]> granicus.if.org Git - postgresql/blob - src/backend/storage/smgr/md.c
Extend yesterday's patch so that the bgwriter is also told to forget
[postgresql] / src / backend / storage / smgr / md.c
1 /*-------------------------------------------------------------------------
2  *
3  * md.c
4  *        This code manages relations that reside on magnetic disk.
5  *
6  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.127 2007/01/17 16:25:01 tgl Exp $
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16
17 #include <unistd.h>
18 #include <fcntl.h>
19 #include <sys/file.h>
20
21 #include "catalog/catalog.h"
22 #include "miscadmin.h"
23 #include "postmaster/bgwriter.h"
24 #include "storage/fd.h"
25 #include "storage/bufmgr.h"
26 #include "storage/smgr.h"
27 #include "utils/hsearch.h"
28 #include "utils/memutils.h"
29
30
31 /* interval for calling AbsorbFsyncRequests in mdsync */
32 #define FSYNCS_PER_ABSORB               10
33
34 /* special values for the segno arg to RememberFsyncRequest */
35 #define FORGET_RELATION_FSYNC   (InvalidBlockNumber)
36 #define FORGET_DATABASE_FSYNC   (InvalidBlockNumber-1)
37
38 /*
39  * On Windows, we have to interpret EACCES as possibly meaning the same as
40  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
41  * that's what you get.  Ugh.  This code is designed so that we don't
42  * actually believe these cases are okay without further evidence (namely,
43  * a pending fsync request getting revoked ... see mdsync).
44  */
45 #ifndef WIN32
46 #define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT)
47 #else
48 #define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT || (err) == EACCES)
49 #endif
50
51 /*
52  *      The magnetic disk storage manager keeps track of open file
53  *      descriptors in its own descriptor pool.  This is done to make it
54  *      easier to support relations that are larger than the operating
55  *      system's file size limit (often 2GBytes).  In order to do that,
56  *      we break relations up into "segment" files that are each shorter than
57  *      the OS file size limit.  The segment size is set by the RELSEG_SIZE
58  *      configuration constant in pg_config_manual.h.
59  *
60  *      On disk, a relation must consist of consecutively numbered segment
61  *      files in the pattern
62  *              -- Zero or more full segments of exactly RELSEG_SIZE blocks each
63  *              -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
64  *              -- Optionally, any number of inactive segments of size 0 blocks.
65  *      The full and partial segments are collectively the "active" segments.
66  *      Inactive segments are those that once contained data but are currently
67  *      not needed because of an mdtruncate() operation.  The reason for leaving
68  *      them present at size zero, rather than unlinking them, is that other
69  *      backends and/or the bgwriter might be holding open file references to
70  *      such segments.  If the relation expands again after mdtruncate(), such
71  *      that a deactivated segment becomes active again, it is important that
72  *      such file references still be valid --- else data might get written
73  *      out to an unlinked old copy of a segment file that will eventually
74  *      disappear.
75  *
76  *      The file descriptor pointer (md_fd field) stored in the SMgrRelation
77  *      cache is, therefore, just the head of a list of MdfdVec objects, one
78  *      per segment.  But note the md_fd pointer can be NULL, indicating
79  *      relation not open.
80  *
81  *      Also note that mdfd_chain == NULL does not necessarily mean the relation
82  *      doesn't have another segment after this one; we may just not have
83  *      opened the next segment yet.  (We could not have "all segments are
84  *      in the chain" as an invariant anyway, since another backend could
85  *      extend the relation when we weren't looking.)  We do not make chain
86  *      entries for inactive segments, however; as soon as we find a partial
87  *      segment, we assume that any subsequent segments are inactive.
88  *
89  *      All MdfdVec objects are palloc'd in the MdCxt memory context.
90  *
91  *      Defining LET_OS_MANAGE_FILESIZE disables the segmentation logic,
92  *      for use on machines that support large files.  Beware that that
93  *      code has not been tested in a long time and is probably bit-rotted.
94  */
95
96 typedef struct _MdfdVec
97 {
98         File            mdfd_vfd;               /* fd number in fd.c's pool */
99         BlockNumber mdfd_segno;         /* segment number, from 0 */
100 #ifndef LET_OS_MANAGE_FILESIZE  /* for large relations */
101         struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
102 #endif
103 } MdfdVec;
104
105 static MemoryContext MdCxt;             /* context for all md.c allocations */
106
107
108 /*
109  * In some contexts (currently, standalone backends and the bgwriter process)
110  * we keep track of pending fsync operations: we need to remember all relation
111  * segments that have been written since the last checkpoint, so that we can
112  * fsync them down to disk before completing the next checkpoint.  This hash
113  * table remembers the pending operations.      We use a hash table mostly as
114  * a convenient way of eliminating duplicate requests.
115  *
116  * (Regular backends do not track pending operations locally, but forward
117  * them to the bgwriter.)
118  */
119 typedef struct
120 {
121         RelFileNode rnode;                      /* the targeted relation */
122         BlockNumber segno;                      /* which segment */
123 } PendingOperationTag;
124
125 typedef struct
126 {
127         PendingOperationTag tag;        /* hash table key (must be first!) */
128         int                     failures;               /* number of failed attempts to fsync */
129 } PendingOperationEntry;
130
131 static HTAB *pendingOpsTable = NULL;
132
133
134 typedef enum                                    /* behavior for mdopen & _mdfd_getseg */
135 {
136         EXTENSION_FAIL,                         /* ereport if segment not present */
137         EXTENSION_RETURN_NULL,          /* return NULL if not present */
138         EXTENSION_CREATE                        /* create new segments as needed */
139 } ExtensionBehavior;
140
141 /* local routines */
142 static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
143 static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
144 static MdfdVec *_fdvec_alloc(void);
145
146 #ifndef LET_OS_MANAGE_FILESIZE
147 static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
148                           int oflags);
149 #endif
150 static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
151                                                          bool isTemp, ExtensionBehavior behavior);
152 static BlockNumber _mdnblocks(SMgrRelation reln, MdfdVec *seg);
153
154
155 /*
156  *      mdinit() -- Initialize private state for magnetic disk storage manager.
157  */
158 void
159 mdinit(void)
160 {
161         MdCxt = AllocSetContextCreate(TopMemoryContext,
162                                                                   "MdSmgr",
163                                                                   ALLOCSET_DEFAULT_MINSIZE,
164                                                                   ALLOCSET_DEFAULT_INITSIZE,
165                                                                   ALLOCSET_DEFAULT_MAXSIZE);
166
167         /*
168          * Create pending-operations hashtable if we need it.  Currently, we need
169          * it if we are standalone (not under a postmaster) OR if we are a
170          * bootstrap-mode subprocess of a postmaster (that is, a startup or
171          * bgwriter process).
172          */
173         if (!IsUnderPostmaster || IsBootstrapProcessingMode())
174         {
175                 HASHCTL         hash_ctl;
176
177                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
178                 hash_ctl.keysize = sizeof(PendingOperationTag);
179                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
180                 hash_ctl.hash = tag_hash;
181                 hash_ctl.hcxt = MdCxt;
182                 pendingOpsTable = hash_create("Pending Ops Table",
183                                                                           100L,
184                                                                           &hash_ctl,
185                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
186         }
187 }
188
189 /*
190  *      mdcreate() -- Create a new relation on magnetic disk.
191  *
192  * If isRedo is true, it's okay for the relation to exist already.
193  */
194 void
195 mdcreate(SMgrRelation reln, bool isRedo)
196 {
197         char       *path;
198         File            fd;
199
200         if (isRedo && reln->md_fd != NULL)
201                 return;                                 /* created and opened already... */
202
203         Assert(reln->md_fd == NULL);
204
205         path = relpath(reln->smgr_rnode);
206
207         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
208
209         if (fd < 0)
210         {
211                 int                     save_errno = errno;
212
213                 /*
214                  * During bootstrap, there are cases where a system relation will be
215                  * accessed (by internal backend processes) before the bootstrap
216                  * script nominally creates it.  Therefore, allow the file to exist
217                  * already, even if isRedo is not set.  (See also mdopen)
218                  */
219                 if (isRedo || IsBootstrapProcessingMode())
220                         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
221                 if (fd < 0)
222                 {
223                         pfree(path);
224                         /* be sure to report the error reported by create, not open */
225                         errno = save_errno;
226                         ereport(ERROR,
227                                         (errcode_for_file_access(),
228                                          errmsg("could not create relation %u/%u/%u: %m",
229                                                         reln->smgr_rnode.spcNode,
230                                                         reln->smgr_rnode.dbNode,
231                                                         reln->smgr_rnode.relNode)));
232                 }
233         }
234
235         pfree(path);
236
237         reln->md_fd = _fdvec_alloc();
238
239         reln->md_fd->mdfd_vfd = fd;
240         reln->md_fd->mdfd_segno = 0;
241 #ifndef LET_OS_MANAGE_FILESIZE
242         reln->md_fd->mdfd_chain = NULL;
243 #endif
244 }
245
246 /*
247  *      mdunlink() -- Unlink a relation.
248  *
249  * Note that we're passed a RelFileNode --- by the time this is called,
250  * there won't be an SMgrRelation hashtable entry anymore.
251  *
252  * If isRedo is true, it's okay for the relation to be already gone.
253  * Also, any failure should be reported as WARNING not ERROR, because
254  * we are usually not in a transaction anymore when this is called.
255  */
256 void
257 mdunlink(RelFileNode rnode, bool isRedo)
258 {
259         char       *path;
260
261         /*
262          * We have to clean out any pending fsync requests for the doomed relation,
263          * else the next mdsync() will fail.
264          */
265         ForgetRelationFsyncRequests(rnode);
266
267         path = relpath(rnode);
268
269         /* Delete the first segment, or only segment if not doing segmenting */
270         if (unlink(path) < 0)
271         {
272                 if (!isRedo || errno != ENOENT)
273                         ereport(WARNING,
274                                         (errcode_for_file_access(),
275                                          errmsg("could not remove relation %u/%u/%u: %m",
276                                                         rnode.spcNode,
277                                                         rnode.dbNode,
278                                                         rnode.relNode)));
279         }
280
281 #ifndef LET_OS_MANAGE_FILESIZE
282         /* Delete the additional segments, if any */
283         else
284         {
285                 char       *segpath = (char *) palloc(strlen(path) + 12);
286                 BlockNumber segno;
287
288                 /*
289                  * Note that because we loop until getting ENOENT, we will
290                  * correctly remove all inactive segments as well as active ones.
291                  */
292                 for (segno = 1;; segno++)
293                 {
294                         sprintf(segpath, "%s.%u", path, segno);
295                         if (unlink(segpath) < 0)
296                         {
297                                 /* ENOENT is expected after the last segment... */
298                                 if (errno != ENOENT)
299                                         ereport(WARNING,
300                                                         (errcode_for_file_access(),
301                                                          errmsg("could not remove segment %u of relation %u/%u/%u: %m",
302                                                                         segno,
303                                                                         rnode.spcNode,
304                                                                         rnode.dbNode,
305                                                                         rnode.relNode)));
306                                 break;
307                         }
308                 }
309                 pfree(segpath);
310         }
311 #endif
312
313         pfree(path);
314 }
315
316 /*
317  *      mdextend() -- Add a block to the specified relation.
318  *
319  *              The semantics are nearly the same as mdwrite(): write at the
320  *              specified position.  However, this is to be used for the case of
321  *              extending a relation (i.e., blocknum is at or beyond the current
322  *              EOF).  Note that we assume writing a block beyond current EOF
323  *              causes intervening file space to become filled with zeroes.
324  */
325 void
326 mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
327 {
328         long            seekpos;
329         int                     nbytes;
330         MdfdVec    *v;
331
332         /* This assert is too expensive to have on normally ... */
333 #ifdef CHECK_WRITE_VS_EXTEND
334         Assert(blocknum >= mdnblocks(reln));
335 #endif
336
337         /*
338          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it
339          * any more --- we mustn't create a block whose number
340          * actually is InvalidBlockNumber.
341          */
342         if (blocknum == InvalidBlockNumber)
343                 ereport(ERROR,
344                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
345                                  errmsg("cannot extend relation %u/%u/%u beyond %u blocks",
346                                                 reln->smgr_rnode.spcNode,
347                                                 reln->smgr_rnode.dbNode,
348                                                 reln->smgr_rnode.relNode,
349                                                 InvalidBlockNumber)));
350
351         v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_CREATE);
352
353 #ifndef LET_OS_MANAGE_FILESIZE
354         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
355         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
356 #else
357         seekpos = (long) (BLCKSZ * (blocknum));
358 #endif
359
360         /*
361          * Note: because caller usually obtained blocknum by calling mdnblocks,
362          * which did a seek(SEEK_END), this seek is often redundant and will be
363          * optimized away by fd.c.  It's not redundant, however, if there is a
364          * partial page at the end of the file. In that case we want to try to
365          * overwrite the partial page with a full page.  It's also not redundant
366          * if bufmgr.c had to dump another buffer of the same file to make room
367          * for the new page's buffer.
368          */
369         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
370                 ereport(ERROR,
371                                 (errcode_for_file_access(),
372                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
373                                                 blocknum,
374                                                 reln->smgr_rnode.spcNode,
375                                                 reln->smgr_rnode.dbNode,
376                                                 reln->smgr_rnode.relNode)));
377
378         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
379         {
380                 if (nbytes < 0)
381                         ereport(ERROR,
382                                         (errcode_for_file_access(),
383                                          errmsg("could not extend relation %u/%u/%u: %m",
384                                                         reln->smgr_rnode.spcNode,
385                                                         reln->smgr_rnode.dbNode,
386                                                         reln->smgr_rnode.relNode),
387                                          errhint("Check free disk space.")));
388                 /* short write: complain appropriately */
389                 ereport(ERROR,
390                                 (errcode(ERRCODE_DISK_FULL),
391                                  errmsg("could not extend relation %u/%u/%u: wrote only %d of %d bytes at block %u",
392                                                 reln->smgr_rnode.spcNode,
393                                                 reln->smgr_rnode.dbNode,
394                                                 reln->smgr_rnode.relNode,
395                                                 nbytes, BLCKSZ, blocknum),
396                                  errhint("Check free disk space.")));
397         }
398
399         if (!isTemp)
400                 register_dirty_segment(reln, v);
401
402 #ifndef LET_OS_MANAGE_FILESIZE
403         Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
404 #endif
405 }
406
407 /*
408  *      mdopen() -- Open the specified relation.
409  *
410  * Note we only open the first segment, when there are multiple segments.
411  *
412  * If first segment is not present, either ereport or return NULL according
413  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
414  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
415  * invent one out of whole cloth.
416  */
417 static MdfdVec *
418 mdopen(SMgrRelation reln, ExtensionBehavior behavior)
419 {
420         MdfdVec    *mdfd;
421         char       *path;
422         File            fd;
423
424         /* No work if already open */
425         if (reln->md_fd)
426                 return reln->md_fd;
427
428         path = relpath(reln->smgr_rnode);
429
430         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
431
432         if (fd < 0)
433         {
434                 /*
435                  * During bootstrap, there are cases where a system relation will be
436                  * accessed (by internal backend processes) before the bootstrap
437                  * script nominally creates it.  Therefore, accept mdopen() as a
438                  * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
439                  */
440                 if (IsBootstrapProcessingMode())
441                         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
442                 if (fd < 0)
443                 {
444                         pfree(path);
445                         if (behavior == EXTENSION_RETURN_NULL &&
446                                 FILE_POSSIBLY_DELETED(errno))
447                                 return NULL;
448                         ereport(ERROR,
449                                         (errcode_for_file_access(),
450                                          errmsg("could not open relation %u/%u/%u: %m",
451                                                         reln->smgr_rnode.spcNode,
452                                                         reln->smgr_rnode.dbNode,
453                                                         reln->smgr_rnode.relNode)));
454                 }
455         }
456
457         pfree(path);
458
459         reln->md_fd = mdfd = _fdvec_alloc();
460
461         mdfd->mdfd_vfd = fd;
462         mdfd->mdfd_segno = 0;
463 #ifndef LET_OS_MANAGE_FILESIZE
464         mdfd->mdfd_chain = NULL;
465         Assert(_mdnblocks(reln, mdfd) <= ((BlockNumber) RELSEG_SIZE));
466 #endif
467
468         return mdfd;
469 }
470
471 /*
472  *      mdclose() -- Close the specified relation, if it isn't closed already.
473  */
474 void
475 mdclose(SMgrRelation reln)
476 {
477         MdfdVec    *v = reln->md_fd;
478
479         /* No work if already closed */
480         if (v == NULL)
481                 return;
482
483         reln->md_fd = NULL;                     /* prevent dangling pointer after error */
484
485 #ifndef LET_OS_MANAGE_FILESIZE
486         while (v != NULL)
487         {
488                 MdfdVec    *ov = v;
489
490                 /* if not closed already */
491                 if (v->mdfd_vfd >= 0)
492                         FileClose(v->mdfd_vfd);
493                 /* Now free vector */
494                 v = v->mdfd_chain;
495                 pfree(ov);
496         }
497 #else
498         if (v->mdfd_vfd >= 0)
499                 FileClose(v->mdfd_vfd);
500         pfree(v);
501 #endif
502 }
503
504 /*
505  *      mdread() -- Read the specified block from a relation.
506  */
507 void
508 mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
509 {
510         long            seekpos;
511         int                     nbytes;
512         MdfdVec    *v;
513
514         v = _mdfd_getseg(reln, blocknum, false, EXTENSION_FAIL);
515
516 #ifndef LET_OS_MANAGE_FILESIZE
517         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
518         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
519 #else
520         seekpos = (long) (BLCKSZ * (blocknum));
521 #endif
522
523         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
524                 ereport(ERROR,
525                                 (errcode_for_file_access(),
526                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
527                                                 blocknum,
528                                                 reln->smgr_rnode.spcNode,
529                                                 reln->smgr_rnode.dbNode,
530                                                 reln->smgr_rnode.relNode)));
531
532         if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
533         {
534                 if (nbytes < 0)
535                         ereport(ERROR,
536                                         (errcode_for_file_access(),
537                                          errmsg("could not read block %u of relation %u/%u/%u: %m",
538                                                         blocknum,
539                                                         reln->smgr_rnode.spcNode,
540                                                         reln->smgr_rnode.dbNode,
541                                                         reln->smgr_rnode.relNode)));
542                 /*
543                  * Short read: we are at or past EOF, or we read a partial block at
544                  * EOF.  Normally this is an error; upper levels should never try to
545                  * read a nonexistent block.  However, if zero_damaged_pages is ON
546                  * or we are InRecovery, we should instead return zeroes without
547                  * complaining.  This allows, for example, the case of trying to
548                  * update a block that was later truncated away.
549                  */
550                 if (zero_damaged_pages || InRecovery)
551                         MemSet(buffer, 0, BLCKSZ);
552                 else
553                         ereport(ERROR,
554                                         (errcode(ERRCODE_DATA_CORRUPTED),
555                                          errmsg("could not read block %u of relation %u/%u/%u: read only %d of %d bytes",
556                                                         blocknum,
557                                                         reln->smgr_rnode.spcNode,
558                                                         reln->smgr_rnode.dbNode,
559                                                         reln->smgr_rnode.relNode,
560                                                         nbytes, BLCKSZ)));
561         }
562 }
563
564 /*
565  *      mdwrite() -- Write the supplied block at the appropriate location.
566  *
567  *              This is to be used only for updating already-existing blocks of a
568  *              relation (ie, those before the current EOF).  To extend a relation,
569  *              use mdextend().
570  */
571 void
572 mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
573 {
574         long            seekpos;
575         int                     nbytes;
576         MdfdVec    *v;
577
578         /* This assert is too expensive to have on normally ... */
579 #ifdef CHECK_WRITE_VS_EXTEND
580         Assert(blocknum < mdnblocks(reln));
581 #endif
582
583         v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_FAIL);
584
585 #ifndef LET_OS_MANAGE_FILESIZE
586         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
587         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
588 #else
589         seekpos = (long) (BLCKSZ * (blocknum));
590 #endif
591
592         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
593                 ereport(ERROR,
594                                 (errcode_for_file_access(),
595                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
596                                                 blocknum,
597                                                 reln->smgr_rnode.spcNode,
598                                                 reln->smgr_rnode.dbNode,
599                                                 reln->smgr_rnode.relNode)));
600
601         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
602         {
603                 if (nbytes < 0)
604                         ereport(ERROR,
605                                         (errcode_for_file_access(),
606                                          errmsg("could not write block %u of relation %u/%u/%u: %m",
607                                                         blocknum,
608                                                         reln->smgr_rnode.spcNode,
609                                                         reln->smgr_rnode.dbNode,
610                                                         reln->smgr_rnode.relNode)));
611                 /* short write: complain appropriately */
612                 ereport(ERROR,
613                                 (errcode(ERRCODE_DISK_FULL),
614                                  errmsg("could not write block %u of relation %u/%u/%u: wrote only %d of %d bytes",
615                                                 blocknum,
616                                                 reln->smgr_rnode.spcNode,
617                                                 reln->smgr_rnode.dbNode,
618                                                 reln->smgr_rnode.relNode,
619                                                 nbytes, BLCKSZ),
620                                  errhint("Check free disk space.")));
621         }
622
623         if (!isTemp)
624                 register_dirty_segment(reln, v);
625 }
626
627 /*
628  *      mdnblocks() -- Get the number of blocks stored in a relation.
629  *
630  *              Important side effect: all active segments of the relation are opened
631  *              and added to the mdfd_chain list.  If this routine has not been
632  *              called, then only segments up to the last one actually touched
633  *              are present in the chain.
634  */
635 BlockNumber
636 mdnblocks(SMgrRelation reln)
637 {
638         MdfdVec    *v = mdopen(reln, EXTENSION_FAIL);
639
640 #ifndef LET_OS_MANAGE_FILESIZE
641         BlockNumber nblocks;
642         BlockNumber segno = 0;
643
644         /*
645          * Skip through any segments that aren't the last one, to avoid redundant
646          * seeks on them.  We have previously verified that these segments are
647          * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
648          *
649          * NOTE: this assumption could only be wrong if another backend has
650          * truncated the relation.      We rely on higher code levels to handle that
651          * scenario by closing and re-opening the md fd, which is handled via
652          * relcache flush.  (Since the bgwriter doesn't participate in relcache
653          * flush, it could have segment chain entries for inactive segments;
654          * that's OK because the bgwriter never needs to compute relation size.)
655          */
656         while (v->mdfd_chain != NULL)
657         {
658                 segno++;
659                 v = v->mdfd_chain;
660         }
661
662         for (;;)
663         {
664                 nblocks = _mdnblocks(reln, v);
665                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
666                         elog(FATAL, "segment too big");
667                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
668                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
669
670                 /*
671                  * If segment is exactly RELSEG_SIZE, advance to next one.
672                  */
673                 segno++;
674
675                 if (v->mdfd_chain == NULL)
676                 {
677                         /*
678                          * Because we pass O_CREAT, we will create the next segment (with
679                          * zero length) immediately, if the last segment is of length
680                          * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
681                          * the logic simple.
682                          */
683                         v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
684                         if (v->mdfd_chain == NULL)
685                                 ereport(ERROR,
686                                                 (errcode_for_file_access(),
687                                                  errmsg("could not open segment %u of relation %u/%u/%u: %m",
688                                                                 segno,
689                                                                 reln->smgr_rnode.spcNode,
690                                                                 reln->smgr_rnode.dbNode,
691                                                                 reln->smgr_rnode.relNode)));
692                 }
693
694                 v = v->mdfd_chain;
695         }
696 #else
697         return _mdnblocks(reln, v);
698 #endif
699 }
700
701 /*
702  *      mdtruncate() -- Truncate relation to specified number of blocks.
703  */
704 void
705 mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
706 {
707         MdfdVec    *v;
708         BlockNumber curnblk;
709
710 #ifndef LET_OS_MANAGE_FILESIZE
711         BlockNumber priorblocks;
712 #endif
713
714         /*
715          * NOTE: mdnblocks makes sure we have opened all active segments, so
716          * that truncation loop will get them all!
717          */
718         curnblk = mdnblocks(reln);
719         if (nblocks > curnblk)
720         {
721                 /* Bogus request ... but no complaint if InRecovery */
722                 if (InRecovery)
723                         return;
724                 ereport(ERROR,
725                                 (errmsg("could not truncate relation %u/%u/%u to %u blocks: it's only %u blocks now",
726                                                 reln->smgr_rnode.spcNode,
727                                                 reln->smgr_rnode.dbNode,
728                                                 reln->smgr_rnode.relNode,
729                                                 nblocks, curnblk)));
730         }
731         if (nblocks == curnblk)
732                 return;                                 /* no work */
733
734         v = mdopen(reln, EXTENSION_FAIL);
735
736 #ifndef LET_OS_MANAGE_FILESIZE
737         priorblocks = 0;
738         while (v != NULL)
739         {
740                 MdfdVec    *ov = v;
741
742                 if (priorblocks > nblocks)
743                 {
744                         /*
745                          * This segment is no longer active (and has already been
746                          * unlinked from the mdfd_chain). We truncate the file, but do
747                          * not delete it, for reasons explained in the header comments.
748                          */
749                         if (FileTruncate(v->mdfd_vfd, 0) < 0)
750                                 ereport(ERROR,
751                                                 (errcode_for_file_access(),
752                                                  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
753                                                                 reln->smgr_rnode.spcNode,
754                                                                 reln->smgr_rnode.dbNode,
755                                                                 reln->smgr_rnode.relNode,
756                                                                 nblocks)));
757                         if (!isTemp)
758                                 register_dirty_segment(reln, v);
759                         v = v->mdfd_chain;
760                         Assert(ov != reln->md_fd);      /* we never drop the 1st segment */
761                         pfree(ov);
762                 }
763                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
764                 {
765                         /*
766                          * This is the last segment we want to keep. Truncate the file to
767                          * the right length, and clear chain link that points to any
768                          * remaining segments (which we shall zap). NOTE: if nblocks is
769                          * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
770                          * segment to 0 length but keep it. This adheres to the invariant
771                          * given in the header comments.
772                          */
773                         BlockNumber lastsegblocks = nblocks - priorblocks;
774
775                         if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)
776                                 ereport(ERROR,
777                                                 (errcode_for_file_access(),
778                                                  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
779                                                                 reln->smgr_rnode.spcNode,
780                                                                 reln->smgr_rnode.dbNode,
781                                                                 reln->smgr_rnode.relNode,
782                                                                 nblocks)));
783                         if (!isTemp)
784                                 register_dirty_segment(reln, v);
785                         v = v->mdfd_chain;
786                         ov->mdfd_chain = NULL;
787                 }
788                 else
789                 {
790                         /*
791                          * We still need this segment and 0 or more blocks beyond it, so
792                          * nothing to do here.
793                          */
794                         v = v->mdfd_chain;
795                 }
796                 priorblocks += RELSEG_SIZE;
797         }
798 #else
799         if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
800                 ereport(ERROR,
801                                 (errcode_for_file_access(),
802                           errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
803                                          reln->smgr_rnode.spcNode,
804                                          reln->smgr_rnode.dbNode,
805                                          reln->smgr_rnode.relNode,
806                                          nblocks)));
807         if (!isTemp)
808                 register_dirty_segment(reln, v);
809 #endif
810 }
811
812 /*
813  *      mdimmedsync() -- Immediately sync a relation to stable storage.
814  *
815  * Note that only writes already issued are synced; this routine knows
816  * nothing of dirty buffers that may exist inside the buffer manager.
817  */
818 void
819 mdimmedsync(SMgrRelation reln)
820 {
821         MdfdVec    *v;
822         BlockNumber curnblk;
823
824         /*
825          * NOTE: mdnblocks makes sure we have opened all active segments, so
826          * that fsync loop will get them all!
827          */
828         curnblk = mdnblocks(reln);
829
830         v = mdopen(reln, EXTENSION_FAIL);
831
832 #ifndef LET_OS_MANAGE_FILESIZE
833         while (v != NULL)
834         {
835                 if (FileSync(v->mdfd_vfd) < 0)
836                         ereport(ERROR,
837                                         (errcode_for_file_access(),
838                                          errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
839                                                         v->mdfd_segno,
840                                                         reln->smgr_rnode.spcNode,
841                                                         reln->smgr_rnode.dbNode,
842                                                         reln->smgr_rnode.relNode)));
843                 v = v->mdfd_chain;
844         }
845 #else
846         if (FileSync(v->mdfd_vfd) < 0)
847                 ereport(ERROR,
848                                 (errcode_for_file_access(),
849                                  errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
850                                                 v->mdfd_segno,
851                                                 reln->smgr_rnode.spcNode,
852                                                 reln->smgr_rnode.dbNode,
853                                                 reln->smgr_rnode.relNode)));
854 #endif
855 }
856
857 /*
858  *      mdsync() -- Sync previous writes to stable storage.
859  *
860  * This is only called during checkpoints, and checkpoints should only
861  * occur in processes that have created a pendingOpsTable.
862  */
863 void
864 mdsync(void)
865 {
866         bool            need_retry;
867
868         if (!pendingOpsTable)
869                 elog(ERROR, "cannot sync without a pendingOpsTable");
870
871         /*
872          * The fsync table could contain requests to fsync relations that have
873          * been deleted (unlinked) by the time we get to them.  Rather than
874          * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
875          * what we will do is retry the whole process after absorbing fsync
876          * request messages again.  Since mdunlink() queues a "revoke" message
877          * before actually unlinking, the fsync request is guaranteed to be gone
878          * the second time if it really was this case.  DROP DATABASE likewise
879          * has to tell us to forget fsync requests before it starts deletions.
880          */
881         do {
882                 HASH_SEQ_STATUS hstat;
883                 PendingOperationEntry *entry;
884                 int                     absorb_counter;
885
886                 need_retry = false;
887
888                 /*
889                  * If we are in the bgwriter, the sync had better include all fsync
890                  * requests that were queued by backends before the checkpoint REDO
891                  * point was determined. We go that a little better by accepting all
892                  * requests queued up to the point where we start fsync'ing.
893                  */
894                 AbsorbFsyncRequests();
895
896                 absorb_counter = FSYNCS_PER_ABSORB;
897                 hash_seq_init(&hstat, pendingOpsTable);
898                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
899                 {
900                         /*
901                          * If fsync is off then we don't have to bother opening the file
902                          * at all.  (We delay checking until this point so that changing
903                          * fsync on the fly behaves sensibly.)
904                          */
905                         if (enableFsync)
906                         {
907                                 SMgrRelation reln;
908                                 MdfdVec    *seg;
909
910                                 /*
911                                  * If in bgwriter, we want to absorb pending requests every so
912                                  * often to prevent overflow of the fsync request queue.  This
913                                  * could result in deleting the current entry out from under
914                                  * our hashtable scan, so the procedure is to fall out of the
915                                  * scan and start over from the top of the function.
916                                  */
917                                 if (--absorb_counter <= 0)
918                                 {
919                                         need_retry = true;
920                                         break;
921                                 }
922
923                                 /*
924                                  * Find or create an smgr hash entry for this relation. This
925                                  * may seem a bit unclean -- md calling smgr?  But it's really
926                                  * the best solution.  It ensures that the open file reference
927                                  * isn't permanently leaked if we get an error here. (You may
928                                  * say "but an unreferenced SMgrRelation is still a leak!" Not
929                                  * really, because the only case in which a checkpoint is done
930                                  * by a process that isn't about to shut down is in the
931                                  * bgwriter, and it will periodically do smgrcloseall(). This
932                                  * fact justifies our not closing the reln in the success path
933                                  * either, which is a good thing since in non-bgwriter cases
934                                  * we couldn't safely do that.)  Furthermore, in many cases
935                                  * the relation will have been dirtied through this same smgr
936                                  * relation, and so we can save a file open/close cycle.
937                                  */
938                                 reln = smgropen(entry->tag.rnode);
939
940                                 /*
941                                  * It is possible that the relation has been dropped or
942                                  * truncated since the fsync request was entered.  Therefore,
943                                  * allow ENOENT, but only if we didn't fail once already on
944                                  * this file.  This applies both during _mdfd_getseg() and
945                                  * during FileSync, since fd.c might have closed the file
946                                  * behind our back.
947                                  */
948                                 seg = _mdfd_getseg(reln,
949                                                                    entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
950                                                                    false, EXTENSION_RETURN_NULL);
951                                 if (seg == NULL ||
952                                         FileSync(seg->mdfd_vfd) < 0)
953                                 {
954                                         /*
955                                          * XXX is there any point in allowing more than one try?
956                                          * Don't see one at the moment, but easy to change the
957                                          * test here if so.
958                                          */
959                                         if (!FILE_POSSIBLY_DELETED(errno) ||
960                                                 ++(entry->failures) > 1)
961                                                 ereport(ERROR,
962                                                                 (errcode_for_file_access(),
963                                                                  errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
964                                                                                 entry->tag.segno,
965                                                                                 entry->tag.rnode.spcNode,
966                                                                                 entry->tag.rnode.dbNode,
967                                                                                 entry->tag.rnode.relNode)));
968                                         else
969                                                 ereport(DEBUG1,
970                                                                 (errcode_for_file_access(),
971                                                                  errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
972                                                                                 entry->tag.segno,
973                                                                                 entry->tag.rnode.spcNode,
974                                                                                 entry->tag.rnode.dbNode,
975                                                                                 entry->tag.rnode.relNode)));
976                                         need_retry = true;
977                                         continue;       /* don't delete the hashtable entry */
978                                 }
979                         }
980
981                         /* Okay, delete this entry */
982                         if (hash_search(pendingOpsTable, &entry->tag,
983                                                         HASH_REMOVE, NULL) == NULL)
984                                 elog(ERROR, "pendingOpsTable corrupted");
985                 }
986         } while (need_retry);
987 }
988
989 /*
990  * register_dirty_segment() -- Mark a relation segment as needing fsync
991  *
992  * If there is a local pending-ops table, just make an entry in it for
993  * mdsync to process later.  Otherwise, try to pass off the fsync request
994  * to the background writer process.  If that fails, just do the fsync
995  * locally before returning (we expect this will not happen often enough
996  * to be a performance problem).
997  */
998 static void
999 register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
1000 {
1001         if (pendingOpsTable)
1002         {
1003                 /* push it into local pending-ops table */
1004                 RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno);
1005         }
1006         else
1007         {
1008                 if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
1009                         return;                         /* passed it off successfully */
1010
1011                 if (FileSync(seg->mdfd_vfd) < 0)
1012                         ereport(ERROR,
1013                                         (errcode_for_file_access(),
1014                                          errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
1015                                                         seg->mdfd_segno,
1016                                                         reln->smgr_rnode.spcNode,
1017                                                         reln->smgr_rnode.dbNode,
1018                                                         reln->smgr_rnode.relNode)));
1019         }
1020 }
1021
1022 /*
1023  * RememberFsyncRequest() -- callback from bgwriter side of fsync request
1024  *
1025  * We stuff the fsync request into the local hash table for execution
1026  * during the bgwriter's next checkpoint.
1027  *
1028  * The range of possible segment numbers is way less than the range of
1029  * BlockNumber, so we can reserve high values of segno for special purposes.
1030  * We define two: FORGET_RELATION_FSYNC means to drop pending fsyncs for
1031  * a relation, and FORGET_DATABASE_FSYNC means to drop pending fsyncs for
1032  * a whole database.  (These are a tad slow because the hash table has to be
1033  * searched linearly, but it doesn't seem worth rethinking the table structure
1034  * for them.)
1035  */
1036 void
1037 RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
1038 {
1039         Assert(pendingOpsTable);
1040
1041         if (segno == FORGET_RELATION_FSYNC)
1042         {
1043                 /* Remove any pending requests for the entire relation */
1044                 HASH_SEQ_STATUS hstat;
1045                 PendingOperationEntry *entry;
1046
1047                 hash_seq_init(&hstat, pendingOpsTable);
1048                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1049                 {
1050                         if (RelFileNodeEquals(entry->tag.rnode, rnode))
1051                         {
1052                                 /* Okay, delete this entry */
1053                                 if (hash_search(pendingOpsTable, &entry->tag,
1054                                                                 HASH_REMOVE, NULL) == NULL)
1055                                         elog(ERROR, "pendingOpsTable corrupted");
1056                         }
1057                 }
1058         }
1059         else if (segno == FORGET_DATABASE_FSYNC)
1060         {
1061                 /* Remove any pending requests for the entire database */
1062                 HASH_SEQ_STATUS hstat;
1063                 PendingOperationEntry *entry;
1064
1065                 hash_seq_init(&hstat, pendingOpsTable);
1066                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1067                 {
1068                         if (entry->tag.rnode.dbNode == rnode.dbNode)
1069                         {
1070                                 /* Okay, delete this entry */
1071                                 if (hash_search(pendingOpsTable, &entry->tag,
1072                                                                 HASH_REMOVE, NULL) == NULL)
1073                                         elog(ERROR, "pendingOpsTable corrupted");
1074                         }
1075                 }
1076         }
1077         else
1078         {
1079                 /* Normal case: enter a request to fsync this segment */
1080                 PendingOperationTag key;
1081                 PendingOperationEntry *entry;
1082                 bool            found;
1083
1084                 /* ensure any pad bytes in the hash key are zeroed */
1085                 MemSet(&key, 0, sizeof(key));
1086                 key.rnode = rnode;
1087                 key.segno = segno;
1088
1089                 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1090                                                                                                           &key,
1091                                                                                                           HASH_ENTER,
1092                                                                                                           &found);
1093                 if (!found)                             /* new entry, so initialize it */
1094                         entry->failures = 0;
1095         }
1096 }
1097
1098 /*
1099  * ForgetRelationFsyncRequests -- ensure any fsyncs for a rel are forgotten
1100  */
1101 void
1102 ForgetRelationFsyncRequests(RelFileNode rnode)
1103 {
1104         if (pendingOpsTable)
1105         {
1106                 /* standalone backend or startup process: fsync state is local */
1107                 RememberFsyncRequest(rnode, FORGET_RELATION_FSYNC);
1108         }
1109         else if (IsUnderPostmaster)
1110         {
1111                 /*
1112                  * Notify the bgwriter about it.  If we fail to queue the revoke
1113                  * message, we have to sleep and try again ... ugly, but hopefully
1114                  * won't happen often.
1115                  *
1116                  * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with
1117                  * an error would leave the no-longer-used file still present on
1118                  * disk, which would be bad, so I'm inclined to assume that the
1119                  * bgwriter will always empty the queue soon.
1120                  */
1121                 while (!ForwardFsyncRequest(rnode, FORGET_RELATION_FSYNC))
1122                         pg_usleep(10000L);      /* 10 msec seems a good number */
1123                 /*
1124                  * Note we don't wait for the bgwriter to actually absorb the
1125                  * revoke message; see mdsync() for the implications.
1126                  */
1127         }
1128 }
1129
1130 /*
1131  * ForgetDatabaseFsyncRequests -- ensure any fsyncs for a DB are forgotten
1132  */
1133 void
1134 ForgetDatabaseFsyncRequests(Oid dbid)
1135 {
1136         RelFileNode rnode;
1137
1138         rnode.dbNode = dbid;
1139         rnode.spcNode = 0;
1140         rnode.relNode = 0;
1141
1142         if (pendingOpsTable)
1143         {
1144                 /* standalone backend or startup process: fsync state is local */
1145                 RememberFsyncRequest(rnode, FORGET_DATABASE_FSYNC);
1146         }
1147         else if (IsUnderPostmaster)
1148         {
1149                 /* see notes in ForgetRelationFsyncRequests */
1150                 while (!ForwardFsyncRequest(rnode, FORGET_DATABASE_FSYNC))
1151                         pg_usleep(10000L);      /* 10 msec seems a good number */
1152         }
1153 }
1154
1155
1156 /*
1157  *      _fdvec_alloc() -- Make a MdfdVec object.
1158  */
1159 static MdfdVec *
1160 _fdvec_alloc(void)
1161 {
1162         return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1163 }
1164
1165 #ifndef LET_OS_MANAGE_FILESIZE
1166
1167 /*
1168  * Open the specified segment of the relation,
1169  * and make a MdfdVec object for it.  Returns NULL on failure.
1170  */
1171 static MdfdVec *
1172 _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
1173 {
1174         MdfdVec    *v;
1175         int                     fd;
1176         char       *path,
1177                            *fullpath;
1178
1179         path = relpath(reln->smgr_rnode);
1180
1181         if (segno > 0)
1182         {
1183                 /* be sure we have enough space for the '.segno' */
1184                 fullpath = (char *) palloc(strlen(path) + 12);
1185                 sprintf(fullpath, "%s.%u", path, segno);
1186                 pfree(path);
1187         }
1188         else
1189                 fullpath = path;
1190
1191         /* open the file */
1192         fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1193
1194         pfree(fullpath);
1195
1196         if (fd < 0)
1197                 return NULL;
1198
1199         /* allocate an mdfdvec entry for it */
1200         v = _fdvec_alloc();
1201
1202         /* fill the entry */
1203         v->mdfd_vfd = fd;
1204         v->mdfd_segno = segno;
1205         v->mdfd_chain = NULL;
1206         Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
1207
1208         /* all done */
1209         return v;
1210 }
1211 #endif   /* LET_OS_MANAGE_FILESIZE */
1212
1213 /*
1214  *      _mdfd_getseg() -- Find the segment of the relation holding the
1215  *              specified block.
1216  *
1217  * If the segment doesn't exist, we ereport, return NULL, or create the
1218  * segment, according to "behavior".  Note: isTemp need only be correct
1219  * in the EXTENSION_CREATE case.
1220  */
1221 static MdfdVec *
1222 _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp,
1223                          ExtensionBehavior behavior)
1224 {
1225         MdfdVec    *v = mdopen(reln, behavior);
1226
1227 #ifndef LET_OS_MANAGE_FILESIZE
1228         BlockNumber targetseg;
1229         BlockNumber nextsegno;
1230
1231         if (!v)
1232                 return NULL;                    /* only possible if EXTENSION_RETURN_NULL */
1233
1234         targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1235         for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1236         {
1237                 Assert(nextsegno == v->mdfd_segno + 1);
1238
1239                 if (v->mdfd_chain == NULL)
1240                 {
1241                         /*
1242                          * Normally we will create new segments only if authorized by
1243                          * the caller (i.e., we are doing mdextend()).  But when doing
1244                          * WAL recovery, create segments anyway; this allows cases such as
1245                          * replaying WAL data that has a write into a high-numbered
1246                          * segment of a relation that was later deleted.  We want to go
1247                          * ahead and create the segments so we can finish out the replay.
1248                          *
1249                          * We have to maintain the invariant that segments before the
1250                          * last active segment are of size RELSEG_SIZE; therefore, pad
1251                          * them out with zeroes if needed.  (This only matters if caller
1252                          * is extending the relation discontiguously, but that can happen
1253                          * in hash indexes.)
1254                          */
1255                         if (behavior == EXTENSION_CREATE || InRecovery)
1256                         {
1257                                 if (_mdnblocks(reln, v) < RELSEG_SIZE)
1258                                 {
1259                                         char   *zerobuf = palloc0(BLCKSZ);
1260
1261                                         mdextend(reln, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1262                                                          zerobuf, isTemp);
1263                                         pfree(zerobuf);
1264                                 }
1265                                 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, O_CREAT);
1266                         }
1267                         else
1268                         {
1269                                 /* We won't create segment if not existent */
1270                                 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, 0);
1271                         }
1272                         if (v->mdfd_chain == NULL)
1273                         {
1274                                 if (behavior == EXTENSION_RETURN_NULL &&
1275                                         FILE_POSSIBLY_DELETED(errno))
1276                                         return NULL;
1277                                 ereport(ERROR,
1278                                                 (errcode_for_file_access(),
1279                                                  errmsg("could not open segment %u of relation %u/%u/%u (target block %u): %m",
1280                                                                 nextsegno,
1281                                                                 reln->smgr_rnode.spcNode,
1282                                                                 reln->smgr_rnode.dbNode,
1283                                                                 reln->smgr_rnode.relNode,
1284                                                                 blkno)));
1285                         }
1286                 }
1287                 v = v->mdfd_chain;
1288         }
1289 #endif
1290
1291         return v;
1292 }
1293
1294 /*
1295  * Get number of blocks present in a single disk file
1296  */
1297 static BlockNumber
1298 _mdnblocks(SMgrRelation reln, MdfdVec *seg)
1299 {
1300         long            len;
1301
1302         len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1303         if (len < 0)
1304                 ereport(ERROR,
1305                                 (errcode_for_file_access(),
1306                                  errmsg("could not seek to end of segment %u of relation %u/%u/%u: %m",
1307                                                 seg->mdfd_segno,
1308                                                 reln->smgr_rnode.spcNode,
1309                                                 reln->smgr_rnode.dbNode,
1310                                                 reln->smgr_rnode.relNode)));
1311         /* note that this calculation will ignore any partial block at EOF */
1312         return (BlockNumber) (len / BLCKSZ);
1313 }