granicus.if.org Git - postgresql/blob - src/backend/storage/smgr/md.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * md.c--
   4  *        This code manages relations that reside on magnetic disk.
   5  *
   6  * Copyright (c) 1994, Regents of the University of California
   7  *
   8  *
   9  * IDENTIFICATION
  10  *        $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.34 1998/07/24 03:31:35 scrappy Exp $
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14 #include <unistd.h>
  15 #include <stdio.h>                              /* for sprintf() */
  16 #include <string.h>
  17 #include <fcntl.h>                              /* for open() flags */
  18 #include <sys/file.h>
  19
  20 #include "postgres.h"
  21 #include "miscadmin.h"                  /* for DataDir */
  22
  23 #include "catalog/catalog.h"
  24 #include "storage/block.h"
  25 #include "storage/fd.h"
  26 #include "storage/smgr.h"               /* where the declarations go */
  27 #include "utils/mcxt.h"
  28 #include "utils/rel.h"
  29
  30 #undef DIAGNOSTIC
  31
  32 /*
  33  *      The magnetic disk storage manager keeps track of open file descriptors
  34  *      in its own descriptor pool.  This happens for two reasons.      First, at
  35  *      transaction boundaries, we walk the list of descriptors and flush
  36  *      anything that we've dirtied in the current transaction.  Second, we
  37  *      have to support relations of > 4GBytes.  In order to do this, we break
  38  *      relations up into chunks of < 2GBytes and store one chunk in each of
  39  *      several files that represent the relation.
  40  */
  41
  42 typedef struct _MdfdVec
  43 {
  44         int                     mdfd_vfd;               /* fd number in vfd pool */
  45         uint16          mdfd_flags;             /* clean, dirty, free */
  46         int                     mdfd_lstbcnt;   /* most recent block count */
  47         int                     mdfd_nextFree;  /* next free vector */
  48 #ifndef LET_OS_MANAGE_FILESIZE
  49         struct _MdfdVec *mdfd_chain;/* for large relations */
  50 #endif
  51 } MdfdVec;
  52
  53 static int      Nfds = 100;
  54 static MdfdVec *Md_fdvec = (MdfdVec *) NULL;
  55 static int      Md_Free = -1;
  56 static int      CurFd = 0;
  57 static MemoryContext MdCxt;
  58
  59 #define MDFD_DIRTY              (uint16) 0x01
  60 #define MDFD_FREE               (uint16) 0x02
  61
  62 /*
  63  * RELSEG_SIZE appears to be the number of segments that can
  64  * be in a disk file.  It was defined as 262144 based on 8k
  65  * blocks, but now that the block size can be changed, this
  66  * has to be calculated at compile time.  Otherwise, the file
  67  * size limit would not work out to 2-gig (2147483648).
  68  *
  69  * The number needs to be (2 ** 31) / BLCKSZ, but to be keep
  70  * the math under MAXINT, pre-divide by 256 and use ...
  71  *
  72  *                       (((2 ** 23) / BLCKSZ) * (2 ** 8))
  73  *
  74  * 07 Jan 98  darrenk
  75  *
  76  * Now possibly let the OS handle it...
  77  *
  78  * 19 Mar 98  darrenk
  79  *
  80  */
  81
  82 #ifndef LET_OS_MANAGE_FILESIZE
  83 #define RELSEG_SIZE             ((8388608 / BLCKSZ) * 256)
  84 #endif
  85
  86 /* routines declared here */
  87 static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);
  88 static MdfdVec *_mdfd_getseg(Relation reln, int blkno, int oflag);
  89 static int      _fdvec_alloc(void);
  90 static void _fdvec_free(int);
  91 static BlockNumber _mdnblocks(File file, Size blcksz);
  92
  93 /*
  94  *      mdinit() -- Initialize private state for magnetic disk storage manager.
  95  *
  96  *              We keep a private table of all file descriptors.  Whenever we do
  97  *              a write to one, we mark it dirty in our table.  Whenever we force
  98  *              changes to disk, we mark the file descriptor clean.  At transaction
  99  *              commit, we force changes to disk for all dirty file descriptors.
 100  *              This routine allocates and initializes the table.
 101  *
 102  *              Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
 103  */
 104 int
 105 mdinit()
 106 {
 107         MemoryContext oldcxt;
 108         int                     i;
 109
 110         MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr");
 111         if (MdCxt == (MemoryContext) NULL)
 112                 return (SM_FAIL);
 113
 114         oldcxt = MemoryContextSwitchTo(MdCxt);
 115         Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
 116         MemoryContextSwitchTo(oldcxt);
 117
 118         if (Md_fdvec == (MdfdVec *) NULL)
 119                 return (SM_FAIL);
 120
 121         MemSet(Md_fdvec, 0, Nfds * sizeof(MdfdVec));
 122
 123         /* Set free list */
 124         for (i = 0; i < Nfds; i++)
 125         {
 126                 Md_fdvec[i].mdfd_nextFree = i + 1;
 127                 Md_fdvec[i].mdfd_flags = MDFD_FREE;
 128         }
 129         Md_Free = 0;
 130         Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
 131
 132         return (SM_SUCCESS);
 133 }
 134
 135 int
 136 mdcreate(Relation reln)
 137 {
 138         int                     fd,
 139                                 vfd;
 140         char       *path;
 141
 142         path = relpath(reln->rd_rel->relname.data);
 143         fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);
 144
 145         /*
 146          * If the file already exists and is empty, we pretend that the create
 147          * succeeded.  During bootstrap processing, we skip that check,
 148          * because pg_time, pg_variable, and pg_log get created before their
 149          * .bki file entries are processed.
 150          *
 151          * As the result of this pretence it was possible to have in pg_class > 1
 152          * records with the same relname. Actually, it should be fixed in
 153          * upper levels, too, but... -  vadim 05/06/97
 154          */
 155
 156         if (fd < 0)
 157         {
 158                 if (!IsBootstrapProcessingMode())
 159                         return (-1);
 160                 fd = FileNameOpenFile(path, O_RDWR, 0600);              /* Bootstrap */
 161                 if (fd < 0)
 162                         return (-1);
 163         }
 164
 165         vfd = _fdvec_alloc();
 166         if (vfd < 0)
 167                 return (-1);
 168
 169         Md_fdvec[vfd].mdfd_vfd = fd;
 170         Md_fdvec[vfd].mdfd_flags = (uint16) 0;
 171 #ifndef LET_OS_MANAGE_FILESIZE
 172         Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
 173 #endif
 174         Md_fdvec[vfd].mdfd_lstbcnt = 0;
 175
 176         return (vfd);
 177 }
 178
 179 /*
 180  *      mdunlink() -- Unlink a relation.
 181  */
 182 int
 183 mdunlink(Relation reln)
 184 {
 185         int                     fd;
 186         int                     i;
 187         MdfdVec    *v,
 188                            *ov;
 189         MemoryContext oldcxt;
 190         char            fname[NAMEDATALEN];
 191         char            tname[NAMEDATALEN + 10];                /* leave room for overflow
 192                                                                                                  * suffixes */
 193
 194         /*
 195          * On Windows NT you can't unlink a file if it is open so we have * to
 196          * do this.
 197          */
 198
 199         StrNCpy(fname, RelationGetRelationName(reln)->data, NAMEDATALEN);
 200
 201         if (FileNameUnlink(fname) < 0)
 202                 return (SM_FAIL);
 203
 204         /* unlink all the overflow files for large relations */
 205         for (i = 1;; i++)
 206         {
 207                 sprintf(tname, "%s.%d", fname, i);
 208                 if (FileNameUnlink(tname) < 0)
 209                         break;
 210         }
 211
 212         /* finally, clean out the mdfd vector */
 213         fd = RelationGetFile(reln);
 214         Md_fdvec[fd].mdfd_flags = (uint16) 0;
 215
 216         oldcxt = MemoryContextSwitchTo(MdCxt);
 217 #ifndef LET_OS_MANAGE_FILESIZE
 218         for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
 219         {
 220                 FileUnlink(v->mdfd_vfd);
 221                 ov = v;
 222                 v = v->mdfd_chain;
 223                 if (ov != &Md_fdvec[fd])
 224                         pfree(ov);
 225         }
 226         Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
 227 #else
 228         v = &Md_fdvec[fd];
 229         if (v != (MdfdVec *) NULL)
 230                 FileUnlink(v->mdfd_vfd);
 231 #endif
 232         MemoryContextSwitchTo(oldcxt);
 233
 234         _fdvec_free(fd);
 235
 236         return (SM_SUCCESS);
 237 }
 238
 239 /*
 240  *      mdextend() -- Add a block to the specified relation.
 241  *
 242  *              This routine returns SM_FAIL or SM_SUCCESS, with errno set as
 243  *              appropriate.
 244  */
 245 int
 246 mdextend(Relation reln, char *buffer)
 247 {
 248         long            pos;
 249         int                     nblocks;
 250         MdfdVec    *v;
 251
 252         nblocks = mdnblocks(reln);
 253         v = _mdfd_getseg(reln, nblocks, O_CREAT);
 254
 255         if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0)
 256                 return (SM_FAIL);
 257
 258         if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
 259                 return (SM_FAIL);
 260
 261         /* remember that we did a write, so we can sync at xact commit */
 262         v->mdfd_flags |= MDFD_DIRTY;
 263
 264         /* try to keep the last block count current, though it's just a hint */
 265 #ifndef LET_OS_MANAGE_FILESIZE
 266         if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
 267                 v->mdfd_lstbcnt = RELSEG_SIZE;
 268
 269 #ifdef DIAGNOSTIC
 270         if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE
 271                 || v->mdfd_lstbcnt > RELSEG_SIZE)
 272                 elog(FATAL, "segment too big!");
 273 #endif
 274 #else
 275         v->mdfd_lstbcnt = ++nblocks;
 276 #endif
 277
 278         return (SM_SUCCESS);
 279 }
 280
 281 /*
 282  *      mdopen() -- Open the specified relation.
 283  */
 284 int
 285 mdopen(Relation reln)
 286 {
 287         char       *path;
 288         int                     fd;
 289         int                     vfd;
 290
 291         path = relpath(reln->rd_rel->relname.data);
 292
 293         fd = FileNameOpenFile(path, O_RDWR, 0600);
 294
 295         /* this should only happen during bootstrap processing */
 296         if (fd < 0)
 297                 fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);
 298
 299         vfd = _fdvec_alloc();
 300         if (vfd < 0)
 301                 return (-1);
 302
 303         Md_fdvec[vfd].mdfd_vfd = fd;
 304         Md_fdvec[vfd].mdfd_flags = (uint16) 0;
 305         Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
 306 #ifndef LET_OS_MANAGE_FILESIZE
 307         Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
 308
 309 #ifdef DIAGNOSTIC
 310         if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE)
 311                 elog(FATAL, "segment too big on relopen!");
 312 #endif
 313 #endif
 314
 315         return (vfd);
 316 }
 317
 318 /*
 319  *      mdclose() -- Close the specified relation
 320  *
 321  *              AND FREE fd vector! It may be re-used for other relation!
 322  *              reln should be flushed from cache after closing !..
 323  *
 324  *              Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
 325  */
 326 int
 327 mdclose(Relation reln)
 328 {
 329         int                     fd;
 330         MdfdVec    *v,
 331                            *ov;
 332         MemoryContext oldcxt;
 333
 334         fd = RelationGetFile(reln);
 335
 336         oldcxt = MemoryContextSwitchTo(MdCxt);
 337 #ifndef LET_OS_MANAGE_FILESIZE
 338         for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
 339         {
 340                 /* if not closed already */
 341                 if (v->mdfd_vfd >= 0)
 342                 {
 343
 344                         /*
 345                          * We sync the file descriptor so that we don't need to reopen
 346                          * it at transaction commit to force changes to disk.
 347                          */
 348
 349                         FileSync(v->mdfd_vfd);
 350                         FileClose(v->mdfd_vfd);
 351
 352                         /* mark this file descriptor as clean in our private table */
 353                         v->mdfd_flags &= ~MDFD_DIRTY;
 354                 }
 355                 /* Now free vector */
 356                 ov = v;
 357                 v = v->mdfd_chain;
 358                 if (ov != &Md_fdvec[fd])
 359                         pfree(ov);
 360         }
 361
 362         Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
 363 #else
 364         v = &Md_fdvec[fd];
 365         if (v != (MdfdVec *) NULL)
 366         {
 367                 if (v->mdfd_vfd >= 0)
 368                 {
 369
 370                         /*
 371                          * We sync the file descriptor so that we don't need to reopen
 372                          * it at transaction commit to force changes to disk.
 373                          */
 374
 375                         FileSync(v->mdfd_vfd);
 376                         FileClose(v->mdfd_vfd);
 377
 378                         /* mark this file descriptor as clean in our private table */
 379                         v->mdfd_flags &= ~MDFD_DIRTY;
 380                 }
 381         }
 382 #endif
 383         MemoryContextSwitchTo(oldcxt);
 384
 385         _fdvec_free(fd);
 386
 387         return (SM_SUCCESS);
 388 }
 389
 390 /*
 391  *      mdread() -- Read the specified block from a relation.
 392  *
 393  *              Returns SM_SUCCESS or SM_FAIL.
 394  */
 395 int
 396 mdread(Relation reln, BlockNumber blocknum, char *buffer)
 397 {
 398         int                     status;
 399         long            seekpos;
 400         int                     nbytes;
 401         MdfdVec    *v;
 402
 403         v = _mdfd_getseg(reln, blocknum, 0);
 404
 405 #ifndef LET_OS_MANAGE_FILESIZE
 406         seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
 407
 408 #ifdef DIAGNOSTIC
 409         if (seekpos >= BLCKSZ * RELSEG_SIZE)
 410                 elog(FATAL, "seekpos too big!");
 411 #endif
 412 #else
 413         seekpos = (long) (BLCKSZ * (blocknum));
 414 #endif
 415
 416         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 417                 return (SM_FAIL);
 418
 419         status = SM_SUCCESS;
 420         if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 421         {
 422                 if (nbytes == 0)
 423                         MemSet(buffer, 0, BLCKSZ);
 424                 else
 425                         status = SM_FAIL;
 426         }
 427
 428         return (status);
 429 }
 430
 431 /*
 432  *      mdwrite() -- Write the supplied block at the appropriate location.
 433  *
 434  *              Returns SM_SUCCESS or SM_FAIL.
 435  */
 436 int
 437 mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
 438 {
 439         int                     status;
 440         long            seekpos;
 441         MdfdVec    *v;
 442
 443         v = _mdfd_getseg(reln, blocknum, 0);
 444
 445 #ifndef LET_OS_MANAGE_FILESIZE
 446         seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
 447 #ifdef DIAGNOSTIC
 448         if (seekpos >= BLCKSZ * RELSEG_SIZE)
 449                 elog(FATAL, "seekpos too big!");
 450 #endif
 451 #else
 452         seekpos = (long) (BLCKSZ * (blocknum));
 453 #endif
 454
 455         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 456                 return (SM_FAIL);
 457
 458         status = SM_SUCCESS;
 459         if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
 460                 status = SM_FAIL;
 461
 462         v->mdfd_flags |= MDFD_DIRTY;
 463
 464         return (status);
 465 }
 466
 467 /*
 468  *      mdflush() -- Synchronously write a block to disk.
 469  *
 470  *              This is exactly like mdwrite(), but doesn't return until the file
 471  *              system buffer cache has been flushed.
 472  */
 473 int
 474 mdflush(Relation reln, BlockNumber blocknum, char *buffer)
 475 {
 476         int                     status;
 477         long            seekpos;
 478         MdfdVec    *v;
 479
 480         v = _mdfd_getseg(reln, blocknum, 0);
 481
 482 #ifndef LET_OS_MANAGE_FILESIZE
 483         seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
 484 #ifdef DIAGNOSTIC
 485         if (seekpos >= BLCKSZ * RELSEG_SIZE)
 486                 elog(FATAL, "seekpos too big!");
 487 #endif
 488 #else
 489         seekpos = (long) (BLCKSZ * (blocknum));
 490 #endif
 491
 492         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 493                 return (SM_FAIL);
 494
 495         /* write and sync the block */
 496         status = SM_SUCCESS;
 497         if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
 498                 || FileSync(v->mdfd_vfd) < 0)
 499                 status = SM_FAIL;
 500
 501         /*
 502          * By here, the block is written and changes have been forced to
 503          * stable storage.      Mark the descriptor as clean until the next write,
 504          * so we don't sync it again unnecessarily at transaction commit.
 505          */
 506
 507         v->mdfd_flags &= ~MDFD_DIRTY;
 508
 509         return (status);
 510 }
 511
 512 /*
 513  *      mdblindwrt() -- Write a block to disk blind.
 514  *
 515  *              We have to be able to do this using only the name and OID of
 516  *              the database and relation in which the block belongs.  This
 517  *              is a synchronous write.
 518  */
 519 int
 520 mdblindwrt(char *dbstr,
 521                    char *relstr,
 522                    Oid dbid,
 523                    Oid relid,
 524                    BlockNumber blkno,
 525                    char *buffer)
 526 {
 527         int                     fd;
 528         int                     segno;
 529         long            seekpos;
 530         int                     status;
 531         char       *path;
 532
 533 #ifndef LET_OS_MANAGE_FILESIZE
 534         int                     nchars;
 535
 536         /* be sure we have enough space for the '.segno', if any */
 537         segno = blkno / RELSEG_SIZE;
 538         if (segno > 0)
 539                 nchars = 10;
 540         else
 541                 nchars = 0;
 542
 543         /* construct the path to the file and open it */
 544         /* system table? then put in system area... */
 545         if (dbid == (Oid) 0)
 546         {
 547                 path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
 548                 if (segno == 0)
 549                         sprintf(path, "%s/%s", DataDir, relstr);
 550                 else
 551                         sprintf(path, "%s/%s.%d", DataDir, relstr, segno);
 552         }
 553         /* user table? then put in user database area... */
 554         else if (dbid == MyDatabaseId)
 555         {
 556                 extern char *DatabasePath;
 557
 558                 path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2 + nchars);
 559                 if (segno == 0)
 560                         sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
 561                 else
 562                         sprintf(path, "%s%c%s.%d", DatabasePath, SEP_CHAR, relstr, segno);
 563         }
 564         else
 565 /* this is work arround only !!! */
 566         {
 567                 char            dbpath[MAXPGPATH + 1];
 568                 Oid                     owner,
 569                                         id;
 570                 char       *tmpPath;
 571 #ifdef MB
 572                 int        tmpEncoding;
 573 #endif
 574
 575 #ifdef MB
 576                 GetRawDatabaseInfo(dbstr, &owner, &id, dbpath, &tmpEncoding);
 577 #else
 578                 GetRawDatabaseInfo(dbstr, &owner, &id, dbpath);
 579 #endif
 580
 581                 if (id != dbid)
 582                         elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
 583                 tmpPath = ExpandDatabasePath(dbpath);
 584                 if (tmpPath == NULL)
 585                         elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
 586                 path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2 + nchars);
 587                 if (segno == 0)
 588                         sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
 589                 else
 590                         sprintf(path, "%s%c%s.%d", tmpPath, SEP_CHAR, relstr, segno);
 591                 pfree(tmpPath);
 592         }
 593 #else
 594         /* construct the path to the file and open it */
 595         /* system table? then put in system area... */
 596         if (dbid == (Oid) 0)
 597         {
 598                 path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2);
 599                 sprintf(path, "%s/%s", DataDir, relstr);
 600         }
 601         /* user table? then put in user database area... */
 602         else if (dbid == MyDatabaseId)
 603         {
 604                 extern char *DatabasePath;
 605
 606                 path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2);
 607                 sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
 608         }
 609         else
 610 /* this is work arround only !!! */
 611         {
 612                 char            dbpath[MAXPGPATH + 1];
 613                 Oid                     owner,
 614                                         id;
 615                 char       *tmpPath;
 616
 617 #ifdef MB
 618                 int        tmpEncoding;
 619 #endif
 620
 621 #ifdef MB
 622                 GetRawDatabaseInfo(dbstr, &owner, &id, dbpath, &tmpEncoding);
 623 #else
 624                 GetRawDatabaseInfo(dbstr, &owner, &id, dbpath);
 625 #endif
 626
 627                 if (id != dbid)
 628                         elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
 629                 tmpPath = ExpandDatabasePath(dbpath);
 630                 if (tmpPath == NULL)
 631                         elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
 632                 path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2);
 633                 sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
 634                 pfree(tmpPath);
 635         }
 636 #endif
 637
 638         if ((fd = open(path, O_RDWR, 0600)) < 0)
 639                 return (SM_FAIL);
 640
 641         /* seek to the right spot */
 642 #ifndef LET_OS_MANAGE_FILESIZE
 643         seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
 644 #else
 645         seekpos = (long) (BLCKSZ * (blkno));
 646 #endif
 647
 648         if (lseek(fd, seekpos, SEEK_SET) != seekpos)
 649         {
 650                 close(fd);
 651                 return (SM_FAIL);
 652         }
 653
 654         status = SM_SUCCESS;
 655
 656         /* write and sync the block */
 657         if (write(fd, buffer, BLCKSZ) != BLCKSZ || (pg_fsync(fd) < 0))
 658                 status = SM_FAIL;
 659
 660         if (close(fd) < 0)
 661                 status = SM_FAIL;
 662
 663         pfree(path);
 664
 665         return (status);
 666 }
 667
 668 /*
 669  *      mdnblocks() -- Get the number of blocks stored in a relation.
 670  *
 671  *              Returns # of blocks or -1 on error.
 672  */
 673 int
 674 mdnblocks(Relation reln)
 675 {
 676         int                     fd;
 677         MdfdVec    *v;
 678         int                     nblocks;
 679         int                     segno;
 680
 681         fd = RelationGetFile(reln);
 682         v = &Md_fdvec[fd];
 683
 684 #ifndef LET_OS_MANAGE_FILESIZE
 685 #ifdef DIAGNOSTIC
 686         if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE)
 687                 elog(FATAL, "segment too big in getseg!");
 688 #endif
 689
 690         segno = 0;
 691         for (;;)
 692         {
 693                 if (v->mdfd_lstbcnt == RELSEG_SIZE
 694                         || (nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ)) == RELSEG_SIZE)
 695                 {
 696
 697                         v->mdfd_lstbcnt = RELSEG_SIZE;
 698                         segno++;
 699
 700                         if (v->mdfd_chain == (MdfdVec *) NULL)
 701                         {
 702                                 v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
 703                                 if (v->mdfd_chain == (MdfdVec *) NULL)
 704                                         elog(ERROR, "cannot count blocks for %s -- open failed",
 705                                                  RelationGetRelationName(reln));
 706                         }
 707
 708                         v = v->mdfd_chain;
 709                 }
 710                 else
 711                         return ((segno * RELSEG_SIZE) + nblocks);
 712         }
 713 #else
 714         return (_mdnblocks(v->mdfd_vfd, BLCKSZ));
 715 #endif
 716 }
 717
 718 /*
 719  *      mdtruncate() -- Truncate relation to specified number of blocks.
 720  *
 721  *              Returns # of blocks or -1 on error.
 722  */
 723 int
 724 mdtruncate(Relation reln, int nblocks)
 725 {
 726         int                     fd;
 727         MdfdVec    *v;
 728
 729 #ifndef LET_OS_MANAGE_FILESIZE
 730         int                     curnblk;
 731
 732         curnblk = mdnblocks(reln);
 733         if (curnblk / RELSEG_SIZE > 0)
 734         {
 735                 elog(NOTICE, "Can't truncate multi-segments relation %s",
 736                          reln->rd_rel->relname.data);
 737                 return (curnblk);
 738         }
 739 #endif
 740
 741         fd = RelationGetFile(reln);
 742         v = &Md_fdvec[fd];
 743
 744         if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
 745                 return (-1);
 746
 747         return (nblocks);
 748
 749 }       /* mdtruncate */
 750
 751 /*
 752  *      mdcommit() -- Commit a transaction.
 753  *
 754  *              All changes to magnetic disk relations must be forced to stable
 755  *              storage.  This routine makes a pass over the private table of
 756  *              file descriptors.  Any descriptors to which we have done writes,
 757  *              but not synced, are synced here.
 758  *
 759  *              Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
 760  */
 761 int
 762 mdcommit()
 763 {
 764         int                     i;
 765         MdfdVec    *v;
 766
 767         for (i = 0; i < CurFd; i++)
 768         {
 769 #ifndef LET_OS_MANAGE_FILESIZE
 770                 for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
 771 #else
 772                 v = &Md_fdvec[i];
 773                 if (v != (MdfdVec *) NULL)
 774 #endif
 775                 {
 776                         if (v->mdfd_flags & MDFD_DIRTY)
 777                         {
 778                                 if (FileSync(v->mdfd_vfd) < 0)
 779                                         return (SM_FAIL);
 780
 781                                 v->mdfd_flags &= ~MDFD_DIRTY;
 782                         }
 783                 }
 784         }
 785
 786         return (SM_SUCCESS);
 787 }
 788
 789 /*
 790  *      mdabort() -- Abort a transaction.
 791  *
 792  *              Changes need not be forced to disk at transaction abort.  We mark
 793  *              all file descriptors as clean here.  Always returns SM_SUCCESS.
 794  */
 795 int
 796 mdabort()
 797 {
 798         int                     i;
 799         MdfdVec    *v;
 800
 801         for (i = 0; i < CurFd; i++)
 802         {
 803 #ifndef LET_OS_MANAGE_FILESIZE
 804                 for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
 805 #else
 806                 v = &Md_fdvec[i];
 807                 if (v != (MdfdVec *) NULL)
 808 #endif
 809                         v->mdfd_flags &= ~MDFD_DIRTY;
 810         }
 811
 812         return (SM_SUCCESS);
 813 }
 814
 815 /*
 816  *      _fdvec_alloc () -- grab a free (or new) md file descriptor vector.
 817  *
 818  */
 819 static
 820 int
 821 _fdvec_alloc()
 822 {
 823         MdfdVec    *nvec;
 824         int                     fdvec,
 825                                 i;
 826         MemoryContext oldcxt;
 827
 828         if (Md_Free >= 0)                       /* get from free list */
 829         {
 830                 fdvec = Md_Free;
 831                 Md_Free = Md_fdvec[fdvec].mdfd_nextFree;
 832                 Assert(Md_fdvec[fdvec].mdfd_flags == MDFD_FREE);
 833                 Md_fdvec[fdvec].mdfd_flags = 0;
 834                 if (fdvec >= CurFd)
 835                 {
 836                         Assert(fdvec == CurFd);
 837                         CurFd++;
 838                 }
 839                 return (fdvec);
 840         }
 841
 842         /* Must allocate more room */
 843
 844         if (Nfds != CurFd)
 845                 elog(FATAL, "_fdvec_alloc error");
 846
 847         Nfds *= 2;
 848
 849         oldcxt = MemoryContextSwitchTo(MdCxt);
 850
 851         nvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
 852         MemSet(nvec, 0, Nfds * sizeof(MdfdVec));
 853         memmove(nvec, (char *) Md_fdvec, CurFd * sizeof(MdfdVec));
 854         pfree(Md_fdvec);
 855
 856         MemoryContextSwitchTo(oldcxt);
 857
 858         Md_fdvec = nvec;
 859
 860         /* Set new free list */
 861         for (i = CurFd; i < Nfds; i++)
 862         {
 863                 Md_fdvec[i].mdfd_nextFree = i + 1;
 864                 Md_fdvec[i].mdfd_flags = MDFD_FREE;
 865         }
 866         Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
 867         Md_Free = CurFd + 1;
 868
 869         fdvec = CurFd;
 870         CurFd++;
 871         Md_fdvec[fdvec].mdfd_flags = 0;
 872
 873         return (fdvec);
 874 }
 875
 876 /*
 877  *      _fdvec_free () -- free md file descriptor vector.
 878  *
 879  */
 880 static
 881 void
 882 _fdvec_free(int fdvec)
 883 {
 884
 885         Assert(Md_Free < 0 || Md_fdvec[Md_Free].mdfd_flags == MDFD_FREE);
 886         Md_fdvec[fdvec].mdfd_nextFree = Md_Free;
 887         Md_fdvec[fdvec].mdfd_flags = MDFD_FREE;
 888         Md_Free = fdvec;
 889
 890 }
 891
 892 static MdfdVec *
 893 _mdfd_openseg(Relation reln, int segno, int oflags)
 894 {
 895         MemoryContext oldcxt;
 896         MdfdVec    *v;
 897         int                     fd;
 898         bool            dofree;
 899         char       *path,
 900                            *fullpath;
 901
 902         /* be sure we have enough space for the '.segno', if any */
 903         path = relpath(RelationGetRelationName(reln)->data);
 904
 905         dofree = false;
 906         if (segno > 0)
 907         {
 908                 dofree = true;
 909                 fullpath = (char *) palloc(strlen(path) + 12);
 910                 sprintf(fullpath, "%s.%d", path, segno);
 911         }
 912         else
 913                 fullpath = path;
 914
 915         /* open the file */
 916         fd = PathNameOpenFile(fullpath, O_RDWR | oflags, 0600);
 917
 918         if (dofree)
 919                 pfree(fullpath);
 920
 921         if (fd < 0)
 922                 return ((MdfdVec *) NULL);
 923
 924         /* allocate an mdfdvec entry for it */
 925         oldcxt = MemoryContextSwitchTo(MdCxt);
 926         v = (MdfdVec *) palloc(sizeof(MdfdVec));
 927         MemoryContextSwitchTo(oldcxt);
 928
 929         /* fill the entry */
 930         v->mdfd_vfd = fd;
 931         v->mdfd_flags = (uint16) 0;
 932         v->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
 933 #ifndef LET_OS_MANAGE_FILESIZE
 934         v->mdfd_chain = (MdfdVec *) NULL;
 935
 936 #ifdef DIAGNOSTIC
 937         if (v->mdfd_lstbcnt > RELSEG_SIZE)
 938                 elog(FATAL, "segment too big on open!");
 939 #endif
 940 #endif
 941
 942         /* all done */
 943         return (v);
 944 }
 945
 946 static MdfdVec *
 947 _mdfd_getseg(Relation reln, int blkno, int oflag)
 948 {
 949         MdfdVec    *v;
 950         int                     segno;
 951         int                     fd;
 952         int                     i;
 953
 954         fd = RelationGetFile(reln);
 955         if (fd < 0)
 956         {
 957                 if ((fd = mdopen(reln)) < 0)
 958                         elog(ERROR, "cannot open relation %s",
 959                                  RelationGetRelationName(reln));
 960                 reln->rd_fd = fd;
 961         }
 962
 963 #ifndef LET_OS_MANAGE_FILESIZE
 964         for (v = &Md_fdvec[fd], segno = blkno / RELSEG_SIZE, i = 1;
 965                  segno > 0;
 966                  i++, segno--)
 967         {
 968
 969                 if (v->mdfd_chain == (MdfdVec *) NULL)
 970                 {
 971                         v->mdfd_chain = _mdfd_openseg(reln, i, oflag);
 972
 973                         if (v->mdfd_chain == (MdfdVec *) NULL)
 974                                 elog(ERROR, "cannot open segment %d of relation %s",
 975                                          i, RelationGetRelationName(reln));
 976                 }
 977                 v = v->mdfd_chain;
 978         }
 979 #else
 980         v = &Md_fdvec[fd];
 981 #endif
 982
 983         return (v);
 984 }
 985
 986 static BlockNumber
 987 _mdnblocks(File file, Size blcksz)
 988 {
 989         long            len;
 990
 991         len = FileSeek(file, 0L, SEEK_END) - 1;
 992         return ((BlockNumber) ((len < 0) ? 0 : 1 + len / blcksz));
 993 }