1 /*-------------------------------------------------------------------------
4 * This code manages relations that reside on magnetic disk.
6 * Copyright (c) 1994, Regents of the University of California
10 * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.3 1996/07/15 19:22:12 scrappy Exp $
12 *-------------------------------------------------------------------------
14 #include <stdio.h> /* for sprintf() */
18 #include "miscadmin.h" /* for DataDir */
21 #include "storage/smgr.h" /* where the declarations go */
22 #include "storage/block.h"
23 #include "storage/fd.h"
24 #include "utils/mcxt.h"
25 #include "utils/rel.h"
26 #include "utils/elog.h"
27 #include "utils/palloc.h"
28 #include "catalog/catalog.h"
33 * The magnetic disk storage manager keeps track of open file descriptors
34 * in its own descriptor pool. This happens for two reasons. First, at
35 * transaction boundaries, we walk the list of descriptors and flush
36 * anything that we've dirtied in the current transaction. Second, we
37 * have to support relations of > 4GBytes. In order to do this, we break
38 * relations up into chunks of < 2GBytes and store one chunk in each of
39 * several files that represent the relation.
42 typedef struct _MdfdVec {
43 int mdfd_vfd; /* fd number in vfd pool */
44 uint16 mdfd_flags; /* clean, dirty */
45 int mdfd_lstbcnt; /* most recent block count */
46 struct _MdfdVec *mdfd_chain; /* for large relations */
49 static int Nfds = 100;
50 static MdfdVec *Md_fdvec = (MdfdVec *) NULL;
52 static MemoryContext MdCxt;
54 #define MDFD_DIRTY (uint16) 0x01
56 #define RELSEG_SIZE 262144 /* (2 ** 31) / 8192 -- 2GB file */
58 /* routines declared here */
59 static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);
60 static MdfdVec *_mdfd_getseg(Relation reln, int blkno, int oflag);
61 static int _fdvec_ext(void);
62 static BlockNumber _mdnblocks(File file, Size blcksz);
65 * mdinit() -- Initialize private state for magnetic disk storage manager.
67 * We keep a private table of all file descriptors. Whenever we do
68 * a write to one, we mark it dirty in our table. Whenever we force
69 * changes to disk, we mark the file descriptor clean. At transaction
70 * commit, we force changes to disk for all dirty file descriptors.
71 * This routine allocates and initializes the table.
73 * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
80 MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr");
81 if (MdCxt == (MemoryContext) NULL)
84 oldcxt = MemoryContextSwitchTo(MdCxt);
85 Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
86 (void) MemoryContextSwitchTo(oldcxt);
88 if (Md_fdvec == (MdfdVec *) NULL)
91 memset(Md_fdvec, 0, Nfds * sizeof(MdfdVec));
97 mdcreate(Relation reln)
102 extern bool IsBootstrapProcessingMode();
104 path = relpath(&(reln->rd_rel->relname.data[0]));
105 fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600);
108 * If the file already exists and is empty, we pretend that the
109 * create succeeded. During bootstrap processing, we skip that check,
110 * because pg_time, pg_variable, and pg_log get created before their
111 * .bki file entries are processed.
115 if ((fd = FileNameOpenFile(path, O_RDWR, 0600)) >= 0) {
116 if (!IsBootstrapProcessingMode() &&
117 FileRead(fd, (char *) &tmp, sizeof(tmp)) != 0) {
125 if (_fdvec_ext() == SM_FAIL)
129 Md_fdvec[CurFd].mdfd_vfd = fd;
130 Md_fdvec[CurFd].mdfd_flags = (uint16) 0;
131 Md_fdvec[CurFd].mdfd_chain = (MdfdVec *) NULL;
132 Md_fdvec[CurFd].mdfd_lstbcnt = 0;
140 * mdunlink() -- Unlink a relation.
143 mdunlink(Relation reln)
148 MemoryContext oldcxt;
149 char fname[NAMEDATALEN];
150 char tname[NAMEDATALEN+10]; /* leave room for overflow suffixes*/
152 /* On Windows NT you can't unlink a file if it is open so we have
156 (void) mdclose(reln);
160 memset(fname,0, NAMEDATALEN);
161 strncpy(fname, RelationGetRelationName(reln)->data, NAMEDATALEN);
163 if (FileNameUnlink(fname) < 0)
166 /* unlink all the overflow files for large relations */
169 (void) mdclose(reln);
171 sprintf(tname, "%s.%d", fname, i);
172 if (FileNameUnlink(tname) < 0)
176 /* finally, clean out the mdfd vector */
177 fd = RelationGetFile(reln);
178 Md_fdvec[fd].mdfd_flags = (uint16) 0;
180 oldcxt = MemoryContextSwitchTo(MdCxt);
181 for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL; ) {
184 if (ov != &Md_fdvec[fd])
187 Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
188 (void) MemoryContextSwitchTo(oldcxt);
194 * mdextend() -- Add a block to the specified relation.
196 * This routine returns SM_FAIL or SM_SUCCESS, with errno set as
200 mdextend(Relation reln, char *buffer)
206 nblocks = mdnblocks(reln);
207 v = _mdfd_getseg(reln, nblocks, O_CREAT);
209 if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0)
212 if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
215 /* remember that we did a write, so we can sync at xact commit */
216 v->mdfd_flags |= MDFD_DIRTY;
218 /* try to keep the last block count current, though it's just a hint */
219 if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
220 v->mdfd_lstbcnt = RELSEG_SIZE;
223 if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE
224 || v->mdfd_lstbcnt > RELSEG_SIZE)
225 elog(FATAL, "segment too big!");
232 * mdopen() -- Open the specified relation.
235 mdopen(Relation reln)
242 if (_fdvec_ext() == SM_FAIL)
246 path = relpath(&(reln->rd_rel->relname.data[0]));
248 fd = FileNameOpenFile(path, O_RDWR, 0600);
250 /* this should only happen during bootstrap processing */
252 fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600);
254 Md_fdvec[CurFd].mdfd_vfd = fd;
255 Md_fdvec[CurFd].mdfd_flags = (uint16) 0;
256 Md_fdvec[CurFd].mdfd_chain = (MdfdVec *) NULL;
257 Md_fdvec[CurFd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
260 if (Md_fdvec[CurFd].mdfd_lstbcnt > RELSEG_SIZE)
261 elog(FATAL, "segment too big on relopen!");
270 * mdclose() -- Close the specified relation.
272 * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
275 mdclose(Relation reln)
280 fd = RelationGetFile(reln);
282 for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
284 /* may be closed already */
289 * We sync the file descriptor so that we don't need to reopen it at
290 * transaction commit to force changes to disk.
293 FileSync(v->mdfd_vfd);
294 FileClose(v->mdfd_vfd);
296 /* mark this file descriptor as clean in our private table */
297 v->mdfd_flags &= ~MDFD_DIRTY;
304 * mdread() -- Read the specified block from a relation.
306 * Returns SM_SUCCESS or SM_FAIL.
309 mdread(Relation reln, BlockNumber blocknum, char *buffer)
316 v = _mdfd_getseg(reln, blocknum, 0);
318 seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
321 if (seekpos >= BLCKSZ * RELSEG_SIZE)
322 elog(FATAL, "seekpos too big!");
325 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
330 if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ) {
332 memset(buffer, 0, BLCKSZ);
342 * mdwrite() -- Write the supplied block at the appropriate location.
344 * Returns SM_SUCCESS or SM_FAIL.
347 mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
353 v = _mdfd_getseg(reln, blocknum, 0);
355 seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
357 if (seekpos >= BLCKSZ * RELSEG_SIZE)
358 elog(FATAL, "seekpos too big!");
361 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
366 if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
369 v->mdfd_flags |= MDFD_DIRTY;
375 * mdflush() -- Synchronously write a block to disk.
377 * This is exactly like mdwrite(), but doesn't return until the file
378 * system buffer cache has been flushed.
381 mdflush(Relation reln, BlockNumber blocknum, char *buffer)
387 v = _mdfd_getseg(reln, blocknum, 0);
389 seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
391 if (seekpos >= BLCKSZ * RELSEG_SIZE)
392 elog(FATAL, "seekpos too big!");
395 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
399 /* write and sync the block */
401 if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
402 || FileSync(v->mdfd_vfd) < 0)
406 * By here, the block is written and changes have been forced to stable
407 * storage. Mark the descriptor as clean until the next write, so we
408 * don't sync it again unnecessarily at transaction commit.
411 v->mdfd_flags &= ~MDFD_DIRTY;
417 * mdblindwrt() -- Write a block to disk blind.
419 * We have to be able to do this using only the name and OID of
420 * the database and relation in which the block belongs. This
421 * is a synchronous write.
424 mdblindwrt(char *dbstr,
438 /* be sure we have enough space for the '.segno', if any */
439 segno = blkno / RELSEG_SIZE;
445 /* construct the path to the file and open it */
446 if (dbid == (Oid) 0) {
447 path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
449 sprintf(path, "%s/%.*s", DataDir, NAMEDATALEN, relstr);
451 sprintf(path, "%s/%.*s.%d", DataDir, NAMEDATALEN, relstr, segno);
453 path = (char *) palloc(strlen(DataDir) + strlen("/base/") + 2 * sizeof(NameData) + 2 + nchars);
455 sprintf(path, "%s/base/%.*s/%.*s", DataDir, NAMEDATALEN,
456 dbstr, NAMEDATALEN, relstr);
458 sprintf(path, "%s/base/%.*s/%.*s.%d", DataDir, NAMEDATALEN, dbstr,
459 NAMEDATALEN, relstr, segno);
462 if ((fd = open(path, O_RDWR, 0600)) < 0)
465 /* seek to the right spot */
466 seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
467 if (lseek(fd, seekpos, SEEK_SET) != seekpos) {
474 /* write and sync the block */
475 #ifdef OPENLINK_PATCHES
476 if (write(fd, buffer, BLCKSZ) != BLCKSZ || (pg_fsync(fd) < 0))
478 if (write(fd, buffer, BLCKSZ) != BLCKSZ || fsync(fd) < 0)
491 * mdnblocks() -- Get the number of blocks stored in a relation.
493 * Returns # of blocks or -1 on error.
496 mdnblocks(Relation reln)
503 fd = RelationGetFile(reln);
507 if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE)
508 elog(FATAL, "segment too big in getseg!");
513 if (v->mdfd_lstbcnt == RELSEG_SIZE
514 || (nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ)) == RELSEG_SIZE) {
516 v->mdfd_lstbcnt = RELSEG_SIZE;
519 if (v->mdfd_chain == (MdfdVec *) NULL) {
520 v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
521 if (v->mdfd_chain == (MdfdVec *) NULL)
522 elog(WARN, "cannot count blocks for %.16s -- open failed",
523 RelationGetRelationName(reln));
528 return ((segno * RELSEG_SIZE) + nblocks);
534 * mdcommit() -- Commit a transaction.
536 * All changes to magnetic disk relations must be forced to stable
537 * storage. This routine makes a pass over the private table of
538 * file descriptors. Any descriptors to which we have done writes,
539 * but not synced, are synced here.
541 * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
549 for (i = 0; i < CurFd; i++) {
550 for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
551 if (v->mdfd_flags & MDFD_DIRTY) {
552 if (FileSync(v->mdfd_vfd) < 0)
555 v->mdfd_flags &= ~MDFD_DIRTY;
564 * mdabort() -- Abort a transaction.
566 * Changes need not be forced to disk at transaction abort. We mark
567 * all file descriptors as clean here. Always returns SM_SUCCESS.
575 for (i = 0; i < CurFd; i++) {
576 for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
577 v->mdfd_flags &= ~MDFD_DIRTY;
585 * _fdvec_ext() -- Extend the md file descriptor vector.
587 * The file descriptor vector must be large enough to hold at least
594 MemoryContext oldcxt;
598 oldcxt = MemoryContextSwitchTo(MdCxt);
600 nvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
601 memset(nvec, 0, Nfds * sizeof(MdfdVec));
602 memmove(nvec, (char *) Md_fdvec, (Nfds / 2) * sizeof(MdfdVec));
605 (void) MemoryContextSwitchTo(oldcxt);
613 _mdfd_openseg(Relation reln, int segno, int oflags)
615 MemoryContext oldcxt;
619 char *path, *fullpath;
621 /* be sure we have enough space for the '.segno', if any */
622 path = relpath(RelationGetRelationName(reln)->data);
627 fullpath = (char *) palloc(strlen(path) + 12);
628 sprintf(fullpath, "%s.%d", path, segno);
633 fd = PathNameOpenFile(fullpath, O_RDWR|oflags, 0600);
639 return ((MdfdVec *) NULL);
641 /* allocate an mdfdvec entry for it */
642 oldcxt = MemoryContextSwitchTo(MdCxt);
643 v = (MdfdVec *) palloc(sizeof(MdfdVec));
644 (void) MemoryContextSwitchTo(oldcxt);
648 v->mdfd_flags = (uint16) 0;
649 v->mdfd_chain = (MdfdVec *) NULL;
650 v->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
653 if (v->mdfd_lstbcnt > RELSEG_SIZE)
654 elog(FATAL, "segment too big on open!");
662 _mdfd_getseg(Relation reln, int blkno, int oflag)
669 fd = RelationGetFile(reln);
671 if ((fd = mdopen(reln)) < 0)
672 elog(WARN, "cannot open relation %.16s",
673 RelationGetRelationName(reln));
677 for (v = &Md_fdvec[fd], segno = blkno / RELSEG_SIZE, i = 1;
681 if (v->mdfd_chain == (MdfdVec *) NULL) {
682 v->mdfd_chain = _mdfd_openseg(reln, i, oflag);
684 if (v->mdfd_chain == (MdfdVec *) NULL)
685 elog(WARN, "cannot open segment %d of relation %.16s",
686 i, RelationGetRelationName(reln));
695 _mdnblocks(File file, Size blcksz)
699 len = FileSeek(file, 0L, SEEK_END) - 1;
700 return((BlockNumber)((len < 0) ? 0 : 1 + len / blcksz));