1 /*-------------------------------------------------------------------------
4 * This code manages relations that reside on magnetic disk.
6 * Copyright (c) 1994, Regents of the University of California
10 * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.6 1996/11/03 05:07:55 scrappy Exp $
12 *-------------------------------------------------------------------------
14 #include <stdio.h> /* for sprintf() */
18 #include "miscadmin.h" /* for DataDir */
20 #include "storage/block.h"
21 #include "storage/smgr.h" /* where the declarations go */
22 #include "storage/fd.h"
23 #include "utils/mcxt.h"
24 #include "utils/rel.h"
25 #include "utils/palloc.h"
26 #include "catalog/catalog.h"
31 * The magnetic disk storage manager keeps track of open file descriptors
32 * in its own descriptor pool. This happens for two reasons. First, at
33 * transaction boundaries, we walk the list of descriptors and flush
34 * anything that we've dirtied in the current transaction. Second, we
35 * have to support relations of > 4GBytes. In order to do this, we break
36 * relations up into chunks of < 2GBytes and store one chunk in each of
37 * several files that represent the relation.
40 typedef struct _MdfdVec {
41 int mdfd_vfd; /* fd number in vfd pool */
42 uint16 mdfd_flags; /* clean, dirty */
43 int mdfd_lstbcnt; /* most recent block count */
44 struct _MdfdVec *mdfd_chain; /* for large relations */
47 static int Nfds = 100;
48 static MdfdVec *Md_fdvec = (MdfdVec *) NULL;
50 static MemoryContext MdCxt;
52 #define MDFD_DIRTY (uint16) 0x01
54 #define RELSEG_SIZE 262144 /* (2 ** 31) / 8192 -- 2GB file */
56 /* routines declared here */
57 static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);
58 static MdfdVec *_mdfd_getseg(Relation reln, int blkno, int oflag);
59 static int _fdvec_ext(void);
60 static BlockNumber _mdnblocks(File file, Size blcksz);
63 * mdinit() -- Initialize private state for magnetic disk storage manager.
65 * We keep a private table of all file descriptors. Whenever we do
66 * a write to one, we mark it dirty in our table. Whenever we force
67 * changes to disk, we mark the file descriptor clean. At transaction
68 * commit, we force changes to disk for all dirty file descriptors.
69 * This routine allocates and initializes the table.
71 * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
78 MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr");
79 if (MdCxt == (MemoryContext) NULL)
82 oldcxt = MemoryContextSwitchTo(MdCxt);
83 Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
84 (void) MemoryContextSwitchTo(oldcxt);
86 if (Md_fdvec == (MdfdVec *) NULL)
89 memset(Md_fdvec, 0, Nfds * sizeof(MdfdVec));
95 mdcreate(Relation reln)
100 extern bool IsBootstrapProcessingMode();
102 path = relpath(&(reln->rd_rel->relname.data[0]));
103 fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600);
106 * If the file already exists and is empty, we pretend that the
107 * create succeeded. During bootstrap processing, we skip that check,
108 * because pg_time, pg_variable, and pg_log get created before their
109 * .bki file entries are processed.
113 if ((fd = FileNameOpenFile(path, O_RDWR, 0600)) >= 0) {
114 if (!IsBootstrapProcessingMode() &&
115 FileRead(fd, (char *) &tmp, sizeof(tmp)) != 0) {
123 if (_fdvec_ext() == SM_FAIL)
127 Md_fdvec[CurFd].mdfd_vfd = fd;
128 Md_fdvec[CurFd].mdfd_flags = (uint16) 0;
129 Md_fdvec[CurFd].mdfd_chain = (MdfdVec *) NULL;
130 Md_fdvec[CurFd].mdfd_lstbcnt = 0;
138 * mdunlink() -- Unlink a relation.
141 mdunlink(Relation reln)
146 MemoryContext oldcxt;
147 char fname[NAMEDATALEN];
148 char tname[NAMEDATALEN+10]; /* leave room for overflow suffixes*/
150 /* On Windows NT you can't unlink a file if it is open so we have
154 (void) mdclose(reln);
158 memset(fname,0, NAMEDATALEN);
159 strncpy(fname, RelationGetRelationName(reln)->data, NAMEDATALEN);
161 if (FileNameUnlink(fname) < 0)
164 /* unlink all the overflow files for large relations */
167 (void) mdclose(reln);
169 sprintf(tname, "%s.%d", fname, i);
170 if (FileNameUnlink(tname) < 0)
174 /* finally, clean out the mdfd vector */
175 fd = RelationGetFile(reln);
176 Md_fdvec[fd].mdfd_flags = (uint16) 0;
178 oldcxt = MemoryContextSwitchTo(MdCxt);
179 for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL; ) {
182 if (ov != &Md_fdvec[fd])
185 Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
186 (void) MemoryContextSwitchTo(oldcxt);
192 * mdextend() -- Add a block to the specified relation.
194 * This routine returns SM_FAIL or SM_SUCCESS, with errno set as
198 mdextend(Relation reln, char *buffer)
204 nblocks = mdnblocks(reln);
205 v = _mdfd_getseg(reln, nblocks, O_CREAT);
207 if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0)
210 if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
213 /* remember that we did a write, so we can sync at xact commit */
214 v->mdfd_flags |= MDFD_DIRTY;
216 /* try to keep the last block count current, though it's just a hint */
217 if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
218 v->mdfd_lstbcnt = RELSEG_SIZE;
221 if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE
222 || v->mdfd_lstbcnt > RELSEG_SIZE)
223 elog(FATAL, "segment too big!");
230 * mdopen() -- Open the specified relation.
233 mdopen(Relation reln)
240 if (_fdvec_ext() == SM_FAIL)
244 path = relpath(&(reln->rd_rel->relname.data[0]));
246 fd = FileNameOpenFile(path, O_RDWR, 0600);
248 /* this should only happen during bootstrap processing */
250 fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600);
252 Md_fdvec[CurFd].mdfd_vfd = fd;
253 Md_fdvec[CurFd].mdfd_flags = (uint16) 0;
254 Md_fdvec[CurFd].mdfd_chain = (MdfdVec *) NULL;
255 Md_fdvec[CurFd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
258 if (Md_fdvec[CurFd].mdfd_lstbcnt > RELSEG_SIZE)
259 elog(FATAL, "segment too big on relopen!");
268 * mdclose() -- Close the specified relation.
270 * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
273 mdclose(Relation reln)
278 fd = RelationGetFile(reln);
280 for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
282 /* may be closed already */
287 * We sync the file descriptor so that we don't need to reopen it at
288 * transaction commit to force changes to disk.
291 FileSync(v->mdfd_vfd);
292 FileClose(v->mdfd_vfd);
294 /* mark this file descriptor as clean in our private table */
295 v->mdfd_flags &= ~MDFD_DIRTY;
302 * mdread() -- Read the specified block from a relation.
304 * Returns SM_SUCCESS or SM_FAIL.
307 mdread(Relation reln, BlockNumber blocknum, char *buffer)
314 v = _mdfd_getseg(reln, blocknum, 0);
316 seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
319 if (seekpos >= BLCKSZ * RELSEG_SIZE)
320 elog(FATAL, "seekpos too big!");
323 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
328 if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ) {
330 memset(buffer, 0, BLCKSZ);
340 * mdwrite() -- Write the supplied block at the appropriate location.
342 * Returns SM_SUCCESS or SM_FAIL.
345 mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
351 v = _mdfd_getseg(reln, blocknum, 0);
353 seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
355 if (seekpos >= BLCKSZ * RELSEG_SIZE)
356 elog(FATAL, "seekpos too big!");
359 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
364 if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
367 v->mdfd_flags |= MDFD_DIRTY;
373 * mdflush() -- Synchronously write a block to disk.
375 * This is exactly like mdwrite(), but doesn't return until the file
376 * system buffer cache has been flushed.
379 mdflush(Relation reln, BlockNumber blocknum, char *buffer)
385 v = _mdfd_getseg(reln, blocknum, 0);
387 seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
389 if (seekpos >= BLCKSZ * RELSEG_SIZE)
390 elog(FATAL, "seekpos too big!");
393 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
397 /* write and sync the block */
399 if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
400 || FileSync(v->mdfd_vfd) < 0)
404 * By here, the block is written and changes have been forced to stable
405 * storage. Mark the descriptor as clean until the next write, so we
406 * don't sync it again unnecessarily at transaction commit.
409 v->mdfd_flags &= ~MDFD_DIRTY;
415 * mdblindwrt() -- Write a block to disk blind.
417 * We have to be able to do this using only the name and OID of
418 * the database and relation in which the block belongs. This
419 * is a synchronous write.
422 mdblindwrt(char *dbstr,
436 /* be sure we have enough space for the '.segno', if any */
437 segno = blkno / RELSEG_SIZE;
443 /* construct the path to the file and open it */
444 if (dbid == (Oid) 0) {
445 path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
447 sprintf(path, "%s/%.*s", DataDir, NAMEDATALEN, relstr);
449 sprintf(path, "%s/%.*s.%d", DataDir, NAMEDATALEN, relstr, segno);
451 path = (char *) palloc(strlen(DataDir) + strlen("/base/") + 2 * sizeof(NameData) + 2 + nchars);
453 sprintf(path, "%s/base/%.*s/%.*s", DataDir, NAMEDATALEN,
454 dbstr, NAMEDATALEN, relstr);
456 sprintf(path, "%s/base/%.*s/%.*s.%d", DataDir, NAMEDATALEN, dbstr,
457 NAMEDATALEN, relstr, segno);
460 if ((fd = open(path, O_RDWR, 0600)) < 0)
463 /* seek to the right spot */
464 seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
465 if (lseek(fd, seekpos, SEEK_SET) != seekpos) {
472 /* write and sync the block */
473 #ifdef OPENLINK_PATCHES
474 if (write(fd, buffer, BLCKSZ) != BLCKSZ || (pg_fsync(fd) < 0))
476 if (write(fd, buffer, BLCKSZ) != BLCKSZ || fsync(fd) < 0)
489 * mdnblocks() -- Get the number of blocks stored in a relation.
491 * Returns # of blocks or -1 on error.
494 mdnblocks(Relation reln)
501 fd = RelationGetFile(reln);
505 if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE)
506 elog(FATAL, "segment too big in getseg!");
511 if (v->mdfd_lstbcnt == RELSEG_SIZE
512 || (nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ)) == RELSEG_SIZE) {
514 v->mdfd_lstbcnt = RELSEG_SIZE;
517 if (v->mdfd_chain == (MdfdVec *) NULL) {
518 v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
519 if (v->mdfd_chain == (MdfdVec *) NULL)
520 elog(WARN, "cannot count blocks for %.16s -- open failed",
521 RelationGetRelationName(reln));
526 return ((segno * RELSEG_SIZE) + nblocks);
532 * mdcommit() -- Commit a transaction.
534 * All changes to magnetic disk relations must be forced to stable
535 * storage. This routine makes a pass over the private table of
536 * file descriptors. Any descriptors to which we have done writes,
537 * but not synced, are synced here.
539 * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
547 for (i = 0; i < CurFd; i++) {
548 for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
549 if (v->mdfd_flags & MDFD_DIRTY) {
550 if (FileSync(v->mdfd_vfd) < 0)
553 v->mdfd_flags &= ~MDFD_DIRTY;
562 * mdabort() -- Abort a transaction.
564 * Changes need not be forced to disk at transaction abort. We mark
565 * all file descriptors as clean here. Always returns SM_SUCCESS.
573 for (i = 0; i < CurFd; i++) {
574 for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
575 v->mdfd_flags &= ~MDFD_DIRTY;
583 * _fdvec_ext() -- Extend the md file descriptor vector.
585 * The file descriptor vector must be large enough to hold at least
592 MemoryContext oldcxt;
596 oldcxt = MemoryContextSwitchTo(MdCxt);
598 nvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
599 memset(nvec, 0, Nfds * sizeof(MdfdVec));
600 memmove(nvec, (char *) Md_fdvec, (Nfds / 2) * sizeof(MdfdVec));
603 (void) MemoryContextSwitchTo(oldcxt);
611 _mdfd_openseg(Relation reln, int segno, int oflags)
613 MemoryContext oldcxt;
617 char *path, *fullpath;
619 /* be sure we have enough space for the '.segno', if any */
620 path = relpath(RelationGetRelationName(reln)->data);
625 fullpath = (char *) palloc(strlen(path) + 12);
626 sprintf(fullpath, "%s.%d", path, segno);
631 fd = PathNameOpenFile(fullpath, O_RDWR|oflags, 0600);
637 return ((MdfdVec *) NULL);
639 /* allocate an mdfdvec entry for it */
640 oldcxt = MemoryContextSwitchTo(MdCxt);
641 v = (MdfdVec *) palloc(sizeof(MdfdVec));
642 (void) MemoryContextSwitchTo(oldcxt);
646 v->mdfd_flags = (uint16) 0;
647 v->mdfd_chain = (MdfdVec *) NULL;
648 v->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
651 if (v->mdfd_lstbcnt > RELSEG_SIZE)
652 elog(FATAL, "segment too big on open!");
660 _mdfd_getseg(Relation reln, int blkno, int oflag)
667 fd = RelationGetFile(reln);
669 if ((fd = mdopen(reln)) < 0)
670 elog(WARN, "cannot open relation %.16s",
671 RelationGetRelationName(reln));
675 for (v = &Md_fdvec[fd], segno = blkno / RELSEG_SIZE, i = 1;
679 if (v->mdfd_chain == (MdfdVec *) NULL) {
680 v->mdfd_chain = _mdfd_openseg(reln, i, oflag);
682 if (v->mdfd_chain == (MdfdVec *) NULL)
683 elog(WARN, "cannot open segment %d of relation %.16s",
684 i, RelationGetRelationName(reln));
693 _mdnblocks(File file, Size blcksz)
697 len = FileSeek(file, 0L, SEEK_END) - 1;
698 return((BlockNumber)((len < 0) ? 0 : 1 + len / blcksz));