1 /*-------------------------------------------------------------------------
4 * This code manages relations that reside on magnetic disk.
6 * Copyright (c) 1994, Regents of the University of California
10 * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.34 1998/07/24 03:31:35 scrappy Exp $
12 *-------------------------------------------------------------------------
15 #include <stdio.h> /* for sprintf() */
17 #include <fcntl.h> /* for open() flags */
21 #include "miscadmin.h" /* for DataDir */
23 #include "catalog/catalog.h"
24 #include "storage/block.h"
25 #include "storage/fd.h"
26 #include "storage/smgr.h" /* where the declarations go */
27 #include "utils/mcxt.h"
28 #include "utils/rel.h"
33 * The magnetic disk storage manager keeps track of open file descriptors
34 * in its own descriptor pool. This happens for two reasons. First, at
35 * transaction boundaries, we walk the list of descriptors and flush
36 * anything that we've dirtied in the current transaction. Second, we
37 * have to support relations of > 4GBytes. In order to do this, we break
38 * relations up into chunks of < 2GBytes and store one chunk in each of
39 * several files that represent the relation.
42 typedef struct _MdfdVec
44 int mdfd_vfd; /* fd number in vfd pool */
45 uint16 mdfd_flags; /* clean, dirty, free */
46 int mdfd_lstbcnt; /* most recent block count */
47 int mdfd_nextFree; /* next free vector */
48 #ifndef LET_OS_MANAGE_FILESIZE
49 struct _MdfdVec *mdfd_chain;/* for large relations */
53 static int Nfds = 100;
54 static MdfdVec *Md_fdvec = (MdfdVec *) NULL;
55 static int Md_Free = -1;
57 static MemoryContext MdCxt;
59 #define MDFD_DIRTY (uint16) 0x01
60 #define MDFD_FREE (uint16) 0x02
63 * RELSEG_SIZE appears to be the number of segments that can
64 * be in a disk file. It was defined as 262144 based on 8k
65 * blocks, but now that the block size can be changed, this
66 * has to be calculated at compile time. Otherwise, the file
67 * size limit would not work out to 2-gig (2147483648).
69 * The number needs to be (2 ** 31) / BLCKSZ, but to be keep
70 * the math under MAXINT, pre-divide by 256 and use ...
72 * (((2 ** 23) / BLCKSZ) * (2 ** 8))
76 * Now possibly let the OS handle it...
82 #ifndef LET_OS_MANAGE_FILESIZE
83 #define RELSEG_SIZE ((8388608 / BLCKSZ) * 256)
86 /* routines declared here */
87 static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);
88 static MdfdVec *_mdfd_getseg(Relation reln, int blkno, int oflag);
89 static int _fdvec_alloc(void);
90 static void _fdvec_free(int);
91 static BlockNumber _mdnblocks(File file, Size blcksz);
94 * mdinit() -- Initialize private state for magnetic disk storage manager.
96 * We keep a private table of all file descriptors. Whenever we do
97 * a write to one, we mark it dirty in our table. Whenever we force
98 * changes to disk, we mark the file descriptor clean. At transaction
99 * commit, we force changes to disk for all dirty file descriptors.
100 * This routine allocates and initializes the table.
102 * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
107 MemoryContext oldcxt;
110 MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr");
111 if (MdCxt == (MemoryContext) NULL)
114 oldcxt = MemoryContextSwitchTo(MdCxt);
115 Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
116 MemoryContextSwitchTo(oldcxt);
118 if (Md_fdvec == (MdfdVec *) NULL)
121 MemSet(Md_fdvec, 0, Nfds * sizeof(MdfdVec));
124 for (i = 0; i < Nfds; i++)
126 Md_fdvec[i].mdfd_nextFree = i + 1;
127 Md_fdvec[i].mdfd_flags = MDFD_FREE;
130 Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
136 mdcreate(Relation reln)
142 path = relpath(reln->rd_rel->relname.data);
143 fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);
146 * If the file already exists and is empty, we pretend that the create
147 * succeeded. During bootstrap processing, we skip that check,
148 * because pg_time, pg_variable, and pg_log get created before their
149 * .bki file entries are processed.
151 * As the result of this pretence it was possible to have in pg_class > 1
152 * records with the same relname. Actually, it should be fixed in
153 * upper levels, too, but... - vadim 05/06/97
158 if (!IsBootstrapProcessingMode())
160 fd = FileNameOpenFile(path, O_RDWR, 0600); /* Bootstrap */
165 vfd = _fdvec_alloc();
169 Md_fdvec[vfd].mdfd_vfd = fd;
170 Md_fdvec[vfd].mdfd_flags = (uint16) 0;
171 #ifndef LET_OS_MANAGE_FILESIZE
172 Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
174 Md_fdvec[vfd].mdfd_lstbcnt = 0;
180 * mdunlink() -- Unlink a relation.
183 mdunlink(Relation reln)
189 MemoryContext oldcxt;
190 char fname[NAMEDATALEN];
191 char tname[NAMEDATALEN + 10]; /* leave room for overflow
195 * On Windows NT you can't unlink a file if it is open so we have * to
199 StrNCpy(fname, RelationGetRelationName(reln)->data, NAMEDATALEN);
201 if (FileNameUnlink(fname) < 0)
204 /* unlink all the overflow files for large relations */
207 sprintf(tname, "%s.%d", fname, i);
208 if (FileNameUnlink(tname) < 0)
212 /* finally, clean out the mdfd vector */
213 fd = RelationGetFile(reln);
214 Md_fdvec[fd].mdfd_flags = (uint16) 0;
216 oldcxt = MemoryContextSwitchTo(MdCxt);
217 #ifndef LET_OS_MANAGE_FILESIZE
218 for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
220 FileUnlink(v->mdfd_vfd);
223 if (ov != &Md_fdvec[fd])
226 Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
229 if (v != (MdfdVec *) NULL)
230 FileUnlink(v->mdfd_vfd);
232 MemoryContextSwitchTo(oldcxt);
240 * mdextend() -- Add a block to the specified relation.
242 * This routine returns SM_FAIL or SM_SUCCESS, with errno set as
246 mdextend(Relation reln, char *buffer)
252 nblocks = mdnblocks(reln);
253 v = _mdfd_getseg(reln, nblocks, O_CREAT);
255 if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0)
258 if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
261 /* remember that we did a write, so we can sync at xact commit */
262 v->mdfd_flags |= MDFD_DIRTY;
264 /* try to keep the last block count current, though it's just a hint */
265 #ifndef LET_OS_MANAGE_FILESIZE
266 if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
267 v->mdfd_lstbcnt = RELSEG_SIZE;
270 if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE
271 || v->mdfd_lstbcnt > RELSEG_SIZE)
272 elog(FATAL, "segment too big!");
275 v->mdfd_lstbcnt = ++nblocks;
282 * mdopen() -- Open the specified relation.
285 mdopen(Relation reln)
291 path = relpath(reln->rd_rel->relname.data);
293 fd = FileNameOpenFile(path, O_RDWR, 0600);
295 /* this should only happen during bootstrap processing */
297 fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);
299 vfd = _fdvec_alloc();
303 Md_fdvec[vfd].mdfd_vfd = fd;
304 Md_fdvec[vfd].mdfd_flags = (uint16) 0;
305 Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
306 #ifndef LET_OS_MANAGE_FILESIZE
307 Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
310 if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE)
311 elog(FATAL, "segment too big on relopen!");
319 * mdclose() -- Close the specified relation
321 * AND FREE fd vector! It may be re-used for other relation!
322 * reln should be flushed from cache after closing !..
324 * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
327 mdclose(Relation reln)
332 MemoryContext oldcxt;
334 fd = RelationGetFile(reln);
336 oldcxt = MemoryContextSwitchTo(MdCxt);
337 #ifndef LET_OS_MANAGE_FILESIZE
338 for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
340 /* if not closed already */
341 if (v->mdfd_vfd >= 0)
345 * We sync the file descriptor so that we don't need to reopen
346 * it at transaction commit to force changes to disk.
349 FileSync(v->mdfd_vfd);
350 FileClose(v->mdfd_vfd);
352 /* mark this file descriptor as clean in our private table */
353 v->mdfd_flags &= ~MDFD_DIRTY;
355 /* Now free vector */
358 if (ov != &Md_fdvec[fd])
362 Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
365 if (v != (MdfdVec *) NULL)
367 if (v->mdfd_vfd >= 0)
371 * We sync the file descriptor so that we don't need to reopen
372 * it at transaction commit to force changes to disk.
375 FileSync(v->mdfd_vfd);
376 FileClose(v->mdfd_vfd);
378 /* mark this file descriptor as clean in our private table */
379 v->mdfd_flags &= ~MDFD_DIRTY;
383 MemoryContextSwitchTo(oldcxt);
391 * mdread() -- Read the specified block from a relation.
393 * Returns SM_SUCCESS or SM_FAIL.
396 mdread(Relation reln, BlockNumber blocknum, char *buffer)
403 v = _mdfd_getseg(reln, blocknum, 0);
405 #ifndef LET_OS_MANAGE_FILESIZE
406 seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
409 if (seekpos >= BLCKSZ * RELSEG_SIZE)
410 elog(FATAL, "seekpos too big!");
413 seekpos = (long) (BLCKSZ * (blocknum));
416 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
420 if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
423 MemSet(buffer, 0, BLCKSZ);
432 * mdwrite() -- Write the supplied block at the appropriate location.
434 * Returns SM_SUCCESS or SM_FAIL.
437 mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
443 v = _mdfd_getseg(reln, blocknum, 0);
445 #ifndef LET_OS_MANAGE_FILESIZE
446 seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
448 if (seekpos >= BLCKSZ * RELSEG_SIZE)
449 elog(FATAL, "seekpos too big!");
452 seekpos = (long) (BLCKSZ * (blocknum));
455 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
459 if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
462 v->mdfd_flags |= MDFD_DIRTY;
468 * mdflush() -- Synchronously write a block to disk.
470 * This is exactly like mdwrite(), but doesn't return until the file
471 * system buffer cache has been flushed.
474 mdflush(Relation reln, BlockNumber blocknum, char *buffer)
480 v = _mdfd_getseg(reln, blocknum, 0);
482 #ifndef LET_OS_MANAGE_FILESIZE
483 seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
485 if (seekpos >= BLCKSZ * RELSEG_SIZE)
486 elog(FATAL, "seekpos too big!");
489 seekpos = (long) (BLCKSZ * (blocknum));
492 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
495 /* write and sync the block */
497 if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
498 || FileSync(v->mdfd_vfd) < 0)
502 * By here, the block is written and changes have been forced to
503 * stable storage. Mark the descriptor as clean until the next write,
504 * so we don't sync it again unnecessarily at transaction commit.
507 v->mdfd_flags &= ~MDFD_DIRTY;
513 * mdblindwrt() -- Write a block to disk blind.
515 * We have to be able to do this using only the name and OID of
516 * the database and relation in which the block belongs. This
517 * is a synchronous write.
520 mdblindwrt(char *dbstr,
533 #ifndef LET_OS_MANAGE_FILESIZE
536 /* be sure we have enough space for the '.segno', if any */
537 segno = blkno / RELSEG_SIZE;
543 /* construct the path to the file and open it */
544 /* system table? then put in system area... */
547 path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
549 sprintf(path, "%s/%s", DataDir, relstr);
551 sprintf(path, "%s/%s.%d", DataDir, relstr, segno);
553 /* user table? then put in user database area... */
554 else if (dbid == MyDatabaseId)
556 extern char *DatabasePath;
558 path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2 + nchars);
560 sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
562 sprintf(path, "%s%c%s.%d", DatabasePath, SEP_CHAR, relstr, segno);
565 /* this is work arround only !!! */
567 char dbpath[MAXPGPATH + 1];
576 GetRawDatabaseInfo(dbstr, &owner, &id, dbpath, &tmpEncoding);
578 GetRawDatabaseInfo(dbstr, &owner, &id, dbpath);
582 elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
583 tmpPath = ExpandDatabasePath(dbpath);
585 elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
586 path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2 + nchars);
588 sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
590 sprintf(path, "%s%c%s.%d", tmpPath, SEP_CHAR, relstr, segno);
594 /* construct the path to the file and open it */
595 /* system table? then put in system area... */
598 path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2);
599 sprintf(path, "%s/%s", DataDir, relstr);
601 /* user table? then put in user database area... */
602 else if (dbid == MyDatabaseId)
604 extern char *DatabasePath;
606 path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2);
607 sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
610 /* this is work arround only !!! */
612 char dbpath[MAXPGPATH + 1];
622 GetRawDatabaseInfo(dbstr, &owner, &id, dbpath, &tmpEncoding);
624 GetRawDatabaseInfo(dbstr, &owner, &id, dbpath);
628 elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
629 tmpPath = ExpandDatabasePath(dbpath);
631 elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
632 path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2);
633 sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
638 if ((fd = open(path, O_RDWR, 0600)) < 0)
641 /* seek to the right spot */
642 #ifndef LET_OS_MANAGE_FILESIZE
643 seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
645 seekpos = (long) (BLCKSZ * (blkno));
648 if (lseek(fd, seekpos, SEEK_SET) != seekpos)
656 /* write and sync the block */
657 if (write(fd, buffer, BLCKSZ) != BLCKSZ || (pg_fsync(fd) < 0))
669 * mdnblocks() -- Get the number of blocks stored in a relation.
671 * Returns # of blocks or -1 on error.
674 mdnblocks(Relation reln)
681 fd = RelationGetFile(reln);
684 #ifndef LET_OS_MANAGE_FILESIZE
686 if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE)
687 elog(FATAL, "segment too big in getseg!");
693 if (v->mdfd_lstbcnt == RELSEG_SIZE
694 || (nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ)) == RELSEG_SIZE)
697 v->mdfd_lstbcnt = RELSEG_SIZE;
700 if (v->mdfd_chain == (MdfdVec *) NULL)
702 v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
703 if (v->mdfd_chain == (MdfdVec *) NULL)
704 elog(ERROR, "cannot count blocks for %s -- open failed",
705 RelationGetRelationName(reln));
711 return ((segno * RELSEG_SIZE) + nblocks);
714 return (_mdnblocks(v->mdfd_vfd, BLCKSZ));
719 * mdtruncate() -- Truncate relation to specified number of blocks.
721 * Returns # of blocks or -1 on error.
724 mdtruncate(Relation reln, int nblocks)
729 #ifndef LET_OS_MANAGE_FILESIZE
732 curnblk = mdnblocks(reln);
733 if (curnblk / RELSEG_SIZE > 0)
735 elog(NOTICE, "Can't truncate multi-segments relation %s",
736 reln->rd_rel->relname.data);
741 fd = RelationGetFile(reln);
744 if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
752 * mdcommit() -- Commit a transaction.
754 * All changes to magnetic disk relations must be forced to stable
755 * storage. This routine makes a pass over the private table of
756 * file descriptors. Any descriptors to which we have done writes,
757 * but not synced, are synced here.
759 * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
767 for (i = 0; i < CurFd; i++)
769 #ifndef LET_OS_MANAGE_FILESIZE
770 for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
773 if (v != (MdfdVec *) NULL)
776 if (v->mdfd_flags & MDFD_DIRTY)
778 if (FileSync(v->mdfd_vfd) < 0)
781 v->mdfd_flags &= ~MDFD_DIRTY;
790 * mdabort() -- Abort a transaction.
792 * Changes need not be forced to disk at transaction abort. We mark
793 * all file descriptors as clean here. Always returns SM_SUCCESS.
801 for (i = 0; i < CurFd; i++)
803 #ifndef LET_OS_MANAGE_FILESIZE
804 for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
807 if (v != (MdfdVec *) NULL)
809 v->mdfd_flags &= ~MDFD_DIRTY;
816 * _fdvec_alloc () -- grab a free (or new) md file descriptor vector.
826 MemoryContext oldcxt;
828 if (Md_Free >= 0) /* get from free list */
831 Md_Free = Md_fdvec[fdvec].mdfd_nextFree;
832 Assert(Md_fdvec[fdvec].mdfd_flags == MDFD_FREE);
833 Md_fdvec[fdvec].mdfd_flags = 0;
836 Assert(fdvec == CurFd);
842 /* Must allocate more room */
845 elog(FATAL, "_fdvec_alloc error");
849 oldcxt = MemoryContextSwitchTo(MdCxt);
851 nvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
852 MemSet(nvec, 0, Nfds * sizeof(MdfdVec));
853 memmove(nvec, (char *) Md_fdvec, CurFd * sizeof(MdfdVec));
856 MemoryContextSwitchTo(oldcxt);
860 /* Set new free list */
861 for (i = CurFd; i < Nfds; i++)
863 Md_fdvec[i].mdfd_nextFree = i + 1;
864 Md_fdvec[i].mdfd_flags = MDFD_FREE;
866 Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
871 Md_fdvec[fdvec].mdfd_flags = 0;
877 * _fdvec_free () -- free md file descriptor vector.
882 _fdvec_free(int fdvec)
885 Assert(Md_Free < 0 || Md_fdvec[Md_Free].mdfd_flags == MDFD_FREE);
886 Md_fdvec[fdvec].mdfd_nextFree = Md_Free;
887 Md_fdvec[fdvec].mdfd_flags = MDFD_FREE;
893 _mdfd_openseg(Relation reln, int segno, int oflags)
895 MemoryContext oldcxt;
902 /* be sure we have enough space for the '.segno', if any */
903 path = relpath(RelationGetRelationName(reln)->data);
909 fullpath = (char *) palloc(strlen(path) + 12);
910 sprintf(fullpath, "%s.%d", path, segno);
916 fd = PathNameOpenFile(fullpath, O_RDWR | oflags, 0600);
922 return ((MdfdVec *) NULL);
924 /* allocate an mdfdvec entry for it */
925 oldcxt = MemoryContextSwitchTo(MdCxt);
926 v = (MdfdVec *) palloc(sizeof(MdfdVec));
927 MemoryContextSwitchTo(oldcxt);
931 v->mdfd_flags = (uint16) 0;
932 v->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
933 #ifndef LET_OS_MANAGE_FILESIZE
934 v->mdfd_chain = (MdfdVec *) NULL;
937 if (v->mdfd_lstbcnt > RELSEG_SIZE)
938 elog(FATAL, "segment too big on open!");
947 _mdfd_getseg(Relation reln, int blkno, int oflag)
954 fd = RelationGetFile(reln);
957 if ((fd = mdopen(reln)) < 0)
958 elog(ERROR, "cannot open relation %s",
959 RelationGetRelationName(reln));
963 #ifndef LET_OS_MANAGE_FILESIZE
964 for (v = &Md_fdvec[fd], segno = blkno / RELSEG_SIZE, i = 1;
969 if (v->mdfd_chain == (MdfdVec *) NULL)
971 v->mdfd_chain = _mdfd_openseg(reln, i, oflag);
973 if (v->mdfd_chain == (MdfdVec *) NULL)
974 elog(ERROR, "cannot open segment %d of relation %s",
975 i, RelationGetRelationName(reln));
987 _mdnblocks(File file, Size blcksz)
991 len = FileSeek(file, 0L, SEEK_END) - 1;
992 return ((BlockNumber) ((len < 0) ? 0 : 1 + len / blcksz));