From 2692d329eba45b30a038309f3e0ddb8d03f6e830 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 10 Apr 2000 23:41:52 +0000 Subject: [PATCH] Tweak smgrblindwrt per advice from Vadim: add parameter indicating whether to do fsync or not, and if so (which should be seldom) just do the fsync immediately. This way we need not build data structures in md.c/fd.c for blind writes. --- src/backend/storage/buffer/bufmgr.c | 14 ++- src/backend/storage/smgr/md.c | 187 ++++++++++------------------ src/backend/storage/smgr/mm.c | 5 +- src/backend/storage/smgr/smgr.c | 13 +- src/include/storage/smgr.h | 11 +- 5 files changed, 95 insertions(+), 135 deletions(-) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 0887f3d1ec..b5eb53b03a 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.78 2000/04/09 04:43:18 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.79 2000/04/10 23:41:49 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1127,7 +1127,8 @@ BufferSync() bufHdr->blind.relname, bufdb, bufrel, bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); + (char *) MAKE_PTR(bufHdr->data), + true); /* must fsync */ } else { @@ -1529,7 +1530,8 @@ BufferReplace(BufferDesc *bufHdr) status = smgrblindwrt(DEFAULT_SMGR, bufHdr->blind.dbname, bufHdr->blind.relname, bufdb, bufrel, bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); + (char *) MAKE_PTR(bufHdr->data), + false); /* no fsync */ } #ifndef OPTIMIZE_SINGLE @@ -1544,9 +1546,11 @@ BufferReplace(BufferDesc *bufHdr) return FALSE; /* If we had marked this buffer as needing to be fsync'd, we can forget - * about that, because it's now the storage manager's responsibility. + * about that, because it's now the storage manager's responsibility + * (but only if we called smgrwrite, not smgrblindwrt). */ - ClearBufferDirtiedByMe(BufferDescriptorGetBuffer(bufHdr), bufHdr); + if (reln != (Relation) NULL) + ClearBufferDirtiedByMe(BufferDescriptorGetBuffer(bufHdr), bufHdr); BufferFlushCount++; diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 233bbb0ac2..b30b0386af 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.65 2000/04/09 04:43:20 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.66 2000/04/10 23:41:51 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -48,11 +48,10 @@ typedef struct _MdfdVec { int mdfd_vfd; /* fd number in vfd pool */ - int mdfd_flags; /* free, temporary */ + int mdfd_flags; /* fd status flags */ /* these are the assigned bits in mdfd_flags: */ #define MDFD_FREE (1 << 0)/* unused entry */ -#define MDFD_TEMP (1 << 1)/* close this entry at transaction end */ int mdfd_lstbcnt; /* most recent block count */ int mdfd_nextFree; /* next free vector */ @@ -72,8 +71,8 @@ static void mdclose_fd(int fd); static int _mdfd_getrelnfd(Relation reln); static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags); static MdfdVec *_mdfd_getseg(Relation reln, int blkno); -static MdfdVec *_mdfd_blind_getseg(char *dbname, char *relname, - Oid dbid, Oid relid, int blkno); +static int _mdfd_blind_getseg(char *dbname, char *relname, + Oid dbid, Oid relid, int blkno); static int _fdvec_alloc(void); static void _fdvec_free(int); static BlockNumber _mdnblocks(File file, Size blcksz); @@ -572,7 +571,8 @@ mdflush(Relation reln, BlockNumber blocknum, char *buffer) * * We have to be able to do this using only the name and OID of * the database and relation in which the block belongs. Otherwise - * this is just like mdwrite(). + * this is much like mdwrite(). If dofsync is TRUE, then we fsync + * the file, making it more like mdflush(). */ int mdblindwrt(char *dbname, @@ -580,15 +580,16 @@ mdblindwrt(char *dbname, Oid dbid, Oid relid, BlockNumber blkno, - char *buffer) + char *buffer, + bool dofsync) { int status; long seekpos; - MdfdVec *v; + int fd; - v = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno); + fd = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno); - if (v == NULL) + if (fd < 0) return SM_FAIL; #ifndef LET_OS_MANAGE_FILESIZE @@ -601,11 +602,22 @@ mdblindwrt(char *dbname, seekpos = (long) (BLCKSZ * (blkno)); #endif - if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) + if (lseek(fd, seekpos, SEEK_SET) != seekpos) + { + close(fd); return SM_FAIL; + } status = SM_SUCCESS; - if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ) + + /* write and optionally sync the block */ + if (write(fd, buffer, BLCKSZ) != BLCKSZ) + status = SM_FAIL; + else if (dofsync && + pg_fsync(fd) < 0) + status = SM_FAIL; + + if (close(fd) < 0) status = SM_FAIL; return status; @@ -633,7 +645,8 @@ mdmarkdirty(Relation reln, BlockNumber blkno) * * We have to be able to do this using only the name and OID of * the database and relation in which the block belongs. Otherwise - * this is just like mdmarkdirty(). + * this is much like mdmarkdirty(). However, we do the fsync immediately + * rather than building md/fd datastructures to postpone it till later. */ int mdblindmarkdirty(char *dbname, @@ -642,16 +655,23 @@ mdblindmarkdirty(char *dbname, Oid relid, BlockNumber blkno) { - MdfdVec *v; + int status; + int fd; - v = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno); + fd = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno); - if (v == NULL) + if (fd < 0) return SM_FAIL; - FileMarkDirty(v->mdfd_vfd); + status = SM_SUCCESS; - return SM_SUCCESS; + if (pg_fsync(fd) < 0) + status = SM_FAIL; + + if (close(fd) < 0) + status = SM_FAIL; + + return status; } /* @@ -820,24 +840,15 @@ mdcommit() v = &Md_fdvec[i]; if (v->mdfd_flags & MDFD_FREE) continue; - if (v->mdfd_flags & MDFD_TEMP) - { - /* Sync and close the file */ - mdclose_fd(i); - } - else - { - /* Sync, but keep the file entry */ - + /* Sync the file entry */ #ifndef LET_OS_MANAGE_FILESIZE - for ( ; v != (MdfdVec *) NULL; v = v->mdfd_chain) + for ( ; v != (MdfdVec *) NULL; v = v->mdfd_chain) #else - if (v != (MdfdVec *) NULL) + if (v != (MdfdVec *) NULL) #endif - { - if (FileSync(v->mdfd_vfd) < 0) - return SM_FAIL; - } + { + if (FileSync(v->mdfd_vfd) < 0) + return SM_FAIL; } } @@ -854,21 +865,9 @@ mdcommit() int mdabort() { - int i; - MdfdVec *v; - - for (i = 0; i < CurFd; i++) - { - v = &Md_fdvec[i]; - if (v->mdfd_flags & MDFD_FREE) - continue; - if (v->mdfd_flags & MDFD_TEMP) - { - /* Close the file */ - mdclose_fd(i); - } - } - + /* We don't actually have to do anything here. fd.c will discard + * fsync-needed bits in its AtEOXact_Files() routine. + */ return SM_SUCCESS; } @@ -1057,102 +1056,52 @@ _mdfd_getseg(Relation reln, int blkno) return v; } -/* Find the segment of the relation holding the specified block. - * This is the same as _mdfd_getseg() except that we must work - * "blind" with no Relation struct. +/* + * Find the segment of the relation holding the specified block. * - * NOTE: we have no easy way to tell whether a FD already exists for the - * target relation, so we always make a new one. This should probably - * be improved somehow, but I doubt it's a significant performance issue - * under normal circumstances. The FD is marked to be closed at end of xact - * so that we don't accumulate a lot of dead FDs. + * This performs the same work as _mdfd_getseg() except that we must work + * "blind" with no Relation struct. We assume that we are not likely to + * touch the same relation again soon, so we do not create an FD entry for + * the relation --- we just open a kernel file descriptor which will be + * used and promptly closed. The return value is the kernel descriptor, + * or -1 on failure. */ -static MdfdVec * +static int _mdfd_blind_getseg(char *dbname, char *relname, Oid dbid, Oid relid, int blkno) { - MdfdVec *v; char *path; int fd; - int vfd; #ifndef LET_OS_MANAGE_FILESIZE int segno; - int targsegno; #endif - /* construct the path to the file and open it */ + /* construct the path to the relation */ path = relpath_blind(dbname, relname, dbid, relid); -#ifndef __CYGWIN32__ - fd = FileNameOpenFile(path, O_RDWR, 0600); -#else - fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600); -#endif - - if (fd < 0) - return NULL; - - vfd = _fdvec_alloc(); - if (vfd < 0) - return NULL; - - Md_fdvec[vfd].mdfd_vfd = fd; - Md_fdvec[vfd].mdfd_flags = MDFD_TEMP; - Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ); #ifndef LET_OS_MANAGE_FILESIZE - Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL; - -#ifdef DIAGNOSTIC - if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE) - elog(FATAL, "segment too big on relopen!"); -#endif - - targsegno = blkno / RELSEG_SIZE; - for (v = &Md_fdvec[vfd], segno = 1; segno <= targsegno; segno++) + /* append the '.segno', if needed */ + segno = blkno / RELSEG_SIZE; + if (segno > 0) { - char *segpath; - MdfdVec *newv; - MemoryContext oldcxt; + char *segpath = (char *) palloc(strlen(path) + 12); - segpath = (char *) palloc(strlen(path) + 12); sprintf(segpath, "%s.%d", path, segno); - -#ifndef __CYGWIN32__ - fd = FileNameOpenFile(segpath, O_RDWR | O_CREAT, 0600); -#else - fd = FileNameOpenFile(segpath, O_RDWR | O_BINARY | O_CREAT, 0600); + pfree(path); + path = segpath; + } #endif - pfree(segpath); - - if (fd < 0) - return (MdfdVec *) NULL; - - /* allocate an mdfdvec entry for it */ - oldcxt = MemoryContextSwitchTo(MdCxt); - newv = (MdfdVec *) palloc(sizeof(MdfdVec)); - MemoryContextSwitchTo(oldcxt); - - /* fill the entry */ - newv->mdfd_vfd = fd; - newv->mdfd_flags = MDFD_TEMP; - newv->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ); - newv->mdfd_chain = (MdfdVec *) NULL; -#ifdef DIAGNOSTIC - if (newv->mdfd_lstbcnt > RELSEG_SIZE) - elog(FATAL, "segment too big on open!"); -#endif - v->mdfd_chain = newv; - v = newv; - } +#ifndef __CYGWIN32__ + fd = open(path, O_RDWR, 0600); #else - v = &Md_fdvec[vfd]; + fd = open(path, O_RDWR | O_BINARY, 0600); #endif pfree(path); - return v; + return fd; } static BlockNumber diff --git a/src/backend/storage/smgr/mm.c b/src/backend/storage/smgr/mm.c index fc3acead66..a5b22cbcc5 100644 --- a/src/backend/storage/smgr/mm.c +++ b/src/backend/storage/smgr/mm.c @@ -11,7 +11,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/mm.c,v 1.18 2000/01/26 05:57:05 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/mm.c,v 1.19 2000/04/10 23:41:51 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -478,7 +478,8 @@ mmblindwrt(char *dbstr, Oid dbid, Oid relid, BlockNumber blkno, - char *buffer) + char *buffer, + bool dofsync) { return SM_FAIL; } diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 839636b118..27cad952ae 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -11,7 +11,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.33 2000/04/09 04:43:20 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.34 2000/04/10 23:41:52 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -38,7 +38,8 @@ typedef struct f_smgr char *buffer); int (*smgr_blindwrt) (char *dbname, char *relname, Oid dbid, Oid relid, - BlockNumber blkno, char *buffer); + BlockNumber blkno, char *buffer, + bool dofsync); int (*smgr_markdirty) (Relation reln, BlockNumber blkno); int (*smgr_blindmarkdirty) (char *dbname, char *relname, Oid dbid, Oid relid, @@ -293,7 +294,8 @@ smgrflush(int16 which, Relation reln, BlockNumber blocknum, char *buffer) * this case, the buffer manager will call smgrblindwrt() with * the name and OID of the database and the relation to which the * buffer belongs. Every storage manager must be able to force - * this page down to stable storage in this circumstance. + * this page down to stable storage in this circumstance. The + * write should be synchronous if dofsync is true. */ int smgrblindwrt(int16 which, @@ -302,7 +304,8 @@ smgrblindwrt(int16 which, Oid dbid, Oid relid, BlockNumber blkno, - char *buffer) + char *buffer, + bool dofsync) { char *dbstr; char *relstr; @@ -313,7 +316,7 @@ smgrblindwrt(int16 which, relstr = pstrdup(relname); status = (*(smgrsw[which].smgr_blindwrt)) (dbstr, relstr, dbid, relid, - blkno, buffer); + blkno, buffer, dofsync); if (status == SM_FAIL) elog(ERROR, "cannot write block %d of %s [%s] blind", diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 053a63196e..bc0ec04bb2 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2000, PostgreSQL, Inc * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: smgr.h,v 1.18 2000/04/09 04:43:18 tgl Exp $ + * $Id: smgr.h,v 1.19 2000/04/10 23:41:45 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -37,7 +37,8 @@ extern int smgrflush(int16 which, Relation reln, BlockNumber blocknum, char *buffer); extern int smgrblindwrt(int16 which, char *dbname, char *relname, Oid dbid, Oid relid, - BlockNumber blkno, char *buffer); + BlockNumber blkno, char *buffer, + bool dofsync); extern int smgrmarkdirty(int16 which, Relation reln, BlockNumber blkno); extern int smgrblindmarkdirty(int16 which, char *dbname, char *relname, Oid dbid, Oid relid, @@ -62,7 +63,8 @@ extern int mdread(Relation reln, BlockNumber blocknum, char *buffer); extern int mdwrite(Relation reln, BlockNumber blocknum, char *buffer); extern int mdflush(Relation reln, BlockNumber blocknum, char *buffer); extern int mdblindwrt(char *dbname, char *relname, Oid dbid, Oid relid, - BlockNumber blkno, char *buffer); + BlockNumber blkno, char *buffer, + bool dofsync); extern int mdmarkdirty(Relation reln, BlockNumber blkno); extern int mdblindmarkdirty(char *dbname, char *relname, Oid dbid, Oid relid, BlockNumber blkno); @@ -84,7 +86,8 @@ extern int mmread(Relation reln, BlockNumber blocknum, char *buffer); extern int mmwrite(Relation reln, BlockNumber blocknum, char *buffer); extern int mmflush(Relation reln, BlockNumber blocknum, char *buffer); extern int mmblindwrt(char *dbname, char *relname, Oid dbid, Oid relid, - BlockNumber blkno, char *buffer); + BlockNumber blkno, char *buffer, + bool dofsync); extern int mmmarkdirty(Relation reln, BlockNumber blkno); extern int mmblindmarkdirty(char *dbname, char *relname, Oid dbid, Oid relid, BlockNumber blkno); -- 2.40.0