From 9d645fd84c330610f85951457b71f56a709ab3de Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 16 Mar 2001 05:44:33 +0000 Subject: [PATCH] Support syncing WAL log to disk using either fsync(), fdatasync(), O_SYNC, or O_DSYNC (as available on a given platform). Add GUC parameter to control sync method. Also, add defense to XLogWrite to prevent it from going nuts if passed a target write position that's past the end of the buffers so far filled by XLogInsert. --- doc/src/sgml/runtime.sgml | 23 +- doc/src/sgml/wal.sgml | 29 ++- src/backend/access/transam/xlog.c | 231 ++++++++++++++++-- src/backend/utils/misc/guc.c | 50 ++-- src/backend/utils/misc/postgresql.conf.sample | 2 + src/include/access/xlog.h | 14 +- 6 files changed, 287 insertions(+), 62 deletions(-) diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml index f321cea669..f7b9f26f43 100644 --- a/doc/src/sgml/runtime.sgml +++ b/doc/src/sgml/runtime.sgml @@ -1,5 +1,5 @@ @@ -1224,8 +1224,8 @@ env PGOPTIONS='-c geqo=off' psql WAL_BUFFERS (integer) - Number of disk-page buffers for WAL log. This option can only be set - at server start. + Number of disk-page buffers in shared memory for WAL log. + This option can only be set at server start. @@ -1250,6 +1250,23 @@ env PGOPTIONS='-c geqo=off' psql + + + WAL_SYNC_METHOD (string) + + + Method used for forcing WAL updates out to disk. Possible + values are + FSYNC (call fsync() at each commit), + FDATASYNC (call fdatasync() at each commit), + OPEN_SYNC (write WAL files with open() option O_SYNC), or + OPEN_DATASYNC (write WAL files with open() option O_DSYNC). + Not all of these choices are available on all platforms. + This option can only be set at server start or in the + postgresql.conf file. + + + diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index c92ccd9d23..69616409e8 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -1,4 +1,4 @@ - + Write-Ahead Logging (<acronym>WAL</acronym>) @@ -281,15 +281,6 @@ CHECKPOINT. - - Setting the WAL_DEBUG parameter to any non-zero - value will result in each LogInsert and - LogFlush WAL call being - logged to standard error. At present, it makes no difference what - the non-zero value is. This option may be replaced by a more - general mechanism in the future. - - The COMMIT_DELAY parameter defines for how many microseconds the backend will sleep after writing a commit @@ -304,6 +295,24 @@ ten milliseconds, so that any nonzero COMMIT_DELAY setting between 1 and 10000 microseconds will have the same effect. + + + The WAL_SYNC_METHOD parameter determines how + Postgres will ask the kernel to force WAL updates out to disk. + All the options should be the same as far as reliability goes, + but it's quite platform-specific which one will be the fastest. + Note that this parameter is irrelevant if FSYNC + has been turned off. + + + + Setting the WAL_DEBUG parameter to any non-zero + value will result in each LogInsert and + LogFlush WAL call being + logged to standard error. At present, it makes no difference what + the non-zero value is. This option may be replaced by a more + general mechanism in the future. + diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 6c2ce3f1bd..d2d75d652c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.58 2001/03/14 20:23:04 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.59 2001/03/16 05:44:33 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -42,6 +42,47 @@ #include "miscadmin.h" +/* + * This chunk of hackery attempts to determine which file sync methods + * are available on the current platform, and to choose an appropriate + * default method. We assume that fsync() is always available, and that + * configure determined whether fdatasync() is. + */ +#define SYNC_METHOD_FSYNC 0 +#define SYNC_METHOD_FDATASYNC 1 +#define SYNC_METHOD_OPEN 2 /* used for both O_SYNC and O_DSYNC */ + +#if defined(O_SYNC) +# define OPEN_SYNC_FLAG O_SYNC +#else +# if defined(O_FSYNC) +# define OPEN_SYNC_FLAG O_FSYNC +# endif +#endif + +#if defined(OPEN_SYNC_FLAG) +# if defined(O_DSYNC) && (O_DSYNC != OPEN_SYNC_FLAG) +# define OPEN_DATASYNC_FLAG O_DSYNC +# endif +#endif + +#if defined(OPEN_DATASYNC_FLAG) +# define DEFAULT_SYNC_METHOD_STR "open_datasync" +# define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN +# define DEFAULT_SYNC_FLAGBIT OPEN_DATASYNC_FLAG +#else +# if defined(HAVE_FDATASYNC) +# define DEFAULT_SYNC_METHOD_STR "fdatasync" +# define DEFAULT_SYNC_METHOD SYNC_METHOD_FDATASYNC +# define DEFAULT_SYNC_FLAGBIT 0 +# else +# define DEFAULT_SYNC_METHOD_STR "fsync" +# define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC +# define DEFAULT_SYNC_FLAGBIT 0 +# endif +#endif + + /* Max time to wait to acquire XLog activity locks */ #define XLOG_LOCK_TIMEOUT (5*60*1000000) /* 5 minutes */ /* Max time to wait to acquire checkpoint lock */ @@ -52,10 +93,18 @@ int CheckPointSegments = 3; int XLOGbuffers = 8; int XLOGfiles = 0; /* how many files to pre-allocate during ckpt */ int XLOG_DEBUG = 0; +char *XLOG_sync_method = NULL; +const char XLOG_sync_method_default[] = DEFAULT_SYNC_METHOD_STR; char XLOG_archive_dir[MAXPGPATH]; /* null string means delete 'em */ +/* these are derived from XLOG_sync_method by assign_xlog_sync_method */ +static int sync_method = DEFAULT_SYNC_METHOD; +static int open_sync_bit = DEFAULT_SYNC_FLAGBIT; + #define MinXLOGbuffers 4 +#define XLOG_SYNC_BIT (enableFsync ? open_sync_bit : 0) + /* * ThisStartUpID will be same in all backends --- it identifies current @@ -365,6 +414,7 @@ static void WriteControlFile(void); static void ReadControlFile(void); static char *str_time(time_t tnow); static void xlog_outrec(char *buf, XLogRecord *record); +static void issue_xlog_fsync(void); /* @@ -917,6 +967,15 @@ XLogWrite(XLogwrtRqst WriteRqst) while (XLByteLT(LogwrtResult.Write, WriteRqst.Write)) { + /* + * Make sure we're not ahead of the insert process. This could + * happen if we're passed a bogus WriteRqst.Write that is past the + * end of the last page that's been initialized by + * AdvanceXLInsertBuffer. + */ + if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[Write->curridx])) + elog(STOP, "XLogWrite: write request is past end of log"); + /* Advance LogwrtResult.Write to end of current buffer page */ LogwrtResult.Write = XLogCtl->xlblocks[Write->curridx]; ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write); @@ -1004,9 +1063,7 @@ XLogWrite(XLogwrtRqst WriteRqst) */ if (openLogOff >= XLogSegSize && !ispartialpage) { - if (pg_fdatasync(openLogFile) != 0) - elog(STOP, "fsync(logfile %u seg %u) failed: %m", - openLogId, openLogSeg); + issue_xlog_fsync(); LogwrtResult.Flush = LogwrtResult.Write; /* end of current page */ } @@ -1030,24 +1087,24 @@ XLogWrite(XLogwrtRqst WriteRqst) * we might have no open file or the wrong one. However, we do * not need to fsync more than one file. */ - if (openLogFile >= 0 && - !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg)) + if (sync_method != SYNC_METHOD_OPEN) { - if (close(openLogFile) != 0) - elog(STOP, "close(logfile %u seg %u) failed: %m", - openLogId, openLogSeg); - openLogFile = -1; - } - if (openLogFile < 0) - { - XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg); - openLogFile = XLogFileOpen(openLogId, openLogSeg, false); - openLogOff = 0; + if (openLogFile >= 0 && + !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg)) + { + if (close(openLogFile) != 0) + elog(STOP, "close(logfile %u seg %u) failed: %m", + openLogId, openLogSeg); + openLogFile = -1; + } + if (openLogFile < 0) + { + XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg); + openLogFile = XLogFileOpen(openLogId, openLogSeg, false); + openLogOff = 0; + } + issue_xlog_fsync(); } - - if (pg_fdatasync(openLogFile) != 0) - elog(STOP, "fsync(logfile %u seg %u) failed: %m", - openLogId, openLogSeg); LogwrtResult.Flush = LogwrtResult.Write; } @@ -1191,7 +1248,8 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent) */ if (*usexistent) { - fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); + fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT, + S_IRUSR | S_IWUSR); if (fd < 0) { if (errno != ENOENT) @@ -1208,6 +1266,7 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent) unlink(tpath); unlink(path); + /* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */ fd = BasicOpenFile(tpath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) @@ -1220,8 +1279,8 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent) * allow "holes" in files, just seeking to the end doesn't allocate * intermediate space. This way, we know that we have all the space * and (after the fsync below) that all the indirect blocks are down - * on disk. Therefore, fdatasync(2) will be sufficient to sync future - * writes to the log file. + * on disk. Therefore, fdatasync(2) or O_DSYNC will be sufficient to + * sync future writes to the log file. */ MemSet(zbuffer, 0, sizeof(zbuffer)); for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer)) @@ -1261,7 +1320,8 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent) log, seg); #endif - fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); + fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT, + S_IRUSR | S_IWUSR); if (fd < 0) elog(STOP, "InitReopen(logfile %u seg %u) failed: %m", log, seg); @@ -1280,7 +1340,8 @@ XLogFileOpen(uint32 log, uint32 seg, bool econt) XLogFileName(path, log, seg); - fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); + fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT, + S_IRUSR | S_IWUSR); if (fd < 0) { if (econt && errno == ENOENT) @@ -1845,7 +1906,8 @@ WriteControlFile(void) memset(buffer, 0, BLCKSZ); memcpy(buffer, ControlFile, sizeof(ControlFileData)); - fd = BasicOpenFile(ControlFilePath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR); + fd = BasicOpenFile(ControlFilePath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + S_IRUSR | S_IWUSR); if (fd < 0) elog(STOP, "WriteControlFile failed to create control file (%s): %m", ControlFilePath); @@ -2852,3 +2914,120 @@ xlog_outrec(char *buf, XLogRecord *record) sprintf(buf + strlen(buf), ": %s", RmgrTable[record->xl_rmid].rm_name); } + + +/* + * GUC support routines + */ + +bool +check_xlog_sync_method(const char *method) +{ + if (strcasecmp(method, "fsync") == 0) return true; +#ifdef HAVE_FDATASYNC + if (strcasecmp(method, "fdatasync") == 0) return true; +#endif +#ifdef OPEN_SYNC_FLAG + if (strcasecmp(method, "open_sync") == 0) return true; +#endif +#ifdef OPEN_DATASYNC_FLAG + if (strcasecmp(method, "open_datasync") == 0) return true; +#endif + return false; +} + +void +assign_xlog_sync_method(const char *method) +{ + int new_sync_method; + int new_sync_bit; + + if (strcasecmp(method, "fsync") == 0) + { + new_sync_method = SYNC_METHOD_FSYNC; + new_sync_bit = 0; + } +#ifdef HAVE_FDATASYNC + else if (strcasecmp(method, "fdatasync") == 0) + { + new_sync_method = SYNC_METHOD_FDATASYNC; + new_sync_bit = 0; + } +#endif +#ifdef OPEN_SYNC_FLAG + else if (strcasecmp(method, "open_sync") == 0) + { + new_sync_method = SYNC_METHOD_OPEN; + new_sync_bit = OPEN_SYNC_FLAG; + } +#endif +#ifdef OPEN_DATASYNC_FLAG + else if (strcasecmp(method, "open_datasync") == 0) + { + new_sync_method = SYNC_METHOD_OPEN; + new_sync_bit = OPEN_DATASYNC_FLAG; + } +#endif + else + { + /* Can't get here unless guc.c screwed up */ + elog(ERROR, "Bogus xlog sync method %s", method); + new_sync_method = 0; /* keep compiler quiet */ + new_sync_bit = 0; + } + + if (sync_method != new_sync_method || open_sync_bit != new_sync_bit) + { + /* + * To ensure that no blocks escape unsynced, force an fsync on + * the currently open log segment (if any). Also, if the open + * flag is changing, close the log file so it will be reopened + * (with new flag bit) at next use. + */ + if (openLogFile >= 0) + { + if (pg_fsync(openLogFile) != 0) + elog(STOP, "fsync(logfile %u seg %u) failed: %m", + openLogId, openLogSeg); + if (open_sync_bit != new_sync_bit) + { + if (close(openLogFile) != 0) + elog(STOP, "close(logfile %u seg %u) failed: %m", + openLogId, openLogSeg); + openLogFile = -1; + } + } + sync_method = new_sync_method; + open_sync_bit = new_sync_bit; + } +} + + +/* + * Issue appropriate kind of fsync (if any) on the current XLOG output file + */ +static void +issue_xlog_fsync(void) +{ + switch (sync_method) + { + case SYNC_METHOD_FSYNC: + if (pg_fsync(openLogFile) != 0) + elog(STOP, "fsync(logfile %u seg %u) failed: %m", + openLogId, openLogSeg); + break; +#ifdef HAVE_FDATASYNC + case SYNC_METHOD_FDATASYNC: + if (pg_fdatasync(openLogFile) != 0) + elog(STOP, "fdatasync(logfile %u seg %u) failed: %m", + openLogId, openLogSeg); + break; +#endif + case SYNC_METHOD_OPEN: + /* write synced it already */ + break; + default: + elog(STOP, "bogus sync_method %d", sync_method); + break; + } +} diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 7e6769d3c8..d05bb75a29 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -4,7 +4,7 @@ * Support for grand unified configuration scheme, including SET * command, configuration file, and command line options. * - * $Header: /cvsroot/pgsql/src/backend/utils/misc/guc.c,v 1.32 2001/03/13 01:17:06 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/misc/guc.c,v 1.33 2001/03/16 05:44:33 tgl Exp $ * * Copyright 2000 by PostgreSQL Global Development Group * Written by Peter Eisentraut . @@ -20,6 +20,7 @@ #include "utils/guc.h" +#include "access/xlog.h" #include "commands/async.h" #include "libpq/auth.h" #include "libpq/pqcomm.h" @@ -33,23 +34,17 @@ #include "tcop/tcopprot.h" -/* XXX should be in a header file */ +/* XXX these should be in other modules' header files */ extern bool Log_connections; - -extern int CheckPointSegments; extern int CheckPointTimeout; -extern int XLOGbuffers; -extern int XLOGfiles; -extern int XLOG_DEBUG; extern int CommitDelay; extern int CommitSiblings; - extern bool FixBTree; #ifdef ENABLE_SYSLOG extern char *Syslog_facility; extern char *Syslog_ident; -bool check_facility(const char *facility); +static bool check_facility(const char *facility); #endif /* @@ -138,7 +133,8 @@ struct config_string GucContext context; char **variable; const char *default_val; - bool (*parse_hook)(const char *); + bool (*parse_hook)(const char *proposed); + void (*assign_hook)(const char *newval); }; @@ -330,25 +326,29 @@ static struct config_string ConfigureNamesString[] = { {"krb_server_keyfile", PGC_POSTMASTER, &pg_krb_server_keyfile, - PG_KRB_SRVTAB, NULL}, - - {"unix_socket_group", PGC_POSTMASTER, &Unix_socket_group, - "", NULL}, + PG_KRB_SRVTAB, NULL, NULL}, #ifdef ENABLE_SYSLOG {"syslog_facility", PGC_POSTMASTER, &Syslog_facility, - "LOCAL0", check_facility}, + "LOCAL0", check_facility, NULL}, {"syslog_ident", PGC_POSTMASTER, &Syslog_ident, - "postgres", NULL}, + "postgres", NULL, NULL}, #endif + {"unix_socket_group", PGC_POSTMASTER, &Unix_socket_group, + "", NULL, NULL}, + {"unix_socket_directory", PGC_POSTMASTER, &UnixSocketDir, - "", NULL}, + "", NULL, NULL}, {"virtual_host", PGC_POSTMASTER, &VirtualHost, - "", NULL}, + "", NULL, NULL}, - {NULL, 0, NULL, NULL, NULL} + {"wal_sync_method", PGC_SIGHUP, &XLOG_sync_method, + XLOG_sync_method_default, + check_xlog_sync_method, assign_xlog_sync_method}, + + {NULL, 0, NULL, NULL, NULL, NULL} }; /******** end of options list ********/ @@ -723,7 +723,10 @@ set_config_option(const char * name, const char * value, GucContext elog(elevel, "out of memory"); return false; } - free(*conf->variable); + if (conf->assign_hook) + (conf->assign_hook)(str); + if (*conf->variable) + free(*conf->variable); *conf->variable = str; } } @@ -737,7 +740,10 @@ set_config_option(const char * name, const char * value, GucContext elog(elevel, "out of memory"); return false; } - free(*conf->variable); + if (conf->assign_hook) + (conf->assign_hook)(str); + if (*conf->variable) + free(*conf->variable); *conf->variable = str; } break; @@ -855,7 +861,7 @@ ParseLongOption(const char * string, char ** name, char ** value) #ifdef ENABLE_SYSLOG -bool +static bool check_facility(const char *facility) { if (strcasecmp(facility,"LOCAL0") == 0) return true; diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 88d1fe9437..d6641f5166 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -107,6 +107,8 @@ # #wal_buffers = 8 # min 4 #wal_files = 0 # range 0-64 +#wal_sync_method = fsync # fsync or fdatasync or open_sync or open_datasync +# Note: default wal_sync_method varies across platforms #wal_debug = 0 # range 0-16 #commit_delay = 0 # range 0-100000 #commit_siblings = 5 # range 1-1000 diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index fa51d68d39..d2dfb2e31c 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -6,7 +6,7 @@ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: xlog.h,v 1.20 2001/03/13 20:32:37 tgl Exp $ + * $Id: xlog.h,v 1.21 2001/03/16 05:44:33 tgl Exp $ */ #ifndef XLOG_H #define XLOG_H @@ -176,6 +176,15 @@ extern StartUpID ThisStartUpID; /* current SUI */ extern bool InRecovery; extern XLogRecPtr MyLastRecPtr; +/* these variables are GUC parameters related to XLOG */ +extern int CheckPointSegments; +extern int XLOGbuffers; +extern int XLOGfiles; +extern int XLOG_DEBUG; +extern char *XLOG_sync_method; +extern const char XLOG_sync_method_default[]; + + extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata); extern void XLogFlush(XLogRecPtr RecPtr); @@ -202,4 +211,7 @@ extern void GetRedoRecPtr(void); */ extern XLogRecPtr GetUndoRecPtr(void); +extern bool check_xlog_sync_method(const char *method); +extern void assign_xlog_sync_method(const char *method); + #endif /* XLOG_H */ -- 2.40.0