]> granicus.if.org Git - postgresql/commitdiff
Don't error out if recycling or removing an old WAL segment fails at the end
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Sun, 13 Sep 2009 18:32:08 +0000 (18:32 +0000)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Sun, 13 Sep 2009 18:32:08 +0000 (18:32 +0000)
of checkpoint. Although the checkpoint has been written to WAL at that point
already, so that all data is safe, and we'll retry removing the WAL segment at
the next checkpoint, if such a failure persists we won't be able to remove any
other old WAL segments either and will eventually run out of disk space. It's
better to treat the failure as non-fatal, and move on to clean any other WAL
segment and continue with any other end-of-checkpoint cleanup.

We don't normally expect any such failures, but on Windows it can happen with
some anti-virus or backup software that lock files without FILE_SHARE_DELETE
flag.

Also, the loop in pgrename() to retry when the file is locked was broken. If a
file is locked on Windows, you get ERROR_SHARE_VIOLATION, not
ERROR_ACCESS_DENIED, at least on modern versions. Fix that, although I left
the check for ERROR_ACCESS_DENIED in there as well (presumably it was correct
in some environment), and added ERROR_LOCK_VIOLATION to be consistent with
similar checks in pgwin32_open(). Reduce the timeout on the loop from 30s to
10s, on the grounds that since it's been broken, we've effectively had a
timeout of 0s and no-one has complained, so a smaller timeout is actually
closer to the old behavior. A longer timeout would mean that if recycling a
WAL file fails because it's locked for some reason, InstallXLogFileSegment()
will hold ControlFileLock for longer, potentially blocking other backends, so
a long timeout isn't totally harmless.

While we're at it, set errno correctly in pgrename().

Backpatch to 8.2, which is the oldest version supported on Windows. The xlog.c
changes would make sense on other platforms and thus on older versions as
well, but since there's no such locking issues on other platforms, it's not
worth it.

src/backend/access/transam/xlog.c
src/port/dirmod.c

index 53ded25ed016c11d6ae505a028f47ce9c40b1aec..331809a3b91f01c28c5b71ac6eba21abdebe4d02 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.352 2009/09/10 09:42:10 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.353 2009/09/13 18:32:07 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -2262,12 +2262,14 @@ XLogFileInit(uint32 log, uint32 seg,
                                                                *use_existent, &max_advance,
                                                                use_lock))
        {
-               /* No need for any more future segments... */
+               /*
+                * No need for any more future segments, or InstallXLogFileSegment()
+                * failed to rename the file into place. If the rename failed, opening
+                * the file below will fail.
+                */
                unlink(tmppath);
        }
 
-       elog(DEBUG2, "done creating and filling new WAL file");
-
        /* Set flag to tell caller there was no existent file */
        *use_existent = false;
 
@@ -2280,6 +2282,8 @@ XLogFileInit(uint32 log, uint32 seg,
                   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
                                  path, log, seg)));
 
+       elog(DEBUG2, "done creating and filling new WAL file");
+
        return fd;
 }
 
@@ -2409,10 +2413,9 @@ XLogFileCopy(uint32 log, uint32 seg,
  * place.  This should be TRUE except during bootstrap log creation.  The
  * caller must *not* hold the lock at call.
  *
- * Returns TRUE if file installed, FALSE if not installed because of
- * exceeding max_advance limit.  On Windows, we also return FALSE if we
- * can't rename the file into place because someone's got it open.
- * (Any other kind of failure causes ereport().)
+ * Returns TRUE if the file was installed successfully.  FALSE indicates that
+ * max_advance limit was exceeded, or an error occurred while renaming the
+ * file into place.
  */
 static bool
 InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
@@ -2460,31 +2463,26 @@ InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
         */
 #if HAVE_WORKING_LINK
        if (link(tmppath, path) < 0)
-               ereport(ERROR,
+       {
+               if (use_lock)
+                       LWLockRelease(ControlFileLock);
+               ereport(LOG,
                                (errcode_for_file_access(),
                                 errmsg("could not link file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
                                                tmppath, path, *log, *seg)));
+               return false;
+       }
        unlink(tmppath);
 #else
        if (rename(tmppath, path) < 0)
        {
-#ifdef WIN32
-#if !defined(__CYGWIN__)
-               if (GetLastError() == ERROR_ACCESS_DENIED)
-#else
-               if (errno == EACCES)
-#endif
-               {
-                       if (use_lock)
-                               LWLockRelease(ControlFileLock);
-                       return false;
-               }
-#endif   /* WIN32 */
-
-               ereport(ERROR,
+               if (use_lock)
+                       LWLockRelease(ControlFileLock);
+               ereport(LOG,
                                (errcode_for_file_access(),
                                 errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
                                                tmppath, path, *log, *seg)));
+               return false;
        }
 #endif
 
@@ -3128,19 +3126,25 @@ RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
                                         */
                                        snprintf(newpath, MAXPGPATH, "%s.deleted", path);
                                        if (rename(path, newpath) != 0)
-                                               ereport(ERROR,
+                                       {
+                                               ereport(LOG,
                                                                (errcode_for_file_access(),
-                                                                errmsg("could not rename old transaction log file \"%s\"",
+                                                                errmsg("could not rename old transaction log file \"%s\": %m",
                                                                                path)));
+                                               continue;
+                                       }
                                        rc = unlink(newpath);
 #else
                                        rc = unlink(path);
 #endif
                                        if (rc != 0)
-                                               ereport(ERROR,
+                                       {
+                                               ereport(LOG,
                                                                (errcode_for_file_access(),
                                                                 errmsg("could not remove old transaction log file \"%s\": %m",
                                                                                path)));
+                                               continue;
+                                       }
                                        CheckpointStats.ckpt_segs_removed++;
                                }
 
index a31e74cbd31c314779235a529a6d4513119c9056..27b70644c47b36bb5a69ebc46a98c928a403e0b2 100644 (file)
@@ -10,7 +10,7 @@
  *     Win32 (NT4 and newer).
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/port/dirmod.c,v 1.58 2009/06/11 14:49:15 momjian Exp $
+ *       $PostgreSQL: pgsql/src/port/dirmod.c,v 1.59 2009/09/13 18:32:08 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -120,7 +120,8 @@ pgrename(const char *from, const char *to)
         * We need to loop because even though PostgreSQL uses flags that allow
         * rename while the file is open, other applications might have the file
         * open without those flags.  However, we won't wait indefinitely for
-        * someone else to close the file.
+        * someone else to close the file, as the caller might be holding locks
+        * and blocking other backends.
         */
 #if defined(WIN32) && !defined(__CYGWIN__)
        while (!MoveFileEx(from, to, MOVEFILE_REPLACE_EXISTING))
@@ -129,13 +130,28 @@ pgrename(const char *from, const char *to)
 #endif
        {
 #if defined(WIN32) && !defined(__CYGWIN__)
-               if (GetLastError() != ERROR_ACCESS_DENIED)
+               DWORD           err = GetLastError();
+
+               _dosmaperr(err);
+
+               /*
+                * Modern NT-based Windows versions return ERROR_SHARING_VIOLATION
+                * if another process has the file open without FILE_SHARE_DELETE.
+                * ERROR_LOCK_VIOLATION has also been seen with some anti-virus
+                * software. This used to check for just ERROR_ACCESS_DENIED, so
+                * presumably you can get that too with some OS versions. We don't
+                * expect real permission errors where we currently use rename().
+                */
+               if (err != ERROR_ACCESS_DENIED &&
+                       err != ERROR_SHARING_VIOLATION &&
+                       err != ERROR_LOCK_VIOLATION)
+                       return -1;
 #else
                if (errno != EACCES)
-#endif
-                       /* set errno? */
                        return -1;
-               if (++loops > 300)              /* time out after 30 sec */
+#endif
+
+               if (++loops > 100)              /* time out after 10 sec */
                        return -1;
                pg_usleep(100000);              /* us */
        }
@@ -155,14 +171,14 @@ pgunlink(const char *path)
         * We need to loop because even though PostgreSQL uses flags that allow
         * unlink while the file is open, other applications might have the file
         * open without those flags.  However, we won't wait indefinitely for
-        * someone else to close the file.
+        * someone else to close the file, as the caller might be holding locks
+        * and blocking other backends.
         */
        while (unlink(path))
        {
                if (errno != EACCES)
-                       /* set errno? */
                        return -1;
-               if (++loops > 300)              /* time out after 30 sec */
+               if (++loops > 100)              /* time out after 10 sec */
                        return -1;
                pg_usleep(100000);              /* us */
        }