]> granicus.if.org Git - postgresql/commitdiff
Fix fallback implementation of pg_atomic_write_u32().
authorAndres Freund <andres@anarazel.de>
Fri, 7 Oct 2016 23:55:15 +0000 (16:55 -0700)
committerAndres Freund <andres@anarazel.de>
Sat, 8 Oct 2016 00:00:17 +0000 (17:00 -0700)
I somehow had assumed that in the spinlock (in turn possibly using
semaphores) based fallback atomics implementation 32 bit writes could be
done without a lock. As far as the write goes that's correct, since
postgres supports only platforms with single-copy atomicity for aligned
32bit writes.  But writing without holding the spinlock breaks
read-modify-write operations like pg_atomic_compare_exchange_u32(),
since they'll potentially "miss" a concurrent write, which can't happen
in actual hardware implementations.

In 9.6+ when using the fallback atomics implementation this could lead
to buffer header locks not being properly marked as released, and
potentially some related state corruption.  I don't see a related danger
in 9.5 (earliest release with the API), because pg_atomic_write_u32()
wasn't used in a concurrent manner there.

The state variable of local buffers, before this change, were
manipulated using pg_atomic_write_u32(), to avoid unnecessary
synchronization overhead. As that'd not be the case anymore, introduce
and use pg_atomic_unlocked_write_u32(), which does not correctly
interact with RMW operations.

This bug only caused issues when postgres is compiled on platforms
without atomics support (i.e. no common new platform), or when compiled
with --disable-atomics, which explains why this wasn't noticed in
testing.

Reported-By: Tom Lane
Discussion: <14947.1475690465@sss.pgh.pa.us>
Backpatch: 9.5-, where the atomic operations API was introduced.

src/backend/port/atomics.c
src/backend/storage/buffer/bufmgr.c
src/backend/storage/buffer/localbuf.c
src/include/port/atomics.h
src/include/port/atomics/fallback.h
src/include/port/atomics/generic.h
src/include/storage/buf_internals.h

index 42169a33cf5e4953aefb1e4363cff12f1f8e7e27..d5970e42b046711ba9db7a32be73aac66fbc6d3c 100644 (file)
@@ -104,6 +104,19 @@ pg_atomic_init_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val_)
        ptr->value = val_;
 }
 
+void
+pg_atomic_write_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val)
+{
+       /*
+        * One might think that an unlocked write doesn't need to acquire the
+        * spinlock, but one would be wrong. Even an unlocked write has to cause a
+        * concurrent pg_atomic_compare_exchange_u32() (et al) to fail.
+        */
+       SpinLockAcquire((slock_t *) &ptr->sema);
+       ptr->value = val;
+       SpinLockRelease((slock_t *) &ptr->sema);
+}
+
 bool
 pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
                                                                        uint32 *expected, uint32 newval)
index 76ade3727cd0ee8dc7364fdae21789c6a26847d0..2616b2578f2a03b66f4987ea54e8498970cda49b 100644 (file)
@@ -821,7 +821,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 
                        Assert(buf_state & BM_VALID);
                        buf_state &= ~BM_VALID;
-                       pg_atomic_write_u32(&bufHdr->state, buf_state);
+                       pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
                }
                else
                {
@@ -941,7 +941,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                uint32          buf_state = pg_atomic_read_u32(&bufHdr->state);
 
                buf_state |= BM_VALID;
-               pg_atomic_write_u32(&bufHdr->state, buf_state);
+               pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
        }
        else
        {
@@ -3167,7 +3167,7 @@ FlushRelationBuffers(Relation rel)
                                                  false);
 
                                buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
-                               pg_atomic_write_u32(&bufHdr->state, buf_state);
+                               pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
 
                                /* Pop the error context stack */
                                error_context_stack = errcallback.previous;
index ca2388789d72fe08ecef846d37ed348fca08a005..fa961ad9c84b9614963c22a4cdc3cfb7107bc6f4 100644 (file)
@@ -138,7 +138,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
                        if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
                        {
                                buf_state += BUF_USAGECOUNT_ONE;
-                               pg_atomic_write_u32(&bufHdr->state, buf_state);
+                               pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
                        }
                }
                LocalRefCount[b]++;
@@ -181,7 +181,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
                        if (BUF_STATE_GET_USAGECOUNT(buf_state) > 0)
                        {
                                buf_state -= BUF_USAGECOUNT_ONE;
-                               pg_atomic_write_u32(&bufHdr->state, buf_state);
+                               pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
                                trycounter = NLocBuffer;
                        }
                        else
@@ -222,7 +222,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 
                /* Mark not-dirty now in case we error out below */
                buf_state &= ~BM_DIRTY;
-               pg_atomic_write_u32(&bufHdr->state, buf_state);
+               pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
 
                pgBufferUsage.local_blks_written++;
        }
@@ -249,7 +249,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
                /* mark buffer invalid just in case hash insert fails */
                CLEAR_BUFFERTAG(bufHdr->tag);
                buf_state &= ~(BM_VALID | BM_TAG_VALID);
-               pg_atomic_write_u32(&bufHdr->state, buf_state);
+               pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
        }
 
        hresult = (LocalBufferLookupEnt *)
@@ -266,7 +266,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
        buf_state |= BM_TAG_VALID;
        buf_state &= ~BUF_USAGECOUNT_MASK;
        buf_state += BUF_USAGECOUNT_ONE;
-       pg_atomic_write_u32(&bufHdr->state, buf_state);
+       pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
 
        *foundPtr = FALSE;
        return bufHdr;
@@ -302,7 +302,7 @@ MarkLocalBufferDirty(Buffer buffer)
 
        buf_state |= BM_DIRTY;
 
-       pg_atomic_write_u32(&bufHdr->state, buf_state);
+       pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
 }
 
 /*
@@ -351,7 +351,7 @@ DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
                        CLEAR_BUFFERTAG(bufHdr->tag);
                        buf_state &= ~BUF_FLAG_MASK;
                        buf_state &= ~BUF_USAGECOUNT_MASK;
-                       pg_atomic_write_u32(&bufHdr->state, buf_state);
+                       pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
                }
        }
 }
@@ -395,7 +395,7 @@ DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
                        CLEAR_BUFFERTAG(bufHdr->tag);
                        buf_state &= ~BUF_FLAG_MASK;
                        buf_state &= ~BUF_USAGECOUNT_MASK;
-                       pg_atomic_write_u32(&bufHdr->state, buf_state);
+                       pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
                }
        }
 }
index f7884d72c6e6ea18625bbf471b545e7d0e17416e..4e15ff8764dc422faf9c57b6699d147a1b85f13d 100644 (file)
@@ -255,10 +255,12 @@ pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
 }
 
 /*
- * pg_atomic_write_u32 - unlocked write to atomic variable.
+ * pg_atomic_write_u32 - write to atomic variable.
  *
  * The write is guaranteed to succeed as a whole, i.e. it's not possible to
- * observe a partial write for any reader.
+ * observe a partial write for any reader.  Note that this correctly interacts
+ * with pg_atomic_compare_exchange_u32, in contrast to
+ * pg_atomic_unlocked_write_u32().
  *
  * No barrier semantics.
  */
@@ -270,6 +272,25 @@ pg_atomic_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
        pg_atomic_write_u32_impl(ptr, val);
 }
 
+/*
+ * pg_atomic_unlocked_write_u32 - unlocked write to atomic variable.
+ *
+ * The write is guaranteed to succeed as a whole, i.e. it's not possible to
+ * observe a partial write for any reader.  But note that writing this way is
+ * not guaranteed to correctly interact with read-modify-write operations like
+ * pg_atomic_compare_exchange_u32.  This should only be used in cases where
+ * minor performance regressions due to atomics emulation are unacceptable.
+ *
+ * No barrier semantics.
+ */
+static inline void
+pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
+{
+       AssertPointerAlignment(ptr, 4);
+
+       pg_atomic_unlocked_write_u32_impl(ptr, val);
+}
+
 /*
  * pg_atomic_exchange_u32 - exchange newval with current value
  *
index bdaa795abe084f7a64d88c9c93a0848fcff59097..2290fff196ff486597cc8f5f5f38f46b44b9e1f9 100644 (file)
@@ -133,6 +133,9 @@ pg_atomic_unlocked_test_flag_impl(volatile pg_atomic_flag *ptr)
 #define PG_HAVE_ATOMIC_INIT_U32
 extern void pg_atomic_init_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val_);
 
+#define PG_HAVE_ATOMIC_WRITE_U32
+extern void pg_atomic_write_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val);
+
 #define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
 extern bool pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
                                                                                                uint32 *expected, uint32 newval);
index 32a01136e61077bce5359eddb47d8c1a7adfb3b7..1acd19242b2946296d6255d191023dd8a291d494 100644 (file)
@@ -58,6 +58,15 @@ pg_atomic_write_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val)
 }
 #endif
 
+#ifndef PG_HAVE_ATOMIC_UNLOCKED_WRITE_U32
+#define PG_HAVE_ATOMIC_UNLOCKED_WRITE_U32
+static inline void
+pg_atomic_unlocked_write_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val)
+{
+       ptr->value = val;
+}
+#endif
+
 /*
  * provide fallback for test_and_set using atomic_exchange if available
  */
index e0dfb2f5b03b99add82df846f193d28e55d430f8..c7da9f6ac2f168c09d6adb6b467ffca1f9c23e68 100644 (file)
@@ -168,7 +168,8 @@ typedef struct buftag
  * We use this same struct for local buffer headers, but the locks are not
  * used and not all of the flag bits are useful either. To avoid unnecessary
  * overhead, manipulations of the state field should be done without actual
- * atomic operations (i.e. only pg_atomic_read/write).
+ * atomic operations (i.e. only pg_atomic_read_u32() and
+ * pg_atomic_unlocked_write_u32()).
  *
  * Be careful to avoid increasing the size of the struct when adding or
  * reordering members.  Keeping it below 64 bytes (the most common CPU