From 0c8eda6258805223fa412ab55a1f130fbc51afa0 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Fri, 23 Sep 2011 17:52:43 -0400 Subject: [PATCH] Memory barrier support for PostgreSQL. This is not actually used anywhere yet, but it gets the basic infrastructure in place. It is fairly likely that there are bugs, and support for some important platforms may be missing, so we'll need to refine this as we go along. --- src/backend/storage/lmgr/README.barrier | 199 ++++++++++++++++++++++++ src/backend/storage/lmgr/s_lock.c | 1 + src/include/storage/barrier.h | 171 ++++++++++++++++++++ 3 files changed, 371 insertions(+) create mode 100644 src/backend/storage/lmgr/README.barrier create mode 100644 src/include/storage/barrier.h diff --git a/src/backend/storage/lmgr/README.barrier b/src/backend/storage/lmgr/README.barrier new file mode 100644 index 0000000000..f9f3593b77 --- /dev/null +++ b/src/backend/storage/lmgr/README.barrier @@ -0,0 +1,199 @@ +Memory Barriers +=============== + +Modern CPUs make extensive use of pipe-lining and out-of-order execution, +meaning that the CPU is often executing more than one instruction at a +time, and not necessarily in the order that the source code would suggest. +Furthermore, even before the CPU gets a chance to reorder operations, the +compiler may (and often does) reorganize the code for greater efficiency, +particularly at higher optimization levels. Optimizing compilers and +out-of-order execution are both critical for good performance, but they +can lead to surprising results when multiple processes access the same +memory space. + +Example +======= + +Suppose x is a pointer to a structure stored in shared memory, and that the +entire structure has been initialized to zero bytes. One backend executes +the following code fragment: + + x->foo = 1; + x->bar = 1; + +Meanwhile, at approximately the same time, another backend executes this +code fragment: + + bar = x->bar; + foo = x->foo; + +The second backend might end up with foo = 1 and bar = 1 (if it executes +both statements after the first backend), or with foo = 0 and bar = 0 (if +it executes both statements before the first backend), or with foo = 1 and +bar = 0 (if the first backend executes the first statement, the second +backend executes both statements, and then the first backend executes the +second statement). + +Surprisingly, however, the second backend could also end up with foo = 0 +and bar = 1. The compiler might swap the order of the two stores performed +by the first backend, or the two loads performed by the second backend. +Even if it doesn't, on a machine with weak memory ordering (such as PowerPC +or Itanium) the CPU might choose to execute either the loads or the stores +out of order. This surprising result can lead to bugs. + +A common pattern where this actually does result in a bug is when adding items +onto a queue. The writer does this: + + q->items[q->num_items] = new_item; + ++q->num_items; + +The reader does this: + + num_items = q->num_items; + for (i = 0; i < num_items; ++i) + /* do something with q->items[i] */ + +This code turns out to be unsafe, because the writer might increment +q->num_items before it finishes storing the new item into the appropriate slot. +More subtly, the reader might prefetch the contents of the q->items array +before reading q->num_items. Thus, there's still a bug here *even if the +writer does everything in the order we expect*. We need the writer to update +the array before bumping the item counter, and the reader to examine the item +counter before examining the array. + +Note that these types of highly counterintuitive bugs can *only* occur when +multiple processes are interacting with the same memory segment. A given +process always perceives its *own* writes to memory in program order. + +Avoiding Memory Ordering Bugs +============================= + +The simplest (and often best) way to avoid memory ordering bugs is to +protect the data structures involved with an lwlock. For more details, see +src/backend/storage/lmgr/README. For instance, in the above example, the +writer could acquire an lwlock in exclusive mode before appending to the +queue, and each reader could acquire the same lock in shared mode before +reading it. If the data structure is not heavily trafficked, this solution is +generally entirely adequate. + +However, in some cases, it is desirable to avoid the overhead of acquiring +and releasing locks. In this case, memory barriers may be used to ensure +that the apparent order of execution is as the programmer desires. In +PostgreSQL backend code, the pg_memory_barrier() macro may be used to achieve +this result. In the example above, we can prevent the reader from seeing a +garbage value by having the writer do this: + + q->items[q->num_items] = new_item; + pg_memory_barrier(); + ++q->num_items; + +And by having the reader do this: + + num_items = q->num_items; + pg_memory_barrier(); + for (i = 0; i < num_items; ++i) + /* do something with q->items[i] */ + +The pg_memory_barrier() macro will (1) prevent the compiler from rearranging +the code in such a way as to allow the memory accesses to occur out of order +and (2) generate any code (often, inline assembly) that is needed to prevent +the CPU from executing the memory accesses out of order. Specifically, the +barrier prevents loads and stores written after the barrier from being +performed before the barrier, and vice-versa. + +Although this code will work, it is needlessly inefficient. On systems with +strong memory ordering (such as x86), the CPU never reorders loads with other +loads, nor stores with other stores. It can, however, allow a load to +performed before a subsequent store. To avoid emitting unnecessary memory +instructions, we provide two additional primitives: pg_read_barrier(), and +pg_write_barrier(). When a memory barrier is being used to separate two +loads, use pg_read_barrier(); when it is separating two stores, use +pg_write_barrier(); when it is a separating a load and a store (in either +order), use pg_memory_barrier(). pg_memory_barrier() can always substitute +for either a read or a write barrier, but is typically more expensive, and +therefore should be used only when needed. + +With these guidelines in mind, the writer can do this: + + q->items[q->num_items] = new_item; + pg_write_barrier(); + ++q->num_items; + +And the reader can do this: + + num_items = q->num_items; + pg_read_barrier(); + for (i = 0; i < num_items; ++i) + /* do something with q->items[i] */ + +On machines with strong memory ordering, these weaker barriers will simply +prevent compiler rearrangement, without emitting any actual machine code. +On machines with weak memory ordering, they will will prevent compiler +reordering and also emit whatever hardware barrier may be required. Even +on machines with weak memory ordering, a read or write barrier may be able +to use a less expensive instruction than a full barrier. + +Weaknesses of Memory Barriers +============================= + +While memory barriers are a powerful tool, and much cheaper than locks, they +are also much less capable than locks. Here are some of the problems. + +1. Concurrent writers are unsafe. In the above example of a queue, using +memory barriers doesn't make it safe for two processes to add items to the +same queue at the same time. If more than one process can write to the queue, +a spinlock or lwlock must be used to synchronize access. The readers can +perhaps proceed without any lock, but the writers may not. + +Even very simple write operations often require additional synchronization. +For example, it's not safe for multiple writers to simultaneously execute +this code (supposing x is a pointer into shared memory): + + x->foo++; + +Although this may compile down to a single machine-language instruction, +the CPU will execute that instruction by reading the current value of foo, +adding one to it, and then storing the result back to the original address. +If two CPUs try to do this simultaneously, both may do their reads before +either one does their writes. Eventually we might be able to use an atomic +fetch-and-add instruction for this specific case on architectures that support +it, but we can't rely on that being available everywhere, and we currently +have no support for it at all. Use a lock. + +2. Eight-byte loads and stores aren't necessarily atomic. We assume in +various places in the source code that an aligned four-byte load or store is +atomic, and that other processes therefore won't see a half-set value. +Sadly, the same can't be said for eight-byte value: on some platforms, an +aligned eight-byte load or store will generate two four-byte operations. If +you need an atomic eight-byte read or write, you must make it atomic with a +lock. + +3. No ordering guarantees. While memory barriers ensure that any given +process performs loads and stores to shared memory in order, they don't +guarantee synchronization. In the queue example above, we can use memory +barriers to be sure that readers won't see garbage, but there's nothing to +say whether a given reader will run before or after a given writer. If this +matters in a given situation, some other mechanism must be used instead of +or in addition to memory barriers. + +4. Barrier proliferation. Many algorithms that at first seem appealing +require multiple barriers. If the number of barriers required is more than +one or two, you may be better off just using a lock. Keep in mind that, on +some platforms, a barrier may be implemented by acquiring and releasing a +backend-private spinlock. This may be better than a centralized lock under +contention, but it may also be slower in the uncontended case. + +Further Reading +=============== + +Much of the documentation about memory barriers appears to be quite +Linux-specific. The following papers may be helpful: + +Memory Ordering in Modern Microprocessors, by Paul E. McKenney +* http://www.rdrop.com/users/paulmck/scalability/paper/ordering.2007.09.19a.pdf + +Memory Barriers: a Hardware View for Software Hackers, by Paul E. McKenney +* http://www.rdrop.com/users/paulmck/scalability/paper/whymb.2010.06.07c.pdf + +The Linux kernel also has some useful documentation on this topic. Start +with Documentation/memory-barriers.txt diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c index 1aa9912572..cd1306c182 100644 --- a/src/backend/storage/lmgr/s_lock.c +++ b/src/backend/storage/lmgr/s_lock.c @@ -20,6 +20,7 @@ #include "storage/s_lock.h" +slock_t dummy_spinlock; static int spins_per_delay = DEFAULT_SPINS_PER_DELAY; diff --git a/src/include/storage/barrier.h b/src/include/storage/barrier.h new file mode 100644 index 0000000000..0286817a38 --- /dev/null +++ b/src/include/storage/barrier.h @@ -0,0 +1,171 @@ +/*------------------------------------------------------------------------- + * + * barrier.h + * Memory barrier operations. + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/barrier.h + * + *------------------------------------------------------------------------- + */ +#ifndef BARRIER_H +#define BARRIER_H + +#include "storage/s_lock.h" + +extern slock_t dummy_spinlock; + +/* + * A compiler barrier need not (and preferably should not) emit any actual + * machine code, but must act as an optimization fence: the compiler must not + * reorder loads or stores to main memory around the barrier. However, the + * CPU may still reorder loads or stores at runtime, if the architecture's + * memory model permits this. + * + * A memory barrier must act as a compiler barrier, and in addition must + * guarantee that all loads and stores issued prior to the barrier are + * completed before any loads or stores issued after the barrier. Unless + * loads and stores are totally ordered (which is not the case on most + * architectures) this requires issuing some sort of memory fencing + * instruction. + * + * A read barrier must act as a compiler barrier, and in addition must + * guarantee that any loads issued prior to the barrier are completed before + * any loads issued after the barrier. Similarly, a write barrier acts + * as a compiler barrier, and also orders stores. Read and write barriers + * are thus weaker than a full memory barrier, but stronger than a compiler + * barrier. In practice, on machines with strong memory ordering, read and + * write barriers may require nothing more than a compiler barrier. + * + * For an introduction to using memory barriers within the PostgreSQL backend, + * see src/backend/storage/lmgr/README.barrier + */ + +#if defined(DISABLE_BARRIERS) + +/* + * Fall through to the spinlock-based implementation. + */ + +#elif defined(__INTEL_COMPILER) + +/* + * icc defines __GNUC__, but doesn't support gcc's inline asm syntax + */ +#define pg_memory_barrier() _mm_mfence() +#define pg_compiler_barrier() __memory_barrier() + +#elif defined(__GNUC__) + +/* This works on any architecture, since it's only talking to GCC itself. */ +#define pg_compiler_barrier() __asm__ __volatile__("" : : : "memory") + +#if defined(__i386__) || defined(__x86_64__) /* 32 or 64 bit x86 */ + +/* + * x86 and x86_64 do not allow loads to be reorded with other loads, or + * stores to be reordered with other stores, but a load can be performed + * before a subsequent store. + * + * "lock; addl" has worked for longer than "mfence". + * + * Technically, some x86-ish chips support uncached memory access and/or + * special instructions that are weakly ordered. In those cases we'd need + * the read and write barriers to be lfence and sfence. But since we don't + * do those things, a compiler barrier should be enough. + */ +#define pg_memory_barrier() \ + __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory") +#define pg_read_barrier() pg_compiler_barrier() +#define pg_write_barrier() pg_compiler_barrier() + +#elif defined(__ia64__) || defined(__ia64) + +/* + * Itanium is weakly ordered, so read and write barriers require a full + * fence. + */ +#define pg_memory_barrier() __asm__ __volatile__ ("mf" : : : "memory") + +#elif defined(__ppc__) || defined(__powerpc__) || defined(__ppc64__) || defined(__powerpc64__) + +/* + * lwsync orders loads with respect to each other, and similarly with stores. + * But a load can be performed before a subsequent store, so sync must be used + * for a full memory barrier. + */ +#define pg_memory_barrier() __asm__ __volatile__ ("sync" : : : "memory") +#define pg_read_barrier() __asm__ __volatile__ ("lwsync" : : : "memory") +#define pg_write_barrier() __asm__ __volatile__ ("lwsync" : : : "memory") + +#elif defined(__alpha) || defined(__alpha__) /* Alpha */ + +/* + * Unlike all other known architectures, Alpha allows dependent reads to be + * reordered, but we don't currently find it necessary to provide a conditional + * read barrier to cover that case. We might need to add that later. + */ +#define pg_memory_barrier() __asm__ __volatile__ ("mb" : : : "memory") +#define pg_read_barrier() __asm__ __volatile__ ("rmb" : : : "memory") +#define pg_write_barrier() __asm__ __volatile__ ("wmb" : : : "memory") + +#elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1) + +/* + * If we're on GCC 4.1.0 or higher, we should be able to get a memory + * barrier out of this compiler built-in. But we prefer to rely on our + * own definitions where possible, and use this only as a fallback. + */ +#define pg_memory_barrier() __sync_synchronize() + +#endif + +#elif defined(__ia64__) || defined(__ia64) + +#define pg_compiler_barrier() _Asm_sched_fence() +#define pg_memory_barrier() _Asm_mf() + +#elif defined(WIN32_ONLY_COMPILER) + +/* Should work on both MSVC and Borland. */ +#include +#pragma intrinsic(_ReadWriteBarrier) +#define pg_compiler_barrier() _ReadWriteBarrier() +#define pg_memory_barrier() MemoryBarrier() + +#endif + +/* + * If we have no memory barrier implementation for this architecture, we + * fall back to acquiring and releasing a spinlock. This might, in turn, + * fall back to the semaphore-based spinlock implementation, which will be + * amazingly slow. + * + * It's not self-evident that every possible legal implementation of a + * spinlock acquire-and-release would be equivalent to a full memory barrier. + * For example, I'm not sure that Itanium's acq and rel add up to a full + * fence. But all of our actual implementations seem OK in this regard. + */ +#if !defined(pg_memory_barrier) +#define pg_memory_barrier(x) \ + do { S_LOCK(&dummy_spinlock); S_UNLOCK(&dummy_spinlock); } while (0) +#endif + +/* + * If read or write barriers are undefined, we upgrade them to full memory + * barriers. + * + * If a compiler barrier is unavailable, you probably don't want a full + * memory barrier instead, so if you have a use case for a compiler barrier, + * you'd better use #ifdef. + */ +#if !defined(pg_read_barrier) +#define pg_read_barrier() pg_memory_barrier() +#endif +#if !defined(pg_write_barrier) +#define pg_write_barrier() pg_memory_barrier() +#endif + +#endif /* BARRIER_H */ -- 2.40.0