BufferDescPadded *BufferDescriptors;
char *BufferBlocks;
+LWLockMinimallyPadded *BufferIOLWLockArray = NULL;
+LWLockTranche BufferIOLWLockTranche;
+LWLockTranche BufferContentLWLockTranche;
/*
InitBufferPool(void)
{
bool foundBufs,
- foundDescs;
+ foundDescs,
+ foundIOLocks;
/* Align descriptors to a cacheline boundary. */
- BufferDescriptors = (BufferDescPadded *) CACHELINEALIGN(
- ShmemInitStruct("Buffer Descriptors",
- NBuffers * sizeof(BufferDescPadded) + PG_CACHE_LINE_SIZE,
- &foundDescs));
+ BufferDescriptors = (BufferDescPadded *)
+ CACHELINEALIGN(
+ ShmemInitStruct("Buffer Descriptors",
+ NBuffers * sizeof(BufferDescPadded)
+ + PG_CACHE_LINE_SIZE,
+ &foundDescs));
BufferBlocks = (char *)
ShmemInitStruct("Buffer Blocks",
NBuffers * (Size) BLCKSZ, &foundBufs);
- if (foundDescs || foundBufs)
+ /* Align lwlocks to cacheline boundary */
+ BufferIOLWLockArray = (LWLockMinimallyPadded *)
+ CACHELINEALIGN(ShmemInitStruct("Buffer IO Locks",
+ NBuffers * (Size) sizeof(LWLockMinimallyPadded)
+ + PG_CACHE_LINE_SIZE,
+ &foundIOLocks));
+
+ BufferIOLWLockTranche.name = "Buffer IO Locks";
+ BufferIOLWLockTranche.array_base = BufferIOLWLockArray;
+ BufferIOLWLockTranche.array_stride = sizeof(LWLockMinimallyPadded);
+ LWLockRegisterTranche(LWTRANCHE_BUFFER_IO_IN_PROGRESS,
+ &BufferIOLWLockTranche);
+
+ BufferContentLWLockTranche.name = "Buffer Content Locks";
+ BufferContentLWLockTranche.array_base =
+ ((char *) BufferDescriptors) + offsetof(BufferDesc, content_lock);
+ BufferContentLWLockTranche.array_stride = sizeof(BufferDescPadded);
+ LWLockRegisterTranche(LWTRANCHE_BUFFER_CONTENT,
+ &BufferContentLWLockTranche);
+
+ if (foundDescs || foundBufs || foundIOLocks)
{
- /* both should be present or neither */
- Assert(foundDescs && foundBufs);
+ /* should find all of these, or none of them */
+ Assert(foundDescs && foundBufs && foundIOLocks);
/* note: this path is only taken in EXEC_BACKEND case */
}
else
*/
buf->freeNext = i + 1;
- buf->io_in_progress_lock = LWLockAssign();
- buf->content_lock = LWLockAssign();
+ LWLockInitialize(BufferDescriptorGetContentLock(buf),
+ LWTRANCHE_BUFFER_CONTENT);
+
+ LWLockInitialize(BufferDescriptorGetIOLock(buf),
+ LWTRANCHE_BUFFER_IO_IN_PROGRESS);
}
/* Correct last entry of linked list */
/* size of stuff controlled by freelist.c */
size = add_size(size, StrategyShmemSize());
+ /*
+ * It would be nice to include the I/O locks in the BufferDesc, but that
+ * would increase the size of a BufferDesc to more than one cache line, and
+ * benchmarking has shown that keeping every BufferDesc aligned on a cache
+ * line boundary is important for performance. So, instead, the array of
+ * I/O locks is allocated in a separate tranche. Because those locks are
+ * not highly contentended, we lay out the array with minimal padding.
+ */
+ size = add_size(size, mul_size(NBuffers, sizeof(LWLockMinimallyPadded)));
+ /* to allow aligning the above */
+ size = add_size(size, PG_CACHE_LINE_SIZE);
+
return size;
}
if (!isLocalBuf)
{
if (mode == RBM_ZERO_AND_LOCK)
- LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
+ LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
+ LW_EXCLUSIVE);
else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
}
if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
!isLocalBuf)
{
- LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
+ LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
}
if (isLocalBuf)
* happens to be trying to split the page the first one got from
* StrategyGetBuffer.)
*/
- if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
+ if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
+ LW_SHARED))
{
/*
* If using a nondefault strategy, and writing the buffer
StrategyRejectBuffer(strategy, buf))
{
/* Drop lock/pin and loop around for another buffer */
- LWLockRelease(buf->content_lock);
+ LWLockRelease(BufferDescriptorGetContentLock(buf));
UnpinBuffer(buf, true);
continue;
}
smgr->smgr_rnode.node.relNode);
FlushBuffer(buf, NULL);
- LWLockRelease(buf->content_lock);
+ LWLockRelease(BufferDescriptorGetContentLock(buf));
TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
smgr->smgr_rnode.node.spcNode,
Assert(BufferIsPinned(buffer));
/* unfortunately we can't check if the lock is held exclusively */
- Assert(LWLockHeldByMe(bufHdr->content_lock));
+ Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
LockBufHdr(bufHdr);
if (ref->refcount == 0)
{
/* I'd better not still hold any locks on the buffer */
- Assert(!LWLockHeldByMe(buf->content_lock));
- Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
+ Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
+ Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
LockBufHdr(buf);
* buffer is clean by the time we've locked it.)
*/
PinBuffer_Locked(bufHdr);
- LWLockAcquire(bufHdr->content_lock, LW_SHARED);
+ LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
FlushBuffer(bufHdr, NULL);
- LWLockRelease(bufHdr->content_lock);
+ LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
UnpinBuffer(bufHdr, true);
return result | BUF_WRITTEN;
(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
{
PinBuffer_Locked(bufHdr);
- LWLockAcquire(bufHdr->content_lock, LW_SHARED);
+ LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
FlushBuffer(bufHdr, rel->rd_smgr);
- LWLockRelease(bufHdr->content_lock);
+ LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
UnpinBuffer(bufHdr, true);
}
else
(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
{
PinBuffer_Locked(bufHdr);
- LWLockAcquire(bufHdr->content_lock, LW_SHARED);
+ LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
FlushBuffer(bufHdr, NULL);
- LWLockRelease(bufHdr->content_lock);
+ LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
UnpinBuffer(bufHdr, true);
}
else
bufHdr = GetBufferDescriptor(buffer - 1);
- Assert(LWLockHeldByMe(bufHdr->content_lock));
+ Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
FlushBuffer(bufHdr, NULL);
}
Assert(GetPrivateRefCount(buffer) > 0);
/* here, either share or exclusive lock is OK */
- Assert(LWLockHeldByMe(bufHdr->content_lock));
+ Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
/*
* This routine might get called many times on the same page, if we are
buf = GetBufferDescriptor(buffer - 1);
if (mode == BUFFER_LOCK_UNLOCK)
- LWLockRelease(buf->content_lock);
+ LWLockRelease(BufferDescriptorGetContentLock(buf));
else if (mode == BUFFER_LOCK_SHARE)
- LWLockAcquire(buf->content_lock, LW_SHARED);
+ LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
else if (mode == BUFFER_LOCK_EXCLUSIVE)
- LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
+ LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
else
elog(ERROR, "unrecognized buffer lock mode: %d", mode);
}
buf = GetBufferDescriptor(buffer - 1);
- return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
+ return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
+ LW_EXCLUSIVE);
}
/*
UnlockBufHdr(buf);
if (!(sv_flags & BM_IO_IN_PROGRESS))
break;
- LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
- LWLockRelease(buf->io_in_progress_lock);
+ LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
+ LWLockRelease(BufferDescriptorGetIOLock(buf));
}
}
* Grab the io_in_progress lock so that other processes can wait for
* me to finish the I/O.
*/
- LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
+ LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
LockBufHdr(buf);
* him to get unwedged.
*/
UnlockBufHdr(buf);
- LWLockRelease(buf->io_in_progress_lock);
+ LWLockRelease(BufferDescriptorGetIOLock(buf));
WaitIO(buf);
}
{
/* someone else already did the I/O */
UnlockBufHdr(buf);
- LWLockRelease(buf->io_in_progress_lock);
+ LWLockRelease(BufferDescriptorGetIOLock(buf));
return false;
}
InProgressBuf = NULL;
- LWLockRelease(buf->io_in_progress_lock);
+ LWLockRelease(BufferDescriptorGetIOLock(buf));
}
/*
* we can use TerminateBufferIO. Anyone who's executing WaitIO on the
* buffer will be in a busy spin until we succeed in doing this.
*/
- LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
+ LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
LockBufHdr(buf);
Assert(buf->flags & BM_IO_IN_PROGRESS);
int numLocks;
/*
- * Possibly this logic should be spread out among the affected modules,
- * the same way that shmem space estimation is done. But for now, there
- * are few enough users of LWLocks that we can get away with just keeping
- * the knowledge here.
+ * Many users of LWLocks no longer reserve space in the main array here,
+ * but instead allocate separate tranches. The latter approach has the
+ * advantage of allowing LWLOCK_STATS and LOCK_DEBUG output to produce
+ * more useful output.
*/
/* Predefined LWLocks */
numLocks = NUM_FIXED_LWLOCKS;
- /* bufmgr.c needs two for each shared buffer */
- numLocks += 2 * NBuffers;
-
/* proc.c needs one for each backend or auxiliary process */
numLocks += MaxBackends + NUM_AUXILIARY_PROCS;
StaticAssertExpr(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS,
"MAX_BACKENDS too big for lwlock.c");
+ StaticAssertExpr(sizeof(LWLock) <= LWLOCK_MINIMAL_SIZE &&
+ sizeof(LWLock) <= LWLOCK_PADDED_SIZE,
+ "Miscalculated LWLock padding");
+
if (!IsUnderPostmaster)
{
int numLocks = NumLWLocks();
* Note: buf_hdr_lock must be held to examine or change the tag, flags,
* usage_count, refcount, or wait_backend_pid fields. buf_id field never
* changes after initialization, so does not need locking. freeNext is
- * protected by the buffer_strategy_lock not buf_hdr_lock. The LWLocks can take
- * care of themselves. The buf_hdr_lock is *not* used to control access to
+ * protected by the buffer_strategy_lock not buf_hdr_lock. The LWLock can
+ * take care of itself. The buf_hdr_lock is *not* used to control access to
* the data in the buffer!
*
* An exception is that if we have the buffer pinned, its tag can't change
*
* We use this same struct for local buffer headers, but the lock fields
* are not used and not all of the flag bits are useful either.
+ *
+ * Be careful to avoid increasing the size of the struct when adding or
+ * reordering members. Keeping it below 64 bytes (the most common CPU
+ * cache line size) is fairly important for performance.
*/
typedef struct BufferDesc
{
BufferTag tag; /* ID of page contained in buffer */
BufFlags flags; /* see bit definitions above */
- uint16 usage_count; /* usage counter for clock sweep code */
+ uint8 usage_count; /* usage counter for clock sweep code */
+ slock_t buf_hdr_lock; /* protects a subset of fields, see above */
unsigned refcount; /* # of backends holding pins on buffer */
int wait_backend_pid; /* backend PID of pin-count waiter */
- slock_t buf_hdr_lock; /* protects the above fields */
-
int buf_id; /* buffer's index number (from 0) */
int freeNext; /* link in freelist chain */
- LWLock *io_in_progress_lock; /* to wait for I/O to complete */
- LWLock *content_lock; /* to lock access to buffer contents */
+ LWLock content_lock; /* to lock access to buffer contents */
} BufferDesc;
/*
#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
+#define BufferDescriptorGetIOLock(bdesc) \
+ (&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
+#define BufferDescriptorGetContentLock(bdesc) \
+ ((LWLock*) (&(bdesc)->content_lock))
+
+extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
+
/*
* The freeNext field is either the index of the next freelist entry,
* or one of these special values:
struct PGPROC;
/*
+ * Prior to PostgreSQL 9.4, every lightweight lock in the system was stored
+ * in a single array. For convenience and for compatibility with past
+ * releases, we still have a main array, but it's now also permissible to
+ * store LWLocks elsewhere in the main shared memory segment or in a dynamic
+ * shared memory segment. Each array of lwlocks forms a separate "tranche".
+ *
* It's occasionally necessary to identify a particular LWLock "by name"; e.g.
* because we wish to report the lock to dtrace. We could store a name or
* other identifying information in the lock itself, but since it's common
} LWLock;
/*
- * Prior to PostgreSQL 9.4, every lightweight lock in the system was stored
- * in a single array. For convenience and for compatibility with past
- * releases, we still have a main array, but it's now also permissible to
- * store LWLocks elsewhere in the main shared memory segment or in a dynamic
- * shared memory segment. In the main array, we force the array stride to
- * be a power of 2, which saves a few cycles in indexing, but more importantly
- * also ensures that individual LWLocks don't cross cache line boundaries.
- * This reduces cache contention problems, especially on AMD Opterons.
- * (Of course, we have to also ensure that the array start address is suitably
- * aligned.)
+ * In most cases, it's desirable to force each tranche of LWLocks to be aligned
+ * on a cache line boundary and make the array stride a power of 2. This saves
+ * a few cycles in indexing, but more importantly ensures that individual
+ * LWLocks don't cross cache line boundaries. This reduces cache contention
+ * problems, especially on AMD Opterons. In some cases, it's useful to add
+ * even more padding so that each LWLock takes up an entire cache line; this is
+ * useful, for example, in the main LWLock array, where the overall number of
+ * locks is small but some are heavily contended.
+ *
+ * When allocating a tranche that contains data other than LWLocks, it is
+ * probably best to include a bare LWLock and then pad the resulting structure
+ * as necessary for performance. For an array that contains only LWLocks,
+ * LWLockMinimallyPadded can be used for cases where we just want to ensure
+ * that we don't cross cache line boundaries within a single lock, while
+ * LWLockPadded can be used for cases where we want each lock to be an entire
+ * cache line.
*
- * On a 32-bit platforms a LWLock will these days fit into 16 bytes, but since
- * that didn't use to be the case and cramming more lwlocks into a cacheline
- * might be detrimental performancewise we still use 32 byte alignment
- * there. So, both on 32 and 64 bit platforms, it should fit into 32 bytes
- * unless slock_t is really big. We allow for that just in case.
+ * On 32-bit platforms, an LWLockMinimallyPadded might actually contain more
+ * than the absolute minimum amount of padding required to keep a lock from
+ * crossing a cache line boundary, because an unpadded LWLock might fit into
+ * 16 bytes. We ignore that possibility when determining the minimal amount
+ * of padding. Older releases had larger LWLocks, so 32 really was the
+ * minimum, and packing them in tighter might hurt performance.
+ *
+ * LWLOCK_MINIMAL_SIZE should be 32 on basically all common platforms, but
+ * because slock_t is more than 2 bytes on some obscure platforms, we allow
+ * for the possibility that it might be 64.
*/
-#define LWLOCK_PADDED_SIZE (sizeof(LWLock) <= 32 ? 32 : 64)
+#define LWLOCK_PADDED_SIZE PG_CACHE_LINE_SIZE
+#define LWLOCK_MINIMAL_SIZE (sizeof(LWLock) <= 32 ? 32 : 64)
+/* LWLock, padded to a full cache line size */
typedef union LWLockPadded
{
LWLock lock;
char pad[LWLOCK_PADDED_SIZE];
} LWLockPadded;
+
+/* LWLock, minimally padded */
+typedef union LWLockMinimallyPadded
+{
+ LWLock lock;
+ char pad[LWLOCK_MINIMAL_SIZE];
+} LWLockMinimallyPadded;
+
extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
extern char *MainLWLockNames[];
{
LWTRANCHE_MAIN,
LWTRANCHE_WAL_INSERT,
+ LWTRANCHE_BUFFER_CONTENT,
+ LWTRANCHE_BUFFER_IO_IN_PROGRESS,
LWTRANCHE_FIRST_USER_DEFINED
} BuiltinTrancheIds;
LVRelStats
LWLock
LWLockHandle
+LWLockMinimallyPadded
LWLockMode
LWLockPadded
LWLockTranche