Code review for ARC patch. Eliminate static variables, improve handling

author Tom Lane <tgl@sss.pgh.pa.us>

Mon, 19 Apr 2004 23:27:17 +0000 (23:27 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Mon, 19 Apr 2004 23:27:17 +0000 (23:27 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Mon, 19 Apr 2004 23:27:17 +0000 (23:27 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Mon, 19 Apr 2004 23:27:17 +0000 (23:27 +0000)
diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README

index 182c1d42344a66aff6e1b92996e3766c996f19eb..f475e63a78e73e885fb77df89262056076698530 100644 (file)
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.6 2003/11/29 19:51:56 pgsql Exp $
+$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.7 2004/04/19 23:27:17 tgl Exp $
  
  Notes about shared buffer access rules
  --------------------------------------
@@ -97,153 +97,149 @@ for VACUUM's use, since we don't allow multiple VACUUMs concurrently on a
  single relation anyway.
  
  
-Buffer replacement strategy interface:
+Buffer replacement strategy interface
+-------------------------------------
  
-The two files freelist.c and buf_table.c contain the buffer cache
-replacement strategy. The interface to the strategy is:
+The file freelist.c contains the buffer cache replacement strategy.
+The interface to the strategy is:
  
-    BufferDesc *
-       StrategyBufferLookup(BufferTag *tagPtr, bool recheck)
+       BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck,
+                                        int *cdb_found_index)
  
-               This is allways the first call made by the buffer manager
-               to check if a disk page is in memory. If so, the function
-               returns the buffer descriptor and no further action is
-               required.
+This is always the first call made by the buffer manager to check if a disk
+page is in memory. If so, the function returns the buffer descriptor and no
+further action is required. If the page is not in memory,
+StrategyBufferLookup() returns NULL.
  
-               If the page is not in memory, StrategyBufferLookup()
-               returns NULL.
+The flag recheck tells the strategy that this is a second lookup after
+flushing a dirty block. If the buffer manager has to evict another buffer,
+it will release the bufmgr lock while doing the write IO. During this time,
+another backend could possibly fault in the same page this backend is after,
+so we have to check again after the IO is done if the page is in memory now.
  
-               The flag recheck tells the strategy that this is a second
-               lookup after flushing a dirty block. If the buffer manager
-               has to evict another buffer, he will release the bufmgr lock
-               while doing the write IO. During this time, another backend
-               could possibly fault in the same page this backend is after,
-               so we have to check again after the IO is done if the page
-               is in memory now.
+*cdb_found_index is set to the index of the found CDB, or -1 if none.
+This is not intended to be used by the caller, except to pass to
+StrategyReplaceBuffer().
  
-       BufferDesc *
-       StrategyGetBuffer(void)
+       BufferDesc *StrategyGetBuffer(int *cdb_replace_index)
  
-               The buffer manager calls this function to get an unpinned
-               cache buffer who's content can be evicted. The returned
-               buffer might be empty, clean or dirty.
+The buffer manager calls this function to get an unpinned cache buffer whose
+content can be evicted. The returned buffer might be empty, clean or dirty.
  
-               The returned buffer is only a cadidate for replacement.
-               It is possible that while the buffer is written, another
-               backend finds and modifies it, so that it is dirty again.
-               The buffer manager will then call StrategyGetBuffer()
-               again to ask for another candidate.
+The returned buffer is only a candidate for replacement.  It is possible that
+while the buffer is being written, another backend finds and modifies it, so
+that it is dirty again.  The buffer manager will then have to call
+StrategyGetBuffer() again to ask for another candidate.
  
-       void
-       StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, 
-                       BlockNumber blockNum)
-               
-               Called by the buffer manager at the time it is about to
-               change the association of a buffer with a disk page.
+*cdb_replace_index is set to the index of the candidate CDB, or -1 if none
+(meaning we are using a previously free buffer).  This is not intended to be
+used by the caller, except to pass to StrategyReplaceBuffer().
  
-               Before this call, StrategyBufferLookup() still has to find
-               the buffer even if it was returned by StrategyGetBuffer()
-               as a candidate for replacement.
+       void StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
+                                  int cdb_found_index, int cdb_replace_index)
  
-               After this call, this buffer must be returned for a
-               lookup of the new page identified by rnode and blockNum.
+Called by the buffer manager at the time it is about to change the association
+of a buffer with a disk page.
  
-       void
-       StrategyInvalidateBuffer(BufferDesc *buf)
+Before this call, StrategyBufferLookup() still has to find the buffer under
+its old tag, even if it was returned by StrategyGetBuffer() as a candidate
+for replacement.
  
-               Called from various parts to inform that the content of
-               this buffer has been thrown away. This happens for example
-               in the case of dropping a relation.
+After this call, this buffer must be returned for a lookup of the new page
+identified by *newTag.
  
-               The buffer must be clean and unpinned on call.
+cdb_found_index and cdb_replace_index must be the auxiliary values
+returned by previous calls to StrategyBufferLookup and StrategyGetBuffer.
  
-               If the buffer associated with a disk page, StrategyBufferLookup()
-               must not return it for this page after the call.
+       void StrategyInvalidateBuffer(BufferDesc *buf)
  
-       void
-       StrategyHintVacuum(bool vacuum_active)
+Called by the buffer manager to inform the strategy that the content of this
+buffer is being thrown away. This happens for example in the case of dropping
+a relation.  The buffer must be clean and unpinned on call.
  
-               Because vacuum reads all relations of the entire database
-               through the buffer manager, it can greatly disturb the
-               buffer replacement strategy. This function is used by vacuum
-               to inform that all subsequent buffer lookups are caused
-               by vacuum scanning relations.
+If the buffer was associated with a disk page, StrategyBufferLookup()
+must not return it for this page after the call.
  
-               
-Buffer replacement strategy:
+       void StrategyHintVacuum(bool vacuum_active)
  
-The buffer replacement strategy actually used in freelist.c is a
-version of the Adaptive Replacement Cache (ARC) special tailored for
-PostgreSQL.
+Because VACUUM reads all relations of the entire database through the buffer
+manager, it can greatly disturb the buffer replacement strategy. This function
+is used by VACUUM to inform the strategy that subsequent buffer lookups are
+(or are not) caused by VACUUM scanning relations.
  
-The algorithm works as follows:
  
-    C is the size of the cache in number of pages (conf: shared_buffers)
-       ARC uses 2*C Cache Directory Blocks (CDB). A cache directory block
-       is allwayt associated with one unique file page and "can" point to
-       one shared buffer.
-
-       All file pages known in by the directory are managed in 4 LRU lists
-       named B1, T1, T2 and B2. The T1 and T2 lists are the "real" cache
-       entries, linking a file page to a memory buffer where the page is
-       currently cached. Consequently T1len+T2len <= C. B1 and B2 are
-       ghost cache directories that extend T1 and T2 so that the strategy
-       remembers pages longer. The strategy tries to keep B1len+T1len and
-       B2len+T2len both at C. T1len and T2 len vary over the runtime
-       depending on the lookup pattern and its resulting cache hits. The
-       desired size of T1len is called T1target.
-
-       Assuming we have a full cache, one of 5 cases happens on a lookup:
-
-       MISS    On a cache miss, depending on T1target and the actual T1len
-                       the LRU buffer of T1 or T2 is evicted. Its CDB is removed
-                       from the T list and added as MRU of the corresponding B list.
-                       The now free buffer is replaced with the requested page
-                       and added as MRU of T1.
-
-       T1 hit  The T1 CDB is moved to the MRU position of the T2 list.
-
-       T2 hit  The T2 CDB is moved to the MRU position of the T2 list.
-
-       B1 hit  This means that a buffer that was evicted from the T1
-                       list is now requested again, indicating that T1target is
-                       too small (otherwise it would still be in T1 and thus in
-                       memory). The strategy raises T1target, evicts a buffer
-                       depending on T1target and T1len and places the CDB at
-                       MRU of T2.
-
-       B2 hit  This means the opposite of B1, the T2 list is probably too
-                       small. So the strategy lowers T1target, evicts a buffer
-                       and places the CDB at MRU of T2.
-
-       Thus, every page that is found on lookup in any of the four lists
-       ends up as the MRU of the T2 list. The T2 list therefore is the
-       "frequency" cache, holding frequently requested pages.
-
-       Every page that is seen for the first time ends up as the MRU of
-       the T1 list. The T1 list is the "recency" cache, holding recent
-       newcomers.
-
-       The tailoring done for PostgreSQL has to do with the way, the
-       query executor works. A typical UPDATE or DELETE first scans the 
-       relation, searching for the tuples and then calls heap_update() or
-       heap_delete(). This causes at least 2 lookups for the block in the
-       same statement. In the case of multiple matches in one block even
-       more often. As a result, every block touched in an UPDATE or DELETE
-       would directly jump into the T2 cache, which is wrong. To prevent
-       this the strategy remembers which transaction added a buffer to the
-       T1 list and will not promote it from there into the T2 cache during
-       the same transaction.
-       
-       Another specialty is the change of the strategy during VACUUM.
-       Lookups during VACUUM do not represent application needs, so it
-       would be wrong to change the cache balance T1target due to that
-       or to cause massive cache evictions. Therefore, a page read in to
-       satisfy vacuum (not those that actually cause a hit on any list)
-       is placed at the LRU position of the T1 list, for immediate
-       reuse. Since Vacuum usually requests many pages very fast, the
-       natural side effect of this is that it will get back the very
-       buffers it filled and possibly modified on the next call and will
-       therefore do it's work in a few shared memory buffers, while using
-       whatever it finds in the cache already.
+Buffer replacement strategy
+---------------------------
+
+The buffer replacement strategy actually used in freelist.c is a version of
+the Adaptive Replacement Cache (ARC) specially tailored for PostgreSQL.
+
+The algorithm works as follows:
  
+C is the size of the cache in number of pages (a/k/a shared_buffers or
+NBuffers).  ARC uses 2*C Cache Directory Blocks (CDB). A cache directory block
+is always associated with one unique file page.  It may point to one shared
+buffer, or may indicate that the file page is not in a buffer but has been
+accessed recently.
+
+All CDB entries are managed in 4 LRU lists named T1, T2, B1 and B2. The T1 and
+T2 lists are the "real" cache entries, linking a file page to a memory buffer
+where the page is currently cached. Consequently T1len+T2len <= C. B1 and B2
+are ghost cache directories that extend T1 and T2 so that the strategy
+remembers pages longer. The strategy tries to keep B1len+T1len and B2len+T2len
+both at C. T1len and T2len vary over the runtime depending on the lookup
+pattern and its resulting cache hits. The desired size of T1len is called
+T1target.
+
+Assuming we have a full cache, one of 5 cases happens on a lookup:
+
+MISS   On a cache miss, depending on T1target and the actual T1len
+       the LRU buffer of either T1 or T2 is evicted. Its CDB is removed
+       from the T list and added as MRU of the corresponding B list.
+       The now free buffer is replaced with the requested page
+       and added as MRU of T1.
+
+T1 hit The T1 CDB is moved to the MRU position of the T2 list.
+
+T2 hit The T2 CDB is moved to the MRU position of the T2 list.
+
+B1 hit This means that a buffer that was evicted from the T1
+       list is now requested again, indicating that T1target is
+       too small (otherwise it would still be in T1 and thus in
+       memory). The strategy raises T1target, evicts a buffer
+       depending on T1target and T1len and places the CDB at
+       MRU of T2.
+
+B2 hit This means the opposite of B1, the T2 list is probably too
+       small. So the strategy lowers T1target, evicts a buffer
+       and places the CDB at MRU of T2.
+
+Thus, every page that is found on lookup in any of the four lists
+ends up as the MRU of the T2 list. The T2 list therefore is the
+"frequency" cache, holding frequently requested pages.
+
+Every page that is seen for the first time ends up as the MRU of the T1
+list. The T1 list is the "recency" cache, holding recent newcomers.
+
+The tailoring done for PostgreSQL has to do with the way the query executor
+works. A typical UPDATE or DELETE first scans the relation, searching for the
+tuples and then calls heap_update() or heap_delete(). This causes at least 2
+lookups for the block in the same statement. In the case of multiple matches
+in one block even more often. As a result, every block touched in an UPDATE or
+DELETE would directly jump into the T2 cache, which is wrong. To prevent this
+the strategy remembers which transaction added a buffer to the T1 list and
+will not promote it from there into the T2 cache during the same transaction.
+
+Another specialty is the change of the strategy during VACUUM.  Lookups during
+VACUUM do not represent application needs, and do not suggest that the page
+will be hit again soon, so it would be wrong to change the cache balance
+T1target due to that or to cause massive cache evictions. Therefore, a page
+read in to satisfy vacuum is placed at the LRU position of the T1 list, for
+immediate reuse.  Also, if we happen to get a hit on a CDB entry during
+VACUUM, we do not promote the page above its current position in the list.
+Since VACUUM usually requests many pages very fast, the effect of this is that
+it will get back the very buffers it filled and possibly modified on the next
+call and will therefore do its work in a few shared memory buffers, while
+being able to use whatever it finds in the cache already.  This also implies
+that most of the write traffic caused by a VACUUM will be done by the VACUUM
+itself and not pushed off onto other processes.
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c

index a671bf9f7ff87a755fa53b0d75826efc5956fa36..e0aa0e93e867980df54b8ca35b225d5111c3073d 100644 (file)
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -8,35 +8,15 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/buffer/buf_init.c,v 1.62 2004/02/12 15:06:56 wieck Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/buffer/buf_init.c,v 1.63 2004/04/19 23:27:17 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
  #include "postgres.h"
  
-#include <sys/file.h>
-#include <math.h>
-#include <signal.h>
-
-#include "catalog/catalog.h"
-#include "executor/execdebug.h"
-#include "miscadmin.h"
-#include "storage/buf.h"
-#include "storage/buf_internals.h"
  #include "storage/bufmgr.h"
-#include "storage/fd.h"
-#include "storage/ipc.h"
-#include "storage/lmgr.h"
-#include "storage/shmem.h"
-#include "storage/smgr.h"
-#include "storage/lwlock.h"
-#include "utils/builtins.h"
-#include "utils/hsearch.h"
-#include "utils/memutils.h"
-
-int                    ShowPinTrace = 0;
+#include "storage/buf_internals.h"
  
-int                    Data_Descriptors;
  
  BufferDesc *BufferDescriptors;
  Block     *BufferBlockPointers;
@@ -44,6 +24,14 @@ Block           *BufferBlockPointers;
  long      *PrivateRefCount;    /* also used in freelist.c */
  bits8     *BufferLocks;                /* flag bits showing locks I have set */
  
+/* statistics counters */
+long int       ReadBufferCount;
+long int       ReadLocalBufferCount;
+long int       BufferHitCount;
+long int       LocalBufferHitCount;
+long int       BufferFlushCount;
+long int       LocalBufferFlushCount;
+
  
  /*
   * Data Structures:
@@ -61,48 +49,35 @@ bits8          *BufferLocks;                /* flag bits showing locks I have set */
   *             see freelist.c.  A buffer cannot be replaced while in
   *             use either by data manager or during IO.
   *
- * WriteBufferBack:
- *             currently, a buffer is only written back at the time
- *             it is selected for replacement.  It should
- *             be done sooner if possible to reduce latency of
- *             BufferAlloc().  Maybe there should be a daemon process.
   *
   * Synchronization/Locking:
   *
   * BufMgrLock lock -- must be acquired before manipulating the
- *             buffer queues (lookup/freelist).  Must be released
+ *             buffer search datastructures (lookup/freelist, as well as the
+ *             flag bits of any buffer).  Must be released
   *             before exit and before doing any IO.
   *
   * IO_IN_PROGRESS -- this is a flag in the buffer descriptor.
   *             It must be set when an IO is initiated and cleared at
- *             the end of      the IO.  It is there to make sure that one
+ *             the end of the IO.  It is there to make sure that one
   *             process doesn't start to use a buffer while another is
   *             faulting it in.  see IOWait/IOSignal.
   *
- * refcount -- A buffer is pinned during IO and immediately
- *             after a BufferAlloc().  A buffer is always either pinned
- *             or on the freelist but never both.      The buffer must be
- *             released, written, or flushed before the end of
- *             transaction.
+ * refcount -- Counts the number of processes holding pins on a buffer.
+ *             A buffer is pinned during IO and immediately after a BufferAlloc().
+ *             Pins must be released before end of transaction.
   *
- * PrivateRefCount -- Each buffer also has a private refcount the keeps
+ * PrivateRefCount -- Each buffer also has a private refcount that keeps
   *             track of the number of times the buffer is pinned in the current
- *             processes.      This is used for two purposes, first, if we pin a
+ *             process.        This is used for two purposes: first, if we pin a
   *             a buffer more than once, we only need to change the shared refcount
- *             once, thus only lock the buffer pool once, second, when a transaction
+ *             once, thus only lock the shared state once; second, when a transaction
   *             aborts, it should only unpin the buffers exactly the number of times it
   *             has pinned them, so that it will not blow away buffers of another
   *             backend.
   *
   */
  
-long int       ReadBufferCount;
-long int       ReadLocalBufferCount;
-long int       BufferHitCount;
-long int       LocalBufferHitCount;
-long int       BufferFlushCount;
-long int       LocalBufferFlushCount;
-
  
  /*
   * Initialize shared buffer pool
@@ -118,8 +93,6 @@ InitBufferPool(void)
                                 foundDescs;
         int                     i;
  
-       Data_Descriptors = NBuffers;
-
         /*
          * It's probably not really necessary to grab the lock --- if there's
          * anyone else attached to the shmem at this point, we've got
@@ -131,7 +104,7 @@ InitBufferPool(void)
  
         BufferDescriptors = (BufferDesc *)
                 ShmemInitStruct("Buffer Descriptors",
-                                         Data_Descriptors * sizeof(BufferDesc), &foundDescs);
+                                               NBuffers * sizeof(BufferDesc), &foundDescs);
  
         BufferBlocks = (char *)
                 ShmemInitStruct("Buffer Blocks",
@@ -152,9 +125,9 @@ InitBufferPool(void)
  
                 /*
                  * link the buffers into a single linked list. This will become the
-                * LiFo list of unused buffers returned by StragegyGetBuffer().
+                * LIFO list of unused buffers returned by StrategyGetBuffer().
                  */
-               for (i = 0; i < Data_Descriptors; block += BLCKSZ, buf++, i++)
+               for (i = 0; i < NBuffers; block += BLCKSZ, buf++, i++)
                 {
                         Assert(ShmemIsValid((unsigned long) block));
  
@@ -173,7 +146,7 @@ InitBufferPool(void)
                 }
  
                 /* Correct last entry */
-               BufferDescriptors[Data_Descriptors - 1].bufNext = -1;
+               BufferDescriptors[NBuffers - 1].bufNext = -1;
         }
  
         /* Init other shared buffer-management stuff */
@@ -215,35 +188,31 @@ InitBufferPoolAccess(void)
                 BufferBlockPointers[i] = (Block) MAKE_PTR(BufferDescriptors[i].data);
  }
  
-/* -----------------------------------------------------
+/*
   * BufferShmemSize
   *
   * compute the size of shared memory for the buffer pool including
   * data pages, buffer descriptors, hash tables, etc.
- * ----------------------------------------------------
   */
  int
  BufferShmemSize(void)
  {
         int                     size = 0;
  
-       /* size of shmem index hash table */
-       size += hash_estimate_size(SHMEM_INDEX_SIZE, sizeof(ShmemIndexEnt));
-
         /* size of buffer descriptors */
         size += MAXALIGN(NBuffers * sizeof(BufferDesc));
  
-       /* size of the shared replacement strategy control block */
-       size += MAXALIGN(sizeof(BufferStrategyControl));
-
-       /* size of the ARC directory blocks */
-       size += MAXALIGN(NBuffers * 2 * sizeof(BufferStrategyCDB));
-
         /* size of data pages */
         size += NBuffers * MAXALIGN(BLCKSZ);
  
         /* size of buffer hash table */
         size += hash_estimate_size(NBuffers * 2, sizeof(BufferLookupEnt));
  
+       /* size of the shared replacement strategy control block */
+       size += MAXALIGN(sizeof(BufferStrategyControl));
+
+       /* size of the ARC directory blocks */
+       size += MAXALIGN(NBuffers * 2 * sizeof(BufferStrategyCDB));
+
         return size;
  }
diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c

index 33590b65fdbd0a083120e44c372f192a2c2ade73..3829444195bcfbf323c08b7b2a74ff9d17b34b3d 100644 (file)
--- a/src/backend/storage/buffer/buf_table.c
+++ b/src/backend/storage/buffer/buf_table.c
@@ -3,46 +3,42 @@
   * buf_table.c
   *       routines for finding buffers in the buffer pool.
   *
+ * NOTE: these days, what this table actually provides is a mapping from
+ * BufferTags to CDB indexes, not directly to buffers.  The function names
+ * are thus slight misnomers.
+ *
+ * Note: all routines in this file assume that the BufMgrLock is held
+ * by the caller, so no synchronization is needed.
+ *
+ *
   * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/buffer/buf_table.c,v 1.34 2003/12/14 00:34:47 neilc Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/buffer/buf_table.c,v 1.35 2004/04/19 23:27:17 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
-/*
- * OLD COMMENTS
- *
- * Data Structures:
- *
- *             Buffers are identified by their BufferTag (buf.h).      This
- * file contains routines for allocating a shmem hash table to
- * map buffer tags to buffer descriptors.
- *
- * Synchronization:
- *
- *     All routines in this file assume BufMgrLock is held by their caller.
- */
-
  #include "postgres.h"
  
  #include "storage/buf_internals.h"
  #include "storage/bufmgr.h"
  
+
  static HTAB *SharedBufHash;
  
  
  /*
   * Initialize shmem hash table for mapping buffers
+ *             size is the desired hash table size (2*NBuffers for ARC algorithm)
   */
  void
  InitBufTable(int size)
  {
         HASHCTL         info;
  
-       /* assume lock is held */
+       /* assume no locking is needed yet */
  
         /* BufferTag maps to Buffer */
         info.keysize = sizeof(BufferTag);
@@ -60,6 +56,7 @@ InitBufTable(int size)
  
  /*
   * BufTableLookup
+ *             Lookup the given BufferTag; return CDB index, or -1 if not found
   */
  int
  BufTableLookup(BufferTag *tagPtr)
@@ -78,10 +75,11 @@ BufTableLookup(BufferTag *tagPtr)
  }
  
  /*
- * BufTableDelete
+ * BufTableInsert
+ *             Insert a hashtable entry for given tag and CDB index
   */
-bool
-BufTableInsert(BufferTag *tagPtr, Buffer buf_id)
+void
+BufTableInsert(BufferTag *tagPtr, int cdb_id)
  {
         BufferLookupEnt *result;
         bool            found;
@@ -97,14 +95,14 @@ BufTableInsert(BufferTag *tagPtr, Buffer buf_id)
         if (found)                                      /* found something else in the table? */
                 elog(ERROR, "shared buffer hash table corrupted");
  
-       result->id = buf_id;
-       return TRUE;
+       result->id = cdb_id;
  }
  
  /*
   * BufTableDelete
+ *             Delete the hashtable entry for given tag
   */
-bool
+void
  BufTableDelete(BufferTag *tagPtr)
  {
         BufferLookupEnt *result;
@@ -114,6 +112,4 @@ BufTableDelete(BufferTag *tagPtr)
  
         if (!result)                            /* shouldn't happen */
                 elog(ERROR, "shared buffer hash table corrupted");
-
-       return TRUE;
  }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c

index d515a7a2590643b99da53243afe1b9280a9bf4b4..a80435b7ec2ced3a1963a807587c3391c5960637 100644 (file)
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.160 2004/02/12 20:07:26 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.161 2004/04/19 23:27:17 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -54,9 +54,9 @@
  #include "storage/proc.h"
  #include "storage/smgr.h"
  #include "utils/relcache.h"
-
  #include "pgstat.h"
  
+
  #define BufferGetLSN(bufHdr)   \
         (*((XLogRecPtr*) MAKE_PTR((bufHdr)->data)))
  
@@ -64,15 +64,17 @@
  /* GUC variable */
  bool           zero_damaged_pages = false;
  
+#ifdef NOT_USED
+int                    ShowPinTrace = 0;
+#endif
+
  int                    BgWriterDelay = 200;
  int                    BgWriterPercent = 1;
  int                    BgWriterMaxpages = 100;
  
-static void WaitIO(BufferDesc *buf);
-static void StartBufferIO(BufferDesc *buf, bool forInput);
-static void TerminateBufferIO(BufferDesc *buf);
-static void ContinueBufferIO(BufferDesc *buf, bool forInput);
-static void buffer_write_error_callback(void *arg);
+long           NDirectFileRead;        /* some I/O's are direct file access.
+                                                                * bypass bufmgr */
+long           NDirectFileWrite;       /* e.g., I/O in psort and hashjoin. */
  
  /*
   * Macro : BUFFER_IS_BROKEN
@@ -80,18 +82,22 @@ static void buffer_write_error_callback(void *arg);
  */
  #define BUFFER_IS_BROKEN(buf) ((buf->flags & BM_IO_ERROR) && !(buf->flags & BM_DIRTY))
  
+
+static void PinBuffer(BufferDesc *buf);
+static void UnpinBuffer(BufferDesc *buf);
+static void WaitIO(BufferDesc *buf);
+static void StartBufferIO(BufferDesc *buf, bool forInput);
+static void TerminateBufferIO(BufferDesc *buf);
+static void ContinueBufferIO(BufferDesc *buf, bool forInput);
+static void buffer_write_error_callback(void *arg);
  static Buffer ReadBufferInternal(Relation reln, BlockNumber blockNum,
                                    bool bufferLockHeld);
  static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
                         bool *foundPtr);
  static void BufferReplace(BufferDesc *bufHdr);
-
-#ifdef NOT_USED
-void           PrintBufferDescs(void);
-#endif
-
  static void write_buffer(Buffer buffer, bool unpin);
  
+
  /*
   * ReadBuffer -- returns a buffer containing the requested
   *             block of the requested relation.  If the blknum
@@ -282,14 +288,15 @@ BufferAlloc(Relation reln,
         BufferDesc *buf,
                            *buf2;
         BufferTag       newTag;                 /* identity of requested block */
+       int                     cdb_found_index,
+                               cdb_replace_index;
         bool            inProgress;             /* buffer undergoing IO */
  
-       /* create a new tag so we can lookup the buffer */
-       /* assume that the relation is already open */
+       /* create a tag so we can lookup the buffer */
         INIT_BUFFERTAG(&newTag, reln, blockNum);
  
         /* see if the block is in the buffer pool already */
-       buf = StrategyBufferLookup(&newTag, false);
+       buf = StrategyBufferLookup(&newTag, false, &cdb_found_index);
         if (buf != NULL)
         {
                 /*
@@ -332,6 +339,13 @@ BufferAlloc(Relation reln,
                 }
  
                 LWLockRelease(BufMgrLock);
+
+               /*
+                * Do the cost accounting for vacuum
+                */
+               if (VacuumCostActive)
+                       VacuumCostBalance += VacuumCostPageHit;
+
                 return buf;
         }
  
@@ -345,16 +359,16 @@ BufferAlloc(Relation reln,
         inProgress = FALSE;
         for (buf = NULL; buf == NULL;)
         {
-               buf = StrategyGetBuffer();
+               buf = StrategyGetBuffer(&cdb_replace_index);
  
-               /* GetFreeBuffer will abort if it can't find a free buffer */
+               /* StrategyGetBuffer will elog if it can't find a free buffer */
                 Assert(buf);
  
                 /*
                  * There should be exactly one pin on the buffer after it is
                  * allocated -- ours.  If it had a pin it wouldn't have been on
                  * the free list.  No one else could have pinned it between
-                * GetFreeBuffer and here because we have the BufMgrLock.
+                * StrategyGetBuffer and here because we have the BufMgrLock.
                  */
                 Assert(buf->refcount == 0);
                 buf->refcount = 1;
@@ -438,7 +452,7 @@ BufferAlloc(Relation reln,
                          * we haven't gotten around to insert the new tag into the
                          * buffer table. So we need to check here.              -ay 3/95
                          */
-                       buf2 = StrategyBufferLookup(&newTag, true);
+                       buf2 = StrategyBufferLookup(&newTag, true, &cdb_found_index);
                         if (buf2 != NULL)
                         {
                                 /*
@@ -471,6 +485,15 @@ BufferAlloc(Relation reln,
                                 }
  
                                 LWLockRelease(BufMgrLock);
+
+                               /*
+                                * Do the cost accounting for vacuum.  (XXX perhaps better
+                                * to consider this a miss?  We didn't have to do the read,
+                                * but we did have to write ...)
+                                */
+                               if (VacuumCostActive)
+                                       VacuumCostBalance += VacuumCostPageHit;
+
                                 return buf2;
                         }
                 }
@@ -485,8 +508,8 @@ BufferAlloc(Relation reln,
          * Tell the buffer replacement strategy that we are replacing the
          * buffer content. Then rename the buffer.
          */
-       StrategyReplaceBuffer(buf, reln, blockNum);
-       INIT_BUFFERTAG(&(buf->tag), reln, blockNum);
+       StrategyReplaceBuffer(buf, &newTag, cdb_found_index, cdb_replace_index);
+       buf->tag = newTag;
  
         /*
          * Buffer contents are currently invalid.  Have to mark IO IN PROGRESS
@@ -501,6 +524,12 @@ BufferAlloc(Relation reln,
  
         LWLockRelease(BufMgrLock);
  
+       /*
+        * Do the cost accounting for vacuum
+        */
+       if (VacuumCostActive)
+               VacuumCostBalance += VacuumCostPageMiss;
+
         return buf;
  }
  
@@ -624,69 +653,117 @@ ReleaseAndReadBuffer(Buffer buffer,
  }
  
  /*
- * BufferSync -- Write all dirty buffers in the pool.
+ * PinBuffer -- make buffer unavailable for replacement.
   *
- * This is called at checkpoint time and writes out all dirty shared buffers,
+ * This should be applied only to shared buffers, never local ones.
+ * Bufmgr lock must be held by caller.
+ */
+static void
+PinBuffer(BufferDesc *buf)
+{
+       int                     b = BufferDescriptorGetBuffer(buf) - 1;
+
+       if (PrivateRefCount[b] == 0)
+               buf->refcount++;
+       PrivateRefCount[b]++;
+       Assert(PrivateRefCount[b] > 0);
+}
+
+/*
+ * UnpinBuffer -- make buffer available for replacement.
+ *
+ * This should be applied only to shared buffers, never local ones.
+ * Bufmgr lock must be held by caller.
+ */
+static void
+UnpinBuffer(BufferDesc *buf)
+{
+       int                     b = BufferDescriptorGetBuffer(buf) - 1;
+
+       Assert(buf->refcount > 0);
+       Assert(PrivateRefCount[b] > 0);
+       PrivateRefCount[b]--;
+       if (PrivateRefCount[b] == 0)
+               buf->refcount--;
+
+       if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
+               buf->refcount == 1)
+       {
+               /* we just released the last pin other than the waiter's */
+               buf->flags &= ~BM_PIN_COUNT_WAITER;
+               ProcSendSignal(buf->wait_backend_id);
+       }
+       else
+       {
+               /* do nothing */
+       }
+}
+
+/*
+ * BufferSync -- Write out dirty buffers in the pool.
+ *
+ * This is called at checkpoint time to write out all dirty shared buffers,
   * and by the background writer process to write out some of the dirty blocks.
+ * percent/maxpages should be zero in the former case, and nonzero limit
+ * values in the latter.
   */
  int
  BufferSync(int percent, int maxpages)
  {
+       BufferDesc **dirty_buffers;
+       BufferTag  *buftags;
+       int                     num_buffer_dirty;
         int                     i;
-       BufferDesc *bufHdr;
         ErrorContextCallback errcontext;
  
-       int                     num_buffer_dirty;
-       int                *buffer_dirty;
-
-       /* Setup error traceback support for ereport() */
-       errcontext.callback = buffer_write_error_callback;
-       errcontext.arg = NULL;
-       errcontext.previous = error_context_stack;
-       error_context_stack = &errcontext;
-
         /*
          * Get a list of all currently dirty buffers and how many there are.
          * We do not flush buffers that get dirtied after we started. They
          * have to wait until the next checkpoint.
          */
-       buffer_dirty = (int *)palloc(NBuffers * sizeof(int));
+       dirty_buffers = (BufferDesc **) palloc(NBuffers * sizeof(BufferDesc *));
+       buftags = (BufferTag *) palloc(NBuffers * sizeof(BufferTag));
         
         LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-       num_buffer_dirty = StrategyDirtyBufferList(buffer_dirty, NBuffers);
-       LWLockRelease(BufMgrLock);
+       num_buffer_dirty = StrategyDirtyBufferList(dirty_buffers, buftags,
+                                                                                          NBuffers);
  
         /*
          * If called by the background writer, we are usually asked to
-        * only write out some percentage of dirty buffers now, to prevent
+        * only write out some portion of dirty buffers now, to prevent
          * the IO storm at checkpoint time.
          */
-       if (percent > 0 && num_buffer_dirty > 10)
+       if (percent > 0)
         {
                 Assert(percent <= 100);
-               num_buffer_dirty = (num_buffer_dirty * percent) / 100;
-               if (maxpages > 0 && num_buffer_dirty > maxpages)
-                       num_buffer_dirty = maxpages;
+               num_buffer_dirty = (num_buffer_dirty * percent + 99) / 100;
         }
+       if (maxpages > 0 && num_buffer_dirty > maxpages)
+               num_buffer_dirty = maxpages;
+
+       /* Setup error traceback support for ereport() */
+       errcontext.callback = buffer_write_error_callback;
+       errcontext.arg = NULL;
+       errcontext.previous = error_context_stack;
+       error_context_stack = &errcontext;
  
+       /*
+        * Loop over buffers to be written.  Note the BufMgrLock is held at
+        * loop top, but is released and reacquired intraloop, so we aren't
+        * holding it long.
+        */
         for (i = 0; i < num_buffer_dirty; i++)
         {
+               BufferDesc *bufHdr = dirty_buffers[i];
                 Buffer          buffer;
                 XLogRecPtr      recptr;
                 SMgrRelation reln;
  
-               LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-
-               bufHdr = &BufferDescriptors[buffer_dirty[i]];
                 errcontext.arg = bufHdr;
  
-               if (!(bufHdr->flags & BM_VALID))
-               {
-                       LWLockRelease(BufMgrLock);
-                       continue;
-               }
-
                 /*
+                * Check it is still the same page and still needs writing.
+                *
                  * We can check bufHdr->cntxDirty here *without* holding any lock
                  * on buffer context as long as we set this flag in access methods
                  * *before* logging changes with XLogInsert(): if someone will set
@@ -694,11 +771,12 @@ BufferSync(int percent, int maxpages)
                  * checkpoint.redo points before log record for upcoming changes
                  * and so we are not required to write such dirty buffer.
                  */
+               if (!(bufHdr->flags & BM_VALID))
+                       continue;
+               if (!BUFFERTAGS_EQUAL(&bufHdr->tag, &buftags[i]))
+                       continue;
                 if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty))
-               {
-                       LWLockRelease(BufMgrLock);
                         continue;
-               }
  
                 /*
                  * IO synchronization. Note that we do it with unpinned buffer to
@@ -707,12 +785,13 @@ BufferSync(int percent, int maxpages)
                 if (bufHdr->flags & BM_IO_IN_PROGRESS)
                 {
                         WaitIO(bufHdr);
-                       if (!(bufHdr->flags & BM_VALID) ||
-                               (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty)))
-                       {
-                               LWLockRelease(BufMgrLock);
+                       /* Still need writing? */
+                       if (!(bufHdr->flags & BM_VALID))
+                               continue;
+                       if (!BUFFERTAGS_EQUAL(&bufHdr->tag, &buftags[i]))
+                               continue;
+                       if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty))
                                 continue;
-                       }
                 }
  
                 /*
@@ -723,10 +802,11 @@ BufferSync(int percent, int maxpages)
                 PinBuffer(bufHdr);
                 StartBufferIO(bufHdr, false);   /* output IO start */
  
-               buffer = BufferDescriptorGetBuffer(bufHdr);
-
+               /* Release BufMgrLock while doing xlog work */
                 LWLockRelease(BufMgrLock);
  
+               buffer = BufferDescriptorGetBuffer(bufHdr);
+
                 /*
                  * Protect buffer content against concurrent update
                  */
@@ -740,8 +820,12 @@ BufferSync(int percent, int maxpages)
  
                 /*
                  * Now it's safe to write buffer to disk. Note that no one else
-                * should not be able to write it while we were busy with locking
-                * and log flushing because of we setted IO flag.
+                * should have been able to write it while we were busy with
+                * locking and log flushing because we set the IO flag.
+                *
+                * Before we issue the actual write command, clear the just-dirtied
+                * flag.  This lets us recognize concurrent changes (note that only
+                * hint-bit changes are possible since we hold the buffer shlock).
                  */
                 LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
                 Assert(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty);
@@ -767,12 +851,12 @@ BufferSync(int percent, int maxpages)
                  * Release the per-buffer readlock, reacquire BufMgrLock.
                  */
                 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
-               BufferFlushCount++;
  
                 LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
  
                 bufHdr->flags &= ~BM_IO_IN_PROGRESS;    /* mark IO finished */
                 TerminateBufferIO(bufHdr);              /* Sync IO finished */
+               BufferFlushCount++;
  
                 /*
                  * If this buffer was marked by someone as DIRTY while we were
@@ -781,14 +865,16 @@ BufferSync(int percent, int maxpages)
                 if (!(bufHdr->flags & BM_JUST_DIRTIED))
                         bufHdr->flags &= ~BM_DIRTY;
                 UnpinBuffer(bufHdr);
-               LWLockRelease(BufMgrLock);
         }
  
-       pfree(buffer_dirty);
+       LWLockRelease(BufMgrLock);
  
         /* Pop the error context stack */
         error_context_stack = errcontext.previous;
  
+       pfree(dirty_buffers);
+       pfree(buftags);
+
         return num_buffer_dirty;
  }
  
@@ -818,11 +904,6 @@ WaitIO(BufferDesc *buf)
  }
  
  
-long           NDirectFileRead;        /* some I/O's are direct file access.
-                                                                * bypass bufmgr */
-long           NDirectFileWrite;       /* e.g., I/O in psort and hashjoin. */
-
-
  /*
   * Return a palloc'd string containing buffer usage statistics.
   */
@@ -892,9 +973,9 @@ AtEOXact_Buffers(bool isCommit)
  
                         if (isCommit)
                                 elog(WARNING,
-                               "buffer refcount leak: [%03d] (bufNext=%d, "
-                                 "rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)",
-                                        i, buf->bufNext,
+                                        "buffer refcount leak: [%03d] "
+                                        "(rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)",
+                                        i,
                                          buf->tag.rnode.tblNode, buf->tag.rnode.relNode,
                                          buf->tag.blockNum, buf->flags,
                                          buf->refcount, PrivateRefCount[i]);
@@ -1021,6 +1102,26 @@ BufferGetBlockNumber(Buffer buffer)
                 return BufferDescriptors[buffer - 1].tag.blockNum;
  }
  
+/*
+ * BufferGetFileNode
+ *             Returns the relation ID (RelFileNode) associated with a buffer.
+ *
+ * This should make the same checks as BufferGetBlockNumber, but since the
+ * two are generally called together, we don't bother.
+ */
+RelFileNode
+BufferGetFileNode(Buffer buffer)
+{
+       BufferDesc *bufHdr;
+
+       if (BufferIsLocal(buffer))
+               bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
+       else
+               bufHdr = &BufferDescriptors[buffer - 1];
+
+       return (bufHdr->tag.rnode);
+}
+
  /*
   * BufferReplace
   *
@@ -1663,7 +1764,11 @@ refcount = %ld, file: %s, line: %d\n",
   *
   * This routine might get called many times on the same page, if we are making
   * the first scan after commit of an xact that added/deleted many tuples.
- * So, be as quick as we can if the buffer is already dirty.
+ * So, be as quick as we can if the buffer is already dirty.  We do this by
+ * not acquiring BufMgrLock if it looks like the status bits are already OK.
+ * (Note it is okay if someone else clears BM_JUST_DIRTIED immediately after
+ * we look, because the buffer content update is already done and will be
+ * reflected in the I/O.)
   */
  void
  SetBufferCommitInfoNeedsSave(Buffer buffer)
@@ -2008,19 +2113,6 @@ AbortBufferIO(void)
         }
  }
  
-RelFileNode
-BufferGetFileNode(Buffer buffer)
-{
-       BufferDesc *bufHdr;
-
-       if (BufferIsLocal(buffer))
-               bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
-       else
-               bufHdr = &BufferDescriptors[buffer - 1];
-
-       return (bufHdr->tag.rnode);
-}
-
  /*
   * Error context callback for errors occurring during buffer writes.
   */
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c

index 595e4905a80c6fdb1e8d4f9946d867f4126c26e6..c14d446497421d29c0fb1c382a5935cf497f43f7 100644 (file)
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -3,210 +3,208 @@
   * freelist.c
   *       routines for manipulating the buffer pool's replacement strategy.
   *
+ * Note: all routines in this file assume that the BufMgrLock is held
+ * by the caller, so no synchronization is needed.
+ *
+ *
   * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/buffer/freelist.c,v 1.41 2004/02/12 15:06:56 wieck Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/buffer/freelist.c,v 1.42 2004/04/19 23:27:17 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
-/*
- * OLD COMMENTS
- *
- * Data Structures:
- *             SharedFreeList is a circular queue.  Notice that this
- *             is a shared memory queue so the next/prev "ptrs" are
- *             buffer ids, not addresses.
- *
- * Sync: all routines in this file assume that the buffer
- *             semaphore has been acquired by the caller.
- */
-
  #include "postgres.h"
  
+#include "access/xact.h"
  #include "storage/buf_internals.h"
  #include "storage/bufmgr.h"
-#include "storage/ipc.h"
-#include "storage/proc.h"
-#include "access/xact.h"
-#include "miscadmin.h"
  
-#ifndef MAX
-#define MAX(a,b) (((a) > (b)) ? (a) : (b))
-#endif
-#ifndef MIN
-#define MIN(a,b) (((a) < (b)) ? (a) : (b))
-#endif
  
+/* GUC variable: time in seconds between statistics reports */
+int            DebugSharedBuffers = 0;
+
+/* Pointers to shared state */
  static BufferStrategyControl   *StrategyControl = NULL;
  static BufferStrategyCDB               *StrategyCDB = NULL;
  
-static int             strategy_cdb_found;
-static int             strategy_cdb_replace;
-static int             strategy_get_from;
-
-int                            DebugSharedBuffers = 0;
-
-static bool                            strategy_hint_vacuum;
+/* Backend-local state about whether currently vacuuming */
+static bool                            strategy_hint_vacuum = false;
  static TransactionId   strategy_vacuum_xid;
  
  
-#define T1_TARGET      StrategyControl->target_T1_size
-#define B1_LENGTH      StrategyControl->listSize[STRAT_LIST_B1]
-#define T1_LENGTH      StrategyControl->listSize[STRAT_LIST_T1]
-#define T2_LENGTH      StrategyControl->listSize[STRAT_LIST_T2]
-#define B2_LENGTH      StrategyControl->listSize[STRAT_LIST_B2]
+#define T1_TARGET      (StrategyControl->target_T1_size)
+#define B1_LENGTH      (StrategyControl->listSize[STRAT_LIST_B1])
+#define T1_LENGTH      (StrategyControl->listSize[STRAT_LIST_T1])
+#define T2_LENGTH      (StrategyControl->listSize[STRAT_LIST_T2])
+#define B2_LENGTH      (StrategyControl->listSize[STRAT_LIST_B2])
  
  
  /*
   * Macro to remove a CDB from whichever list it currently is on
   */
  #define        STRAT_LIST_REMOVE(cdb) \
-{ \
-       AssertMacro((cdb)->list >= 0 && (cdb)->list < STRAT_NUM_LISTS);         \
-       if ((cdb)->prev < 0)                                                                                            \
-               StrategyControl->listHead[(cdb)->list] = (cdb)->next;                   \
-       else                                                                                                                            \
-               StrategyCDB[(cdb)->prev].next = (cdb)->next;                                    \
-       if ((cdb)->next < 0)                                                                                            \
-               StrategyControl->listTail[(cdb)->list] = (cdb)->prev;                   \
-       else                                                                                                                            \
-               StrategyCDB[(cdb)->next].prev = (cdb)->prev;                                    \
-       StrategyControl->listSize[(cdb)->list]--;                                                       \
-       (cdb)->list = STRAT_LIST_UNUSED;                                                                        \
-}
+do { \
+       Assert((cdb)->list >= 0 && (cdb)->list < STRAT_NUM_LISTS);      \
+       if ((cdb)->prev < 0)                                                                            \
+               StrategyControl->listHead[(cdb)->list] = (cdb)->next;   \
+       else                                                                                                            \
+               StrategyCDB[(cdb)->prev].next = (cdb)->next;                    \
+       if ((cdb)->next < 0)                                                                            \
+               StrategyControl->listTail[(cdb)->list] = (cdb)->prev;   \
+       else                                                                                                            \
+               StrategyCDB[(cdb)->next].prev = (cdb)->prev;                    \
+       StrategyControl->listSize[(cdb)->list]--;                                       \
+       (cdb)->list = STRAT_LIST_UNUSED;                                                        \
+} while(0)
  
  /*
   * Macro to add a CDB to the tail of a list (MRU position)
   */
  #define STRAT_MRU_INSERT(cdb,l) \
-{ \
-       AssertMacro((cdb)->list == STRAT_LIST_UNUSED);                                          \
-       if (StrategyControl->listTail[(l)] < 0)                                                         \
-       {                                                                                                                                       \
-               (cdb)->prev = (cdb)->next = -1;                                                                 \
-               StrategyControl->listHead[(l)] =                                                                \
-                       StrategyControl->listTail[(l)] =                                                        \
-                       ((cdb) - StrategyCDB);                                                                          \
-       }                                                                                                                                       \
-       else                                                                                                                            \
-       {                                                                                                                                       \
-               (cdb)->next = -1;                                                                                               \
-               (cdb)->prev = StrategyControl->listTail[(l)];                                   \
-               StrategyCDB[StrategyControl->listTail[(l)]].next =                              \
-                       ((cdb) - StrategyCDB);                                                                          \
-               StrategyControl->listTail[(l)] =                                                                \
-                       ((cdb) - StrategyCDB);                                                                          \
-       }                                                                                                                                       \
-       StrategyControl->listSize[(l)]++;                                                                       \
-       (cdb)->list = (l);                                                                                                      \
-}
+do { \
+       Assert((cdb)->list == STRAT_LIST_UNUSED);                                       \
+       if (StrategyControl->listTail[(l)] < 0)                                         \
+       {                                                                                                                       \
+               (cdb)->prev = (cdb)->next = -1;                                                 \
+               StrategyControl->listHead[(l)] =                                                \
+                       StrategyControl->listTail[(l)] =                                        \
+                       ((cdb) - StrategyCDB);                                                          \
+       }                                                                                                                       \
+       else                                                                                                            \
+       {                                                                                                                       \
+               (cdb)->next = -1;                                                                               \
+               (cdb)->prev = StrategyControl->listTail[(l)];                   \
+               StrategyCDB[StrategyControl->listTail[(l)]].next =              \
+                       ((cdb) - StrategyCDB);                                                          \
+               StrategyControl->listTail[(l)] =                                                \
+                       ((cdb) - StrategyCDB);                                                          \
+       }                                                                                                                       \
+       StrategyControl->listSize[(l)]++;                                                       \
+       (cdb)->list = (l);                                                                                      \
+} while(0)
  
  /*
   * Macro to add a CDB to the head of a list (LRU position)
   */
  #define STRAT_LRU_INSERT(cdb,l) \
-{ \
-       AssertMacro((cdb)->list == STRAT_LIST_UNUSED);                                          \
-       if (StrategyControl->listHead[(l)] < 0)                                                         \
-       {                                                                                                                                       \
-               (cdb)->prev = (cdb)->next = -1;                                                                 \
-               StrategyControl->listHead[(l)] =                                                                \
-                       StrategyControl->listTail[(l)] =                                                        \
-                       ((cdb) - StrategyCDB);                                                                          \
-       }                                                                                                                                       \
-       else                                                                                                                            \
-       {                                                                                                                                       \
-               (cdb)->prev = -1;                                                                                               \
-               (cdb)->next = StrategyControl->listHead[(l)];                                   \
-               StrategyCDB[StrategyControl->listHead[(l)]].prev =                              \
-                       ((cdb) - StrategyCDB);                                                                          \
-               StrategyControl->listHead[(l)] =                                                                \
-                       ((cdb) - StrategyCDB);                                                                          \
-       }                                                                                                                                       \
-       StrategyControl->listSize[(l)]++;                                                                       \
-       (cdb)->list = (l);                                                                                                      \
-}
+do { \
+       Assert((cdb)->list == STRAT_LIST_UNUSED);                                       \
+       if (StrategyControl->listHead[(l)] < 0)                                         \
+       {                                                                                                                       \
+               (cdb)->prev = (cdb)->next = -1;                                                 \
+               StrategyControl->listHead[(l)] =                                                \
+                       StrategyControl->listTail[(l)] =                                        \
+                       ((cdb) - StrategyCDB);                                                          \
+       }                                                                                                                       \
+       else                                                                                                            \
+       {                                                                                                                       \
+               (cdb)->prev = -1;                                                                               \
+               (cdb)->next = StrategyControl->listHead[(l)];                   \
+               StrategyCDB[StrategyControl->listHead[(l)]].prev =              \
+                       ((cdb) - StrategyCDB);                                                          \
+               StrategyControl->listHead[(l)] =                                                \
+                       ((cdb) - StrategyCDB);                                                          \
+       }                                                                                                                       \
+       StrategyControl->listSize[(l)]++;                                                       \
+       (cdb)->list = (l);                                                                                      \
+} while(0)
+
+
+/*
+ * Printout for use when DebugSharedBuffers is enabled
+ */
+static void
+StrategyStatsDump(void)
+{
+       time_t          now = time(NULL);
+
+       if (StrategyControl->stat_report + DebugSharedBuffers < now)
+       {
+               long    all_hit, b1_hit, t1_hit, t2_hit, b2_hit;
+               int             id, t1_clean, t2_clean;
+               ErrorContextCallback    *errcxtold;
  
+               id = StrategyControl->listHead[STRAT_LIST_T1];
+               t1_clean = 0;
+               while (id >= 0)
+               {
+                       if (BufferDescriptors[StrategyCDB[id].buf_id].flags & BM_DIRTY)
+                               break;
+                       t1_clean++;
+                       id = StrategyCDB[id].next;
+               }
+               id = StrategyControl->listHead[STRAT_LIST_T2];
+               t2_clean = 0;
+               while (id >= 0)
+               {
+                       if (BufferDescriptors[StrategyCDB[id].buf_id].flags & BM_DIRTY)
+                               break;
+                       t2_clean++;
+                       id = StrategyCDB[id].next;
+               }
+
+               if (StrategyControl->num_lookup == 0)
+               {
+                       all_hit = b1_hit = t1_hit = t2_hit = b2_hit = 0;
+               }
+               else
+               {
+                       b1_hit = (StrategyControl->num_hit[STRAT_LIST_B1] * 100 /
+                                         StrategyControl->num_lookup);
+                       t1_hit = (StrategyControl->num_hit[STRAT_LIST_T1] * 100 /
+                                         StrategyControl->num_lookup);
+                       t2_hit = (StrategyControl->num_hit[STRAT_LIST_T2] * 100 /
+                                         StrategyControl->num_lookup);
+                       b2_hit = (StrategyControl->num_hit[STRAT_LIST_B2] * 100 /
+                                         StrategyControl->num_lookup);
+                       all_hit = b1_hit + t1_hit + t2_hit + b2_hit;
+               }
+
+               errcxtold = error_context_stack;
+               error_context_stack = NULL;
+               elog(DEBUG1, "ARC T1target=%5d B1len=%5d T1len=%5d T2len=%5d B2len=%5d",
+                        T1_TARGET, B1_LENGTH, T1_LENGTH, T2_LENGTH, B2_LENGTH);
+               elog(DEBUG1, "ARC total   =%4ld%% B1hit=%4ld%% T1hit=%4ld%% T2hit=%4ld%% B2hit=%4ld%%",
+                        all_hit, b1_hit, t1_hit, t2_hit, b2_hit);
+               elog(DEBUG1, "ARC clean buffers at LRU       T1=   %5d T2=   %5d",
+                        t1_clean, t2_clean);
+               error_context_stack = errcxtold;
+
+               StrategyControl->num_lookup = 0;
+               StrategyControl->num_hit[STRAT_LIST_B1] = 0;
+               StrategyControl->num_hit[STRAT_LIST_T1] = 0;
+               StrategyControl->num_hit[STRAT_LIST_T2] = 0;
+               StrategyControl->num_hit[STRAT_LIST_B2] = 0;
+               StrategyControl->stat_report = now;
+       }
+}
  
  /*
   * StrategyBufferLookup
   *
   *     Lookup a page request in the cache directory. A buffer is only
- *     returned for a T1 or T2 cache hit. B1 and B2 hits are only
- *     remembered here to later affect the behaviour.
+ *     returned for a T1 or T2 cache hit. B1 and B2 hits are just
+ *     remembered here, to possibly affect the behaviour later.
+ *
+ *     recheck indicates we are rechecking after I/O wait; do not change
+ *     internal status in this case.
+ *
+ *     *cdb_found_index is set to the index of the found CDB, or -1 if none.
+ *     This is not intended to be used by the caller, except to pass to
+ *     StrategyReplaceBuffer().
   */
  BufferDesc *
-StrategyBufferLookup(BufferTag *tagPtr, bool recheck)
+StrategyBufferLookup(BufferTag *tagPtr, bool recheck,
+                                        int *cdb_found_index)
  {
         BufferStrategyCDB  *cdb;
-       time_t                          now;
  
+       /* Optional stats printout */
         if (DebugSharedBuffers > 0)
-       {
-               time(&now);
-               if (StrategyControl->stat_report + DebugSharedBuffers < now)
-               {
-                       long    all_hit, b1_hit, t1_hit, t2_hit, b2_hit;
-                       int             id, t1_clean, t2_clean;
-                       ErrorContextCallback    *errcxtold;
-
-                       id = StrategyControl->listHead[STRAT_LIST_T1];
-                       t1_clean = 0;
-                       while (id >= 0)
-                       {
-                               if (BufferDescriptors[StrategyCDB[id].buf_id].flags & BM_DIRTY)
-                                       break;
-                               t1_clean++;
-                               id = StrategyCDB[id].next;
-                       }
-                       id = StrategyControl->listHead[STRAT_LIST_T2];
-                       t2_clean = 0;
-                       while (id >= 0)
-                       {
-                               if (BufferDescriptors[StrategyCDB[id].buf_id].flags & BM_DIRTY)
-                                       break;
-                               t2_clean++;
-                               id = StrategyCDB[id].next;
-                       }
-
-                       if (StrategyControl->num_lookup == 0)
-                       {
-                               all_hit = b1_hit = t1_hit = t2_hit = b2_hit = 0;
-                       }
-                       else
-                       {
-                               b1_hit = (StrategyControl->num_hit[STRAT_LIST_B1] * 100 /
-                                                 StrategyControl->num_lookup);
-                               t1_hit = (StrategyControl->num_hit[STRAT_LIST_T1] * 100 /
-                                                 StrategyControl->num_lookup);
-                               t2_hit = (StrategyControl->num_hit[STRAT_LIST_T2] * 100 /
-                                                 StrategyControl->num_lookup);
-                               b2_hit = (StrategyControl->num_hit[STRAT_LIST_B2] * 100 /
-                                                 StrategyControl->num_lookup);
-                               all_hit = b1_hit + t1_hit + t2_hit + b2_hit;
-                       }
-
-                       errcxtold = error_context_stack;
-                       error_context_stack = NULL;
-                       elog(DEBUG1, "ARC T1target=%5d B1len=%5d T1len=%5d T2len=%5d B2len=%5d",
-                                       T1_TARGET, B1_LENGTH, T1_LENGTH, T2_LENGTH, B2_LENGTH);
-                       elog(DEBUG1, "ARC total   =%4ld%% B1hit=%4ld%% T1hit=%4ld%% T2hit=%4ld%% B2hit=%4ld%%",
-                                       all_hit, b1_hit, t1_hit, t2_hit, b2_hit);
-                       elog(DEBUG1, "ARC clean buffers at LRU       T1=   %5d T2=   %5d",
-                                       t1_clean, t2_clean);
-                       error_context_stack = errcxtold;
-
-                       StrategyControl->num_lookup = 0;
-                       StrategyControl->num_hit[STRAT_LIST_B1] = 0;
-                       StrategyControl->num_hit[STRAT_LIST_T1] = 0;
-                       StrategyControl->num_hit[STRAT_LIST_T2] = 0;
-                       StrategyControl->num_hit[STRAT_LIST_B2] = 0;
-                       StrategyControl->stat_report = now;
-               }
-       }
+               StrategyStatsDump();
  
         /*
          * Count lookups
@@ -216,75 +214,75 @@ StrategyBufferLookup(BufferTag *tagPtr, bool recheck)
         /*
          * Lookup the block in the shared hash table
          */
-       strategy_cdb_found = BufTableLookup(tagPtr);
+       *cdb_found_index = BufTableLookup(tagPtr);
  
         /*
-        * Handle CDB lookup miss
+        * Done if complete CDB lookup miss
          */
-       if (strategy_cdb_found < 0)
-       {
-               if (!recheck)
-               {
-                       /*
-                        * This is an initial lookup and we have a complete
-                        * cache miss (block found nowhere). This means we
-                        * remember according to the current T1 size and the
-                        * target T1 size from where we take a block if we
-                        * need one later.
-                        */
-                       if (T1_LENGTH >= MAX(1, T1_TARGET))
-                               strategy_get_from = STRAT_LIST_T1;
-                       else
-                               strategy_get_from = STRAT_LIST_T2;
-               }
-
-               /*
-                * Do the cost accounting for vacuum
-                */
-               if (VacuumCostActive)
-                       VacuumCostBalance += VacuumCostPageMiss;
-
-               /* report cache miss */
+       if (*cdb_found_index < 0)
                 return NULL;
-       }
  
         /*
          * We found a CDB
          */
-       cdb = &StrategyCDB[strategy_cdb_found];
+       cdb = &StrategyCDB[*cdb_found_index];
  
         /*
          * Count hits
          */
         StrategyControl->num_hit[cdb->list]++;
-       if (VacuumCostActive)
-               VacuumCostBalance += VacuumCostPageHit;
  
         /*
          * If this is a T2 hit, we simply move the CDB to the
          * T2 MRU position and return the found buffer.
+        *
+        * A CDB in T2 cannot have t1_vacuum set, so we needn't check.  However,
+        * if the current process is VACUUM then it doesn't promote to MRU.
          */
         if (cdb->list == STRAT_LIST_T2)
         {
-               STRAT_LIST_REMOVE(cdb);
-               STRAT_MRU_INSERT(cdb, STRAT_LIST_T2);
+               if (!strategy_hint_vacuum)
+               {
+                       STRAT_LIST_REMOVE(cdb);
+                       STRAT_MRU_INSERT(cdb, STRAT_LIST_T2);
+               }
  
                 return &BufferDescriptors[cdb->buf_id];
         }
  
         /*
-        * If this is a T1 hit, we move the buffer to the T2 MRU
-        * only if another transaction had read it into T1. This is
-        * required because any UPDATE or DELETE in PostgreSQL does
-        * multiple ReadBuffer(), first during the scan, later during
-        * the heap_update() or heap_delete().
+        * If this is a T1 hit, we move the buffer to the T2 MRU only if another
+        * transaction had read it into T1, *and* neither transaction is a VACUUM.
+        * This is required because any UPDATE or DELETE in PostgreSQL does
+        * multiple ReadBuffer(), first during the scan, later during the
+        * heap_update() or heap_delete().  Otherwise move to T1 MRU.  VACUUM
+        * doesn't even get to make that happen.
          */
         if (cdb->list == STRAT_LIST_T1)
         {
-               if (!TransactionIdIsCurrentTransactionId(cdb->t1_xid))
+               if (!strategy_hint_vacuum)
                 {
-                       STRAT_LIST_REMOVE(cdb);
-                       STRAT_MRU_INSERT(cdb, STRAT_LIST_T2);
+                       if (!cdb->t1_vacuum &&
+                               !TransactionIdIsCurrentTransactionId(cdb->t1_xid))
+                       {
+                               STRAT_LIST_REMOVE(cdb);
+                               STRAT_MRU_INSERT(cdb, STRAT_LIST_T2);
+                       }
+                       else
+                       {
+                               STRAT_LIST_REMOVE(cdb);
+                               STRAT_MRU_INSERT(cdb, STRAT_LIST_T1);
+                               /*
+                                * If a non-VACUUM process references a page recently loaded
+                                * by VACUUM, clear the stigma; the state will now be the
+                                * same as if this process loaded it originally.
+                                */
+                               if (cdb->t1_vacuum)
+                               {
+                                       cdb->t1_xid = GetCurrentTransactionId();
+                                       cdb->t1_vacuum = false;
+                               }
+                       }
                 }
  
                 return &BufferDescriptors[cdb->buf_id];
@@ -292,17 +290,19 @@ StrategyBufferLookup(BufferTag *tagPtr, bool recheck)
  
         /*
          * In the case of a recheck we don't care about B1 or B2 hits here.
-        * The bufmgr does this call only to make sure noone faulted in the
-        * block while we where busy flushing another. Now for this really
-        * to end up as a B1 or B2 cache hit, we must have been flushing for
-        * quite some time as the block not only must have been read, but
-        * also traveled through the queue and evicted from the T cache again
-        * already. 
+        * The bufmgr does this call only to make sure no-one faulted in the
+        * block while we where busy flushing another; we don't want to doubly
+        * adjust the T1target.
+        *
+        * Now for this really to end up as a B1 or B2 cache hit, we must have
+        * been flushing for quite some time as the block not only must have been
+        * read, but also traveled through the queue and evicted from the T cache
+        * again already.
+        *
+        * VACUUM re-reads shouldn't adjust the target either.
          */
-       if (recheck)
-       {
+       if (recheck || strategy_hint_vacuum)
                 return NULL;
-       }
  
         /*
          * Adjust the target size of the T1 cache depending on if this is
@@ -316,8 +316,8 @@ StrategyBufferLookup(BufferTag *tagPtr, bool recheck)
                          * small. Adjust the T1 target size and continue
                          * below.
                          */
-                       T1_TARGET = MIN(T1_TARGET + MAX(B2_LENGTH / B1_LENGTH, 1),
-                                                       Data_Descriptors);
+                       T1_TARGET = Min(T1_TARGET + Max(B2_LENGTH / B1_LENGTH, 1),
+                                                       NBuffers);
                         break;
  
                 case STRAT_LIST_B2:
@@ -325,26 +325,17 @@ StrategyBufferLookup(BufferTag *tagPtr, bool recheck)
                          * B2 hit means that the T2 cache is probably too
                          * small. Adjust the T1 target size and continue
                          * below.
- */
-                       T1_TARGET = MAX(T1_TARGET - MAX(B1_LENGTH / B2_LENGTH, 1), 0);
+                        */
+                       T1_TARGET = Max(T1_TARGET - Max(B1_LENGTH / B2_LENGTH, 1), 0);
                         break;
  
                 default:
-                       elog(ERROR, "Buffer hash table corrupted - CDB on list %d found",
-                                       cdb->list);
+                       elog(ERROR, "buffer hash table corrupted: CDB->list = %d",
+                                cdb->list);
         }
  
         /*
-        * Decide where to take from if we will be out of
-        * free blocks later in StrategyGetBuffer().
-        */
-       if (T1_LENGTH >= MAX(1, T1_TARGET))
-               strategy_get_from = STRAT_LIST_T1;
-       else
-               strategy_get_from = STRAT_LIST_T2;
-
-       /*
-        * Even if we had seen the block in the past, it's data is
+        * Even though we had seen the block in the past, its data is
          * not currently in memory ... cache miss to the bufmgr.
          */
         return NULL;
@@ -357,18 +348,25 @@ StrategyBufferLookup(BufferTag *tagPtr, bool recheck)
   *     Called by the bufmgr to get the next candidate buffer to use in
   *     BufferAlloc(). The only hard requirement BufferAlloc() has is that
   *     this buffer must not currently be pinned. 
+ *
+ *     *cdb_replace_index is set to the index of the candidate CDB, or -1 if
+ *     none (meaning we are using a previously free buffer).  This is not
+ *     intended to be used by the caller, except to pass to
+ *     StrategyReplaceBuffer().
   */
  BufferDesc *
-StrategyGetBuffer(void)
+StrategyGetBuffer(int *cdb_replace_index)
  {
         int                             cdb_id;
         BufferDesc         *buf;
  
         if (StrategyControl->listFreeBuffers < 0)
         {
-               /* We don't have a free buffer, must take one from T1 or T2 */
-
-               if (strategy_get_from == STRAT_LIST_T1)
+               /*
+                * We don't have a free buffer, must take one from T1 or T2.
+                * Choose based on trying to converge T1len to T1target.
+                */
+               if (T1_LENGTH >= Max(1, T1_TARGET))
                 {
                         /*
                          * We should take the first unpinned buffer from T1.
@@ -379,7 +377,7 @@ StrategyGetBuffer(void)
                                 buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id];
                                 if (buf->refcount == 0)
                                 {
-                                       strategy_cdb_replace = cdb_id;
+                                       *cdb_replace_index = cdb_id;
                                         Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T1);
                                         return buf;
                                 }
@@ -387,7 +385,7 @@ StrategyGetBuffer(void)
                         }
  
                         /*
-                        * No unpinned T1 buffer found - pardon T2 cache.
+                        * No unpinned T1 buffer found - try T2 cache.
                          */
                         cdb_id = StrategyControl->listHead[STRAT_LIST_T2];
                         while (cdb_id >= 0)
@@ -395,7 +393,7 @@ StrategyGetBuffer(void)
                                 buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id];
                                 if (buf->refcount == 0)
                                 {
-                                       strategy_cdb_replace = cdb_id;
+                                       *cdb_replace_index = cdb_id;
                                         Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T2);
                                         return buf;
                                 }
@@ -405,7 +403,7 @@ StrategyGetBuffer(void)
                         /*
                          * No unpinned buffers at all!!!
                          */
-                       elog(ERROR, "StrategyGetBuffer(): Out of unpinned buffers");
+                       elog(ERROR, "no unpinned buffers available");
                 }
                 else
                 {
@@ -418,7 +416,7 @@ StrategyGetBuffer(void)
                                 buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id];
                                 if (buf->refcount == 0)
                                 {
-                                       strategy_cdb_replace = cdb_id;
+                                       *cdb_replace_index = cdb_id;
                                         Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T2);
                                         return buf;
                                 }
@@ -426,7 +424,7 @@ StrategyGetBuffer(void)
                         }
  
                         /*
-                        * No unpinned T2 buffer found - pardon T1 cache.
+                        * No unpinned T2 buffer found - try T1 cache.
                          */
                         cdb_id = StrategyControl->listHead[STRAT_LIST_T1];
                         while (cdb_id >= 0)
@@ -434,7 +432,7 @@ StrategyGetBuffer(void)
                                 buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id];
                                 if (buf->refcount == 0)
                                 {
-                                       strategy_cdb_replace = cdb_id;
+                                       *cdb_replace_index = cdb_id;
                                         Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T1);
                                         return buf;
                                 }
@@ -444,7 +442,7 @@ StrategyGetBuffer(void)
                         /*
                          * No unpinned buffers at all!!!
                          */
-                       elog(ERROR, "StrategyGetBuffer(): Out of unpinned buffers");
+                       elog(ERROR, "no unpinned buffers available");
                 }
         }
         else
@@ -459,13 +457,13 @@ StrategyGetBuffer(void)
                  * that there will never be any reason to recheck. Otherwise
                  * we would leak shared buffers here!
                  */
-               strategy_cdb_replace = -1;
+               *cdb_replace_index = -1;
                 buf = &BufferDescriptors[StrategyControl->listFreeBuffers];
  
                 StrategyControl->listFreeBuffers = buf->bufNext;
                 buf->bufNext = -1;
  
-               /* Buffer of freelist cannot be pinned */
+               /* Buffer in freelist cannot be pinned */
                 Assert(buf->refcount == 0);
                 Assert(!(buf->flags & BM_DIRTY));
  
@@ -480,54 +478,59 @@ StrategyGetBuffer(void)
  /*
   * StrategyReplaceBuffer
   *
- *     Called by the buffer manager to inform us that he possibly flushed
- *     a buffer and is now about to replace the content. Prior to this call,
+ *     Called by the buffer manager to inform us that he flushed a buffer
+ *     and is now about to replace the content. Prior to this call,
   *     the cache algorithm still reports the buffer as in the cache. After
   *     this call we report the new block, even if IO might still need to
- *     start.
+ *     be done to bring in the new content.
+ *
+ *     cdb_found_index and cdb_replace_index must be the auxiliary values
+ *     returned by previous calls to StrategyBufferLookup and StrategyGetBuffer.
   */
  void
-StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum)
+StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
+                                         int cdb_found_index, int cdb_replace_index)
  {
         BufferStrategyCDB          *cdb_found;
         BufferStrategyCDB          *cdb_replace;
  
-       if (strategy_cdb_found >= 0)
+       if (cdb_found_index >= 0)
         {
-               /* This was a ghost buffer cache hit (B1 or B2) */
-               cdb_found = &StrategyCDB[strategy_cdb_found];
+               /* This must have been a ghost buffer cache hit (B1 or B2) */
+               cdb_found = &StrategyCDB[cdb_found_index];
  
                 /* Assert that the buffer remembered in cdb_found is the one */
                 /* the buffer manager is currently faulting in */
-               Assert(BUFFERTAG_EQUALS(&(cdb_found->buf_tag), rnode, blockNum));
+               Assert(BUFFERTAGS_EQUAL(&(cdb_found->buf_tag), newTag));
                 
-               if (strategy_cdb_replace >= 0)
+               if (cdb_replace_index >= 0)
                 {
                         /* We are satisfying it with an evicted T buffer */
-                       cdb_replace = &StrategyCDB[strategy_cdb_replace];
+                       cdb_replace = &StrategyCDB[cdb_replace_index];
  
                         /* Assert that the buffer remembered in cdb_replace is */
                         /* the one the buffer manager has just evicted */
                         Assert(cdb_replace->list == STRAT_LIST_T1 || 
-                                       cdb_replace->list == STRAT_LIST_T2);
+                                  cdb_replace->list == STRAT_LIST_T2);
                         Assert(cdb_replace->buf_id == buf->buf_id);
                         Assert(BUFFERTAGS_EQUAL(&(cdb_replace->buf_tag), &(buf->tag)));
  
-                       /* If this was a T1 buffer faulted in by vacuum, just */
-                       /* do not cause the CDB end up in the B1 list, so that */
-                       /* the vacuum scan does not affect T1_target adjusting */
-                       if (strategy_hint_vacuum)
+                       /*
+                        * Under normal circumstances we move the evicted T list entry to
+                        * the corresponding B list.  However, T1 entries that exist only
+                        * because of VACUUM are just thrown into the unused list instead.
+                        * We don't expect them to be touched again by the VACUUM, and if
+                        * we put them into B1 then VACUUM would skew T1_target adjusting.
+                        */
+                       if (cdb_replace->t1_vacuum)
                         {
                                 BufTableDelete(&(cdb_replace->buf_tag));
                                 STRAT_LIST_REMOVE(cdb_replace);
-                               cdb_replace->buf_id = -1;
                                 cdb_replace->next = StrategyControl->listUnusedCDB;
-                               StrategyControl->listUnusedCDB = strategy_cdb_replace;
+                               StrategyControl->listUnusedCDB = cdb_replace_index;
                         }
                         else
                         {
-                               /* Under normal circumstances move the evicted */
-                               /* T list entry to it's corresponding B list */
                                 if (cdb_replace->list == STRAT_LIST_T1)
                                 {
                                         STRAT_LIST_REMOVE(cdb_replace);
@@ -539,25 +542,26 @@ StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum)
                                         STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B2);
                                 }
                         }
-                       /* And clear it's block reference */
+                       /* And clear its block reference */
                         cdb_replace->buf_id = -1;
                 }
                 else
                 {
-                       /* or we satisfy it with an unused buffer */
+                       /* We are satisfying it with an unused buffer */
                 }
  
-               /* Now the found B CDB get's the buffer and is moved to T2 */
+               /* Now the found B CDB gets the buffer and is moved to T2 */
                 cdb_found->buf_id = buf->buf_id;
                 STRAT_LIST_REMOVE(cdb_found);
                 STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T2);
         }
         else
         {
-               /* This was a complete cache miss, so we need to create */
-               /* a new CDB. The goal is to keep T1len+B1len <= c */
-
-               if (B1_LENGTH > 0 && (T1_LENGTH + B1_LENGTH) >= Data_Descriptors)
+               /*
+                * This was a complete cache miss, so we need to create
+                * a new CDB. The goal is to keep T1len+B1len <= c.
+                */
+               if (B1_LENGTH > 0 && (T1_LENGTH + B1_LENGTH) >= NBuffers)
                 {
                         /* So if B1 isn't empty and T1len+B1len >= c we take B1-LRU */
                         cdb_found = &StrategyCDB[StrategyControl->listHead[STRAT_LIST_B1]];
@@ -587,18 +591,20 @@ StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum)
                         }
                 }
  
-               /* Set the CDB's buf_tag and insert the hash key */
-               INIT_BUFFERTAG(&(cdb_found->buf_tag), rnode, blockNum);
+               /* Set the CDB's buf_tag and insert it into the hash table */
+               cdb_found->buf_tag = *newTag;
                 BufTableInsert(&(cdb_found->buf_tag), (cdb_found - StrategyCDB));
  
-               if (strategy_cdb_replace >= 0)
+               if (cdb_replace_index >= 0)
                 {
-                       /* The buffer was formerly in a T list, move it's CDB
-                        * to the corresponding B list */
-                       cdb_replace = &StrategyCDB[strategy_cdb_replace];
+                       /*
+                        * The buffer was formerly in a T list, move its CDB
+                        * to the corresponding B list
+                        */
+                       cdb_replace = &StrategyCDB[cdb_replace_index];
  
                         Assert(cdb_replace->list == STRAT_LIST_T1 || 
-                                       cdb_replace->list == STRAT_LIST_T2);
+                                  cdb_replace->list == STRAT_LIST_T2);
                         Assert(cdb_replace->buf_id == buf->buf_id);
                         Assert(BUFFERTAGS_EQUAL(&(cdb_replace->buf_tag), &(buf->tag)));
  
@@ -612,32 +618,32 @@ StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum)
                                 STRAT_LIST_REMOVE(cdb_replace);
                                 STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B2);
                         }
-                       /* And clear it's block reference */
+                       /* And clear its block reference */
                         cdb_replace->buf_id = -1;
                 }
                 else
                 {
-                       /* or we satisfy it with an unused buffer */
+                       /* We are satisfying it with an unused buffer */
                 }
  
                 /* Assign the buffer id to the new CDB */
                 cdb_found->buf_id = buf->buf_id;
  
                 /*
-                * Specialized VACUUM optimization. If this "complete cache miss"
-                * happened because vacuum needed the page, we want it later on
-                * to be placed at the LRU instead of the MRU position of T1.
+                * Specialized VACUUM optimization. If this complete cache miss
+                * happened because vacuum needed the page, we place it at the LRU
+                * position of T1; normally it goes at the MRU position.
                  */
                 if (strategy_hint_vacuum)
                 {
-                       if (strategy_vacuum_xid != GetCurrentTransactionId())
+                       if (TransactionIdIsCurrentTransactionId(strategy_vacuum_xid))
+                               STRAT_LRU_INSERT(cdb_found, STRAT_LIST_T1);
+                       else
                         {
+                               /* VACUUM must have been aborted by error, reset flag */
                                 strategy_hint_vacuum = false;
                                 STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T1);
                         }
-                       else
-                               STRAT_LRU_INSERT(cdb_found, STRAT_LIST_T1);
-                       
                 }
                 else
                         STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T1);
@@ -645,8 +651,10 @@ StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum)
                 /*
                  * Remember the Xid when this buffer went onto T1 to avoid
                  * a single UPDATE promoting a newcomer straight into T2.
+                * Also remember if it was loaded for VACUUM.
                  */
                 cdb_found->t1_xid = GetCurrentTransactionId();
+               cdb_found->t1_vacuum = strategy_hint_vacuum;
         }
  }
  
@@ -673,8 +681,7 @@ StrategyInvalidateBuffer(BufferDesc *buf)
          */
         cdb_id = BufTableLookup(&(buf->tag));
         if (cdb_id < 0)
-               elog(ERROR, "StrategyInvalidateBuffer() buffer %d not in directory",
-                               buf->buf_id);
+               elog(ERROR, "buffer %d not in buffer hash table", buf->buf_id);
         cdb = &StrategyCDB[cdb_id];
  
         /*
@@ -694,7 +701,7 @@ StrategyInvalidateBuffer(BufferDesc *buf)
         StrategyControl->listUnusedCDB = cdb_id;
  
         /*
-        * Clear out the buffers tag and add it to the list of
+        * Clear out the buffer's tag and add it to the list of
          * currently unused buffers.
          */
         CLEAR_BUFFERTAG(&(buf->tag));
@@ -702,7 +709,9 @@ StrategyInvalidateBuffer(BufferDesc *buf)
         StrategyControl->listFreeBuffers = buf->buf_id;
  }
  
-
+/*
+ * StrategyHintVacuum -- tell us whether VACUUM is active
+ */
  void
  StrategyHintVacuum(bool vacuum_active)
  {
@@ -710,9 +719,24 @@ StrategyHintVacuum(bool vacuum_active)
         strategy_vacuum_xid = GetCurrentTransactionId();
  }
  
-
+/*
+ * StrategyDirtyBufferList
+ *
+ * Returns a list of dirty buffers, in priority order for writing.
+ * Note that the caller may choose not to write them all.
+ *
+ * The caller must beware of the possibility that a buffer is no longer dirty,
+ * or even contains a different page, by the time he reaches it.  If it no
+ * longer contains the same page it need not be written, even if it is (again)
+ * dirty.
+ *
+ * Buffer pointers are stored into buffers[], and corresponding tags into
+ * buftags[], both of size max_buffers.  The function returns the number of
+ * buffer IDs stored.
+ */
  int
-StrategyDirtyBufferList(int *buffer_list, int max_buffers)
+StrategyDirtyBufferList(BufferDesc **buffers, BufferTag *buftags,
+                                               int max_buffers)
  {
         int                                     num_buffer_dirty = 0;
         int                                     cdb_id_t1;
@@ -724,13 +748,13 @@ StrategyDirtyBufferList(int *buffer_list, int max_buffers)
          * Traverse the T1 and T2 list LRU to MRU in "parallel"
          * and add all dirty buffers found in that order to the list.
          * The ARC strategy keeps all used buffers including pinned ones
-        * in the T1 or T2 list. So we cannot loose any dirty buffers.
+        * in the T1 or T2 list. So we cannot miss any dirty buffers.
          */
         cdb_id_t1 = StrategyControl->listHead[STRAT_LIST_T1];
         cdb_id_t2 = StrategyControl->listHead[STRAT_LIST_T2];
  
         while ((cdb_id_t1 >= 0 || cdb_id_t2 >= 0) && 
-                       num_buffer_dirty < max_buffers)
+                  num_buffer_dirty < max_buffers)
         {
                 if (cdb_id_t1 >= 0)
                 {
@@ -741,7 +765,9 @@ StrategyDirtyBufferList(int *buffer_list, int max_buffers)
                         {
                                 if ((buf->flags & BM_DIRTY) || (buf->cntxDirty))
                                 {
-                                       buffer_list[num_buffer_dirty++] = buf_id;
+                                       buffers[num_buffer_dirty] = buf;
+                                       buftags[num_buffer_dirty] = buf->tag;
+                                       num_buffer_dirty++;
                                 }
                         }
  
@@ -757,7 +783,9 @@ StrategyDirtyBufferList(int *buffer_list, int max_buffers)
                         {
                                 if ((buf->flags & BM_DIRTY) || (buf->cntxDirty))
                                 {
-                                       buffer_list[num_buffer_dirty++] = buf_id;
+                                       buffers[num_buffer_dirty] = buf;
+                                       buftags[num_buffer_dirty] = buf->tag;
+                                       num_buffer_dirty++;
                                 }
                         }
  
@@ -785,16 +813,16 @@ StrategyInitialize(bool init)
         /*
          * Initialize the shared CDB lookup hashtable
          */
-       InitBufTable(Data_Descriptors * 2);
+       InitBufTable(NBuffers * 2);
  
         /*
          * Get or create the shared strategy control block and the CDB's
          */
         StrategyControl = (BufferStrategyControl *)
-                       ShmemInitStruct("Buffer Strategy Status",
-                                       sizeof(BufferStrategyControl) +
-                                       sizeof(BufferStrategyCDB) * (Data_Descriptors * 2 - 1),
-                                       &found);
+               ShmemInitStruct("Buffer Strategy Status",
+                                               sizeof(BufferStrategyControl) +
+                                               sizeof(BufferStrategyCDB) * (NBuffers * 2 - 1),
+                                               &found);
         StrategyCDB = &(StrategyControl->cdb[0]);
  
         if (!found)
@@ -805,8 +833,8 @@ StrategyInitialize(bool init)
                 Assert(init);
  
                 /*
-                * Grab the whole linked list of free buffers for our
-                * strategy
+                * Grab the whole linked list of free buffers for our strategy.
+                * We assume it was previously set up by InitBufferPool().
                  */
                 StrategyControl->listFreeBuffers = 0;
  
@@ -814,7 +842,7 @@ StrategyInitialize(bool init)
                  * We start off with a target T1 list size of
                  * half the available cache blocks.
                  */
-               StrategyControl->target_T1_size = Data_Descriptors / 2;
+               StrategyControl->target_T1_size = NBuffers / 2;
  
                 /*
                  * Initialize B1, T1, T2 and B2 lists to be empty
@@ -832,14 +860,14 @@ StrategyInitialize(bool init)
                 /*
                  * All CDB's are linked as the listUnusedCDB
                  */
-               for (i = 0; i < Data_Descriptors * 2; i++)
+               for (i = 0; i < NBuffers * 2; i++)
                 {
                         StrategyCDB[i].next = i + 1;
                         StrategyCDB[i].list = STRAT_LIST_UNUSED;
                         CLEAR_BUFFERTAG(&(StrategyCDB[i].buf_tag));
                         StrategyCDB[i].buf_id = -1;
                 }
-               StrategyCDB[Data_Descriptors * 2 - 1].next = -1;
+               StrategyCDB[NBuffers * 2 - 1].next = -1;
                 StrategyControl->listUnusedCDB = 0;
         }
         else
@@ -847,91 +875,3 @@ StrategyInitialize(bool init)
                 Assert(!init);
         }
  }
-
-
-#undef PinBuffer
-
-/*
- * PinBuffer -- make buffer unavailable for replacement.
- *
- * This should be applied only to shared buffers, never local ones.
- * Bufmgr lock must be held by caller.
- */
-void
-PinBuffer(BufferDesc *buf)
-{
-       int                     b = BufferDescriptorGetBuffer(buf) - 1;
-
-       if (PrivateRefCount[b] == 0)
-               buf->refcount++;
-       PrivateRefCount[b]++;
-       Assert(PrivateRefCount[b] > 0);
-}
-
-#ifdef NOT_USED
-void
-PinBuffer_Debug(char *file, int line, BufferDesc *buf)
-{
-       PinBuffer(buf);
-       if (ShowPinTrace)
-       {
-               Buffer          buffer = BufferDescriptorGetBuffer(buf);
-
-               fprintf(stderr, "PIN(Pin) %ld relname = %s, blockNum = %d, \
-refcount = %ld, file: %s, line: %d\n",
-                               buffer, buf->blind.relname, buf->tag.blockNum,
-                               PrivateRefCount[buffer - 1], file, line);
-       }
-}
-#endif
-
-#undef UnpinBuffer
-
-/*
- * UnpinBuffer -- make buffer available for replacement.
- *
- * This should be applied only to shared buffers, never local ones.
- * Bufmgr lock must be held by caller.
- */
-void
-UnpinBuffer(BufferDesc *buf)
-{
-       int                     b = BufferDescriptorGetBuffer(buf) - 1;
-
-       Assert(buf->refcount > 0);
-       Assert(PrivateRefCount[b] > 0);
-       PrivateRefCount[b]--;
-       if (PrivateRefCount[b] == 0)
-               buf->refcount--;
-
-       if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
-                        buf->refcount == 1)
-       {
-               /* we just released the last pin other than the waiter's */
-               buf->flags &= ~BM_PIN_COUNT_WAITER;
-               ProcSendSignal(buf->wait_backend_id);
-       }
-       else
-       {
-               /* do nothing */
-       }
-}
-
-#ifdef NOT_USED
-void
-UnpinBuffer_Debug(char *file, int line, BufferDesc *buf)
-{
-       UnpinBuffer(buf);
-       if (ShowPinTrace)
-       {
-               Buffer          buffer = BufferDescriptorGetBuffer(buf);
-
-               fprintf(stderr, "UNPIN(Unpin) %ld relname = %s, blockNum = %d, \
-refcount = %ld, file: %s, line: %d\n",
-                               buffer, buf->blind.relname, buf->tag.blockNum,
-                               PrivateRefCount[buffer - 1], file, line);
-       }
-}
-#endif
-
-
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c

index ac738d8f77ab41557029fa81ed717e3184f795fe..3e8c2a6c1b64124b2374aab6744258ff5664eaae 100644 (file)
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/ipc/ipci.c,v 1.65 2004/02/25 19:41:22 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/ipc/ipci.c,v 1.66 2004/04/19 23:27:17 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -60,7 +60,8 @@ CreateSharedMemoryAndSemaphores(bool makePrivate,
                  * moderately-accurate estimates for the big hogs, plus 100K for the
                  * stuff that's too small to bother with estimating.
                  */
-               size = BufferShmemSize();
+               size = hash_estimate_size(SHMEM_INDEX_SIZE, sizeof(ShmemIndexEnt));
+               size += BufferShmemSize();
                 size += LockShmemSize(maxBackends);
                 size += XLOGShmemSize();
                 size += CLOGShmemSize();
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h

index f401791d93a7c7c3168cee9305a9d3562bdc82eb..fc83396ff0d0f25a47a3873d27e12e78e1eca8af 100644 (file)
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -8,7 +8,7 @@
   * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.68 2004/02/12 15:06:56 wieck Exp $
+ * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.69 2004/04/19 23:27:17 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -21,15 +21,6 @@
  #include "storage/lwlock.h"
  
  
-/* Buf Mgr constants */
-/* in bufmgr.c */
-extern int     Data_Descriptors;
-extern int     Free_List_Descriptor;
-extern int     Lookup_List_Descriptor;
-extern int     Num_Descriptors;
-
-extern int     ShowPinTrace;
-
  /*
   * Flags for buffer descriptors
   */
@@ -51,10 +42,13 @@ typedef bits16 BufFlags;
   * that the backend flushing the buffer doesn't even believe the relation is
   * visible yet (its xact may have started before the xact that created the
   * rel).  The storage manager must be able to cope anyway.
+ *
+ * Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have
+ * to be fixed to zero them, since this struct is used as a hash key.
   */
  typedef struct buftag
  {
-       RelFileNode rnode;
+       RelFileNode rnode;                      /* physical relation identifier */
         BlockNumber blockNum;           /* blknum relative to begin of reln */
  } BufferTag;
  
@@ -71,12 +65,6 @@ typedef struct buftag
         (a)->rnode = (xx_reln)->rd_node \
  )
  
-#define BUFFERTAG_EQUALS(a,xx_reln,xx_blockNum) \
-( \
-       (a)->rnode.tblNode == (xx_reln)->rd_node.tblNode && \
-       (a)->rnode.relNode == (xx_reln)->rd_node.relNode && \
-       (a)->blockNum == (xx_blockNum) \
-)
  #define BUFFERTAGS_EQUAL(a,b) \
  ( \
         (a)->rnode.tblNode == (b)->rnode.tblNode && \
@@ -93,7 +81,7 @@ typedef struct sbufdesc
         Buffer          bufNext;                /* link in freelist chain */
         SHMEM_OFFSET data;                      /* pointer to data in buf pool */
  
-       /* tag and id must be together for table lookup */
+       /* tag and id must be together for table lookup (still true?) */
         BufferTag       tag;                    /* file/block identifier */
         int                     buf_id;                 /* buffer's index number (from 0) */
  
@@ -108,7 +96,7 @@ typedef struct sbufdesc
         /*
          * We can't physically remove items from a disk page if another
          * backend has the buffer pinned.  Hence, a backend may need to wait
-        * for all other pins to go away.  This is signaled by setting its own
+        * for all other pins to go away.  This is signaled by storing its own
          * backend ID into wait_backend_id and setting flag bit
          * BM_PIN_COUNT_WAITER. At present, there can be only one such waiter
          * per buffer.
@@ -128,17 +116,17 @@ typedef struct sbufdesc
  #define BL_IO_IN_PROGRESS      (1 << 0)        /* unimplemented */
  #define BL_PIN_COUNT_LOCK      (1 << 1)
  
-/* entry for buffer hashtable */
+/* entry for buffer lookup hashtable */
  typedef struct
  {
-       BufferTag       key;
-       Buffer          id;
+       BufferTag       key;                    /* Tag of a disk page */
+       int                     id;                             /* CDB id of associated CDB */
  } BufferLookupEnt;
  
  /*
   * Definitions for the buffer replacement strategy
   */
-#define STRAT_LIST_UNUSED      -1
+#define STRAT_LIST_UNUSED      (-1)
  #define STRAT_LIST_B1          0
  #define STRAT_LIST_T1          1
  #define STRAT_LIST_T2          2
@@ -150,12 +138,13 @@ typedef struct
   */
  typedef struct
  {
-       int                             prev;           /* links in the queue */
+       int                             prev;           /* list links */
         int                             next;
-       int                             list;           /* current list */
-       BufferTag               buf_tag;        /* buffer key */
-       Buffer                  buf_id;         /* currently assigned data buffer */
+       short                   list;           /* ID of list it is currently in */
+       bool                    t1_vacuum;      /* t => present only because of VACUUM */
         TransactionId   t1_xid;         /* the xid this entry went onto T1 */
+       BufferTag               buf_tag;        /* page identifier */
+       int                             buf_id;         /* currently assigned data buffer, or -1 */
  } BufferStrategyCDB;
  
  /*
@@ -163,7 +152,6 @@ typedef struct
   */
  typedef struct
  {
-
         int             target_T1_size;                         /* What T1 size are we aiming for */
         int             listUnusedCDB;                          /* All unused StrategyCDB */
         int             listHead[STRAT_NUM_LISTS];      /* ARC lists B1, T1, T2 and B2 */
@@ -175,8 +163,10 @@ typedef struct
         long    num_hit[STRAT_NUM_LISTS];
         time_t  stat_report;
  
-       BufferStrategyCDB       cdb[1];                 /* The cache directory */
+       /* Array of CDB's starts here */
+       BufferStrategyCDB       cdb[1];                 /* VARIABLE SIZE ARRAY */
  } BufferStrategyControl;
+
   
  /* counters in buf_init.c */
  extern long int ReadBufferCount;
@@ -191,24 +181,25 @@ extern long int LocalBufferFlushCount;
   * Bufmgr Interface:
   */
  
-/* Internal routines: only called by buf.c */
+/* Internal routines: only called by bufmgr */
  
-/*freelist.c*/
-extern void PinBuffer(BufferDesc *buf);
-extern void UnpinBuffer(BufferDesc *buf);
-extern BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck);
-extern BufferDesc *StrategyGetBuffer(void);
-extern void StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum);
+/* freelist.c */
+extern BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck,
+                                                                               int *cdb_found_index);
+extern BufferDesc *StrategyGetBuffer(int *cdb_replace_index);
+extern void StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
+                                                                 int cdb_found_index, int cdb_replace_index);
  extern void StrategyInvalidateBuffer(BufferDesc *buf);
  extern void StrategyHintVacuum(bool vacuum_active);
-extern int StrategyDirtyBufferList(int *buffer_dirty, int max_buffers);
+extern int StrategyDirtyBufferList(BufferDesc **buffers, BufferTag *buftags,
+                                                                  int max_buffers);
  extern void StrategyInitialize(bool init);
  
  /* buf_table.c */
  extern void InitBufTable(int size);
  extern int BufTableLookup(BufferTag *tagPtr);
-extern bool BufTableInsert(BufferTag *tagPtr, Buffer buf_id);
-extern bool BufTableDelete(BufferTag *tagPtr);
+extern void BufTableInsert(BufferTag *tagPtr, int cdb_id);
+extern void BufTableDelete(BufferTag *tagPtr);
  
  /* bufmgr.c */
  extern BufferDesc *BufferDescriptors;
author	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 19 Apr 2004 23:27:17 +0000 (23:27 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 19 Apr 2004 23:27:17 +0000 (23:27 +0000)
src/backend/storage/buffer/README		patch \| blob \| history
src/backend/storage/buffer/buf_init.c		patch \| blob \| history
src/backend/storage/buffer/buf_table.c		patch \| blob \| history
src/backend/storage/buffer/bufmgr.c		patch \| blob \| history
src/backend/storage/buffer/freelist.c		patch \| blob \| history
src/backend/storage/ipc/ipci.c		patch \| blob \| history
src/include/storage/buf_internals.h		patch \| blob \| history