]> granicus.if.org Git - postgresql/commitdiff
Support unlogged GiST index.
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Mon, 11 Feb 2013 20:50:15 +0000 (22:50 +0200)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Mon, 11 Feb 2013 21:07:09 +0000 (23:07 +0200)
The reason this wasn't supported before was that GiST indexes need an
increasing sequence to detect concurrent page-splits. In a regular WAL-
logged GiST index, the LSN of the page-split record is used for that
purpose, and in a temporary index, we can get away with a backend-local
counter. Neither of those methods works for an unlogged relation.

To provide such an increasing sequence of numbers, create a "fake LSN"
counter that is saved and restored across shutdowns. On recovery, unlogged
relations are blown away, so the counter doesn't need to survive that
either.

Jeevan Chalke, based on discussions with Robert Haas, Tom Lane and me.

12 files changed:
doc/src/sgml/ref/create_table.sgml
src/backend/access/gist/gist.c
src/backend/access/gist/gistbuild.c
src/backend/access/gist/gistutil.c
src/backend/access/gist/gistvacuum.c
src/backend/access/transam/xlog.c
src/backend/storage/buffer/bufmgr.c
src/bin/pg_controldata/pg_controldata.c
src/bin/pg_resetxlog/pg_resetxlog.c
src/include/access/gist_private.h
src/include/access/xlog.h
src/include/catalog/pg_control.h

index 8872920446aca9c466a1b0f356605137ef345789..af11eb05a652df7fdec10fa88f3d40442ee040e2 100644 (file)
@@ -182,8 +182,7 @@ CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXI
       automatically truncated after a crash or unclean shutdown.  The contents
       of an unlogged table are also not replicated to standby servers.
       Any indexes created on an unlogged table are automatically unlogged as
-      well; however, unlogged <link linkend="GiST">GiST indexes</link> are
-      currently not supported and cannot be created on an unlogged table.
+      well.
      </para>
     </listitem>
    </varlistentry>
index e2d3390300ee05c5e825ddc36a90c8ce0d736547..eba95f1866451cf4aac5df2e7aeb0fc57b6f902d 100644 (file)
@@ -16,6 +16,7 @@
 
 #include "access/genam.h"
 #include "access/gist_private.h"
+#include "access/heapam_xlog.h"
 #include "catalog/index.h"
 #include "catalog/pg_collation.h"
 #include "miscadmin.h"
@@ -71,9 +72,22 @@ createTempGistContext(void)
 Datum
 gistbuildempty(PG_FUNCTION_ARGS)
 {
-       ereport(ERROR,
-                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                        errmsg("unlogged GiST indexes are not supported")));
+       Relation        index = (Relation) PG_GETARG_POINTER(0);
+       Buffer          buffer;
+
+       /* Initialize the root page */
+       buffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+       LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+       /* Initialize and xlog buffer */
+       START_CRIT_SECTION();
+       GISTInitBuffer(buffer, F_LEAF);
+       MarkBufferDirty(buffer);
+       log_newpage_buffer(buffer);
+       END_CRIT_SECTION();
+
+       /* Unlock and release the buffer */
+       UnlockReleaseBuffer(buffer);
 
        PG_RETURN_VOID();
 }
@@ -391,7 +405,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
                                                                   dist, oldrlink, oldnsn, leftchildbuf,
                                                                   markfollowright);
                else
-                       recptr = GetXLogRecPtrForTemp();
+                       recptr = gistGetFakeLSN(rel);
 
                for (ptr = dist; ptr; ptr = ptr->next)
                {
@@ -448,7 +462,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
                }
                else
                {
-                       recptr = GetXLogRecPtrForTemp();
+                       recptr = gistGetFakeLSN(rel);
                        PageSetLSN(page, recptr);
                }
 
index aec5b5248095904f167d8790d04497f0682a9b3d..0cf22cdf3a88be2e23585f7274c7b817f0d467f4 100644 (file)
@@ -158,16 +158,6 @@ gistbuild(PG_FUNCTION_ARGS)
                elog(ERROR, "index \"%s\" already contains data",
                         RelationGetRelationName(index));
 
-       /*
-        * We can't yet handle unlogged GiST indexes, because we depend on LSNs.
-        * This is duplicative of an error in gistbuildempty, but we want to check
-        * here so as to throw error before doing all the index-build work.
-        */
-       if (heap->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
-               ereport(ERROR,
-                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                errmsg("unlogged GiST indexes are not supported")));
-
        /* no locking is needed */
        buildstate.giststate = initGISTstate(index);
 
@@ -204,7 +194,7 @@ gistbuild(PG_FUNCTION_ARGS)
                PageSetTLI(page, ThisTimeLineID);
        }
        else
-               PageSetLSN(page, GetXLogRecPtrForTemp());
+               PageSetLSN(page, gistGetFakeLSN(heap));
 
        UnlockReleaseBuffer(buffer);
 
index e5c3d69fca7a13ab41feaf1b6ad2c49a28f8cfcb..f7d50ddb7120c83c240a06f71338822e34825dd2 100644 (file)
@@ -798,16 +798,30 @@ gistoptions(PG_FUNCTION_ARGS)
 }
 
 /*
- * Temporary GiST indexes are not WAL-logged, but we need LSNs to detect
- * concurrent page splits anyway. GetXLogRecPtrForTemp() provides a fake
- * sequence of LSNs for that purpose. Each call generates an LSN that is
- * greater than any previous value returned by this function in the same
- * session.
+ * Temporary and unlogged GiST indexes are not WAL-logged, but we need LSNs
+ * to detect concurrent page splits anyway. This function provides a fake
+ * sequence of LSNs for that purpose.
  */
 XLogRecPtr
-GetXLogRecPtrForTemp(void)
+gistGetFakeLSN(Relation rel)
 {
        static XLogRecPtr counter = 1;
-       counter++;
-       return counter;
+
+       if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
+       {
+               /*
+                * Temporary relations are only accessible in our session, so a
+                * simple backend-local counter will do.
+                */
+               return counter++;
+       }
+       else
+       {
+               /*
+                * Unlogged relations are accessible from other backends, and survive
+                * (clean) restarts. GetFakeLSNForUnloggedRel() handles that for us.
+                */
+               Assert(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED);
+               return GetFakeLSNForUnloggedRel();
+       }
 }
index b5be6765d4ed77d8c0ae86980b8e78565ecfe5ee..1d9f83201661a91fd413e5a1b89188d20ef23595 100644 (file)
@@ -238,7 +238,7 @@ gistbulkdelete(PG_FUNCTION_ARGS)
                                        PageSetTLI(page, ThisTimeLineID);
                                }
                                else
-                                       PageSetLSN(page, GetXLogRecPtrForTemp());
+                                       PageSetLSN(page, gistGetFakeLSN(rel));
 
                                END_CRIT_SECTION();
                        }
index 140f9109a6f92e39658d76c8f35a0e32ee269de3..479c14da902a02f609f4f60ebfd657a2512e87f7 100644 (file)
@@ -391,6 +391,10 @@ typedef struct XLogCtlData
        XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
        XLogSegNo       lastRemovedSegNo; /* latest removed/recycled XLOG segment */
 
+       /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck */
+       XLogRecPtr  unloggedLSN;
+       slock_t         ulsn_lck;
+
        /* Protected by WALWriteLock: */
        XLogCtlWrite Write;
 
@@ -3696,6 +3700,31 @@ GetSystemIdentifier(void)
        return ControlFile->system_identifier;
 }
 
+/*
+ * Returns a fake LSN for unlogged relations.
+ *
+ * Each call generates an LSN that is greater than any previous value
+ * returned. The current counter value is saved and restored across clean
+ * shutdowns, but like unlogged relations, does not survive a crash. This can
+ * be used in lieu of real LSN values returned by XLogInsert, if you need an
+ * LSN-like increasing sequence of numbers without writing any WAL.
+ */
+XLogRecPtr
+GetFakeLSNForUnloggedRel(void)
+{
+       XLogRecPtr nextUnloggedLSN;
+
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
+
+       /* increment the unloggedLSN counter, need SpinLock */
+       SpinLockAcquire(&xlogctl->ulsn_lck);
+       nextUnloggedLSN = xlogctl->unloggedLSN++;
+       SpinLockRelease(&xlogctl->ulsn_lck);
+
+       return nextUnloggedLSN;
+}
+
 /*
  * Auto-tune the number of XLOG buffers.
  *
@@ -3844,6 +3873,7 @@ XLOGShmemInit(void)
        XLogCtl->WalWriterSleeping = false;
        XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
        SpinLockInit(&XLogCtl->info_lck);
+       SpinLockInit(&XLogCtl->ulsn_lck);
        InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
 
        /*
@@ -3989,6 +4019,7 @@ BootStrapXLOG(void)
        ControlFile->time = checkPoint.time;
        ControlFile->checkPoint = checkPoint.redo;
        ControlFile->checkPointCopy = checkPoint;
+       ControlFile->unloggedLSN = 1;
 
        /* Set important parameter values for use when replaying WAL */
        ControlFile->MaxConnections = MaxConnections;
@@ -5032,6 +5063,16 @@ StartupXLOG(void)
        XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
        XLogCtl->ckptXid = checkPoint.nextXid;
 
+       /*
+        * Initialize unlogged LSN. On a clean shutdown, it's restored from the
+        * control file. On recovery, all unlogged relations are blown away, so
+        * the unlogged LSN counter can be reset too.
+        */
+       if (ControlFile->state == DB_SHUTDOWNED)
+               XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
+       else
+               XLogCtl->unloggedLSN = 1;
+
        /*
         * We must replay WAL entries using the same TimeLineID they were created
         * under, so temporarily adopt the TLI indicated by the checkpoint (see
@@ -6916,6 +6957,16 @@ CreateCheckPoint(int flags)
        /* crash recovery should always recover to the end of WAL */
        ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
        ControlFile->minRecoveryPointTLI = 0;
+
+       /*
+        * Persist unloggedLSN value. It's reset on crash recovery, so this goes
+        * unused on non-shutdown checkpoints, but seems useful to store it always
+        * for debugging purposes.
+        */
+       SpinLockAcquire(&XLogCtl->ulsn_lck);
+       ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
+       SpinLockRelease(&XLogCtl->ulsn_lck);
+
        UpdateControlFile();
        LWLockRelease(ControlFileLock);
 
index 13b80aefc5bfade510b546d38928f4f2ea64d2a5..405ff61130e81319745001e66a2f20f8c3e1bd46 100644 (file)
@@ -1922,9 +1922,24 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
         * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
         * rule that log updates must hit disk before any of the data-file changes
         * they describe do.
+        *
+        * However, this rule does not apply to unlogged relations, which will be
+        * lost after a crash anyway.  Most unlogged relation pages do not bear
+        * LSNs since we never emit WAL records for them, and therefore flushing
+        * up through the buffer LSN would be useless, but harmless.  However, GiST
+        * indexes use LSNs internally to track page-splits, and therefore unlogged
+        * GiST pages bear "fake" LSNs generated by GetFakeLSNForUnloggedRel.  It
+        * is unlikely but possible that the fake LSN counter could advance past
+        * the WAL insertion point; and if it did happen, attempting to flush WAL
+        * through that location would fail, with disastrous system-wide
+        * consequences.  To make sure that can't happen, skip the flush if the
+        * buffer isn't permanent.
         */
-       recptr = BufferGetLSN(buf);
-       XLogFlush(recptr);
+       if (buf->flags & BM_PERMANENT)
+       {
+               recptr = BufferGetLSN(buf);
+               XLogFlush(recptr);
+       }
 
        /*
         * Now it's safe to write buffer to disk. Note that no one else should
index 33725154fd7aacf58bc821b6c591d4f0a413dd08..cab25684d963fbf3b24e5f2f54ad8f7381244a9f 100644 (file)
@@ -240,6 +240,9 @@ main(int argc, char *argv[])
                   ControlFile.checkPointCopy.oldestMultiDB);
        printf(_("Time of latest checkpoint:            %s\n"),
                   ckpttime_str);
+       printf(_("Fake LSN counter for unlogged rels:   %X/%X\n"),
+                  (uint32) (ControlFile.unloggedLSN >> 32),
+                  (uint32) ControlFile.unloggedLSN);
        printf(_("Min recovery ending location:         %X/%X\n"),
                   (uint32) (ControlFile.minRecoveryPoint >> 32),
                   (uint32) ControlFile.minRecoveryPoint);
index 272813eaabffc35da14ce0f5b1f94115e02e2059..317d8606a0bb176f71a807e8bc0effe6e1ce14a5 100644 (file)
@@ -510,6 +510,7 @@ GuessControlValues(void)
        ControlFile.state = DB_SHUTDOWNED;
        ControlFile.time = (pg_time_t) time(NULL);
        ControlFile.checkPoint = ControlFile.checkPointCopy.redo;
+       ControlFile.unloggedLSN = 1;
 
        /* minRecoveryPoint, backupStartPoint and backupEndPoint can be left zero */
 
index c2f9031b4fee40c860f84faf575f3fa488d52903..cae6dbc225cdcad10bb314b79d553dbfbeefda25 100644 (file)
@@ -512,7 +512,7 @@ extern void gistMakeUnionKey(GISTSTATE *giststate, int attno,
                                 GISTENTRY *entry2, bool isnull2,
                                 Datum *dst, bool *dstisnull);
 
-extern XLogRecPtr GetXLogRecPtrForTemp(void);
+extern XLogRecPtr gistGetFakeLSN(Relation rel);
 
 /* gistvacuum.c */
 extern Datum gistbulkdelete(PG_FUNCTION_ARGS);
index 72e324259645774004478bdf8759cd8236fbee6e..8a65492a3464440d42308c461d600305ab23715f 100644 (file)
@@ -294,6 +294,7 @@ extern char *XLogFileNameP(TimeLineID tli, XLogSegNo segno);
 
 extern void UpdateControlFile(void);
 extern uint64 GetSystemIdentifier(void);
+extern XLogRecPtr GetFakeLSNForUnloggedRel(void);
 extern Size XLOGShmemSize(void);
 extern void XLOGShmemInit(void);
 extern void BootStrapXLOG(void);
index 0c647e77ad76aa20e693991ecb1bd79e5c869517..306d18885400f13108302b0921077316c4a59ad8 100644 (file)
@@ -21,7 +21,7 @@
 
 
 /* Version identifier for this pg_control format */
-#define PG_CONTROL_VERSION     934
+#define PG_CONTROL_VERSION     935
 
 /*
  * Body of CheckPoint XLOG records.  This is declared here because we keep
@@ -126,6 +126,8 @@ typedef struct ControlFileData
 
        CheckPoint      checkPointCopy; /* copy of last check point record */
 
+       XLogRecPtr  unloggedLSN;        /* current fake LSN value, for unlogged rels */
+
        /*
         * These two values determine the minimum point we must recover up to
         * before starting up: