]> granicus.if.org Git - postgresql/blobdiff - src/backend/port/sysv_shmem.c
pgindent run for 9.4
[postgresql] / src / backend / port / sysv_shmem.c
index 8d6a1814ecb982d4c3b9237aa66af3bcf226b249..7430757c7533fffddd9973654975df6a03bf1b81 100644 (file)
@@ -6,47 +6,50 @@
  * These routines represent a fairly thin layer on top of SysV shared
  * memory functionality.
  *
- * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/port/sysv_shmem.c,v 1.11 2003/07/14 20:00:22 tgl Exp $
+ *       src/backend/port/sysv_shmem.c
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
-#include <errno.h>
 #include <signal.h>
 #include <unistd.h>
 #include <sys/file.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
 #ifdef HAVE_SYS_IPC_H
 #include <sys/ipc.h>
 #endif
 #ifdef HAVE_SYS_SHM_H
 #include <sys/shm.h>
 #endif
-#ifdef HAVE_KERNEL_OS_H
-#include <kernel/OS.h>
-#endif
 
 #include "miscadmin.h"
+#include "portability/mem.h"
+#include "storage/dsm.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
+#include "utils/guc.h"
 
-typedef int IpcMemoryId;               /* shared memory ID returned by shmget(2) */
 
-#define IPCProtection  (0600)  /* access/modify by user only */
+typedef key_t IpcMemoryKey;            /* shared memory key passed to shmget(2) */
+typedef int IpcMemoryId;               /* shared memory ID returned by shmget(2) */
 
 
-IpcMemoryKey UsedShmemSegID = 0;
-void *UsedShmemSegAddr = NULL;
+unsigned long UsedShmemSegID = 0;
+void      *UsedShmemSegAddr = NULL;
+static Size AnonymousShmemSize;
+static void *AnonymousShmem = NULL;
 
-static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, uint32 size);
+static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
 static void IpcMemoryDetach(int status, Datum shmaddr);
 static void IpcMemoryDelete(int status, Datum shmId);
 static PGShmemHeader *PGSharedMemoryAttach(IpcMemoryKey key,
-                                                                                  IpcMemoryId *shmid);
+                                        IpcMemoryId *shmid);
 
 
 /*
@@ -62,7 +65,7 @@ static PGShmemHeader *PGSharedMemoryAttach(IpcMemoryKey key,
  * print out an error and abort.  Other types of errors are not recoverable.
  */
 static void *
-InternalIpcMemoryCreate(IpcMemoryKey memKey, uint32 size)
+InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
 {
        IpcMemoryId shmid;
        void       *memAddress;
@@ -71,99 +74,121 @@ InternalIpcMemoryCreate(IpcMemoryKey memKey, uint32 size)
 
        if (shmid < 0)
        {
+               int                     shmget_errno = errno;
+
                /*
-                * Fail quietly if error indicates a collision with existing
-                * segment. One would expect EEXIST, given that we said IPC_EXCL,
-                * but perhaps we could get a permission violation instead?  Also,
-                * EIDRM might occur if an old seg is slated for destruction but
-                * not gone yet.
+                * Fail quietly if error indicates a collision with existing segment.
+                * One would expect EEXIST, given that we said IPC_EXCL, but perhaps
+                * we could get a permission violation instead?  Also, EIDRM might
+                * occur if an old seg is slated for destruction but not gone yet.
                 */
-               if (errno == EEXIST || errno == EACCES
+               if (shmget_errno == EEXIST || shmget_errno == EACCES
 #ifdef EIDRM
-                       || errno == EIDRM
+                       || shmget_errno == EIDRM
 #endif
                        )
                        return NULL;
 
                /*
-                * Else complain and abort
+                * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if
+                * there is an existing segment but it's smaller than "size" (this is
+                * a result of poorly-thought-out ordering of error tests). To
+                * distinguish between collision and invalid size in such cases, we
+                * make a second try with size = 0.  These kernels do not test size
+                * against SHMMIN in the preexisting-segment case, so we will not get
+                * EINVAL a second time if there is such a segment.
                 */
-               fprintf(stderr, "IpcMemoryCreate: shmget(key=%d, size=%u, 0%o) failed: %s\n",
-                         (int) memKey, size, (IPC_CREAT | IPC_EXCL | IPCProtection),
-                               strerror(errno));
+               if (shmget_errno == EINVAL)
+               {
+                       shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection);
 
-               if (errno == EINVAL)
-                       fprintf(stderr,
-                                       "\nThis error usually means that PostgreSQL's request for a shared memory\n"
-                                       "segment exceeded your kernel's SHMMAX parameter.  You can either\n"
-                                       "reduce the request size or reconfigure the kernel with larger SHMMAX.\n"
-                                       "To reduce the request size (currently %u bytes), reduce\n"
-                                       "PostgreSQL's shared_buffers parameter (currently %d) and/or\n"
-                                       "its max_connections parameter (currently %d).\n"
-                                       "\n"
-                                       "If the request size is already small, it's possible that it is less than\n"
-                                       "your kernel's SHMMIN parameter, in which case raising the request size or\n"
-                                       "reconfiguring SHMMIN is called for.\n"
-                                       "\n"
-                                       "The PostgreSQL documentation contains more information about shared\n"
-                                       "memory configuration.\n\n",
-                                       size, NBuffers, MaxBackends);
-
-               else if (errno == ENOMEM)
-                       fprintf(stderr,
-                                       "\nThis error usually means that PostgreSQL's request for a shared\n"
-                                       "memory segment exceeded available memory or swap space.\n"
-                                       "To reduce the request size (currently %u bytes), reduce\n"
-                                       "PostgreSQL's shared_buffers parameter (currently %d) and/or\n"
-                                       "its max_connections parameter (currently %d).\n"
-                                       "\n"
-                                       "The PostgreSQL documentation contains more information about shared\n"
-                                       "memory configuration.\n\n",
-                                       size, NBuffers, MaxBackends);
-
-               else if (errno == ENOSPC)
-                       fprintf(stderr,
-                                       "\nThis error does *not* mean that you have run out of disk space.\n"
-                                       "\n"
-                                       "It occurs either if all available shared memory IDs have been taken,\n"
-                                       "in which case you need to raise the SHMMNI parameter in your kernel,\n"
-                                       "or because the system's overall limit for shared memory has been\n"
-                       "reached.  If you cannot increase the shared memory limit,\n"
-                                       "reduce PostgreSQL's shared memory request (currently %u bytes),\n"
-                                       "by reducing its shared_buffers parameter (currently %d) and/or\n"
-                                       "its max_connections parameter (currently %d).\n"
-                                       "\n"
-                                       "The PostgreSQL documentation contains more information about shared\n"
-                                       "memory configuration.\n\n",
-                                       size, NBuffers, MaxBackends);
-
-               proc_exit(1);
+                       if (shmid < 0)
+                       {
+                               /* As above, fail quietly if we verify a collision */
+                               if (errno == EEXIST || errno == EACCES
+#ifdef EIDRM
+                                       || errno == EIDRM
+#endif
+                                       )
+                                       return NULL;
+                               /* Otherwise, fall through to report the original error */
+                       }
+                       else
+                       {
+                               /*
+                                * On most platforms we cannot get here because SHMMIN is
+                                * greater than zero.  However, if we do succeed in creating a
+                                * zero-size segment, free it and then fall through to report
+                                * the original error.
+                                */
+                               if (shmctl(shmid, IPC_RMID, NULL) < 0)
+                                       elog(LOG, "shmctl(%d, %d, 0) failed: %m",
+                                                (int) shmid, IPC_RMID);
+                       }
+               }
+
+               /*
+                * Else complain and abort.
+                *
+                * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX
+                * is violated.  SHMALL violation might be reported as either ENOMEM
+                * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which
+                * it should be.  SHMMNI violation is ENOSPC, per spec.  Just plain
+                * not-enough-RAM is ENOMEM.
+                */
+               errno = shmget_errno;
+               ereport(FATAL,
+                               (errmsg("could not create shared memory segment: %m"),
+                 errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o).",
+                                       (unsigned long) memKey, size,
+                                       IPC_CREAT | IPC_EXCL | IPCProtection),
+                                (shmget_errno == EINVAL) ?
+                                errhint("This error usually means that PostgreSQL's request for a shared memory "
+                "segment exceeded your kernel's SHMMAX parameter, or possibly that "
+                                                "it is less than "
+                                                "your kernel's SHMMIN parameter.\n"
+               "The PostgreSQL documentation contains more information about shared "
+                                                "memory configuration.") : 0,
+                                (shmget_errno == ENOMEM) ?
+                                errhint("This error usually means that PostgreSQL's request for a shared "
+                                                "memory segment exceeded your kernel's SHMALL parameter.  You might need "
+                                                "to reconfigure the kernel with larger SHMALL.\n"
+               "The PostgreSQL documentation contains more information about shared "
+                                                "memory configuration.") : 0,
+                                (shmget_errno == ENOSPC) ?
+                                errhint("This error does *not* mean that you have run out of disk space.  "
+                                                "It occurs either if all available shared memory IDs have been taken, "
+                                                "in which case you need to raise the SHMMNI parameter in your kernel, "
+                 "or because the system's overall limit for shared memory has been "
+                                                "reached.\n"
+               "The PostgreSQL documentation contains more information about shared "
+                                                "memory configuration.") : 0));
        }
 
        /* Register on-exit routine to delete the new segment */
        on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid));
 
        /* OK, should be able to attach to the segment */
-#if defined(solaris) && defined(__sparc__)
-       /* use intimate shared memory on SPARC Solaris */
-       memAddress = shmat(shmid, 0, SHM_SHARE_MMU);
-#else
-       memAddress = shmat(shmid, 0, 0);
-#endif
+       memAddress = shmat(shmid, NULL, PG_SHMAT_FLAGS);
 
        if (memAddress == (void *) -1)
-       {
-               fprintf(stderr, "IpcMemoryCreate: shmat(id=%d) failed: %s\n",
-                               shmid, strerror(errno));
-               proc_exit(1);
-       }
+               elog(FATAL, "shmat(id=%d) failed: %m", shmid);
 
        /* Register on-exit routine to detach new segment before deleting */
        on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress));
 
-       /* Record key and ID in lockfile for data directory. */
-       RecordSharedMemoryInLockFile((unsigned long) memKey,
-                                                                (unsigned long) shmid);
+       /*
+        * Store shmem key and ID in data directory lockfile.  Format to try to
+        * keep it the same length always (trailing junk in the lockfile won't
+        * hurt, but might confuse humans).
+        */
+       {
+               char            line[64];
+
+               sprintf(line, "%9lu %9lu",
+                               (unsigned long) memKey, (unsigned long) shmid);
+               AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line);
+       }
 
        return memAddress;
 }
@@ -176,14 +201,13 @@ InternalIpcMemoryCreate(IpcMemoryKey memKey, uint32 size)
 static void
 IpcMemoryDetach(int status, Datum shmaddr)
 {
+       /* Detach System V shared memory block. */
        if (shmdt(DatumGetPointer(shmaddr)) < 0)
-               fprintf(stderr, "IpcMemoryDetach: shmdt(%p) failed: %s\n",
-                               DatumGetPointer(shmaddr), strerror(errno));
-
-       /*
-        * We used to report a failure via elog(WARNING), but that's pretty
-        * pointless considering any client has long since disconnected ...
-        */
+               elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr));
+       /* Release anonymous shared memory block, if any. */
+       if (AnonymousShmem != NULL
+               && munmap(AnonymousShmem, AnonymousShmemSize) < 0)
+               elog(LOG, "munmap(%p) failed: %m", AnonymousShmem);
 }
 
 /****************************************************************************/
@@ -193,94 +217,265 @@ IpcMemoryDetach(int status, Datum shmaddr)
 static void
 IpcMemoryDelete(int status, Datum shmId)
 {
-       if (shmctl(DatumGetInt32(shmId), IPC_RMID, (struct shmid_ds *) NULL) < 0)
-               fprintf(stderr, "IpcMemoryDelete: shmctl(%d, %d, 0) failed: %s\n",
-                               DatumGetInt32(shmId), IPC_RMID, strerror(errno));
-
-       /*
-        * We used to report a failure via elog(WARNING), but that's pretty
-        * pointless considering any client has long since disconnected ...
-        */
+       if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0)
+               elog(LOG, "shmctl(%d, %d, 0) failed: %m",
+                        DatumGetInt32(shmId), IPC_RMID);
 }
 
 /*
  * PGSharedMemoryIsInUse
  *
  * Is a previously-existing shmem segment still existing and in use?
+ *
+ * The point of this exercise is to detect the case where a prior postmaster
+ * crashed, but it left child backends that are still running.  Therefore
+ * we only care about shmem segments that are associated with the intended
+ * DataDir.  This is an important consideration since accidental matches of
+ * shmem segment IDs are reasonably common.
  */
 bool
 PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
 {
        IpcMemoryId shmId = (IpcMemoryId) id2;
        struct shmid_ds shmStat;
+       struct stat statbuf;
+       PGShmemHeader *hdr;
 
        /*
-        * We detect whether a shared memory segment is in use by seeing
-        * whether it (a) exists and (b) has any processes are attached to it.
-        *
-        * If we are unable to perform the stat operation for a reason other than
-        * nonexistence of the segment (most likely, because it doesn't belong
-        * to our userid), assume it is in use.
+        * We detect whether a shared memory segment is in use by seeing whether
+        * it (a) exists and (b) has any processes attached to it.
         */
        if (shmctl(shmId, IPC_STAT, &shmStat) < 0)
        {
                /*
                 * EINVAL actually has multiple possible causes documented in the
-                * shmctl man page, but we assume it must mean the segment no
-                * longer exists.
+                * shmctl man page, but we assume it must mean the segment no longer
+                * exists.
                 */
                if (errno == EINVAL)
                        return false;
-               /* Else assume segment is in use */
+
+               /*
+                * EACCES implies that the segment belongs to some other userid, which
+                * means it is not a Postgres shmem segment (or at least, not one that
+                * is relevant to our data directory).
+                */
+               if (errno == EACCES)
+                       return false;
+
+               /*
+                * Some Linux kernel versions (in fact, all of them as of July 2007)
+                * sometimes return EIDRM when EINVAL is correct.  The Linux kernel
+                * actually does not have any internal state that would justify
+                * returning EIDRM, so we can get away with assuming that EIDRM is
+                * equivalent to EINVAL on that platform.
+                */
+#ifdef HAVE_LINUX_EIDRM_BUG
+               if (errno == EIDRM)
+                       return false;
+#endif
+
+               /*
+                * Otherwise, we had better assume that the segment is in use. The
+                * only likely case is EIDRM, which implies that the segment has been
+                * IPC_RMID'd but there are still processes attached to it.
+                */
                return true;
        }
-       /* If it has attached processes, it's in use */
-       if (shmStat.shm_nattch != 0)
-               return true;
-       return false;
+
+       /* If it has no attached processes, it's not in use */
+       if (shmStat.shm_nattch == 0)
+               return false;
+
+       /*
+        * Try to attach to the segment and see if it matches our data directory.
+        * This avoids shmid-conflict problems on machines that are running
+        * several postmasters under the same userid.
+        */
+       if (stat(DataDir, &statbuf) < 0)
+               return true;                    /* if can't stat, be conservative */
+
+       hdr = (PGShmemHeader *) shmat(shmId, NULL, PG_SHMAT_FLAGS);
+
+       if (hdr == (PGShmemHeader *) -1)
+               return true;                    /* if can't attach, be conservative */
+
+       if (hdr->magic != PGShmemMagic ||
+               hdr->device != statbuf.st_dev ||
+               hdr->inode != statbuf.st_ino)
+       {
+               /*
+                * It's either not a Postgres segment, or not one for my data
+                * directory.  In either case it poses no threat.
+                */
+               shmdt((void *) hdr);
+               return false;
+       }
+
+       /* Trouble --- looks a lot like there's still live backends */
+       shmdt((void *) hdr);
+
+       return true;
 }
 
+/*
+ * Creates an anonymous mmap()ed shared memory segment.
+ *
+ * Pass the requested size in *size.  This function will modify *size to the
+ * actual size of the allocation, if it ends up allocating a segment that is
+ * larger than requested.
+ */
+#ifndef EXEC_BACKEND
+static void *
+CreateAnonymousSegment(Size *size)
+{
+       Size            allocsize = *size;
+       void       *ptr = MAP_FAILED;
+       int                     mmap_errno = 0;
+
+#ifndef MAP_HUGETLB
+       if (huge_pages == HUGE_PAGES_ON)
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("huge TLB pages not supported on this platform")));
+#else
+       if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY)
+       {
+               /*
+                * Round up the request size to a suitable large value.
+                *
+                * Some Linux kernel versions are known to have a bug, which causes
+                * mmap() with MAP_HUGETLB to fail if the request size is not a
+                * multiple of any supported huge page size. To work around that, we
+                * round up the request size to nearest 2MB. 2MB is the most common
+                * huge page page size on affected systems.
+                *
+                * Aside from that bug, even with a kernel that does the allocation
+                * correctly, rounding it up ourselves avoids wasting memory. Without
+                * it, if we for example make an allocation of 2MB + 1 bytes, the
+                * kernel might decide to use two 2MB huge pages for that, and waste 2
+                * MB - 1 of memory. When we do the rounding ourselves, we can use
+                * that space for allocations.
+                */
+               int                     hugepagesize = 2 * 1024 * 1024;
+
+               if (allocsize % hugepagesize != 0)
+                       allocsize += hugepagesize - (allocsize % hugepagesize);
+
+               ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
+                                  PG_MMAP_FLAGS | MAP_HUGETLB, -1, 0);
+               mmap_errno = errno;
+               if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
+                       elog(DEBUG1, "mmap with MAP_HUGETLB failed, huge pages disabled: %m");
+       }
+#endif
+
+       if (huge_pages == HUGE_PAGES_OFF ||
+               (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED))
+       {
+               /*
+                * use the original size, not the rounded up value, when falling back
+                * to non-huge pages.
+                */
+               allocsize = *size;
+               ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
+                                  PG_MMAP_FLAGS, -1, 0);
+               mmap_errno = errno;
+       }
+
+       if (ptr == MAP_FAILED)
+       {
+               errno = mmap_errno;
+               ereport(FATAL,
+                               (errmsg("could not map anonymous shared memory: %m"),
+                                (mmap_errno == ENOMEM) ?
+                                errhint("This error usually means that PostgreSQL's request "
+                                       "for a shared memory segment exceeded available memory, "
+                                         "swap space or huge pages. To reduce the request size "
+                                                "(currently  %zu bytes), reduce PostgreSQL's shared "
+                                          "memory usage, perhaps by reducing shared_buffers or "
+                                                "max_connections.",
+                                                *size) : 0));
+       }
+
+       *size = allocsize;
+       return ptr;
+}
+#endif
 
 /*
  * PGSharedMemoryCreate
  *
  * Create a shared memory segment of the given size and initialize its
  * standard header.  Also, register an on_shmem_exit callback to release
- * the storage.  For an exec'ed backend, it just attaches.
+ * the storage.
  *
  * Dead Postgres segments are recycled if found, but we do not fail upon
- * collision with non-Postgres shmem segments. The idea here is to detect and
+ * collision with non-Postgres shmem segments.  The idea here is to detect and
  * re-use keys that may have been assigned by a crashed postmaster or backend.
  *
  * makePrivate means to always create a new segment, rather than attach to
  * or recycle any existing segment.
  *
  * The port number is passed for possible use as a key (for SysV, we use
- * it to generate the starting shmem key).     In a standalone backend,
+ * it to generate the starting shmem key).  In a standalone backend,
  * zero will be passed.
  */
 PGShmemHeader *
-PGSharedMemoryCreate(uint32 size, bool makePrivate, int port)
+PGSharedMemoryCreate(Size size, bool makePrivate, int port,
+                                        PGShmemHeader **shim)
 {
        IpcMemoryKey NextShmemSegID;
        void       *memAddress;
        PGShmemHeader *hdr;
        IpcMemoryId shmid;
+       struct stat statbuf;
+       Size            sysvsize;
+
+#if defined(EXEC_BACKEND) || !defined(MAP_HUGETLB)
+       if (huge_pages == HUGE_PAGES_ON)
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("huge pages not supported on this platform")));
+#endif
 
        /* Room for a header? */
        Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
 
-       /* Just attach and return the pointer */
-       if (ExecBackend && UsedShmemSegAddr != NULL && !makePrivate)
-       {
-               if ((hdr = PGSharedMemoryAttach(UsedShmemSegID, &shmid)) == NULL)
-               {
-                       fprintf(stderr, "Unable to attach to proper memory at fixed address: shmget(key=%d, addr=%p) failed: %s\n",
-                               (int) UsedShmemSegID, UsedShmemSegAddr, strerror(errno));
-                       proc_exit(1);
-               }
-               return hdr;
-       }
+       /*
+        * As of PostgreSQL 9.3, we normally allocate only a very small amount of
+        * System V shared memory, and only for the purposes of providing an
+        * interlock to protect the data directory.  The real shared memory block
+        * is allocated using mmap().  This works around the problem that many
+        * systems have very low limits on the amount of System V shared memory
+        * that can be allocated.  Even a limit of a few megabytes will be enough
+        * to run many copies of PostgreSQL without needing to adjust system
+        * settings.
+        *
+        * We assume that no one will attempt to run PostgreSQL 9.3 or later on
+        * systems that are ancient enough that anonymous shared memory is not
+        * supported, such as pre-2.4 versions of Linux.  If that turns out to be
+        * false, we might need to add a run-time test here and do this only if
+        * the running kernel supports it.
+        *
+        * However, we disable this logic in the EXEC_BACKEND case, and fall back
+        * to the old method of allocating the entire segment using System V
+        * shared memory, because there's no way to attach an mmap'd segment to a
+        * process after exec().  Since EXEC_BACKEND is intended only for
+        * developer use, this shouldn't be a big problem.
+        */
+#ifndef EXEC_BACKEND
+       AnonymousShmem = CreateAnonymousSegment(&size);
+       AnonymousShmemSize = size;
+
+       /* Now we need only allocate a minimal-sized SysV shmem block. */
+       sysvsize = sizeof(PGShmemHeader);
+#else
+       sysvsize = size;
+#endif
+
+       /* Make sure PGSharedMemoryAttach doesn't fail without need */
+       UsedShmemSegAddr = NULL;
 
        /* Loop till we find a free IPC key */
        NextShmemSegID = port * 1000;
@@ -288,7 +483,7 @@ PGSharedMemoryCreate(uint32 size, bool makePrivate, int port)
        for (NextShmemSegID++;; NextShmemSegID++)
        {
                /* Try to create new segment */
-               memAddress = InternalIpcMemoryCreate(NextShmemSegID, size);
+               memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize);
                if (memAddress)
                        break;                          /* successful create and attach */
 
@@ -315,54 +510,145 @@ PGSharedMemoryCreate(uint32 size, bool makePrivate, int port)
                }
 
                /*
-                * The segment appears to be from a dead Postgres process, or from
-                * a previous cycle of life in this same process.  Zap it, if
-                * possible.  This probably shouldn't fail, but if it does, assume
-                * the segment belongs to someone else after all, and continue
-                * quietly.
+                * The segment appears to be from a dead Postgres process, or from a
+                * previous cycle of life in this same process.  Zap it, if possible,
+                * and any associated dynamic shared memory segments, as well. This
+                * probably shouldn't fail, but if it does, assume the segment belongs
+                * to someone else after all, and continue quietly.
                 */
+               if (hdr->dsm_control != 0)
+                       dsm_cleanup_using_control_segment(hdr->dsm_control);
                shmdt(memAddress);
-               if (shmctl(shmid, IPC_RMID, (struct shmid_ds *) NULL) < 0)
+               if (shmctl(shmid, IPC_RMID, NULL) < 0)
                        continue;
 
                /*
                 * Now try again to create the segment.
                 */
-               memAddress = InternalIpcMemoryCreate(NextShmemSegID, size);
+               memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize);
                if (memAddress)
                        break;                          /* successful create and attach */
 
                /*
-                * Can only get here if some other process managed to create the
-                * same shmem key before we did.  Let him have that one, loop
-                * around to try next key.
+                * Can only get here if some other process managed to create the same
+                * shmem key before we did.  Let him have that one, loop around to try
+                * next key.
                 */
        }
 
        /*
-        * OK, we created a new segment.  Mark it as created by this process.
-        * The order of assignments here is critical so that another Postgres
-        * process can't see the header as valid but belonging to an invalid
-        * PID!
+        * OK, we created a new segment.  Mark it as created by this process. The
+        * order of assignments here is critical so that another Postgres process
+        * can't see the header as valid but belonging to an invalid PID!
         */
        hdr = (PGShmemHeader *) memAddress;
        hdr->creatorPID = getpid();
        hdr->magic = PGShmemMagic;
+       hdr->dsm_control = 0;
+
+       /* Fill in the data directory ID info, too */
+       if (stat(DataDir, &statbuf) < 0)
+               ereport(FATAL,
+                               (errcode_for_file_access(),
+                                errmsg("could not stat data directory \"%s\": %m",
+                                               DataDir)));
+       hdr->device = statbuf.st_dev;
+       hdr->inode = statbuf.st_ino;
 
        /*
         * Initialize space allocation status for segment.
         */
        hdr->totalsize = size;
        hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
+       *shim = hdr;
+
+       /* Save info for possible future use */
+       UsedShmemSegAddr = memAddress;
+       UsedShmemSegID = (unsigned long) NextShmemSegID;
+
+       /*
+        * If AnonymousShmem is NULL here, then we're not using anonymous shared
+        * memory, and should return a pointer to the System V shared memory
+        * block. Otherwise, the System V shared memory block is only a shim, and
+        * we must return a pointer to the real block.
+        */
+       if (AnonymousShmem == NULL)
+               return hdr;
+       memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));
+       return (PGShmemHeader *) AnonymousShmem;
+}
+
+#ifdef EXEC_BACKEND
+
+/*
+ * PGSharedMemoryReAttach
+ *
+ * Re-attach to an already existing shared memory segment.  In the non
+ * EXEC_BACKEND case this is not used, because postmaster children inherit
+ * the shared memory segment attachment via fork().
+ *
+ * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
+ * routine.  The caller must have already restored them to the postmaster's
+ * values.
+ */
+void
+PGSharedMemoryReAttach(void)
+{
+       IpcMemoryId shmid;
+       void       *hdr;
+       void       *origUsedShmemSegAddr = UsedShmemSegAddr;
+
+       Assert(UsedShmemSegAddr != NULL);
+       Assert(IsUnderPostmaster);
+
+#ifdef __CYGWIN__
+       /* cygipc (currently) appears to not detach on exec. */
+       PGSharedMemoryDetach();
+       UsedShmemSegAddr = origUsedShmemSegAddr;
+#endif
 
-       
-       if (ExecBackend && UsedShmemSegAddr == NULL && !makePrivate)
+       elog(DEBUG3, "attaching to %p", UsedShmemSegAddr);
+       hdr = (void *) PGSharedMemoryAttach((IpcMemoryKey) UsedShmemSegID, &shmid);
+       if (hdr == NULL)
+               elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m",
+                        (int) UsedShmemSegID, UsedShmemSegAddr);
+       if (hdr != origUsedShmemSegAddr)
+               elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)",
+                        hdr, origUsedShmemSegAddr);
+       dsm_set_control_handle(((PGShmemHeader *) hdr)->dsm_control);
+
+       UsedShmemSegAddr = hdr;         /* probably redundant */
+}
+#endif   /* EXEC_BACKEND */
+
+/*
+ * PGSharedMemoryDetach
+ *
+ * Detach from the shared memory segment, if still attached.  This is not
+ * intended for use by the process that originally created the segment
+ * (it will have an on_shmem_exit callback registered to do that).  Rather,
+ * this is for subprocesses that have inherited an attachment and want to
+ * get rid of it.
+ */
+void
+PGSharedMemoryDetach(void)
+{
+       if (UsedShmemSegAddr != NULL)
        {
-               UsedShmemSegAddr = memAddress;
-               UsedShmemSegID = NextShmemSegID;
+               if ((shmdt(UsedShmemSegAddr) < 0)
+#if defined(EXEC_BACKEND) && defined(__CYGWIN__)
+               /* Work-around for cygipc exec bug */
+                       && shmdt(NULL) < 0
+#endif
+                       )
+                       elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr);
+               UsedShmemSegAddr = NULL;
        }
-       
-       return hdr;
+
+       /* Release anonymous shared memory block, if any. */
+       if (AnonymousShmem != NULL
+               && munmap(AnonymousShmem, AnonymousShmemSize) < 0)
+               elog(LOG, "munmap(%p) failed: %m", AnonymousShmem);
 }
 
 
@@ -379,22 +665,14 @@ PGSharedMemoryAttach(IpcMemoryKey key, IpcMemoryId *shmid)
        if ((*shmid = shmget(key, sizeof(PGShmemHeader), 0)) < 0)
                return NULL;
 
-       hdr = (PGShmemHeader *) shmat(*shmid,
-                                                                 UsedShmemSegAddr,
-#if defined(solaris) && defined(__sparc__)
-                                                                 /* use intimate shared memory on Solaris */
-                                                                 SHM_SHARE_MMU
-#else
-                                                                 0
-#endif
-               );
+       hdr = (PGShmemHeader *) shmat(*shmid, UsedShmemSegAddr, PG_SHMAT_FLAGS);
 
        if (hdr == (PGShmemHeader *) -1)
                return NULL;                    /* failed: must be some other app's */
 
        if (hdr->magic != PGShmemMagic)
        {
-               shmdt(hdr);
+               shmdt((void *) hdr);
                return NULL;                    /* segment belongs to a non-Postgres app */
        }