1 /*-------------------------------------------------------------------------
4 * Implement shared memory using SysV facilities
6 * These routines represent a fairly thin layer on top of SysV shared
7 * memory functionality.
9 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994, Regents of the University of California
13 * src/backend/port/sysv_shmem.c
15 *-------------------------------------------------------------------------
31 #include "miscadmin.h"
32 #include "portability/mem.h"
33 #include "storage/dsm.h"
34 #include "storage/ipc.h"
35 #include "storage/pg_shmem.h"
36 #include "utils/guc.h"
39 typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */
40 typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */
43 unsigned long UsedShmemSegID = 0;
44 void *UsedShmemSegAddr = NULL;
45 static Size AnonymousShmemSize;
46 static void *AnonymousShmem = NULL;
48 static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
49 static void IpcMemoryDetach(int status, Datum shmaddr);
50 static void IpcMemoryDelete(int status, Datum shmId);
51 static PGShmemHeader *PGSharedMemoryAttach(IpcMemoryKey key,
56 * InternalIpcMemoryCreate(memKey, size)
58 * Attempt to create a new shared memory segment with the specified key.
59 * Will fail (return NULL) if such a segment already exists. If successful,
60 * attach the segment to the current process and return its attached address.
61 * On success, callbacks are registered with on_shmem_exit to detach and
62 * delete the segment when on_shmem_exit is called.
64 * If we fail with a failure code other than collision-with-existing-segment,
65 * print out an error and abort. Other types of errors are not recoverable.
68 InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
73 shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
77 int shmget_errno = errno;
80 * Fail quietly if error indicates a collision with existing segment.
81 * One would expect EEXIST, given that we said IPC_EXCL, but perhaps
82 * we could get a permission violation instead? Also, EIDRM might
83 * occur if an old seg is slated for destruction but not gone yet.
85 if (shmget_errno == EEXIST || shmget_errno == EACCES
87 || shmget_errno == EIDRM
93 * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if
94 * there is an existing segment but it's smaller than "size" (this is
95 * a result of poorly-thought-out ordering of error tests). To
96 * distinguish between collision and invalid size in such cases, we
97 * make a second try with size = 0. These kernels do not test size
98 * against SHMMIN in the preexisting-segment case, so we will not get
99 * EINVAL a second time if there is such a segment.
101 if (shmget_errno == EINVAL)
103 shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection);
107 /* As above, fail quietly if we verify a collision */
108 if (errno == EEXIST || errno == EACCES
114 /* Otherwise, fall through to report the original error */
119 * On most platforms we cannot get here because SHMMIN is
120 * greater than zero. However, if we do succeed in creating a
121 * zero-size segment, free it and then fall through to report
122 * the original error.
124 if (shmctl(shmid, IPC_RMID, NULL) < 0)
125 elog(LOG, "shmctl(%d, %d, 0) failed: %m",
126 (int) shmid, IPC_RMID);
131 * Else complain and abort.
133 * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX
134 * is violated. SHMALL violation might be reported as either ENOMEM
135 * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which
136 * it should be. SHMMNI violation is ENOSPC, per spec. Just plain
137 * not-enough-RAM is ENOMEM.
139 errno = shmget_errno;
141 (errmsg("could not create shared memory segment: %m"),
142 errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o).",
143 (unsigned long) memKey, size,
144 IPC_CREAT | IPC_EXCL | IPCProtection),
145 (shmget_errno == EINVAL) ?
146 errhint("This error usually means that PostgreSQL's request for a shared memory "
147 "segment exceeded your kernel's SHMMAX parameter, or possibly that "
149 "your kernel's SHMMIN parameter.\n"
150 "The PostgreSQL documentation contains more information about shared "
151 "memory configuration.") : 0,
152 (shmget_errno == ENOMEM) ?
153 errhint("This error usually means that PostgreSQL's request for a shared "
154 "memory segment exceeded your kernel's SHMALL parameter. You might need "
155 "to reconfigure the kernel with larger SHMALL.\n"
156 "The PostgreSQL documentation contains more information about shared "
157 "memory configuration.") : 0,
158 (shmget_errno == ENOSPC) ?
159 errhint("This error does *not* mean that you have run out of disk space. "
160 "It occurs either if all available shared memory IDs have been taken, "
161 "in which case you need to raise the SHMMNI parameter in your kernel, "
162 "or because the system's overall limit for shared memory has been "
164 "The PostgreSQL documentation contains more information about shared "
165 "memory configuration.") : 0));
168 /* Register on-exit routine to delete the new segment */
169 on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid));
171 /* OK, should be able to attach to the segment */
172 memAddress = shmat(shmid, NULL, PG_SHMAT_FLAGS);
174 if (memAddress == (void *) -1)
175 elog(FATAL, "shmat(id=%d) failed: %m", shmid);
177 /* Register on-exit routine to detach new segment before deleting */
178 on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress));
181 * Store shmem key and ID in data directory lockfile. Format to try to
182 * keep it the same length always (trailing junk in the lockfile won't
183 * hurt, but might confuse humans).
188 sprintf(line, "%9lu %9lu",
189 (unsigned long) memKey, (unsigned long) shmid);
190 AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line);
196 /****************************************************************************/
197 /* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */
198 /* from process' address space */
199 /* (called as an on_shmem_exit callback, hence funny argument list) */
200 /****************************************************************************/
202 IpcMemoryDetach(int status, Datum shmaddr)
204 /* Detach System V shared memory block. */
205 if (shmdt(DatumGetPointer(shmaddr)) < 0)
206 elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr));
207 /* Release anonymous shared memory block, if any. */
208 if (AnonymousShmem != NULL
209 && munmap(AnonymousShmem, AnonymousShmemSize) < 0)
210 elog(LOG, "munmap(%p) failed: %m", AnonymousShmem);
213 /****************************************************************************/
214 /* IpcMemoryDelete(status, shmId) deletes a shared memory segment */
215 /* (called as an on_shmem_exit callback, hence funny argument list) */
216 /****************************************************************************/
218 IpcMemoryDelete(int status, Datum shmId)
220 if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0)
221 elog(LOG, "shmctl(%d, %d, 0) failed: %m",
222 DatumGetInt32(shmId), IPC_RMID);
226 * PGSharedMemoryIsInUse
228 * Is a previously-existing shmem segment still existing and in use?
230 * The point of this exercise is to detect the case where a prior postmaster
231 * crashed, but it left child backends that are still running. Therefore
232 * we only care about shmem segments that are associated with the intended
233 * DataDir. This is an important consideration since accidental matches of
234 * shmem segment IDs are reasonably common.
237 PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
239 IpcMemoryId shmId = (IpcMemoryId) id2;
240 struct shmid_ds shmStat;
245 * We detect whether a shared memory segment is in use by seeing whether
246 * it (a) exists and (b) has any processes attached to it.
248 if (shmctl(shmId, IPC_STAT, &shmStat) < 0)
251 * EINVAL actually has multiple possible causes documented in the
252 * shmctl man page, but we assume it must mean the segment no longer
259 * EACCES implies that the segment belongs to some other userid, which
260 * means it is not a Postgres shmem segment (or at least, not one that
261 * is relevant to our data directory).
267 * Some Linux kernel versions (in fact, all of them as of July 2007)
268 * sometimes return EIDRM when EINVAL is correct. The Linux kernel
269 * actually does not have any internal state that would justify
270 * returning EIDRM, so we can get away with assuming that EIDRM is
271 * equivalent to EINVAL on that platform.
273 #ifdef HAVE_LINUX_EIDRM_BUG
279 * Otherwise, we had better assume that the segment is in use. The
280 * only likely case is EIDRM, which implies that the segment has been
281 * IPC_RMID'd but there are still processes attached to it.
286 /* If it has no attached processes, it's not in use */
287 if (shmStat.shm_nattch == 0)
291 * Try to attach to the segment and see if it matches our data directory.
292 * This avoids shmid-conflict problems on machines that are running
293 * several postmasters under the same userid.
295 if (stat(DataDir, &statbuf) < 0)
296 return true; /* if can't stat, be conservative */
298 hdr = (PGShmemHeader *) shmat(shmId, NULL, PG_SHMAT_FLAGS);
300 if (hdr == (PGShmemHeader *) -1)
301 return true; /* if can't attach, be conservative */
303 if (hdr->magic != PGShmemMagic ||
304 hdr->device != statbuf.st_dev ||
305 hdr->inode != statbuf.st_ino)
308 * It's either not a Postgres segment, or not one for my data
309 * directory. In either case it poses no threat.
315 /* Trouble --- looks a lot like there's still live backends */
322 * Creates an anonymous mmap()ed shared memory segment.
324 * Pass the requested size in *size. This function will modify *size to the
325 * actual size of the allocation, if it ends up allocating a segment that is
326 * larger than requested.
330 CreateAnonymousSegment(Size *size)
332 Size allocsize = *size;
333 void *ptr = MAP_FAILED;
337 if (huge_pages == HUGE_PAGES_ON)
339 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
340 errmsg("huge TLB pages not supported on this platform")));
342 if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY)
345 * Round up the request size to a suitable large value.
347 * Some Linux kernel versions are known to have a bug, which causes
348 * mmap() with MAP_HUGETLB to fail if the request size is not a
349 * multiple of any supported huge page size. To work around that, we
350 * round up the request size to nearest 2MB. 2MB is the most common
351 * huge page page size on affected systems.
353 * Aside from that bug, even with a kernel that does the allocation
354 * correctly, rounding it up ourselves avoids wasting memory. Without
355 * it, if we for example make an allocation of 2MB + 1 bytes, the
356 * kernel might decide to use two 2MB huge pages for that, and waste 2
357 * MB - 1 of memory. When we do the rounding ourselves, we can use
358 * that space for allocations.
360 int hugepagesize = 2 * 1024 * 1024;
362 if (allocsize % hugepagesize != 0)
363 allocsize += hugepagesize - (allocsize % hugepagesize);
365 ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
366 PG_MMAP_FLAGS | MAP_HUGETLB, -1, 0);
368 if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
369 elog(DEBUG1, "mmap with MAP_HUGETLB failed, huge pages disabled: %m");
373 if (huge_pages == HUGE_PAGES_OFF ||
374 (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED))
377 * use the original size, not the rounded up value, when falling back
381 ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
382 PG_MMAP_FLAGS, -1, 0);
386 if (ptr == MAP_FAILED)
390 (errmsg("could not map anonymous shared memory: %m"),
391 (mmap_errno == ENOMEM) ?
392 errhint("This error usually means that PostgreSQL's request "
393 "for a shared memory segment exceeded available memory, "
394 "swap space, or huge pages. To reduce the request size "
395 "(currently %zu bytes), reduce PostgreSQL's shared "
396 "memory usage, perhaps by reducing shared_buffers or "
407 * PGSharedMemoryCreate
409 * Create a shared memory segment of the given size and initialize its
410 * standard header. Also, register an on_shmem_exit callback to release
413 * Dead Postgres segments are recycled if found, but we do not fail upon
414 * collision with non-Postgres shmem segments. The idea here is to detect and
415 * re-use keys that may have been assigned by a crashed postmaster or backend.
417 * makePrivate means to always create a new segment, rather than attach to
418 * or recycle any existing segment.
420 * The port number is passed for possible use as a key (for SysV, we use
421 * it to generate the starting shmem key). In a standalone backend,
422 * zero will be passed.
425 PGSharedMemoryCreate(Size size, bool makePrivate, int port,
426 PGShmemHeader **shim)
428 IpcMemoryKey NextShmemSegID;
435 #if defined(EXEC_BACKEND) || !defined(MAP_HUGETLB)
436 if (huge_pages == HUGE_PAGES_ON)
438 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
439 errmsg("huge pages not supported on this platform")));
442 /* Room for a header? */
443 Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
446 * As of PostgreSQL 9.3, we normally allocate only a very small amount of
447 * System V shared memory, and only for the purposes of providing an
448 * interlock to protect the data directory. The real shared memory block
449 * is allocated using mmap(). This works around the problem that many
450 * systems have very low limits on the amount of System V shared memory
451 * that can be allocated. Even a limit of a few megabytes will be enough
452 * to run many copies of PostgreSQL without needing to adjust system
455 * We assume that no one will attempt to run PostgreSQL 9.3 or later on
456 * systems that are ancient enough that anonymous shared memory is not
457 * supported, such as pre-2.4 versions of Linux. If that turns out to be
458 * false, we might need to add a run-time test here and do this only if
459 * the running kernel supports it.
461 * However, we disable this logic in the EXEC_BACKEND case, and fall back
462 * to the old method of allocating the entire segment using System V
463 * shared memory, because there's no way to attach an mmap'd segment to a
464 * process after exec(). Since EXEC_BACKEND is intended only for
465 * developer use, this shouldn't be a big problem.
468 AnonymousShmem = CreateAnonymousSegment(&size);
469 AnonymousShmemSize = size;
471 /* Now we need only allocate a minimal-sized SysV shmem block. */
472 sysvsize = sizeof(PGShmemHeader);
477 /* Make sure PGSharedMemoryAttach doesn't fail without need */
478 UsedShmemSegAddr = NULL;
480 /* Loop till we find a free IPC key */
481 NextShmemSegID = port * 1000;
483 for (NextShmemSegID++;; NextShmemSegID++)
485 /* Try to create new segment */
486 memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize);
488 break; /* successful create and attach */
490 /* Check shared memory and possibly remove and recreate */
492 if (makePrivate) /* a standalone backend shouldn't do this */
495 if ((memAddress = PGSharedMemoryAttach(NextShmemSegID, &shmid)) == NULL)
496 continue; /* can't attach, not one of mine */
499 * If I am not the creator and it belongs to an extant process,
502 hdr = (PGShmemHeader *) memAddress;
503 if (hdr->creatorPID != getpid())
505 if (kill(hdr->creatorPID, 0) == 0 || errno != ESRCH)
508 continue; /* segment belongs to a live process */
513 * The segment appears to be from a dead Postgres process, or from a
514 * previous cycle of life in this same process. Zap it, if possible,
515 * and any associated dynamic shared memory segments, as well. This
516 * probably shouldn't fail, but if it does, assume the segment belongs
517 * to someone else after all, and continue quietly.
519 if (hdr->dsm_control != 0)
520 dsm_cleanup_using_control_segment(hdr->dsm_control);
522 if (shmctl(shmid, IPC_RMID, NULL) < 0)
526 * Now try again to create the segment.
528 memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize);
530 break; /* successful create and attach */
533 * Can only get here if some other process managed to create the same
534 * shmem key before we did. Let him have that one, loop around to try
540 * OK, we created a new segment. Mark it as created by this process. The
541 * order of assignments here is critical so that another Postgres process
542 * can't see the header as valid but belonging to an invalid PID!
544 hdr = (PGShmemHeader *) memAddress;
545 hdr->creatorPID = getpid();
546 hdr->magic = PGShmemMagic;
547 hdr->dsm_control = 0;
549 /* Fill in the data directory ID info, too */
550 if (stat(DataDir, &statbuf) < 0)
552 (errcode_for_file_access(),
553 errmsg("could not stat data directory \"%s\": %m",
555 hdr->device = statbuf.st_dev;
556 hdr->inode = statbuf.st_ino;
559 * Initialize space allocation status for segment.
561 hdr->totalsize = size;
562 hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
565 /* Save info for possible future use */
566 UsedShmemSegAddr = memAddress;
567 UsedShmemSegID = (unsigned long) NextShmemSegID;
570 * If AnonymousShmem is NULL here, then we're not using anonymous shared
571 * memory, and should return a pointer to the System V shared memory
572 * block. Otherwise, the System V shared memory block is only a shim, and
573 * we must return a pointer to the real block.
575 if (AnonymousShmem == NULL)
577 memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));
578 return (PGShmemHeader *) AnonymousShmem;
584 * PGSharedMemoryReAttach
586 * This is called during startup of a postmaster child process to re-attach to
587 * an already existing shared memory segment. This is needed only in the
588 * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory
589 * segment attachment via fork().
591 * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
592 * routine. The caller must have already restored them to the postmaster's
596 PGSharedMemoryReAttach(void)
600 void *origUsedShmemSegAddr = UsedShmemSegAddr;
602 Assert(UsedShmemSegAddr != NULL);
603 Assert(IsUnderPostmaster);
606 /* cygipc (currently) appears to not detach on exec. */
607 PGSharedMemoryDetach();
608 UsedShmemSegAddr = origUsedShmemSegAddr;
611 elog(DEBUG3, "attaching to %p", UsedShmemSegAddr);
612 hdr = (void *) PGSharedMemoryAttach((IpcMemoryKey) UsedShmemSegID, &shmid);
614 elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m",
615 (int) UsedShmemSegID, UsedShmemSegAddr);
616 if (hdr != origUsedShmemSegAddr)
617 elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)",
618 hdr, origUsedShmemSegAddr);
619 dsm_set_control_handle(((PGShmemHeader *) hdr)->dsm_control);
621 UsedShmemSegAddr = hdr; /* probably redundant */
625 * PGSharedMemoryNoReAttach
627 * This is called during startup of a postmaster child process when we choose
628 * *not* to re-attach to the existing shared memory segment. We must clean up
629 * to leave things in the appropriate state. This is not used in the non
630 * EXEC_BACKEND case, either.
632 * The child process startup logic might or might not call PGSharedMemoryDetach
633 * after this; make sure that it will be a no-op if called.
635 * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
636 * routine. The caller must have already restored them to the postmaster's
640 PGSharedMemoryNoReAttach(void)
642 Assert(UsedShmemSegAddr != NULL);
643 Assert(IsUnderPostmaster);
646 /* cygipc (currently) appears to not detach on exec. */
647 PGSharedMemoryDetach();
650 /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */
651 UsedShmemSegAddr = NULL;
652 /* And the same for UsedShmemSegID. */
656 #endif /* EXEC_BACKEND */
659 * PGSharedMemoryDetach
661 * Detach from the shared memory segment, if still attached. This is not
662 * intended to be called explicitly by the process that originally created the
663 * segment (it will have an on_shmem_exit callback registered to do that).
664 * Rather, this is for subprocesses that have inherited an attachment and want
667 * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
671 PGSharedMemoryDetach(void)
673 if (UsedShmemSegAddr != NULL)
675 if ((shmdt(UsedShmemSegAddr) < 0)
676 #if defined(EXEC_BACKEND) && defined(__CYGWIN__)
677 /* Work-around for cygipc exec bug */
681 elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr);
682 UsedShmemSegAddr = NULL;
685 /* Release anonymous shared memory block, if any. */
686 if (AnonymousShmem != NULL
687 && munmap(AnonymousShmem, AnonymousShmemSize) < 0)
688 elog(LOG, "munmap(%p) failed: %m", AnonymousShmem);
693 * Attach to shared memory and make sure it has a Postgres header
695 * Returns attach address if OK, else NULL
697 static PGShmemHeader *
698 PGSharedMemoryAttach(IpcMemoryKey key, IpcMemoryId *shmid)
702 if ((*shmid = shmget(key, sizeof(PGShmemHeader), 0)) < 0)
705 hdr = (PGShmemHeader *) shmat(*shmid, UsedShmemSegAddr, PG_SHMAT_FLAGS);
707 if (hdr == (PGShmemHeader *) -1)
708 return NULL; /* failed: must be some other app's */
710 if (hdr->magic != PGShmemMagic)
713 return NULL; /* segment belongs to a non-Postgres app */