1 /*-------------------------------------------------------------------------
4 * Implement shared memory using SysV facilities
6 * These routines represent a fairly thin layer on top of SysV shared
7 * memory functionality.
9 * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994, Regents of the University of California
13 * src/backend/port/sysv_shmem.c
15 *-------------------------------------------------------------------------
31 #include "miscadmin.h"
32 #include "storage/ipc.h"
33 #include "storage/pg_shmem.h"
36 typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */
37 typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */
39 #define IPCProtection (0600) /* access/modify by user only */
41 #ifdef SHM_SHARE_MMU /* use intimate shared memory on Solaris */
42 #define PG_SHMAT_FLAGS SHM_SHARE_MMU
44 #define PG_SHMAT_FLAGS 0
47 /* Linux prefers MAP_ANONYMOUS, but the flag is called MAP_ANON on other systems. */
49 #define MAP_ANONYMOUS MAP_ANON
52 /* BSD-derived systems have MAP_HASSEMAPHORE, but it's not present (or needed) on Linux. */
53 #ifndef MAP_HASSEMAPHORE
54 #define MAP_HASSEMAPHORE 0
57 #define PG_MMAP_FLAGS (MAP_SHARED|MAP_ANONYMOUS|MAP_HASSEMAPHORE)
59 /* Some really old systems don't define MAP_FAILED. */
61 #define MAP_FAILED ((void *) -1)
65 unsigned long UsedShmemSegID = 0;
66 void *UsedShmemSegAddr = NULL;
67 static Size AnonymousShmemSize;
68 static PGShmemHeader *AnonymousShmem;
70 static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
71 static void IpcMemoryDetach(int status, Datum shmaddr);
72 static void IpcMemoryDelete(int status, Datum shmId);
73 static PGShmemHeader *PGSharedMemoryAttach(IpcMemoryKey key,
78 * InternalIpcMemoryCreate(memKey, size)
80 * Attempt to create a new shared memory segment with the specified key.
81 * Will fail (return NULL) if such a segment already exists. If successful,
82 * attach the segment to the current process and return its attached address.
83 * On success, callbacks are registered with on_shmem_exit to detach and
84 * delete the segment when on_shmem_exit is called.
86 * If we fail with a failure code other than collision-with-existing-segment,
87 * print out an error and abort. Other types of errors are not recoverable.
90 InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
95 shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
100 * Fail quietly if error indicates a collision with existing segment.
101 * One would expect EEXIST, given that we said IPC_EXCL, but perhaps
102 * we could get a permission violation instead? Also, EIDRM might
103 * occur if an old seg is slated for destruction but not gone yet.
105 if (errno == EEXIST || errno == EACCES
113 * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if
114 * there is an existing segment but it's smaller than "size" (this is
115 * a result of poorly-thought-out ordering of error tests). To
116 * distinguish between collision and invalid size in such cases, we
117 * make a second try with size = 0. These kernels do not test size
118 * against SHMMIN in the preexisting-segment case, so we will not get
119 * EINVAL a second time if there is such a segment.
123 int save_errno = errno;
125 shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection);
129 /* As above, fail quietly if we verify a collision */
130 if (errno == EEXIST || errno == EACCES
136 /* Otherwise, fall through to report the original error */
141 * On most platforms we cannot get here because SHMMIN is
142 * greater than zero. However, if we do succeed in creating a
143 * zero-size segment, free it and then fall through to report
144 * the original error.
146 if (shmctl(shmid, IPC_RMID, NULL) < 0)
147 elog(LOG, "shmctl(%d, %d, 0) failed: %m",
148 (int) shmid, IPC_RMID);
155 * Else complain and abort.
157 * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX
158 * is violated. SHMALL violation might be reported as either ENOMEM
159 * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which
160 * it should be. SHMMNI violation is ENOSPC, per spec. Just plain
161 * not-enough-RAM is ENOMEM.
164 (errmsg("could not create shared memory segment: %m"),
165 errdetail("Failed system call was shmget(key=%lu, size=%lu, 0%o).",
166 (unsigned long) memKey, (unsigned long) size,
167 IPC_CREAT | IPC_EXCL | IPCProtection),
169 errhint("This error usually means that PostgreSQL's request for a shared memory "
170 "segment exceeded your kernel's SHMMAX parameter. You can either "
171 "reduce the request size or reconfigure the kernel with larger SHMMAX. "
172 "To reduce the request size (currently %lu bytes), reduce "
173 "PostgreSQL's shared memory usage, perhaps by reducing shared_buffers "
174 "or max_connections.\n"
175 "If the request size is already small, it's possible that it is less than "
176 "your kernel's SHMMIN parameter, in which case raising the request size or "
177 "reconfiguring SHMMIN is called for.\n"
178 "The PostgreSQL documentation contains more information about shared "
179 "memory configuration.",
180 (unsigned long) size) : 0,
182 errhint("This error usually means that PostgreSQL's request for a shared "
183 "memory segment exceeded available memory or swap space, "
184 "or exceeded your kernel's SHMALL parameter. You can either "
185 "reduce the request size or reconfigure the kernel with larger SHMALL. "
186 "To reduce the request size (currently %lu bytes), reduce "
187 "PostgreSQL's shared memory usage, perhaps by reducing shared_buffers "
188 "or max_connections.\n"
189 "The PostgreSQL documentation contains more information about shared "
190 "memory configuration.",
191 (unsigned long) size) : 0,
193 errhint("This error does *not* mean that you have run out of disk space. "
194 "It occurs either if all available shared memory IDs have been taken, "
195 "in which case you need to raise the SHMMNI parameter in your kernel, "
196 "or because the system's overall limit for shared memory has been "
197 "reached. If you cannot increase the shared memory limit, "
198 "reduce PostgreSQL's shared memory request (currently %lu bytes), "
199 "perhaps by reducing shared_buffers or max_connections.\n"
200 "The PostgreSQL documentation contains more information about shared "
201 "memory configuration.",
202 (unsigned long) size) : 0));
205 /* Register on-exit routine to delete the new segment */
206 on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid));
208 /* OK, should be able to attach to the segment */
209 memAddress = shmat(shmid, NULL, PG_SHMAT_FLAGS);
211 if (memAddress == (void *) -1)
212 elog(FATAL, "shmat(id=%d) failed: %m", shmid);
214 /* Register on-exit routine to detach new segment before deleting */
215 on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress));
218 * Store shmem key and ID in data directory lockfile. Format to try to
219 * keep it the same length always (trailing junk in the lockfile won't
220 * hurt, but might confuse humans).
225 sprintf(line, "%9lu %9lu",
226 (unsigned long) memKey, (unsigned long) shmid);
227 AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line);
233 /****************************************************************************/
234 /* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */
235 /* from process' address spaceq */
236 /* (called as an on_shmem_exit callback, hence funny argument list) */
237 /****************************************************************************/
239 IpcMemoryDetach(int status, Datum shmaddr)
241 /* Detach System V shared memory block. */
242 if (shmdt(DatumGetPointer(shmaddr)) < 0)
243 elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr));
244 /* Release anonymous shared memory block, if any. */
245 if (AnonymousShmem != NULL
246 && munmap(AnonymousShmem, AnonymousShmemSize) < 0)
247 elog(LOG, "munmap(%p) failed: %m", AnonymousShmem);
250 /****************************************************************************/
251 /* IpcMemoryDelete(status, shmId) deletes a shared memory segment */
252 /* (called as an on_shmem_exit callback, hence funny argument list) */
253 /****************************************************************************/
255 IpcMemoryDelete(int status, Datum shmId)
257 if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0)
258 elog(LOG, "shmctl(%d, %d, 0) failed: %m",
259 DatumGetInt32(shmId), IPC_RMID);
263 * PGSharedMemoryIsInUse
265 * Is a previously-existing shmem segment still existing and in use?
267 * The point of this exercise is to detect the case where a prior postmaster
268 * crashed, but it left child backends that are still running. Therefore
269 * we only care about shmem segments that are associated with the intended
270 * DataDir. This is an important consideration since accidental matches of
271 * shmem segment IDs are reasonably common.
274 PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
276 IpcMemoryId shmId = (IpcMemoryId) id2;
277 struct shmid_ds shmStat;
282 * We detect whether a shared memory segment is in use by seeing whether
283 * it (a) exists and (b) has any processes attached to it.
285 if (shmctl(shmId, IPC_STAT, &shmStat) < 0)
288 * EINVAL actually has multiple possible causes documented in the
289 * shmctl man page, but we assume it must mean the segment no longer
296 * EACCES implies that the segment belongs to some other userid, which
297 * means it is not a Postgres shmem segment (or at least, not one that
298 * is relevant to our data directory).
304 * Some Linux kernel versions (in fact, all of them as of July 2007)
305 * sometimes return EIDRM when EINVAL is correct. The Linux kernel
306 * actually does not have any internal state that would justify
307 * returning EIDRM, so we can get away with assuming that EIDRM is
308 * equivalent to EINVAL on that platform.
310 #ifdef HAVE_LINUX_EIDRM_BUG
316 * Otherwise, we had better assume that the segment is in use. The
317 * only likely case is EIDRM, which implies that the segment has been
318 * IPC_RMID'd but there are still processes attached to it.
323 /* If it has no attached processes, it's not in use */
324 if (shmStat.shm_nattch == 0)
328 * Try to attach to the segment and see if it matches our data directory.
329 * This avoids shmid-conflict problems on machines that are running
330 * several postmasters under the same userid.
332 if (stat(DataDir, &statbuf) < 0)
333 return true; /* if can't stat, be conservative */
335 hdr = (PGShmemHeader *) shmat(shmId, NULL, PG_SHMAT_FLAGS);
337 if (hdr == (PGShmemHeader *) -1)
338 return true; /* if can't attach, be conservative */
340 if (hdr->magic != PGShmemMagic ||
341 hdr->device != statbuf.st_dev ||
342 hdr->inode != statbuf.st_ino)
345 * It's either not a Postgres segment, or not one for my data
346 * directory. In either case it poses no threat.
352 /* Trouble --- looks a lot like there's still live backends */
360 * PGSharedMemoryCreate
362 * Create a shared memory segment of the given size and initialize its
363 * standard header. Also, register an on_shmem_exit callback to release
366 * Dead Postgres segments are recycled if found, but we do not fail upon
367 * collision with non-Postgres shmem segments. The idea here is to detect and
368 * re-use keys that may have been assigned by a crashed postmaster or backend.
370 * makePrivate means to always create a new segment, rather than attach to
371 * or recycle any existing segment.
373 * The port number is passed for possible use as a key (for SysV, we use
374 * it to generate the starting shmem key). In a standalone backend,
375 * zero will be passed.
378 PGSharedMemoryCreate(Size size, bool makePrivate, int port)
380 IpcMemoryKey NextShmemSegID;
385 Size allocsize = size;
387 /* Room for a header? */
388 Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
391 * As of PostgreSQL 9.3, we normally allocate only a very small amount of
392 * System V shared memory, and only for the purposes of providing an
393 * interlock to protect the data directory. The real shared memory block
394 * is allocated using mmap(). This works around the problem that many
395 * systems have very low limits on the amount of System V shared memory
396 * that can be allocated. Even a limit of a few megabytes will be enough
397 * to run many copies of PostgreSQL without needing to adjust system
400 * However, we disable this logic in the EXEC_BACKEND case, and fall back
401 * to the old method of allocating the entire segment using System V shared
402 * memory, because there's no way to attach an mmap'd segment to a process
403 * after exec(). Since EXEC_BACKEND is intended only for developer use,
404 * this shouldn't be a big problem.
408 long pagesize = sysconf(_SC_PAGE_SIZE);
411 * Ensure request size is a multiple of pagesize.
413 * pagesize will, for practical purposes, always be a power of two.
414 * But just in case it isn't, we do it this way instead of using
417 if (pagesize > 0 && size % pagesize != 0)
418 size += pagesize - (size % pagesize);
421 * We assume that no one will attempt to run PostgreSQL 9.3 or later
422 * on systems that are ancient enough that anonymous shared memory is
423 * not supported, such as pre-2.4 versions of Linux. If that turns out
424 * to be false, we might need to add a run-time test here and do this
425 * only if the running kernel supports it.
427 AnonymousShmem = mmap(NULL, size, PROT_READ|PROT_WRITE, PG_MMAP_FLAGS,
429 if (AnonymousShmem == MAP_FAILED)
431 (errmsg("could not map anonymous shared memory: %m"),
433 errhint("This error usually means that PostgreSQL's request "
434 "for a shared memory segment exceeded available memory "
435 "or swap space. To reduce the request size (currently "
436 "%lu bytes), reduce PostgreSQL's shared memory usage, "
437 "perhaps by reducing shared_buffers or "
439 (unsigned long) size) : 0));
440 AnonymousShmemSize = size;
442 /* Now we need only allocate a minimal-sized SysV shmem block. */
443 allocsize = sizeof(PGShmemHeader);
447 /* Make sure PGSharedMemoryAttach doesn't fail without need */
448 UsedShmemSegAddr = NULL;
450 /* Loop till we find a free IPC key */
451 NextShmemSegID = port * 1000;
453 for (NextShmemSegID++;; NextShmemSegID++)
455 /* Try to create new segment */
456 memAddress = InternalIpcMemoryCreate(NextShmemSegID, allocsize);
458 break; /* successful create and attach */
460 /* Check shared memory and possibly remove and recreate */
462 if (makePrivate) /* a standalone backend shouldn't do this */
465 if ((memAddress = PGSharedMemoryAttach(NextShmemSegID, &shmid)) == NULL)
466 continue; /* can't attach, not one of mine */
469 * If I am not the creator and it belongs to an extant process,
472 hdr = (PGShmemHeader *) memAddress;
473 if (hdr->creatorPID != getpid())
475 if (kill(hdr->creatorPID, 0) == 0 || errno != ESRCH)
478 continue; /* segment belongs to a live process */
483 * The segment appears to be from a dead Postgres process, or from a
484 * previous cycle of life in this same process. Zap it, if possible.
485 * This probably shouldn't fail, but if it does, assume the segment
486 * belongs to someone else after all, and continue quietly.
489 if (shmctl(shmid, IPC_RMID, NULL) < 0)
493 * Now try again to create the segment.
495 memAddress = InternalIpcMemoryCreate(NextShmemSegID, allocsize);
497 break; /* successful create and attach */
500 * Can only get here if some other process managed to create the same
501 * shmem key before we did. Let him have that one, loop around to try
507 * OK, we created a new segment. Mark it as created by this process. The
508 * order of assignments here is critical so that another Postgres process
509 * can't see the header as valid but belonging to an invalid PID!
511 hdr = (PGShmemHeader *) memAddress;
512 hdr->creatorPID = getpid();
513 hdr->magic = PGShmemMagic;
515 /* Fill in the data directory ID info, too */
516 if (stat(DataDir, &statbuf) < 0)
518 (errcode_for_file_access(),
519 errmsg("could not stat data directory \"%s\": %m",
521 hdr->device = statbuf.st_dev;
522 hdr->inode = statbuf.st_ino;
525 * Initialize space allocation status for segment.
527 hdr->totalsize = size;
528 hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
530 /* Save info for possible future use */
531 UsedShmemSegAddr = memAddress;
532 UsedShmemSegID = (unsigned long) NextShmemSegID;
535 * If AnonymousShmem is NULL here, then we're not using anonymous shared
536 * memory, and should return a pointer to the System V shared memory block.
537 * Otherwise, the System V shared memory block is only a shim, and we must
538 * return a pointer to the real block.
540 if (AnonymousShmem == NULL)
542 memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));
543 return AnonymousShmem;
550 * PGSharedMemoryReAttach
552 * Re-attach to an already existing shared memory segment. In the non
553 * EXEC_BACKEND case this is not used, because postmaster children inherit
554 * the shared memory segment attachment via fork().
556 * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
557 * routine. The caller must have already restored them to the postmaster's
561 PGSharedMemoryReAttach(void)
565 void *origUsedShmemSegAddr = UsedShmemSegAddr;
567 Assert(UsedShmemSegAddr != NULL);
568 Assert(IsUnderPostmaster);
571 /* cygipc (currently) appears to not detach on exec. */
572 PGSharedMemoryDetach();
573 UsedShmemSegAddr = origUsedShmemSegAddr;
576 elog(DEBUG3, "attaching to %p", UsedShmemSegAddr);
577 hdr = (void *) PGSharedMemoryAttach((IpcMemoryKey) UsedShmemSegID, &shmid);
579 elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m",
580 (int) UsedShmemSegID, UsedShmemSegAddr);
581 if (hdr != origUsedShmemSegAddr)
582 elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)",
583 hdr, origUsedShmemSegAddr);
585 UsedShmemSegAddr = hdr; /* probably redundant */
587 #endif /* EXEC_BACKEND */
590 * PGSharedMemoryDetach
592 * Detach from the shared memory segment, if still attached. This is not
593 * intended for use by the process that originally created the segment
594 * (it will have an on_shmem_exit callback registered to do that). Rather,
595 * this is for subprocesses that have inherited an attachment and want to
599 PGSharedMemoryDetach(void)
601 if (UsedShmemSegAddr != NULL)
603 if ((shmdt(UsedShmemSegAddr) < 0)
604 #if defined(EXEC_BACKEND) && defined(__CYGWIN__)
605 /* Work-around for cygipc exec bug */
609 elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr);
610 UsedShmemSegAddr = NULL;
613 /* Release anonymous shared memory block, if any. */
614 if (AnonymousShmem != NULL
615 && munmap(AnonymousShmem, AnonymousShmemSize) < 0)
616 elog(LOG, "munmap(%p) failed: %m", AnonymousShmem);
621 * Attach to shared memory and make sure it has a Postgres header
623 * Returns attach address if OK, else NULL
625 static PGShmemHeader *
626 PGSharedMemoryAttach(IpcMemoryKey key, IpcMemoryId *shmid)
630 if ((*shmid = shmget(key, sizeof(PGShmemHeader), 0)) < 0)
633 hdr = (PGShmemHeader *) shmat(*shmid, UsedShmemSegAddr, PG_SHMAT_FLAGS);
635 if (hdr == (PGShmemHeader *) -1)
636 return NULL; /* failed: must be some other app's */
638 if (hdr->magic != PGShmemMagic)
641 return NULL; /* segment belongs to a non-Postgres app */