1 /*-------------------------------------------------------------------------
4 * Implement shared memory using SysV facilities
6 * These routines represent a fairly thin layer on top of SysV shared
7 * memory functionality.
9 * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994, Regents of the University of California
13 * src/backend/port/sysv_shmem.c
15 *-------------------------------------------------------------------------
31 #include "miscadmin.h"
32 #include "storage/ipc.h"
33 #include "storage/pg_shmem.h"
36 typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */
37 typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */
39 #define IPCProtection (0600) /* access/modify by user only */
41 #ifdef SHM_SHARE_MMU /* use intimate shared memory on Solaris */
42 #define PG_SHMAT_FLAGS SHM_SHARE_MMU
44 #define PG_SHMAT_FLAGS 0
47 /* Linux prefers MAP_ANONYMOUS, but the flag is called MAP_ANON on other systems. */
49 #define MAP_ANONYMOUS MAP_ANON
52 /* BSD-derived systems have MAP_HASSEMAPHORE, but it's not present (or needed) on Linux. */
53 #ifndef MAP_HASSEMAPHORE
54 #define MAP_HASSEMAPHORE 0
57 #define PG_MMAP_FLAGS (MAP_SHARED|MAP_ANONYMOUS|MAP_HASSEMAPHORE)
59 /* Some really old systems don't define MAP_FAILED. */
61 #define MAP_FAILED ((void *) -1)
65 unsigned long UsedShmemSegID = 0;
66 void *UsedShmemSegAddr = NULL;
67 static Size AnonymousShmemSize;
68 static void *AnonymousShmem;
70 static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
71 static void IpcMemoryDetach(int status, Datum shmaddr);
72 static void IpcMemoryDelete(int status, Datum shmId);
73 static PGShmemHeader *PGSharedMemoryAttach(IpcMemoryKey key,
78 * InternalIpcMemoryCreate(memKey, size)
80 * Attempt to create a new shared memory segment with the specified key.
81 * Will fail (return NULL) if such a segment already exists. If successful,
82 * attach the segment to the current process and return its attached address.
83 * On success, callbacks are registered with on_shmem_exit to detach and
84 * delete the segment when on_shmem_exit is called.
86 * If we fail with a failure code other than collision-with-existing-segment,
87 * print out an error and abort. Other types of errors are not recoverable.
90 InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
95 shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
100 * Fail quietly if error indicates a collision with existing segment.
101 * One would expect EEXIST, given that we said IPC_EXCL, but perhaps
102 * we could get a permission violation instead? Also, EIDRM might
103 * occur if an old seg is slated for destruction but not gone yet.
105 if (errno == EEXIST || errno == EACCES
113 * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if
114 * there is an existing segment but it's smaller than "size" (this is
115 * a result of poorly-thought-out ordering of error tests). To
116 * distinguish between collision and invalid size in such cases, we
117 * make a second try with size = 0. These kernels do not test size
118 * against SHMMIN in the preexisting-segment case, so we will not get
119 * EINVAL a second time if there is such a segment.
123 int save_errno = errno;
125 shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection);
129 /* As above, fail quietly if we verify a collision */
130 if (errno == EEXIST || errno == EACCES
136 /* Otherwise, fall through to report the original error */
141 * On most platforms we cannot get here because SHMMIN is
142 * greater than zero. However, if we do succeed in creating a
143 * zero-size segment, free it and then fall through to report
144 * the original error.
146 if (shmctl(shmid, IPC_RMID, NULL) < 0)
147 elog(LOG, "shmctl(%d, %d, 0) failed: %m",
148 (int) shmid, IPC_RMID);
155 * Else complain and abort.
157 * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX
158 * is violated. SHMALL violation might be reported as either ENOMEM
159 * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which
160 * it should be. SHMMNI violation is ENOSPC, per spec. Just plain
161 * not-enough-RAM is ENOMEM.
164 (errmsg("could not create shared memory segment: %m"),
165 errdetail("Failed system call was shmget(key=%lu, size=%lu, 0%o).",
166 (unsigned long) memKey, (unsigned long) size,
167 IPC_CREAT | IPC_EXCL | IPCProtection),
169 errhint("This error usually means that PostgreSQL's request for a shared memory "
170 "segment exceeded your kernel's SHMMAX parameter, or possibly that "
172 "your kernel's SHMMIN parameter.\n"
173 "The PostgreSQL documentation contains more information about shared "
174 "memory configuration.") : 0,
176 errhint("This error usually means that PostgreSQL's request for a shared "
177 "memory segment exceeded your kernel's SHMALL parameter. You may need "
178 "to reconfigure the kernel with larger SHMALL.\n"
179 "The PostgreSQL documentation contains more information about shared "
180 "memory configuration.") : 0,
182 errhint("This error does *not* mean that you have run out of disk space. "
183 "It occurs either if all available shared memory IDs have been taken, "
184 "in which case you need to raise the SHMMNI parameter in your kernel, "
185 "or because the system's overall limit for shared memory has been "
187 "The PostgreSQL documentation contains more information about shared "
188 "memory configuration.") : 0));
191 /* Register on-exit routine to delete the new segment */
192 on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid));
194 /* OK, should be able to attach to the segment */
195 memAddress = shmat(shmid, NULL, PG_SHMAT_FLAGS);
197 if (memAddress == (void *) -1)
198 elog(FATAL, "shmat(id=%d) failed: %m", shmid);
200 /* Register on-exit routine to detach new segment before deleting */
201 on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress));
204 * Store shmem key and ID in data directory lockfile. Format to try to
205 * keep it the same length always (trailing junk in the lockfile won't
206 * hurt, but might confuse humans).
211 sprintf(line, "%9lu %9lu",
212 (unsigned long) memKey, (unsigned long) shmid);
213 AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line);
219 /****************************************************************************/
220 /* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */
221 /* from process' address spaceq */
222 /* (called as an on_shmem_exit callback, hence funny argument list) */
223 /****************************************************************************/
225 IpcMemoryDetach(int status, Datum shmaddr)
227 /* Detach System V shared memory block. */
228 if (shmdt(DatumGetPointer(shmaddr)) < 0)
229 elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr));
230 /* Release anonymous shared memory block, if any. */
231 if (AnonymousShmem != NULL
232 && munmap(AnonymousShmem, AnonymousShmemSize) < 0)
233 elog(LOG, "munmap(%p) failed: %m", AnonymousShmem);
236 /****************************************************************************/
237 /* IpcMemoryDelete(status, shmId) deletes a shared memory segment */
238 /* (called as an on_shmem_exit callback, hence funny argument list) */
239 /****************************************************************************/
241 IpcMemoryDelete(int status, Datum shmId)
243 if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0)
244 elog(LOG, "shmctl(%d, %d, 0) failed: %m",
245 DatumGetInt32(shmId), IPC_RMID);
249 * PGSharedMemoryIsInUse
251 * Is a previously-existing shmem segment still existing and in use?
253 * The point of this exercise is to detect the case where a prior postmaster
254 * crashed, but it left child backends that are still running. Therefore
255 * we only care about shmem segments that are associated with the intended
256 * DataDir. This is an important consideration since accidental matches of
257 * shmem segment IDs are reasonably common.
260 PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
262 IpcMemoryId shmId = (IpcMemoryId) id2;
263 struct shmid_ds shmStat;
268 * We detect whether a shared memory segment is in use by seeing whether
269 * it (a) exists and (b) has any processes attached to it.
271 if (shmctl(shmId, IPC_STAT, &shmStat) < 0)
274 * EINVAL actually has multiple possible causes documented in the
275 * shmctl man page, but we assume it must mean the segment no longer
282 * EACCES implies that the segment belongs to some other userid, which
283 * means it is not a Postgres shmem segment (or at least, not one that
284 * is relevant to our data directory).
290 * Some Linux kernel versions (in fact, all of them as of July 2007)
291 * sometimes return EIDRM when EINVAL is correct. The Linux kernel
292 * actually does not have any internal state that would justify
293 * returning EIDRM, so we can get away with assuming that EIDRM is
294 * equivalent to EINVAL on that platform.
296 #ifdef HAVE_LINUX_EIDRM_BUG
302 * Otherwise, we had better assume that the segment is in use. The
303 * only likely case is EIDRM, which implies that the segment has been
304 * IPC_RMID'd but there are still processes attached to it.
309 /* If it has no attached processes, it's not in use */
310 if (shmStat.shm_nattch == 0)
314 * Try to attach to the segment and see if it matches our data directory.
315 * This avoids shmid-conflict problems on machines that are running
316 * several postmasters under the same userid.
318 if (stat(DataDir, &statbuf) < 0)
319 return true; /* if can't stat, be conservative */
321 hdr = (PGShmemHeader *) shmat(shmId, NULL, PG_SHMAT_FLAGS);
323 if (hdr == (PGShmemHeader *) -1)
324 return true; /* if can't attach, be conservative */
326 if (hdr->magic != PGShmemMagic ||
327 hdr->device != statbuf.st_dev ||
328 hdr->inode != statbuf.st_ino)
331 * It's either not a Postgres segment, or not one for my data
332 * directory. In either case it poses no threat.
338 /* Trouble --- looks a lot like there's still live backends */
346 * PGSharedMemoryCreate
348 * Create a shared memory segment of the given size and initialize its
349 * standard header. Also, register an on_shmem_exit callback to release
352 * Dead Postgres segments are recycled if found, but we do not fail upon
353 * collision with non-Postgres shmem segments. The idea here is to detect and
354 * re-use keys that may have been assigned by a crashed postmaster or backend.
356 * makePrivate means to always create a new segment, rather than attach to
357 * or recycle any existing segment.
359 * The port number is passed for possible use as a key (for SysV, we use
360 * it to generate the starting shmem key). In a standalone backend,
361 * zero will be passed.
364 PGSharedMemoryCreate(Size size, bool makePrivate, int port)
366 IpcMemoryKey NextShmemSegID;
371 Size sysvsize = size;
373 /* Room for a header? */
374 Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
377 * As of PostgreSQL 9.3, we normally allocate only a very small amount of
378 * System V shared memory, and only for the purposes of providing an
379 * interlock to protect the data directory. The real shared memory block
380 * is allocated using mmap(). This works around the problem that many
381 * systems have very low limits on the amount of System V shared memory
382 * that can be allocated. Even a limit of a few megabytes will be enough
383 * to run many copies of PostgreSQL without needing to adjust system
386 * However, we disable this logic in the EXEC_BACKEND case, and fall back
387 * to the old method of allocating the entire segment using System V
388 * shared memory, because there's no way to attach an mmap'd segment to a
389 * process after exec(). Since EXEC_BACKEND is intended only for
390 * developer use, this shouldn't be a big problem.
394 long pagesize = sysconf(_SC_PAGE_SIZE);
397 * Ensure request size is a multiple of pagesize.
399 * pagesize will, for practical purposes, always be a power of two.
400 * But just in case it isn't, we do it this way instead of using
403 if (pagesize > 0 && size % pagesize != 0)
404 size += pagesize - (size % pagesize);
407 * We assume that no one will attempt to run PostgreSQL 9.3 or later
408 * on systems that are ancient enough that anonymous shared memory is
409 * not supported, such as pre-2.4 versions of Linux. If that turns
410 * out to be false, we might need to add a run-time test here and do
411 * this only if the running kernel supports it.
413 AnonymousShmem = mmap(NULL, size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS,
415 if (AnonymousShmem == MAP_FAILED)
417 (errmsg("could not map anonymous shared memory: %m"),
419 errhint("This error usually means that PostgreSQL's request "
420 "for a shared memory segment exceeded available memory "
421 "or swap space. To reduce the request size (currently "
422 "%lu bytes), reduce PostgreSQL's shared memory usage, "
423 "perhaps by reducing shared_buffers or "
425 (unsigned long) size) : 0));
426 AnonymousShmemSize = size;
428 /* Now we need only allocate a minimal-sized SysV shmem block. */
429 sysvsize = sizeof(PGShmemHeader);
433 /* Make sure PGSharedMemoryAttach doesn't fail without need */
434 UsedShmemSegAddr = NULL;
436 /* Loop till we find a free IPC key */
437 NextShmemSegID = port * 1000;
439 for (NextShmemSegID++;; NextShmemSegID++)
441 /* Try to create new segment */
442 memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize);
444 break; /* successful create and attach */
446 /* Check shared memory and possibly remove and recreate */
448 if (makePrivate) /* a standalone backend shouldn't do this */
451 if ((memAddress = PGSharedMemoryAttach(NextShmemSegID, &shmid)) == NULL)
452 continue; /* can't attach, not one of mine */
455 * If I am not the creator and it belongs to an extant process,
458 hdr = (PGShmemHeader *) memAddress;
459 if (hdr->creatorPID != getpid())
461 if (kill(hdr->creatorPID, 0) == 0 || errno != ESRCH)
464 continue; /* segment belongs to a live process */
469 * The segment appears to be from a dead Postgres process, or from a
470 * previous cycle of life in this same process. Zap it, if possible.
471 * This probably shouldn't fail, but if it does, assume the segment
472 * belongs to someone else after all, and continue quietly.
475 if (shmctl(shmid, IPC_RMID, NULL) < 0)
479 * Now try again to create the segment.
481 memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize);
483 break; /* successful create and attach */
486 * Can only get here if some other process managed to create the same
487 * shmem key before we did. Let him have that one, loop around to try
493 * OK, we created a new segment. Mark it as created by this process. The
494 * order of assignments here is critical so that another Postgres process
495 * can't see the header as valid but belonging to an invalid PID!
497 hdr = (PGShmemHeader *) memAddress;
498 hdr->creatorPID = getpid();
499 hdr->magic = PGShmemMagic;
501 /* Fill in the data directory ID info, too */
502 if (stat(DataDir, &statbuf) < 0)
504 (errcode_for_file_access(),
505 errmsg("could not stat data directory \"%s\": %m",
507 hdr->device = statbuf.st_dev;
508 hdr->inode = statbuf.st_ino;
511 * Initialize space allocation status for segment.
513 hdr->totalsize = size;
514 hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
516 /* Save info for possible future use */
517 UsedShmemSegAddr = memAddress;
518 UsedShmemSegID = (unsigned long) NextShmemSegID;
521 * If AnonymousShmem is NULL here, then we're not using anonymous shared
522 * memory, and should return a pointer to the System V shared memory
523 * block. Otherwise, the System V shared memory block is only a shim, and
524 * we must return a pointer to the real block.
526 if (AnonymousShmem == NULL)
528 memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));
529 return (PGShmemHeader *) AnonymousShmem;
535 * PGSharedMemoryReAttach
537 * Re-attach to an already existing shared memory segment. In the non
538 * EXEC_BACKEND case this is not used, because postmaster children inherit
539 * the shared memory segment attachment via fork().
541 * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
542 * routine. The caller must have already restored them to the postmaster's
546 PGSharedMemoryReAttach(void)
550 void *origUsedShmemSegAddr = UsedShmemSegAddr;
552 Assert(UsedShmemSegAddr != NULL);
553 Assert(IsUnderPostmaster);
556 /* cygipc (currently) appears to not detach on exec. */
557 PGSharedMemoryDetach();
558 UsedShmemSegAddr = origUsedShmemSegAddr;
561 elog(DEBUG3, "attaching to %p", UsedShmemSegAddr);
562 hdr = (void *) PGSharedMemoryAttach((IpcMemoryKey) UsedShmemSegID, &shmid);
564 elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m",
565 (int) UsedShmemSegID, UsedShmemSegAddr);
566 if (hdr != origUsedShmemSegAddr)
567 elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)",
568 hdr, origUsedShmemSegAddr);
570 UsedShmemSegAddr = hdr; /* probably redundant */
572 #endif /* EXEC_BACKEND */
575 * PGSharedMemoryDetach
577 * Detach from the shared memory segment, if still attached. This is not
578 * intended for use by the process that originally created the segment
579 * (it will have an on_shmem_exit callback registered to do that). Rather,
580 * this is for subprocesses that have inherited an attachment and want to
584 PGSharedMemoryDetach(void)
586 if (UsedShmemSegAddr != NULL)
588 if ((shmdt(UsedShmemSegAddr) < 0)
589 #if defined(EXEC_BACKEND) && defined(__CYGWIN__)
590 /* Work-around for cygipc exec bug */
594 elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr);
595 UsedShmemSegAddr = NULL;
598 /* Release anonymous shared memory block, if any. */
599 if (AnonymousShmem != NULL
600 && munmap(AnonymousShmem, AnonymousShmemSize) < 0)
601 elog(LOG, "munmap(%p) failed: %m", AnonymousShmem);
606 * Attach to shared memory and make sure it has a Postgres header
608 * Returns attach address if OK, else NULL
610 static PGShmemHeader *
611 PGSharedMemoryAttach(IpcMemoryKey key, IpcMemoryId *shmid)
615 if ((*shmid = shmget(key, sizeof(PGShmemHeader), 0)) < 0)
618 hdr = (PGShmemHeader *) shmat(*shmid, UsedShmemSegAddr, PG_SHMAT_FLAGS);
620 if (hdr == (PGShmemHeader *) -1)
621 return NULL; /* failed: must be some other app's */
623 if (hdr->magic != PGShmemMagic)
626 return NULL; /* segment belongs to a non-Postgres app */