From 61c21ddad03dda1f92a05a723cf449087d80ec08 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 23 Apr 2017 15:26:25 -0700
Subject: [PATCH] Remove select(2) backed latch implementation.

poll(2) is required by Single Unix Spec v2, the usual baseline for
postgres (leaving windows aside).  There's not been any buildfarm
animals without poll(2) for a long while, leaving the select(2)
implementation to be largely untested.

On windows, including mingw, poll() is not available, but we have a
special case implementation for windows anyway.

Author: Andres Freund
Discussion: https://postgr.es/m/20170420003611.7r2sdvehesdyiz2i@alap3.anarazel.de
---
 src/backend/storage/ipc/latch.c | 221 ++++----------------------------
 1 file changed, 28 insertions(+), 193 deletions(-)

diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 67162fdd4a..31aa5debc7 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -3,27 +3,24 @@
  * latch.c
  *	  Routines for inter-process latches
  *
- * The Unix implementation uses the so-called self-pipe trick to overcome
- * the race condition involved with select() and setting a global flag
- * in the signal handler. When a latch is set and the current process
- * is waiting for it, the signal handler wakes up the select() in
- * WaitLatch by writing a byte to a pipe. A signal by itself doesn't
- * interrupt select() on all platforms, and even on platforms where it
- * does, a signal that arrives just before the select() call does not
- * prevent the select() from entering sleep. An incoming byte on a pipe
- * however reliably interrupts the sleep, and causes select() to return
- * immediately even if the signal arrives before select() begins.
- *
- * (Actually, we prefer epoll_wait() over poll() over select() where
- * available, but the same comments apply.)
+ * The Unix implementation uses the so-called self-pipe trick to overcome the
+ * race condition involved with poll() (or epoll_wait() on linux) and setting
+ * a global flag in the signal handler. When a latch is set and the current
+ * process is waiting for it, the signal handler wakes up the poll() in
+ * WaitLatch by writing a byte to a pipe. A signal by itself doesn't interrupt
+ * poll() on all platforms, and even on platforms where it does, a signal that
+ * arrives just before the poll() call does not prevent poll() from entering
+ * sleep. An incoming byte on a pipe however reliably interrupts the sleep,
+ * and causes poll() to return immediately even if the signal arrives before
+ * poll() begins.
  *
  * When SetLatch is called from the same process that owns the latch,
  * SetLatch writes the byte directly to the pipe. If it's owned by another
  * process, SIGUSR1 is sent and the signal handler in the waiting process
  * writes the byte to the pipe on behalf of the signaling process.
  *
- * The Windows implementation uses Windows events that are inherited by
- * all postmaster child processes.
+ * The Windows implementation uses Windows events that are inherited by all
+ * postmaster child processes. There's no need for the self-pipe trick there.
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -39,7 +36,6 @@
 #include <limits.h>
 #include <signal.h>
 #include <unistd.h>
-#include <sys/time.h>
 #ifdef HAVE_SYS_EPOLL_H
 #include <sys/epoll.h>
 #endif
@@ -49,9 +45,6 @@
 #ifdef HAVE_SYS_POLL_H
 #include <sys/poll.h>
 #endif
-#ifdef HAVE_SYS_SELECT_H
-#include <sys/select.h>
-#endif
 
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -69,14 +62,12 @@
  * define somewhere before this block.
  */
 #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
-	defined(WAIT_USE_SELECT) || defined(WAIT_USE_WIN32)
+	defined(WAIT_USE_WIN32)
 /* don't overwrite manual choice */
 #elif defined(HAVE_SYS_EPOLL_H)
 #define WAIT_USE_EPOLL
 #elif defined(HAVE_POLL)
 #define WAIT_USE_POLL
-#elif HAVE_SYS_SELECT_H
-#define WAIT_USE_SELECT
 #elif WIN32
 #define WAIT_USE_WIN32
 #else
@@ -162,8 +153,8 @@ InitializeLatchSupport(void)
 
 	/*
 	 * Set up the self-pipe that allows a signal handler to wake up the
-	 * select() in WaitLatch. Make the write-end non-blocking, so that
-	 * SetLatch won't block if the event has already been set many times
+	 * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
+	 * that SetLatch won't block if the event has already been set many times
 	 * filling the kernel buffer. Make the read-end non-blocking too, so that
 	 * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
 	 */
@@ -401,8 +392,9 @@ SetLatch(volatile Latch *latch)
 
 	/*
 	 * See if anyone's waiting for the latch. It can be the current process if
-	 * we're in a signal handler. We use the self-pipe to wake up the select()
-	 * in that case. If it's another process, send a signal.
+	 * we're in a signal handler. We use the self-pipe to wake up the
+	 * poll()/epoll_wait() in that case. If it's another process, send a
+	 * signal.
 	 *
 	 * Fetch owner_pid only once, in case the latch is concurrently getting
 	 * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
@@ -666,8 +658,6 @@ AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
 	WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
 #elif defined(WAIT_USE_POLL)
 	WaitEventAdjustPoll(set, event);
-#elif defined(WAIT_USE_SELECT)
-	/* nothing to do */
 #elif defined(WAIT_USE_WIN32)
 	WaitEventAdjustWin32(set, event);
 #endif
@@ -724,8 +714,6 @@ ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
 	WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
 #elif defined(WAIT_USE_POLL)
 	WaitEventAdjustPoll(set, event);
-#elif defined(WAIT_USE_SELECT)
-	/* nothing to do */
 #elif defined(WAIT_USE_WIN32)
 	WaitEventAdjustWin32(set, event);
 #endif
@@ -1055,9 +1043,11 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
 			 * because we don't expect the pipe to become readable or to have
 			 * any errors either, treat those cases as postmaster death, too.
 			 *
-			 * As explained in the WAIT_USE_SELECT implementation, select(2)
-			 * may spuriously return. Be paranoid about that here too, a
-			 * spurious WL_POSTMASTER_DEATH would be painful.
+			 * Be paranoid about a spurious event signalling the postmaster as
+			 * being dead.  There have been reports about that happening with
+			 * older primitives (select(2) to be specific), and a spurious
+			 * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
+			 * cost much.
 			 */
 			if (!PostmasterIsAlive())
 			{
@@ -1171,9 +1161,11 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
 			 * we don't expect the pipe to become readable or to have any
 			 * errors either, treat those cases as postmaster death, too.
 			 *
-			 * As explained in the WAIT_USE_SELECT implementation, select(2)
-			 * may spuriously return. Be paranoid about that here too, a
-			 * spurious WL_POSTMASTER_DEATH would be painful.
+			 * Be paranoid about a spurious event signalling the postmaster as
+			 * being dead.  There have been reports about that happening with
+			 * older primitives (select(2) to be specific), and a spurious
+			 * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
+			 * cost much.
 			 */
 			if (!PostmasterIsAlive())
 			{
@@ -1214,163 +1206,6 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
 	return returned_events;
 }
 
-#elif defined(WAIT_USE_SELECT)
-
-/*
- * Wait using select(2).
- *
- * XXX: On at least older linux kernels select(), in violation of POSIX,
- * doesn't reliably return a socket as writable if closed - but we rely on
- * that. So far all the known cases of this problem are on platforms that also
- * provide a poll() implementation without that bug.  If we find one where
- * that's not the case, we'll need to add a workaround.
- */
-static inline int
-WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
-					  WaitEvent *occurred_events, int nevents)
-{
-	int			returned_events = 0;
-	int			rc;
-	WaitEvent  *cur_event;
-	fd_set		input_mask;
-	fd_set		output_mask;
-	int			hifd;
-	struct timeval tv;
-	struct timeval *tvp = NULL;
-
-	FD_ZERO(&input_mask);
-	FD_ZERO(&output_mask);
-
-	/*
-	 * Prepare input/output masks. We do so every loop iteration as there's no
-	 * entirely portable way to copy fd_sets.
-	 */
-	for (cur_event = set->events;
-		 cur_event < (set->events + set->nevents);
-		 cur_event++)
-	{
-		if (cur_event->events == WL_LATCH_SET)
-			FD_SET(cur_event->fd, &input_mask);
-		else if (cur_event->events == WL_POSTMASTER_DEATH)
-			FD_SET(cur_event->fd, &input_mask);
-		else
-		{
-			Assert(cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
-			if (cur_event->events == WL_SOCKET_READABLE)
-				FD_SET(cur_event->fd, &input_mask);
-			else if (cur_event->events == WL_SOCKET_WRITEABLE)
-				FD_SET(cur_event->fd, &output_mask);
-		}
-
-		if (cur_event->fd > hifd)
-			hifd = cur_event->fd;
-	}
-
-	/* Sleep */
-	if (cur_timeout >= 0)
-	{
-		tv.tv_sec = cur_timeout / 1000L;
-		tv.tv_usec = (cur_timeout % 1000L) * 1000L;
-		tvp = &tv;
-	}
-	rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp);
-
-	/* Check return code */
-	if (rc < 0)
-	{
-		/* EINTR is okay, otherwise complain */
-		if (errno != EINTR)
-		{
-			waiting = false;
-			ereport(ERROR,
-					(errcode_for_socket_access(),
-					 errmsg("select() failed: %m")));
-		}
-		return 0;				/* retry */
-	}
-	else if (rc == 0)
-	{
-		/* timeout exceeded */
-		return -1;
-	}
-
-	/*
-	 * To associate events with select's masks, we have to check the status of
-	 * the file descriptors associated with an event; by looping through all
-	 * events.
-	 */
-	for (cur_event = set->events;
-		 cur_event < (set->events + set->nevents)
-		 && returned_events < nevents;
-		 cur_event++)
-	{
-		occurred_events->pos = cur_event->pos;
-		occurred_events->user_data = cur_event->user_data;
-		occurred_events->events = 0;
-
-		if (cur_event->events == WL_LATCH_SET &&
-			FD_ISSET(cur_event->fd, &input_mask))
-		{
-			/* There's data in the self-pipe, clear it. */
-			drainSelfPipe();
-
-			if (set->latch->is_set)
-			{
-				occurred_events->fd = PGINVALID_SOCKET;
-				occurred_events->events = WL_LATCH_SET;
-				occurred_events++;
-				returned_events++;
-			}
-		}
-		else if (cur_event->events == WL_POSTMASTER_DEATH &&
-				 FD_ISSET(cur_event->fd, &input_mask))
-		{
-			/*
-			 * According to the select(2) man page on Linux, select(2) may
-			 * spuriously return and report a file descriptor as readable,
-			 * when it's not; and presumably so can poll(2).  It's not clear
-			 * that the relevant cases would ever apply to the postmaster
-			 * pipe, but since the consequences of falsely returning
-			 * WL_POSTMASTER_DEATH could be pretty unpleasant, we take the
-			 * trouble to positively verify EOF with PostmasterIsAlive().
-			 */
-			if (!PostmasterIsAlive())
-			{
-				occurred_events->fd = PGINVALID_SOCKET;
-				occurred_events->events = WL_POSTMASTER_DEATH;
-				occurred_events++;
-				returned_events++;
-			}
-		}
-		else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
-		{
-			Assert(cur_event->fd != PGINVALID_SOCKET);
-
-			if ((cur_event->events & WL_SOCKET_READABLE) &&
-				FD_ISSET(cur_event->fd, &input_mask))
-			{
-				/* data available in socket, or EOF */
-				occurred_events->events |= WL_SOCKET_READABLE;
-			}
-
-			if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
-				FD_ISSET(cur_event->fd, &output_mask))
-			{
-				/* socket is writeable, or EOF */
-				occurred_events->events |= WL_SOCKET_WRITEABLE;
-			}
-
-			if (occurred_events->events != 0)
-			{
-				occurred_events->fd = cur_event->fd;
-				occurred_events++;
-				returned_events++;
-			}
-		}
-	}
-	return returned_events;
-}
-
 #elif defined(WAIT_USE_WIN32)
 
 /*
-- 
2.50.0