]> granicus.if.org Git - postgresql/blob - src/backend/libpq/pqcomm.c
Use the standard spelling of the protocol argument to get/setsockopt.
[postgresql] / src / backend / libpq / pqcomm.c
1 /*-------------------------------------------------------------------------
2  *
3  * pqcomm.c
4  *        Communication functions between the Frontend and the Backend
5  *
6  * These routines handle the low-level details of communication between
7  * frontend and backend.  They just shove data across the communication
8  * channel, and are ignorant of the semantics of the data --- or would be,
9  * except for major brain damage in the design of the old COPY OUT protocol.
10  * Unfortunately, COPY OUT was designed to commandeer the communication
11  * channel (it just transfers data without wrapping it into messages).
12  * No other messages can be sent while COPY OUT is in progress; and if the
13  * copy is aborted by an ereport(ERROR), we need to close out the copy so that
14  * the frontend gets back into sync.  Therefore, these routines have to be
15  * aware of COPY OUT state.  (New COPY-OUT is message-based and does *not*
16  * set the DoingCopyOut flag.)
17  *
18  * NOTE: generally, it's a bad idea to emit outgoing messages directly with
19  * pq_putbytes(), especially if the message would require multiple calls
20  * to send.  Instead, use the routines in pqformat.c to construct the message
21  * in a buffer and then emit it in one call to pq_putmessage.  This ensures
22  * that the channel will not be clogged by an incomplete message if execution
23  * is aborted by ereport(ERROR) partway through the message.  The only
24  * non-libpq code that should call pq_putbytes directly is old-style COPY OUT.
25  *
26  * At one time, libpq was shared between frontend and backend, but now
27  * the backend's "backend/libpq" is quite separate from "interfaces/libpq".
28  * All that remains is similarities of names to trap the unwary...
29  *
30  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
31  * Portions Copyright (c) 1994, Regents of the University of California
32  *
33  *      $PostgreSQL: pgsql/src/backend/libpq/pqcomm.c,v 1.178 2005/07/30 20:28:20 tgl Exp $
34  *
35  *-------------------------------------------------------------------------
36  */
37
38 /*------------------------
39  * INTERFACE ROUTINES
40  *
41  * setup/teardown:
42  *              StreamServerPort        - Open postmaster's server port
43  *              StreamConnection        - Create new connection with client
44  *              StreamClose                     - Close a client/backend connection
45  *              TouchSocketFile         - Protect socket file against /tmp cleaners
46  *              pq_init                 - initialize libpq at backend startup
47  *              pq_comm_reset   - reset libpq during error recovery
48  *              pq_close                - shutdown libpq at backend exit
49  *
50  * low-level I/O:
51  *              pq_getbytes             - get a known number of bytes from connection
52  *              pq_getstring    - get a null terminated string from connection
53  *              pq_getmessage   - get a message with length word from connection
54  *              pq_getbyte              - get next byte from connection
55  *              pq_peekbyte             - peek at next byte from connection
56  *              pq_putbytes             - send bytes to connection (not flushed until pq_flush)
57  *              pq_flush                - flush pending output
58  *
59  * message-level I/O (and old-style-COPY-OUT cruft):
60  *              pq_putmessage   - send a normal message (suppressed in COPY OUT mode)
61  *              pq_startcopyout - inform libpq that a COPY OUT transfer is beginning
62  *              pq_endcopyout   - end a COPY OUT transfer
63  *
64  *------------------------
65  */
66 #include "postgres.h"
67
68 #include <signal.h>
69 #include <errno.h>
70 #include <fcntl.h>
71 #include <grp.h>
72 #include <unistd.h>
73 #include <sys/file.h>
74 #include <sys/socket.h>
75 #include <sys/stat.h>
76 #include <sys/time.h>
77 #include <netdb.h>
78 #include <netinet/in.h>
79 #ifdef HAVE_NETINET_TCP_H
80 #include <netinet/tcp.h>
81 #endif
82 #include <arpa/inet.h>
83 #ifdef HAVE_UTIME_H
84 #include <utime.h>
85 #endif
86
87 #include "libpq/libpq.h"
88 #include "miscadmin.h"
89 #include "storage/ipc.h"
90 #include "utils/guc.h"
91
92 /*
93  * Configuration options
94  */
95 int                     Unix_socket_permissions;
96 char       *Unix_socket_group;
97
98
99 /* Where the Unix socket file is */
100 static char sock_path[MAXPGPATH];
101
102
103 /*
104  * Buffers for low-level I/O
105  */
106
107 #define PQ_BUFFER_SIZE 8192
108
109 static unsigned char PqSendBuffer[PQ_BUFFER_SIZE];
110 static int      PqSendPointer;          /* Next index to store a byte in
111                                                                  * PqSendBuffer */
112
113 static unsigned char PqRecvBuffer[PQ_BUFFER_SIZE];
114 static int      PqRecvPointer;          /* Next index to read a byte from
115                                                                  * PqRecvBuffer */
116 static int      PqRecvLength;           /* End of data available in PqRecvBuffer */
117
118 /*
119  * Message status
120  */
121 static bool PqCommBusy;
122 static bool DoingCopyOut;
123
124
125 /* Internal functions */
126 static void pq_close(int code, Datum arg);
127 static int      internal_putbytes(const char *s, size_t len);
128 static int      internal_flush(void);
129 #ifdef HAVE_UNIX_SOCKETS
130 static int      Lock_AF_UNIX(unsigned short portNumber, char *unixSocketName);
131 static int      Setup_AF_UNIX(void);
132 #endif   /* HAVE_UNIX_SOCKETS */
133
134
135 /* --------------------------------
136  *              pq_init - initialize libpq at backend startup
137  * --------------------------------
138  */
139 void
140 pq_init(void)
141 {
142         PqSendPointer = PqRecvPointer = PqRecvLength = 0;
143         PqCommBusy = false;
144         DoingCopyOut = false;
145         on_proc_exit(pq_close, 0);
146 }
147
148 /* --------------------------------
149  *              pq_comm_reset - reset libpq during error recovery
150  *
151  * This is called from error recovery at the outer idle loop.  It's
152  * just to get us out of trouble if we somehow manage to elog() from
153  * inside a pqcomm.c routine (which ideally will never happen, but...)
154  * --------------------------------
155  */
156 void
157 pq_comm_reset(void)
158 {
159         /* Do not throw away pending data, but do reset the busy flag */
160         PqCommBusy = false;
161         /* We can abort any old-style COPY OUT, too */
162         pq_endcopyout(true);
163 }
164
165 /* --------------------------------
166  *              pq_close - shutdown libpq at backend exit
167  *
168  * Note: in a standalone backend MyProcPort will be null,
169  * don't crash during exit...
170  * --------------------------------
171  */
172 static void
173 pq_close(int code, Datum arg)
174 {
175         if (MyProcPort != NULL)
176         {
177                 /* Cleanly shut down SSL layer */
178                 secure_close(MyProcPort);
179
180                 /*
181                  * Formerly we did an explicit close() here, but it seems better
182                  * to leave the socket open until the process dies.  This allows
183                  * clients to perform a "synchronous close" if they care --- wait
184                  * till the transport layer reports connection closure, and you
185                  * can be sure the backend has exited.
186                  *
187                  * We do set sock to -1 to prevent any further I/O, though.
188                  */
189                 MyProcPort->sock = -1;
190         }
191 }
192
193
194
195 /*
196  * Streams -- wrapper around Unix socket system calls
197  *
198  *
199  *              Stream functions are used for vanilla TCP connection protocol.
200  */
201
202
203 /* StreamDoUnlink()
204  * Shutdown routine for backend connection
205  * If a Unix socket is used for communication, explicitly close it.
206  */
207 #ifdef HAVE_UNIX_SOCKETS
208 static void
209 StreamDoUnlink(int code, Datum arg)
210 {
211         Assert(sock_path[0]);
212         unlink(sock_path);
213 }
214 #endif   /* HAVE_UNIX_SOCKETS */
215
216 /*
217  * StreamServerPort -- open a "listening" port to accept connections.
218  *
219  * Successfully opened sockets are added to the ListenSocket[] array,
220  * at the first position that isn't -1.
221  *
222  * RETURNS: STATUS_OK or STATUS_ERROR
223  */
224
225 int
226 StreamServerPort(int family, char *hostName, unsigned short portNumber,
227                                  char *unixSocketName,
228                                  int ListenSocket[], int MaxListen)
229 {
230         int                     fd,
231                                 err;
232         int                     maxconn;
233         int                     one = 1;
234         int                     ret;
235         char            portNumberStr[32];
236         const char *familyDesc;
237         char            familyDescBuf[64];
238         char       *service;
239         struct addrinfo *addrs = NULL,
240                            *addr;
241         struct addrinfo hint;
242         int                     listen_index = 0;
243         int                     added = 0;
244
245         /* Initialize hint structure */
246         MemSet(&hint, 0, sizeof(hint));
247         hint.ai_family = family;
248         hint.ai_flags = AI_PASSIVE;
249         hint.ai_socktype = SOCK_STREAM;
250
251 #ifdef HAVE_UNIX_SOCKETS
252         if (family == AF_UNIX)
253         {
254                 /* Lock_AF_UNIX will also fill in sock_path. */
255                 if (Lock_AF_UNIX(portNumber, unixSocketName) != STATUS_OK)
256                         return STATUS_ERROR;
257                 service = sock_path;
258         }
259         else
260 #endif   /* HAVE_UNIX_SOCKETS */
261         {
262                 snprintf(portNumberStr, sizeof(portNumberStr), "%d", portNumber);
263                 service = portNumberStr;
264         }
265
266         ret = getaddrinfo_all(hostName, service, &hint, &addrs);
267         if (ret || !addrs)
268         {
269                 if (hostName)
270                         ereport(LOG,
271                                         (errmsg("could not translate host name \"%s\", service \"%s\" to address: %s",
272                                                         hostName, service, gai_strerror(ret))));
273                 else
274                         ereport(LOG,
275                          (errmsg("could not translate service \"%s\" to address: %s",
276                                          service, gai_strerror(ret))));
277                 if (addrs)
278                         freeaddrinfo_all(hint.ai_family, addrs);
279                 return STATUS_ERROR;
280         }
281
282         for (addr = addrs; addr; addr = addr->ai_next)
283         {
284                 if (!IS_AF_UNIX(family) && IS_AF_UNIX(addr->ai_family))
285                 {
286                         /*
287                          * Only set up a unix domain socket when they really asked for
288                          * it.  The service/port is different in that case.
289                          */
290                         continue;
291                 }
292
293                 /* See if there is still room to add 1 more socket. */
294                 for (; listen_index < MaxListen; listen_index++)
295                 {
296                         if (ListenSocket[listen_index] == -1)
297                                 break;
298                 }
299                 if (listen_index >= MaxListen)
300                 {
301                         ereport(LOG,
302                                         (errmsg("could not bind to all requested addresses: MAXLISTEN (%d) exceeded",
303                                                         MaxListen)));
304                         break;
305                 }
306
307                 /* set up family name for possible error messages */
308                 switch (addr->ai_family)
309                 {
310                         case AF_INET:
311                                 familyDesc = _("IPv4");
312                                 break;
313 #ifdef HAVE_IPV6
314                         case AF_INET6:
315                                 familyDesc = _("IPv6");
316                                 break;
317 #endif
318 #ifdef HAVE_UNIX_SOCKETS
319                         case AF_UNIX:
320                                 familyDesc = _("Unix");
321                                 break;
322 #endif
323                         default:
324                                 snprintf(familyDescBuf, sizeof(familyDescBuf),
325                                                  _("unrecognized address family %d"),
326                                                  addr->ai_family);
327                                 familyDesc = familyDescBuf;
328                                 break;
329                 }
330
331                 if ((fd = socket(addr->ai_family, SOCK_STREAM, 0)) < 0)
332                 {
333                         ereport(LOG,
334                                         (errcode_for_socket_access(),
335                         /* translator: %s is IPv4, IPv6, or Unix */
336                                          errmsg("could not create %s socket: %m",
337                                                         familyDesc)));
338                         continue;
339                 }
340
341                 if (!IS_AF_UNIX(addr->ai_family))
342                 {
343                         if ((setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
344                                                         (char *) &one, sizeof(one))) == -1)
345                         {
346                                 ereport(LOG,
347                                                 (errcode_for_socket_access(),
348                                                  errmsg("setsockopt(SO_REUSEADDR) failed: %m")));
349                                 closesocket(fd);
350                                 continue;
351                         }
352                 }
353
354 #ifdef IPV6_V6ONLY
355                 if (addr->ai_family == AF_INET6)
356                 {
357                         if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY,
358                                                    (char *) &one, sizeof(one)) == -1)
359                         {
360                                 ereport(LOG,
361                                                 (errcode_for_socket_access(),
362                                                  errmsg("setsockopt(IPV6_V6ONLY) failed: %m")));
363                                 closesocket(fd);
364                                 continue;
365                         }
366                 }
367 #endif
368
369                 /*
370                  * Note: This might fail on some OS's, like Linux older than
371                  * 2.4.21-pre3, that don't have the IPV6_V6ONLY socket option, and
372                  * map ipv4 addresses to ipv6.  It will show ::ffff:ipv4 for all
373                  * ipv4 connections.
374                  */
375                 err = bind(fd, addr->ai_addr, addr->ai_addrlen);
376                 if (err < 0)
377                 {
378                         ereport(LOG,
379                                         (errcode_for_socket_access(),
380                         /* translator: %s is IPv4, IPv6, or Unix */
381                                          errmsg("could not bind %s socket: %m",
382                                                         familyDesc),
383                                          (IS_AF_UNIX(addr->ai_family)) ?
384                           errhint("Is another postmaster already running on port %d?"
385                                           " If not, remove socket file \"%s\" and retry.",
386                                           (int) portNumber, sock_path) :
387                           errhint("Is another postmaster already running on port %d?"
388                                           " If not, wait a few seconds and retry.",
389                                           (int) portNumber)));
390                         closesocket(fd);
391                         continue;
392                 }
393
394 #ifdef HAVE_UNIX_SOCKETS
395                 if (addr->ai_family == AF_UNIX)
396                 {
397                         if (Setup_AF_UNIX() != STATUS_OK)
398                         {
399                                 closesocket(fd);
400                                 break;
401                         }
402                 }
403 #endif
404
405                 /*
406                  * Select appropriate accept-queue length limit.  PG_SOMAXCONN is
407                  * only intended to provide a clamp on the request on platforms
408                  * where an overly large request provokes a kernel error (are
409                  * there any?).
410                  */
411                 maxconn = MaxBackends * 2;
412                 if (maxconn > PG_SOMAXCONN)
413                         maxconn = PG_SOMAXCONN;
414
415                 err = listen(fd, maxconn);
416                 if (err < 0)
417                 {
418                         ereport(LOG,
419                                         (errcode_for_socket_access(),
420                         /* translator: %s is IPv4, IPv6, or Unix */
421                                          errmsg("could not listen on %s socket: %m",
422                                                         familyDesc)));
423                         closesocket(fd);
424                         continue;
425                 }
426                 ListenSocket[listen_index] = fd;
427                 added++;
428         }
429
430         freeaddrinfo_all(hint.ai_family, addrs);
431
432         if (!added)
433                 return STATUS_ERROR;
434
435         return STATUS_OK;
436 }
437
438
439 #ifdef HAVE_UNIX_SOCKETS
440
441 /*
442  * Lock_AF_UNIX -- configure unix socket file path
443  */
444 static int
445 Lock_AF_UNIX(unsigned short portNumber, char *unixSocketName)
446 {
447         UNIXSOCK_PATH(sock_path, portNumber, unixSocketName);
448
449         /*
450          * Grab an interlock file associated with the socket file.
451          */
452         CreateSocketLockFile(sock_path, true);
453
454         /*
455          * Once we have the interlock, we can safely delete any pre-existing
456          * socket file to avoid failure at bind() time.
457          */
458         unlink(sock_path);
459
460         return STATUS_OK;
461 }
462
463
464 /*
465  * Setup_AF_UNIX -- configure unix socket permissions
466  */
467 static int
468 Setup_AF_UNIX(void)
469 {
470         /* Arrange to unlink the socket file at exit */
471         on_proc_exit(StreamDoUnlink, 0);
472
473         /*
474          * Fix socket ownership/permission if requested.  Note we must do this
475          * before we listen() to avoid a window where unwanted connections
476          * could get accepted.
477          */
478         Assert(Unix_socket_group);
479         if (Unix_socket_group[0] != '\0')
480         {
481 #ifdef WIN32
482                 elog(WARNING, "configuration item unix_socket_group is not supported on this platform");
483 #else
484                 char       *endptr;
485                 unsigned long int val;
486                 gid_t           gid;
487
488                 val = strtoul(Unix_socket_group, &endptr, 10);
489                 if (*endptr == '\0')
490                 {                                               /* numeric group id */
491                         gid = val;
492                 }
493                 else
494                 {                                               /* convert group name to id */
495                         struct group *gr;
496
497                         gr = getgrnam(Unix_socket_group);
498                         if (!gr)
499                         {
500                                 ereport(LOG,
501                                                 (errmsg("group \"%s\" does not exist",
502                                                                 Unix_socket_group)));
503                                 return STATUS_ERROR;
504                         }
505                         gid = gr->gr_gid;
506                 }
507                 if (chown(sock_path, -1, gid) == -1)
508                 {
509                         ereport(LOG,
510                                         (errcode_for_file_access(),
511                                          errmsg("could not set group of file \"%s\": %m",
512                                                         sock_path)));
513                         return STATUS_ERROR;
514                 }
515 #endif
516         }
517
518         if (chmod(sock_path, Unix_socket_permissions) == -1)
519         {
520                 ereport(LOG,
521                                 (errcode_for_file_access(),
522                                  errmsg("could not set permissions of file \"%s\": %m",
523                                                 sock_path)));
524                 return STATUS_ERROR;
525         }
526         return STATUS_OK;
527 }
528 #endif   /* HAVE_UNIX_SOCKETS */
529
530
531 /*
532  * StreamConnection -- create a new connection with client using
533  *              server port.
534  *
535  * ASSUME: that this doesn't need to be non-blocking because
536  *              the Postmaster uses select() to tell when the server master
537  *              socket is ready for accept().
538  *
539  * RETURNS: STATUS_OK or STATUS_ERROR
540  */
541 int
542 StreamConnection(int server_fd, Port *port)
543 {
544         /* accept connection and fill in the client (remote) address */
545         port->raddr.salen = sizeof(port->raddr.addr);
546         if ((port->sock = accept(server_fd,
547                                                          (struct sockaddr *) & port->raddr.addr,
548                                                          &port->raddr.salen)) < 0)
549         {
550                 ereport(LOG,
551                                 (errcode_for_socket_access(),
552                                  errmsg("could not accept new connection: %m")));
553                 return STATUS_ERROR;
554         }
555
556 #ifdef SCO_ACCEPT_BUG
557
558         /*
559          * UnixWare 7+ and OpenServer 5.0.4 are known to have this bug, but it
560          * shouldn't hurt to catch it for all versions of those platforms.
561          */
562         if (port->raddr.addr.ss_family == 0)
563                 port->raddr.addr.ss_family = AF_UNIX;
564 #endif
565
566         /* fill in the server (local) address */
567         port->laddr.salen = sizeof(port->laddr.addr);
568         if (getsockname(port->sock,
569                                         (struct sockaddr *) & port->laddr.addr,
570                                         &port->laddr.salen) < 0)
571         {
572                 elog(LOG, "getsockname() failed: %m");
573                 return STATUS_ERROR;
574         }
575
576         /* select NODELAY and KEEPALIVE options if it's a TCP connection */
577         if (!IS_AF_UNIX(port->laddr.addr.ss_family))
578         {
579                 int                     on;
580
581 #ifdef  TCP_NODELAY
582                 on = 1;
583                 if (setsockopt(port->sock, IPPROTO_TCP, TCP_NODELAY,
584                                            (char *) &on, sizeof(on)) < 0)
585                 {
586                         elog(LOG, "setsockopt(TCP_NODELAY) failed: %m");
587                         return STATUS_ERROR;
588                 }
589 #endif
590                 on = 1;
591                 if (setsockopt(port->sock, SOL_SOCKET, SO_KEEPALIVE,
592                                            (char *) &on, sizeof(on)) < 0)
593                 {
594                         elog(LOG, "setsockopt(SO_KEEPALIVE) failed: %m");
595                         return STATUS_ERROR;
596                 }
597
598                 /* Set default keepalive parameters. This should also catch
599                  * misconfigurations (non-zero values when socket options aren't
600                  * supported)
601                  */
602                 if (pq_setkeepalivesidle(tcp_keepalives_idle, port) != STATUS_OK)
603                         return STATUS_ERROR;
604
605                 if (pq_setkeepalivesinterval(tcp_keepalives_interval, port) != STATUS_OK)
606                         return STATUS_ERROR;
607
608                 if (pq_setkeepalivescount(tcp_keepalives_count, port) != STATUS_OK)
609                         return STATUS_ERROR;
610         }
611
612         return STATUS_OK;
613 }
614
615 /*
616  * StreamClose -- close a client/backend connection
617  *
618  * NOTE: this is NOT used to terminate a session; it is just used to release
619  * the file descriptor in a process that should no longer have the socket
620  * open.  (For example, the postmaster calls this after passing ownership
621  * of the connection to a child process.)  It is expected that someone else
622  * still has the socket open.  So, we only want to close the descriptor,
623  * we do NOT want to send anything to the far end.
624  */
625 void
626 StreamClose(int sock)
627 {
628         closesocket(sock);
629 }
630
631 /*
632  * TouchSocketFile -- mark socket file as recently accessed
633  *
634  * This routine should be called every so often to ensure that the socket
635  * file has a recent mod date (ordinary operations on sockets usually won't
636  * change the mod date).  That saves it from being removed by
637  * overenthusiastic /tmp-directory-cleaner daemons.  (Another reason we should
638  * never have put the socket file in /tmp...)
639  */
640 void
641 TouchSocketFile(void)
642 {
643         /* Do nothing if we did not create a socket... */
644         if (sock_path[0] != '\0')
645         {
646                 /*
647                  * utime() is POSIX standard, utimes() is a common alternative. If
648                  * we have neither, there's no way to affect the mod or access
649                  * time of the socket :-(
650                  *
651                  * In either path, we ignore errors; there's no point in complaining.
652                  */
653 #ifdef HAVE_UTIME
654                 utime(sock_path, NULL);
655 #else                                                   /* !HAVE_UTIME */
656 #ifdef HAVE_UTIMES
657                 utimes(sock_path, NULL);
658 #endif   /* HAVE_UTIMES */
659 #endif   /* HAVE_UTIME */
660         }
661 }
662
663
664 /* --------------------------------
665  * Low-level I/O routines begin here.
666  *
667  * These routines communicate with a frontend client across a connection
668  * already established by the preceding routines.
669  * --------------------------------
670  */
671
672
673 /* --------------------------------
674  *              pq_recvbuf - load some bytes into the input buffer
675  *
676  *              returns 0 if OK, EOF if trouble
677  * --------------------------------
678  */
679 static int
680 pq_recvbuf(void)
681 {
682         if (PqRecvPointer > 0)
683         {
684                 if (PqRecvLength > PqRecvPointer)
685                 {
686                         /* still some unread data, left-justify it in the buffer */
687                         memmove(PqRecvBuffer, PqRecvBuffer + PqRecvPointer,
688                                         PqRecvLength - PqRecvPointer);
689                         PqRecvLength -= PqRecvPointer;
690                         PqRecvPointer = 0;
691                 }
692                 else
693                         PqRecvLength = PqRecvPointer = 0;
694         }
695
696         /* Can fill buffer from PqRecvLength and upwards */
697         for (;;)
698         {
699                 int                     r;
700
701                 r = secure_read(MyProcPort, PqRecvBuffer + PqRecvLength,
702                                                 PQ_BUFFER_SIZE - PqRecvLength);
703
704                 if (r < 0)
705                 {
706                         if (errno == EINTR)
707                                 continue;               /* Ok if interrupted */
708
709                         /*
710                          * Careful: an ereport() that tries to write to the client
711                          * would cause recursion to here, leading to stack overflow
712                          * and core dump!  This message must go *only* to the
713                          * postmaster log.
714                          */
715                         ereport(COMMERROR,
716                                         (errcode_for_socket_access(),
717                                          errmsg("could not receive data from client: %m")));
718                         return EOF;
719                 }
720                 if (r == 0)
721                 {
722                         /*
723                          * EOF detected.  We used to write a log message here, but
724                          * it's better to expect the ultimate caller to do that.
725                          */
726                         return EOF;
727                 }
728                 /* r contains number of bytes read, so just incr length */
729                 PqRecvLength += r;
730                 return 0;
731         }
732 }
733
734 /* --------------------------------
735  *              pq_getbyte      - get a single byte from connection, or return EOF
736  * --------------------------------
737  */
738 int
739 pq_getbyte(void)
740 {
741         while (PqRecvPointer >= PqRecvLength)
742         {
743                 if (pq_recvbuf())               /* If nothing in buffer, then recv some */
744                         return EOF;                     /* Failed to recv data */
745         }
746         return PqRecvBuffer[PqRecvPointer++];
747 }
748
749 /* --------------------------------
750  *              pq_peekbyte             - peek at next byte from connection
751  *
752  *       Same as pq_getbyte() except we don't advance the pointer.
753  * --------------------------------
754  */
755 int
756 pq_peekbyte(void)
757 {
758         while (PqRecvPointer >= PqRecvLength)
759         {
760                 if (pq_recvbuf())               /* If nothing in buffer, then recv some */
761                         return EOF;                     /* Failed to recv data */
762         }
763         return PqRecvBuffer[PqRecvPointer];
764 }
765
766 /* --------------------------------
767  *              pq_getbytes             - get a known number of bytes from connection
768  *
769  *              returns 0 if OK, EOF if trouble
770  * --------------------------------
771  */
772 int
773 pq_getbytes(char *s, size_t len)
774 {
775         size_t          amount;
776
777         while (len > 0)
778         {
779                 while (PqRecvPointer >= PqRecvLength)
780                 {
781                         if (pq_recvbuf())       /* If nothing in buffer, then recv some */
782                                 return EOF;             /* Failed to recv data */
783                 }
784                 amount = PqRecvLength - PqRecvPointer;
785                 if (amount > len)
786                         amount = len;
787                 memcpy(s, PqRecvBuffer + PqRecvPointer, amount);
788                 PqRecvPointer += amount;
789                 s += amount;
790                 len -= amount;
791         }
792         return 0;
793 }
794
795 /* --------------------------------
796  *              pq_discardbytes         - throw away a known number of bytes
797  *
798  *              same as pq_getbytes except we do not copy the data to anyplace.
799  *              this is used for resynchronizing after read errors.
800  *
801  *              returns 0 if OK, EOF if trouble
802  * --------------------------------
803  */
804 static int
805 pq_discardbytes(size_t len)
806 {
807         size_t          amount;
808
809         while (len > 0)
810         {
811                 while (PqRecvPointer >= PqRecvLength)
812                 {
813                         if (pq_recvbuf())       /* If nothing in buffer, then recv some */
814                                 return EOF;             /* Failed to recv data */
815                 }
816                 amount = PqRecvLength - PqRecvPointer;
817                 if (amount > len)
818                         amount = len;
819                 PqRecvPointer += amount;
820                 len -= amount;
821         }
822         return 0;
823 }
824
825 /* --------------------------------
826  *              pq_getstring    - get a null terminated string from connection
827  *
828  *              The return value is placed in an expansible StringInfo, which has
829  *              already been initialized by the caller.
830  *
831  *              This is used only for dealing with old-protocol clients.  The idea
832  *              is to produce a StringInfo that looks the same as we would get from
833  *              pq_getmessage() with a newer client; we will then process it with
834  *              pq_getmsgstring.  Therefore, no character set conversion is done here,
835  *              even though this is presumably useful only for text.
836  *
837  *              returns 0 if OK, EOF if trouble
838  * --------------------------------
839  */
840 int
841 pq_getstring(StringInfo s)
842 {
843         int                     i;
844
845         /* Reset string to empty */
846         s->len = 0;
847         s->data[0] = '\0';
848         s->cursor = 0;
849
850         /* Read until we get the terminating '\0' */
851         for (;;)
852         {
853                 while (PqRecvPointer >= PqRecvLength)
854                 {
855                         if (pq_recvbuf())       /* If nothing in buffer, then recv some */
856                                 return EOF;             /* Failed to recv data */
857                 }
858
859                 for (i = PqRecvPointer; i < PqRecvLength; i++)
860                 {
861                         if (PqRecvBuffer[i] == '\0')
862                         {
863                                 /* include the '\0' in the copy */
864                                 appendBinaryStringInfo(s, PqRecvBuffer + PqRecvPointer,
865                                                                            i - PqRecvPointer + 1);
866                                 PqRecvPointer = i + 1;  /* advance past \0 */
867                                 return 0;
868                         }
869                 }
870
871                 /* If we're here we haven't got the \0 in the buffer yet. */
872                 appendBinaryStringInfo(s, PqRecvBuffer + PqRecvPointer,
873                                                            PqRecvLength - PqRecvPointer);
874                 PqRecvPointer = PqRecvLength;
875         }
876 }
877
878
879 /* --------------------------------
880  *              pq_getmessage   - get a message with length word from connection
881  *
882  *              The return value is placed in an expansible StringInfo, which has
883  *              already been initialized by the caller.
884  *              Only the message body is placed in the StringInfo; the length word
885  *              is removed.  Also, s->cursor is initialized to zero for convenience
886  *              in scanning the message contents.
887  *
888  *              If maxlen is not zero, it is an upper limit on the length of the
889  *              message we are willing to accept.  We abort the connection (by
890  *              returning EOF) if client tries to send more than that.
891  *
892  *              returns 0 if OK, EOF if trouble
893  * --------------------------------
894  */
895 int
896 pq_getmessage(StringInfo s, int maxlen)
897 {
898         int32           len;
899
900         /* Reset message buffer to empty */
901         s->len = 0;
902         s->data[0] = '\0';
903         s->cursor = 0;
904
905         /* Read message length word */
906         if (pq_getbytes((char *) &len, 4) == EOF)
907         {
908                 ereport(COMMERROR,
909                                 (errcode(ERRCODE_PROTOCOL_VIOLATION),
910                                  errmsg("unexpected EOF within message length word")));
911                 return EOF;
912         }
913
914         len = ntohl(len);
915
916         if (len < 4 ||
917                 (maxlen > 0 && len > maxlen))
918         {
919                 ereport(COMMERROR,
920                                 (errcode(ERRCODE_PROTOCOL_VIOLATION),
921                                  errmsg("invalid message length")));
922                 return EOF;
923         }
924
925         len -= 4;                                       /* discount length itself */
926
927         if (len > 0)
928         {
929                 /*
930                  * Allocate space for message.  If we run out of room (ridiculously
931                  * large message), we will elog(ERROR), but we want to discard the
932                  * message body so as not to lose communication sync.
933                  */
934                 PG_TRY();
935                 {
936                         enlargeStringInfo(s, len);
937                 }
938                 PG_CATCH();
939                 {
940                         if (pq_discardbytes(len) == EOF)
941                                 ereport(COMMERROR,
942                                                 (errcode(ERRCODE_PROTOCOL_VIOLATION),
943                                                  errmsg("incomplete message from client")));
944                         PG_RE_THROW();
945                 }
946                 PG_END_TRY();
947
948                 /* And grab the message */
949                 if (pq_getbytes(s->data, len) == EOF)
950                 {
951                         ereport(COMMERROR,
952                                         (errcode(ERRCODE_PROTOCOL_VIOLATION),
953                                          errmsg("incomplete message from client")));
954                         return EOF;
955                 }
956                 s->len = len;
957                 /* Place a trailing null per StringInfo convention */
958                 s->data[len] = '\0';
959         }
960
961         return 0;
962 }
963
964
965 /* --------------------------------
966  *              pq_putbytes             - send bytes to connection (not flushed until pq_flush)
967  *
968  *              returns 0 if OK, EOF if trouble
969  * --------------------------------
970  */
971 int
972 pq_putbytes(const char *s, size_t len)
973 {
974         int                     res;
975
976         /* Should only be called by old-style COPY OUT */
977         Assert(DoingCopyOut);
978         /* No-op if reentrant call */
979         if (PqCommBusy)
980                 return 0;
981         PqCommBusy = true;
982         res = internal_putbytes(s, len);
983         PqCommBusy = false;
984         return res;
985 }
986
987 static int
988 internal_putbytes(const char *s, size_t len)
989 {
990         size_t          amount;
991
992         while (len > 0)
993         {
994                 /* If buffer is full, then flush it out */
995                 if (PqSendPointer >= PQ_BUFFER_SIZE)
996                         if (internal_flush())
997                                 return EOF;
998                 amount = PQ_BUFFER_SIZE - PqSendPointer;
999                 if (amount > len)
1000                         amount = len;
1001                 memcpy(PqSendBuffer + PqSendPointer, s, amount);
1002                 PqSendPointer += amount;
1003                 s += amount;
1004                 len -= amount;
1005         }
1006         return 0;
1007 }
1008
1009 /* --------------------------------
1010  *              pq_flush                - flush pending output
1011  *
1012  *              returns 0 if OK, EOF if trouble
1013  * --------------------------------
1014  */
1015 int
1016 pq_flush(void)
1017 {
1018         int                     res;
1019
1020         /* No-op if reentrant call */
1021         if (PqCommBusy)
1022                 return 0;
1023         PqCommBusy = true;
1024         res = internal_flush();
1025         PqCommBusy = false;
1026         return res;
1027 }
1028
1029 static int
1030 internal_flush(void)
1031 {
1032         static int      last_reported_send_errno = 0;
1033
1034         unsigned char *bufptr = PqSendBuffer;
1035         unsigned char *bufend = PqSendBuffer + PqSendPointer;
1036
1037         while (bufptr < bufend)
1038         {
1039                 int                     r;
1040
1041                 r = secure_write(MyProcPort, bufptr, bufend - bufptr);
1042
1043                 if (r <= 0)
1044                 {
1045                         if (errno == EINTR)
1046                                 continue;               /* Ok if we were interrupted */
1047
1048                         /*
1049                          * Careful: an ereport() that tries to write to the client
1050                          * would cause recursion to here, leading to stack overflow
1051                          * and core dump!  This message must go *only* to the
1052                          * postmaster log.
1053                          *
1054                          * If a client disconnects while we're in the midst of output, we
1055                          * might write quite a bit of data before we get to a safe
1056                          * query abort point.  So, suppress duplicate log messages.
1057                          */
1058                         if (errno != last_reported_send_errno)
1059                         {
1060                                 last_reported_send_errno = errno;
1061                                 ereport(COMMERROR,
1062                                                 (errcode_for_socket_access(),
1063                                                  errmsg("could not send data to client: %m")));
1064                         }
1065
1066                         /*
1067                          * We drop the buffered data anyway so that processing can
1068                          * continue, even though we'll probably quit soon.
1069                          */
1070                         PqSendPointer = 0;
1071                         return EOF;
1072                 }
1073
1074                 last_reported_send_errno = 0;   /* reset after any successful send */
1075                 bufptr += r;
1076         }
1077
1078         PqSendPointer = 0;
1079         return 0;
1080 }
1081
1082
1083 /* --------------------------------
1084  * Message-level I/O routines begin here.
1085  *
1086  * These routines understand about the old-style COPY OUT protocol.
1087  * --------------------------------
1088  */
1089
1090
1091 /* --------------------------------
1092  *              pq_putmessage   - send a normal message (suppressed in COPY OUT mode)
1093  *
1094  *              If msgtype is not '\0', it is a message type code to place before
1095  *              the message body.  If msgtype is '\0', then the message has no type
1096  *              code (this is only valid in pre-3.0 protocols).
1097  *
1098  *              len is the length of the message body data at *s.  In protocol 3.0
1099  *              and later, a message length word (equal to len+4 because it counts
1100  *              itself too) is inserted by this routine.
1101  *
1102  *              All normal messages are suppressed while old-style COPY OUT is in
1103  *              progress.  (In practice only a few notice messages might get emitted
1104  *              then; dropping them is annoying, but at least they will still appear
1105  *              in the postmaster log.)
1106  *
1107  *              We also suppress messages generated while pqcomm.c is busy.  This
1108  *              avoids any possibility of messages being inserted within other
1109  *              messages.  The only known trouble case arises if SIGQUIT occurs
1110  *              during a pqcomm.c routine --- quickdie() will try to send a warning
1111  *              message, and the most reasonable approach seems to be to drop it.
1112  *
1113  *              returns 0 if OK, EOF if trouble
1114  * --------------------------------
1115  */
1116 int
1117 pq_putmessage(char msgtype, const char *s, size_t len)
1118 {
1119         if (DoingCopyOut || PqCommBusy)
1120                 return 0;
1121         PqCommBusy = true;
1122         if (msgtype)
1123                 if (internal_putbytes(&msgtype, 1))
1124                         goto fail;
1125         if (PG_PROTOCOL_MAJOR(FrontendProtocol) >= 3)
1126         {
1127                 uint32          n32;
1128
1129                 n32 = htonl((uint32) (len + 4));
1130                 if (internal_putbytes((char *) &n32, 4))
1131                         goto fail;
1132         }
1133         if (internal_putbytes(s, len))
1134                 goto fail;
1135         PqCommBusy = false;
1136         return 0;
1137
1138 fail:
1139         PqCommBusy = false;
1140         return EOF;
1141 }
1142
1143 /* --------------------------------
1144  *              pq_startcopyout - inform libpq that an old-style COPY OUT transfer
1145  *                      is beginning
1146  * --------------------------------
1147  */
1148 void
1149 pq_startcopyout(void)
1150 {
1151         DoingCopyOut = true;
1152 }
1153
1154 /* --------------------------------
1155  *              pq_endcopyout   - end an old-style COPY OUT transfer
1156  *
1157  *              If errorAbort is indicated, we are aborting a COPY OUT due to an error,
1158  *              and must send a terminator line.  Since a partial data line might have
1159  *              been emitted, send a couple of newlines first (the first one could
1160  *              get absorbed by a backslash...)  Note that old-style COPY OUT does
1161  *              not allow binary transfers, so a textual terminator is always correct.
1162  * --------------------------------
1163  */
1164 void
1165 pq_endcopyout(bool errorAbort)
1166 {
1167         if (!DoingCopyOut)
1168                 return;
1169         if (errorAbort)
1170                 pq_putbytes("\n\n\\.\n", 5);
1171         /* in non-error case, copy.c will have emitted the terminator line */
1172         DoingCopyOut = false;
1173 }
1174
1175 int
1176 pq_getkeepalivesidle(Port *port)
1177 {
1178 #ifdef TCP_KEEPIDLE
1179         if (IS_AF_UNIX(port->laddr.addr.ss_family))
1180                 return 0;
1181
1182         if (port->keepalives_idle != 0)
1183                 return port->keepalives_idle;
1184
1185         if (port->default_keepalives_idle == 0)
1186         {
1187                 socklen_t size = sizeof(port->default_keepalives_idle);
1188                 if (getsockopt(port->sock, IPPROTO_TCP, TCP_KEEPIDLE,
1189                                            (char *) &port->default_keepalives_idle, 
1190                                            &size) < 0)
1191                 {
1192                         elog(LOG, "getsockopt(TCP_KEEPIDLE) failed: %m");
1193                         return -1;
1194                 }
1195         }
1196
1197         return port->default_keepalives_idle;
1198 #else
1199         return 0;
1200 #endif
1201 }
1202    
1203 int
1204 pq_setkeepalivesidle(int idle, Port *port)
1205 {
1206         if (IS_AF_UNIX(port->laddr.addr.ss_family))
1207                 return STATUS_OK;
1208
1209 #ifdef TCP_KEEPIDLE
1210         if (idle == port->keepalives_idle)
1211                 return STATUS_OK;
1212
1213         if (port->default_keepalives_idle == 0)
1214         {
1215                 if (pq_getkeepalivesidle(port) < 0)
1216                         return STATUS_ERROR;
1217         }
1218                         
1219         if (idle == 0)
1220                 idle = port->default_keepalives_idle;
1221
1222         if (setsockopt(port->sock, IPPROTO_TCP, TCP_KEEPIDLE,
1223                                    (char *) &idle, sizeof(idle)) < 0)
1224         {
1225                 elog(LOG, "setsockopt(TCP_KEEPIDLE) failed: %m");
1226                 return STATUS_ERROR;
1227         }
1228
1229         port->keepalives_idle = idle;
1230 #else
1231         if (idle != 0)
1232         {
1233                 elog(LOG, "setsockopt(TCP_KEEPIDLE) not supported");
1234                 return STATUS_ERROR;
1235         }
1236 #endif
1237
1238         return STATUS_OK;
1239 }
1240
1241 int
1242 pq_getkeepalivesinterval(Port *port)
1243 {
1244 #ifdef TCP_KEEPINTVL
1245         if (IS_AF_UNIX(port->laddr.addr.ss_family))
1246                 return 0;
1247
1248         if (port->keepalives_interval != 0)
1249                 return port->keepalives_interval;
1250
1251         if (port->default_keepalives_interval == 0)
1252         {
1253                 socklen_t size = sizeof(port->default_keepalives_interval);
1254                 if (getsockopt(port->sock, IPPROTO_TCP, TCP_KEEPINTVL,
1255                                            (char *) &port->default_keepalives_interval, 
1256                                            &size) < 0)
1257                 {
1258                         elog(LOG, "getsockopt(TCP_KEEPINTVL) failed: %m");
1259                         return -1;
1260                 }
1261         }
1262
1263         return port->default_keepalives_interval;
1264 #else
1265         return 0;
1266 #endif
1267 }
1268    
1269 int
1270 pq_setkeepalivesinterval(int interval, Port *port)
1271 {
1272         if (IS_AF_UNIX(port->laddr.addr.ss_family))
1273                 return STATUS_OK;
1274
1275 #ifdef TCP_KEEPINTVL
1276         if (interval == port->keepalives_interval)
1277                 return STATUS_OK;
1278
1279         if (port->default_keepalives_interval == 0) {
1280                 if (pq_getkeepalivesinterval(port) < 0)
1281                         return STATUS_ERROR;
1282         }
1283                         
1284         if (interval == 0)
1285                 interval = port->default_keepalives_interval;
1286
1287         if (setsockopt(port->sock, IPPROTO_TCP, TCP_KEEPINTVL,
1288                                    (char *) &interval, sizeof(interval)) < 0)
1289         {
1290                 elog(LOG, "setsockopt(TCP_KEEPINTVL) failed: %m");
1291                 return STATUS_ERROR;
1292         }
1293
1294         port->keepalives_interval = interval;
1295 #else
1296         if (interval != 0)
1297         {
1298                 elog(LOG, "setsockopt(TCP_KEEPINTVL) not supported");
1299                 return STATUS_ERROR;
1300         }               
1301 #endif
1302
1303         return STATUS_OK;
1304 }
1305
1306 int
1307 pq_getkeepalivescount(Port *port)
1308 {
1309 #ifdef TCP_KEEPCNT
1310         if (IS_AF_UNIX(port->laddr.addr.ss_family))
1311                 return 0;
1312
1313         if (port->keepalives_count != 0)
1314                 return port->keepalives_count;
1315
1316         if (port->default_keepalives_count == 0)
1317         {
1318                 socklen_t size = sizeof(port->default_keepalives_count);
1319                 if (getsockopt(port->sock, IPPROTO_TCP, TCP_KEEPCNT,
1320                                            (char *) &port->default_keepalives_count, 
1321                                            &size) < 0)
1322                 {
1323                         elog(LOG, "getsockopt(TCP_KEEPCNT) failed: %m");
1324                         return -1;
1325                 }
1326         }
1327
1328         return port->default_keepalives_count;
1329 #else
1330         return 0;
1331 #endif
1332 }
1333    
1334 int
1335 pq_setkeepalivescount(int count, Port *port)
1336 {
1337         if (IS_AF_UNIX(port->laddr.addr.ss_family))
1338                 return STATUS_OK;
1339
1340 #ifdef TCP_KEEPCNT
1341         if (count == port->keepalives_count)
1342                 return STATUS_OK;
1343
1344         if (port->default_keepalives_count == 0) {
1345                 if (pq_getkeepalivescount(port) < 0)
1346                         return STATUS_ERROR;
1347         }
1348                         
1349         if (count == 0)
1350                 count = port->default_keepalives_count;
1351
1352         if (setsockopt(port->sock, IPPROTO_TCP, TCP_KEEPCNT,
1353                                    (char *) &count, sizeof(count)) < 0)
1354         {
1355                 elog(LOG, "setsockopt(TCP_KEEPCNT) failed: %m");
1356                 return STATUS_ERROR;
1357         }
1358
1359         port->keepalives_count = count;
1360 #else
1361         if (count != 0)
1362         {
1363                 elog(LOG, "setsockopt(TCP_KEEPCNT) not supported");
1364                 return STATUS_ERROR;
1365         }
1366 #endif
1367
1368         return STATUS_OK;
1369 }