]> granicus.if.org Git - postgresql/blob - src/bin/pg_basebackup/receivelog.c
pg_basebackup pg_receivexlog: Issue fsync more carefully
[postgresql] / src / bin / pg_basebackup / receivelog.c
1 /*-------------------------------------------------------------------------
2  *
3  * receivelog.c - receive transaction log files using the streaming
4  *                                replication protocol.
5  *
6  * Author: Magnus Hagander <magnus@hagander.net>
7  *
8  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
9  *
10  * IDENTIFICATION
11  *                src/bin/pg_basebackup/receivelog.c
12  *-------------------------------------------------------------------------
13  */
14
15 #include "postgres_fe.h"
16
17 #include <sys/stat.h>
18 #include <unistd.h>
19 #ifdef HAVE_SYS_SELECT_H
20 #include <sys/select.h>
21 #endif
22
23 /* local includes */
24 #include "receivelog.h"
25 #include "streamutil.h"
26
27 #include "libpq-fe.h"
28 #include "access/xlog_internal.h"
29 #include "common/file_utils.h"
30
31
32 /* fd and filename for currently open WAL file */
33 static int      walfile = -1;
34 static char current_walfile_name[MAXPGPATH] = "";
35 static bool reportFlushPosition = false;
36 static XLogRecPtr lastFlushPosition = InvalidXLogRecPtr;
37
38 static bool still_sending = true;               /* feedback still needs to be sent? */
39
40 static PGresult *HandleCopyStream(PGconn *conn, StreamCtl *stream,
41                                  XLogRecPtr *stoppos);
42 static int      CopyStreamPoll(PGconn *conn, long timeout_ms);
43 static int      CopyStreamReceive(PGconn *conn, long timeout, char **buffer);
44 static bool ProcessKeepaliveMsg(PGconn *conn, char *copybuf, int len,
45                                         XLogRecPtr blockpos, int64 *last_status);
46 static bool ProcessXLogDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len,
47                                    XLogRecPtr *blockpos);
48 static PGresult *HandleEndOfCopyStream(PGconn *conn, StreamCtl *stream, char *copybuf,
49                                           XLogRecPtr blockpos, XLogRecPtr *stoppos);
50 static bool CheckCopyStreamStop(PGconn *conn, StreamCtl *stream, XLogRecPtr blockpos,
51                                         XLogRecPtr *stoppos);
52 static long CalculateCopyStreamSleeptime(int64 now, int standby_message_timeout,
53                                                          int64 last_status);
54
55 static bool ReadEndOfStreamingResult(PGresult *res, XLogRecPtr *startpos,
56                                                  uint32 *timeline);
57
58 static bool
59 mark_file_as_archived(const char *basedir, const char *fname)
60 {
61         int                     fd;
62         static char tmppath[MAXPGPATH];
63
64         snprintf(tmppath, sizeof(tmppath), "%s/archive_status/%s.done",
65                          basedir, fname);
66
67         fd = open(tmppath, O_WRONLY | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR);
68         if (fd < 0)
69         {
70                 fprintf(stderr, _("%s: could not create archive status file \"%s\": %s\n"),
71                                 progname, tmppath, strerror(errno));
72                 return false;
73         }
74
75         close(fd);
76
77         if (fsync_fname(tmppath, false, progname) != 0)
78                 return false;
79
80         if (fsync_parent_path(tmppath, progname) != 0)
81                 return false;
82
83         return true;
84 }
85
86 /*
87  * Open a new WAL file in the specified directory.
88  *
89  * The file will be padded to 16Mb with zeroes. The base filename (without
90  * partial_suffix) is stored in current_walfile_name.
91  */
92 static bool
93 open_walfile(StreamCtl *stream, XLogRecPtr startpoint)
94 {
95         int                     f;
96         char            fn[MAXPGPATH];
97         struct stat statbuf;
98         char       *zerobuf;
99         int                     bytes;
100         XLogSegNo       segno;
101
102         XLByteToSeg(startpoint, segno);
103         XLogFileName(current_walfile_name, stream->timeline, segno);
104
105         snprintf(fn, sizeof(fn), "%s/%s%s", stream->basedir, current_walfile_name,
106                          stream->partial_suffix ? stream->partial_suffix : "");
107         f = open(fn, O_WRONLY | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR);
108         if (f == -1)
109         {
110                 fprintf(stderr,
111                                 _("%s: could not open transaction log file \"%s\": %s\n"),
112                                 progname, fn, strerror(errno));
113                 return false;
114         }
115
116         /*
117          * Verify that the file is either empty (just created), or a complete
118          * XLogSegSize segment. Anything in between indicates a corrupt file.
119          */
120         if (fstat(f, &statbuf) != 0)
121         {
122                 fprintf(stderr,
123                                 _("%s: could not stat transaction log file \"%s\": %s\n"),
124                                 progname, fn, strerror(errno));
125                 close(f);
126                 return false;
127         }
128         if (statbuf.st_size == XLogSegSize)
129         {
130                 /* File is open and ready to use */
131                 walfile = f;
132
133                 /*
134                  * fsync, in case of a previous crash between padding and fsyncing the
135                  * file.
136                  */
137                 if (fsync_fname(fn, false, progname) != 0)
138                         return false;
139                 if (fsync_parent_path(fn, progname) != 0)
140                         return false;
141
142                 return true;
143         }
144         if (statbuf.st_size != 0)
145         {
146                 fprintf(stderr,
147                                 _("%s: transaction log file \"%s\" has %d bytes, should be 0 or %d\n"),
148                                 progname, fn, (int) statbuf.st_size, XLogSegSize);
149                 close(f);
150                 return false;
151         }
152
153         /* New, empty, file. So pad it to 16Mb with zeroes */
154         zerobuf = pg_malloc0(XLOG_BLCKSZ);
155         for (bytes = 0; bytes < XLogSegSize; bytes += XLOG_BLCKSZ)
156         {
157                 if (write(f, zerobuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
158                 {
159                         fprintf(stderr,
160                                         _("%s: could not pad transaction log file \"%s\": %s\n"),
161                                         progname, fn, strerror(errno));
162                         free(zerobuf);
163                         close(f);
164                         unlink(fn);
165                         return false;
166                 }
167         }
168         free(zerobuf);
169
170         /*
171          * fsync WAL file and containing directory, to ensure the file is
172          * persistently created and zeroed. That's particularly important when
173          * using synchronous mode, where the file is modified and fsynced
174          * in-place, without a directory fsync.
175          */
176         if (fsync_fname(fn, false, progname) != 0)
177                 return false;
178         if (fsync_parent_path(fn, progname) != 0)
179                 return false;
180
181         if (lseek(f, SEEK_SET, 0) != 0)
182         {
183                 fprintf(stderr,
184                                 _("%s: could not seek to beginning of transaction log file \"%s\": %s\n"),
185                                 progname, fn, strerror(errno));
186                 close(f);
187                 return false;
188         }
189         walfile = f;
190         return true;
191 }
192
193 /*
194  * Close the current WAL file (if open), and rename it to the correct
195  * filename if it's complete. On failure, prints an error message to stderr
196  * and returns false, otherwise returns true.
197  */
198 static bool
199 close_walfile(StreamCtl *stream, XLogRecPtr pos)
200 {
201         off_t           currpos;
202
203         if (walfile == -1)
204                 return true;
205
206         currpos = lseek(walfile, 0, SEEK_CUR);
207         if (currpos == -1)
208         {
209                 fprintf(stderr,
210                          _("%s: could not determine seek position in file \"%s\": %s\n"),
211                                 progname, current_walfile_name, strerror(errno));
212                 return false;
213         }
214
215         if (fsync(walfile) != 0)
216         {
217                 fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
218                                 progname, current_walfile_name, strerror(errno));
219                 return false;
220         }
221
222         if (close(walfile) != 0)
223         {
224                 fprintf(stderr, _("%s: could not close file \"%s\": %s\n"),
225                                 progname, current_walfile_name, strerror(errno));
226                 walfile = -1;
227                 return false;
228         }
229         walfile = -1;
230
231         /*
232          * If we finished writing a .partial file, rename it into place.
233          */
234         if (currpos == XLOG_SEG_SIZE && stream->partial_suffix)
235         {
236                 char            oldfn[MAXPGPATH];
237                 char            newfn[MAXPGPATH];
238
239                 snprintf(oldfn, sizeof(oldfn), "%s/%s%s", stream->basedir, current_walfile_name, stream->partial_suffix);
240                 snprintf(newfn, sizeof(newfn), "%s/%s", stream->basedir, current_walfile_name);
241                 if (durable_rename(oldfn, newfn, progname) != 0)
242                 {
243                         /* durable_rename produced a log entry */
244                         return false;
245                 }
246         }
247         else if (stream->partial_suffix)
248                 fprintf(stderr,
249                                 _("%s: not renaming \"%s%s\", segment is not complete\n"),
250                                 progname, current_walfile_name, stream->partial_suffix);
251
252         /*
253          * Mark file as archived if requested by the caller - pg_basebackup needs
254          * to do so as files can otherwise get archived again after promotion of a
255          * new node. This is in line with walreceiver.c always doing a
256          * XLogArchiveForceDone() after a complete segment.
257          */
258         if (currpos == XLOG_SEG_SIZE && stream->mark_done)
259         {
260                 /* writes error message if failed */
261                 if (!mark_file_as_archived(stream->basedir, current_walfile_name))
262                         return false;
263         }
264
265         lastFlushPosition = pos;
266         return true;
267 }
268
269
270 /*
271  * Check if a timeline history file exists.
272  */
273 static bool
274 existsTimeLineHistoryFile(StreamCtl *stream)
275 {
276         char            path[MAXPGPATH];
277         char            histfname[MAXFNAMELEN];
278         int                     fd;
279
280         /*
281          * Timeline 1 never has a history file. We treat that as if it existed,
282          * since we never need to stream it.
283          */
284         if (stream->timeline == 1)
285                 return true;
286
287         TLHistoryFileName(histfname, stream->timeline);
288
289         snprintf(path, sizeof(path), "%s/%s", stream->basedir, histfname);
290
291         fd = open(path, O_RDONLY | PG_BINARY, 0);
292         if (fd < 0)
293         {
294                 if (errno != ENOENT)
295                         fprintf(stderr, _("%s: could not open timeline history file \"%s\": %s\n"),
296                                         progname, path, strerror(errno));
297                 return false;
298         }
299         else
300         {
301                 close(fd);
302                 return true;
303         }
304 }
305
306 static bool
307 writeTimeLineHistoryFile(StreamCtl *stream, char *filename, char *content)
308 {
309         int                     size = strlen(content);
310         char            path[MAXPGPATH];
311         char            tmppath[MAXPGPATH];
312         char            histfname[MAXFNAMELEN];
313         int                     fd;
314
315         /*
316          * Check that the server's idea of how timeline history files should be
317          * named matches ours.
318          */
319         TLHistoryFileName(histfname, stream->timeline);
320         if (strcmp(histfname, filename) != 0)
321         {
322                 fprintf(stderr, _("%s: server reported unexpected history file name for timeline %u: %s\n"),
323                                 progname, stream->timeline, filename);
324                 return false;
325         }
326
327         snprintf(path, sizeof(path), "%s/%s", stream->basedir, histfname);
328
329         /*
330          * Write into a temp file name.
331          */
332         snprintf(tmppath, MAXPGPATH, "%s.tmp", path);
333
334         unlink(tmppath);
335
336         fd = open(tmppath, O_WRONLY | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR);
337         if (fd < 0)
338         {
339                 fprintf(stderr, _("%s: could not create timeline history file \"%s\": %s\n"),
340                                 progname, tmppath, strerror(errno));
341                 return false;
342         }
343
344         errno = 0;
345         if ((int) write(fd, content, size) != size)
346         {
347                 int                     save_errno = errno;
348
349                 /*
350                  * If we fail to make the file, delete it to release disk space
351                  */
352                 close(fd);
353                 unlink(tmppath);
354                 errno = save_errno;
355
356                 fprintf(stderr, _("%s: could not write timeline history file \"%s\": %s\n"),
357                                 progname, tmppath, strerror(errno));
358                 return false;
359         }
360
361         if (close(fd) != 0)
362         {
363                 fprintf(stderr, _("%s: could not close file \"%s\": %s\n"),
364                                 progname, tmppath, strerror(errno));
365                 return false;
366         }
367
368         /*
369          * Now move the completed history file into place with its final name.
370          */
371         if (durable_rename(tmppath, path, progname) < 0)
372         {
373                 /* durable_rename produced a log entry */
374                 return false;
375         }
376
377         /* Maintain archive_status, check close_walfile() for details. */
378         if (stream->mark_done)
379         {
380                 /* writes error message if failed */
381                 if (!mark_file_as_archived(stream->basedir, histfname))
382                         return false;
383         }
384
385         return true;
386 }
387
388 /*
389  * Send a Standby Status Update message to server.
390  */
391 static bool
392 sendFeedback(PGconn *conn, XLogRecPtr blockpos, int64 now, bool replyRequested)
393 {
394         char            replybuf[1 + 8 + 8 + 8 + 8 + 1];
395         int                     len = 0;
396
397         replybuf[len] = 'r';
398         len += 1;
399         fe_sendint64(blockpos, &replybuf[len]);         /* write */
400         len += 8;
401         if (reportFlushPosition)
402                 fe_sendint64(lastFlushPosition, &replybuf[len]);                /* flush */
403         else
404                 fe_sendint64(InvalidXLogRecPtr, &replybuf[len]);                /* flush */
405         len += 8;
406         fe_sendint64(InvalidXLogRecPtr, &replybuf[len]);        /* apply */
407         len += 8;
408         fe_sendint64(now, &replybuf[len]);      /* sendTime */
409         len += 8;
410         replybuf[len] = replyRequested ? 1 : 0;         /* replyRequested */
411         len += 1;
412
413         if (PQputCopyData(conn, replybuf, len) <= 0 || PQflush(conn))
414         {
415                 fprintf(stderr, _("%s: could not send feedback packet: %s"),
416                                 progname, PQerrorMessage(conn));
417                 return false;
418         }
419
420         return true;
421 }
422
423 /*
424  * Check that the server version we're connected to is supported by
425  * ReceiveXlogStream().
426  *
427  * If it's not, an error message is printed to stderr, and false is returned.
428  */
429 bool
430 CheckServerVersionForStreaming(PGconn *conn)
431 {
432         int                     minServerMajor,
433                                 maxServerMajor;
434         int                     serverMajor;
435
436         /*
437          * The message format used in streaming replication changed in 9.3, so we
438          * cannot stream from older servers. And we don't support servers newer
439          * than the client; it might work, but we don't know, so err on the safe
440          * side.
441          */
442         minServerMajor = 903;
443         maxServerMajor = PG_VERSION_NUM / 100;
444         serverMajor = PQserverVersion(conn) / 100;
445         if (serverMajor < minServerMajor)
446         {
447                 const char *serverver = PQparameterStatus(conn, "server_version");
448
449                 fprintf(stderr, _("%s: incompatible server version %s; client does not support streaming from server versions older than %s\n"),
450                                 progname,
451                                 serverver ? serverver : "'unknown'",
452                                 "9.3");
453                 return false;
454         }
455         else if (serverMajor > maxServerMajor)
456         {
457                 const char *serverver = PQparameterStatus(conn, "server_version");
458
459                 fprintf(stderr, _("%s: incompatible server version %s; client does not support streaming from server versions newer than %s\n"),
460                                 progname,
461                                 serverver ? serverver : "'unknown'",
462                                 PG_VERSION);
463                 return false;
464         }
465         return true;
466 }
467
468 /*
469  * Receive a log stream starting at the specified position.
470  *
471  * Individual parameters are passed through the StreamCtl structure.
472  *
473  * If sysidentifier is specified, validate that both the system
474  * identifier and the timeline matches the specified ones
475  * (by sending an extra IDENTIFY_SYSTEM command)
476  *
477  * All received segments will be written to the directory
478  * specified by basedir. This will also fetch any missing timeline history
479  * files.
480  *
481  * The stream_stop callback will be called every time data
482  * is received, and whenever a segment is completed. If it returns
483  * true, the streaming will stop and the function
484  * return. As long as it returns false, streaming will continue
485  * indefinitely.
486  *
487  * standby_message_timeout controls how often we send a message
488  * back to the master letting it know our progress, in milliseconds.
489  * This message will only contain the write location, and never
490  * flush or replay.
491  *
492  * If 'partial_suffix' is not NULL, files are initially created with the
493  * given suffix, and the suffix is removed once the file is finished. That
494  * allows you to tell the difference between partial and completed files,
495  * so that you can continue later where you left.
496  *
497  * If 'synchronous' is true, the received WAL is flushed as soon as written,
498  * otherwise only when the WAL file is closed.
499  *
500  * Note: The log position *must* be at a log segment start!
501  */
502 bool
503 ReceiveXlogStream(PGconn *conn, StreamCtl *stream)
504 {
505         char            query[128];
506         char            slotcmd[128];
507         PGresult   *res;
508         XLogRecPtr      stoppos;
509
510         /*
511          * The caller should've checked the server version already, but doesn't do
512          * any harm to check it here too.
513          */
514         if (!CheckServerVersionForStreaming(conn))
515                 return false;
516
517         /*
518          * Decide whether we want to report the flush position. If we report
519          * the flush position, the primary will know what WAL we'll
520          * possibly re-request, and it can then remove older WAL safely.
521          * We must always do that when we are using slots.
522          *
523          * Reporting the flush position makes one eligible as a synchronous
524          * replica. People shouldn't include generic names in
525          * synchronous_standby_names, but we've protected them against it so
526          * far, so let's continue to do so unless specifically requested.
527          */
528         if (replication_slot != NULL)
529         {
530                 reportFlushPosition = true;
531                 sprintf(slotcmd, "SLOT \"%s\" ", replication_slot);
532         }
533         else
534         {
535                 if (stream->synchronous)
536                         reportFlushPosition = true;
537                 else
538                         reportFlushPosition = false;
539                 slotcmd[0] = 0;
540         }
541
542         if (stream->sysidentifier != NULL)
543         {
544                 /* Validate system identifier hasn't changed */
545                 res = PQexec(conn, "IDENTIFY_SYSTEM");
546                 if (PQresultStatus(res) != PGRES_TUPLES_OK)
547                 {
548                         fprintf(stderr,
549                                         _("%s: could not send replication command \"%s\": %s"),
550                                         progname, "IDENTIFY_SYSTEM", PQerrorMessage(conn));
551                         PQclear(res);
552                         return false;
553                 }
554                 if (PQntuples(res) != 1 || PQnfields(res) < 3)
555                 {
556                         fprintf(stderr,
557                                         _("%s: could not identify system: got %d rows and %d fields, expected %d rows and %d or more fields\n"),
558                                         progname, PQntuples(res), PQnfields(res), 1, 3);
559                         PQclear(res);
560                         return false;
561                 }
562                 if (strcmp(stream->sysidentifier, PQgetvalue(res, 0, 0)) != 0)
563                 {
564                         fprintf(stderr,
565                                         _("%s: system identifier does not match between base backup and streaming connection\n"),
566                                         progname);
567                         PQclear(res);
568                         return false;
569                 }
570                 if (stream->timeline > atoi(PQgetvalue(res, 0, 1)))
571                 {
572                         fprintf(stderr,
573                                 _("%s: starting timeline %u is not present in the server\n"),
574                                         progname, stream->timeline);
575                         PQclear(res);
576                         return false;
577                 }
578                 PQclear(res);
579         }
580
581         /*
582          * initialize flush position to starting point, it's the caller's
583          * responsibility that that's sane.
584          */
585         lastFlushPosition = stream->startpos;
586
587         while (1)
588         {
589                 /*
590                  * Fetch the timeline history file for this timeline, if we don't have
591                  * it already.
592                  */
593                 if (!existsTimeLineHistoryFile(stream))
594                 {
595                         snprintf(query, sizeof(query), "TIMELINE_HISTORY %u", stream->timeline);
596                         res = PQexec(conn, query);
597                         if (PQresultStatus(res) != PGRES_TUPLES_OK)
598                         {
599                                 /* FIXME: we might send it ok, but get an error */
600                                 fprintf(stderr, _("%s: could not send replication command \"%s\": %s"),
601                                         progname, "TIMELINE_HISTORY", PQresultErrorMessage(res));
602                                 PQclear(res);
603                                 return false;
604                         }
605
606                         /*
607                          * The response to TIMELINE_HISTORY is a single row result set
608                          * with two fields: filename and content
609                          */
610                         if (PQnfields(res) != 2 || PQntuples(res) != 1)
611                         {
612                                 fprintf(stderr,
613                                                 _("%s: unexpected response to TIMELINE_HISTORY command: got %d rows and %d fields, expected %d rows and %d fields\n"),
614                                                 progname, PQntuples(res), PQnfields(res), 1, 2);
615                         }
616
617                         /* Write the history file to disk */
618                         writeTimeLineHistoryFile(stream,
619                                                                          PQgetvalue(res, 0, 0),
620                                                                          PQgetvalue(res, 0, 1));
621
622                         PQclear(res);
623                 }
624
625                 /*
626                  * Before we start streaming from the requested location, check if the
627                  * callback tells us to stop here.
628                  */
629                 if (stream->stream_stop(stream->startpos, stream->timeline, false))
630                         return true;
631
632                 /* Initiate the replication stream at specified location */
633                 snprintf(query, sizeof(query), "START_REPLICATION %s%X/%X TIMELINE %u",
634                                  slotcmd,
635                                  (uint32) (stream->startpos >> 32), (uint32) stream->startpos,
636                                  stream->timeline);
637                 res = PQexec(conn, query);
638                 if (PQresultStatus(res) != PGRES_COPY_BOTH)
639                 {
640                         fprintf(stderr, _("%s: could not send replication command \"%s\": %s"),
641                                         progname, "START_REPLICATION", PQresultErrorMessage(res));
642                         PQclear(res);
643                         return false;
644                 }
645                 PQclear(res);
646
647                 /* Stream the WAL */
648                 res = HandleCopyStream(conn, stream, &stoppos);
649                 if (res == NULL)
650                         goto error;
651
652                 /*
653                  * Streaming finished.
654                  *
655                  * There are two possible reasons for that: a controlled shutdown, or
656                  * we reached the end of the current timeline. In case of
657                  * end-of-timeline, the server sends a result set after Copy has
658                  * finished, containing information about the next timeline. Read
659                  * that, and restart streaming from the next timeline. In case of
660                  * controlled shutdown, stop here.
661                  */
662                 if (PQresultStatus(res) == PGRES_TUPLES_OK)
663                 {
664                         /*
665                          * End-of-timeline. Read the next timeline's ID and starting
666                          * position. Usually, the starting position will match the end of
667                          * the previous timeline, but there are corner cases like if the
668                          * server had sent us half of a WAL record, when it was promoted.
669                          * The new timeline will begin at the end of the last complete
670                          * record in that case, overlapping the partial WAL record on the
671                          * the old timeline.
672                          */
673                         uint32          newtimeline;
674                         bool            parsed;
675
676                         parsed = ReadEndOfStreamingResult(res, &stream->startpos, &newtimeline);
677                         PQclear(res);
678                         if (!parsed)
679                                 goto error;
680
681                         /* Sanity check the values the server gave us */
682                         if (newtimeline <= stream->timeline)
683                         {
684                                 fprintf(stderr,
685                                                 _("%s: server reported unexpected next timeline %u, following timeline %u\n"),
686                                                 progname, newtimeline, stream->timeline);
687                                 goto error;
688                         }
689                         if (stream->startpos > stoppos)
690                         {
691                                 fprintf(stderr,
692                                                 _("%s: server stopped streaming timeline %u at %X/%X, but reported next timeline %u to begin at %X/%X\n"),
693                                                 progname,
694                                 stream->timeline, (uint32) (stoppos >> 32), (uint32) stoppos,
695                                                 newtimeline, (uint32) (stream->startpos >> 32), (uint32) stream->startpos);
696                                 goto error;
697                         }
698
699                         /* Read the final result, which should be CommandComplete. */
700                         res = PQgetResult(conn);
701                         if (PQresultStatus(res) != PGRES_COMMAND_OK)
702                         {
703                                 fprintf(stderr,
704                                    _("%s: unexpected termination of replication stream: %s"),
705                                                 progname, PQresultErrorMessage(res));
706                                 PQclear(res);
707                                 goto error;
708                         }
709                         PQclear(res);
710
711                         /*
712                          * Loop back to start streaming from the new timeline. Always
713                          * start streaming at the beginning of a segment.
714                          */
715                         stream->timeline = newtimeline;
716                         stream->startpos = stream->startpos - (stream->startpos % XLOG_SEG_SIZE);
717                         continue;
718                 }
719                 else if (PQresultStatus(res) == PGRES_COMMAND_OK)
720                 {
721                         PQclear(res);
722
723                         /*
724                          * End of replication (ie. controlled shut down of the server).
725                          *
726                          * Check if the callback thinks it's OK to stop here. If not,
727                          * complain.
728                          */
729                         if (stream->stream_stop(stoppos, stream->timeline, false))
730                                 return true;
731                         else
732                         {
733                                 fprintf(stderr, _("%s: replication stream was terminated before stop point\n"),
734                                                 progname);
735                                 goto error;
736                         }
737                 }
738                 else
739                 {
740                         /* Server returned an error. */
741                         fprintf(stderr,
742                                         _("%s: unexpected termination of replication stream: %s"),
743                                         progname, PQresultErrorMessage(res));
744                         PQclear(res);
745                         goto error;
746                 }
747         }
748
749 error:
750         if (walfile != -1 && close(walfile) != 0)
751                 fprintf(stderr, _("%s: could not close file \"%s\": %s\n"),
752                                 progname, current_walfile_name, strerror(errno));
753         walfile = -1;
754         return false;
755 }
756
757 /*
758  * Helper function to parse the result set returned by server after streaming
759  * has finished. On failure, prints an error to stderr and returns false.
760  */
761 static bool
762 ReadEndOfStreamingResult(PGresult *res, XLogRecPtr *startpos, uint32 *timeline)
763 {
764         uint32          startpos_xlogid,
765                                 startpos_xrecoff;
766
767         /*----------
768          * The result set consists of one row and two columns, e.g:
769          *
770          *      next_tli | next_tli_startpos
771          * ----------+-------------------
772          *                 4 | 0/9949AE0
773          *
774          * next_tli is the timeline ID of the next timeline after the one that
775          * just finished streaming. next_tli_startpos is the XLOG position where
776          * the server switched to it.
777          *----------
778          */
779         if (PQnfields(res) < 2 || PQntuples(res) != 1)
780         {
781                 fprintf(stderr,
782                                 _("%s: unexpected result set after end-of-timeline: got %d rows and %d fields, expected %d rows and %d fields\n"),
783                                 progname, PQntuples(res), PQnfields(res), 1, 2);
784                 return false;
785         }
786
787         *timeline = atoi(PQgetvalue(res, 0, 0));
788         if (sscanf(PQgetvalue(res, 0, 1), "%X/%X", &startpos_xlogid,
789                            &startpos_xrecoff) != 2)
790         {
791                 fprintf(stderr,
792                         _("%s: could not parse next timeline's starting point \"%s\"\n"),
793                                 progname, PQgetvalue(res, 0, 1));
794                 return false;
795         }
796         *startpos = ((uint64) startpos_xlogid << 32) | startpos_xrecoff;
797
798         return true;
799 }
800
801 /*
802  * The main loop of ReceiveXlogStream. Handles the COPY stream after
803  * initiating streaming with the START_STREAMING command.
804  *
805  * If the COPY ends (not necessarily successfully) due a message from the
806  * server, returns a PGresult and sets *stoppos to the last byte written.
807  * On any other sort of error, returns NULL.
808  */
809 static PGresult *
810 HandleCopyStream(PGconn *conn, StreamCtl *stream,
811                                  XLogRecPtr *stoppos)
812 {
813         char       *copybuf = NULL;
814         int64           last_status = -1;
815         XLogRecPtr      blockpos = stream->startpos;
816
817         still_sending = true;
818
819         while (1)
820         {
821                 int                     r;
822                 int64           now;
823                 long            sleeptime;
824
825                 /*
826                  * Check if we should continue streaming, or abort at this point.
827                  */
828                 if (!CheckCopyStreamStop(conn, stream, blockpos, stoppos))
829                         goto error;
830
831                 now = feGetCurrentTimestamp();
832
833                 /*
834                  * If synchronous option is true, issue sync command as soon as there
835                  * are WAL data which has not been flushed yet.
836                  */
837                 if (stream->synchronous && lastFlushPosition < blockpos && walfile != -1)
838                 {
839                         if (fsync(walfile) != 0)
840                         {
841                                 fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
842                                                 progname, current_walfile_name, strerror(errno));
843                                 goto error;
844                         }
845                         lastFlushPosition = blockpos;
846
847                         /*
848                          * Send feedback so that the server sees the latest WAL locations
849                          * immediately.
850                          */
851                         if (!sendFeedback(conn, blockpos, now, false))
852                                 goto error;
853                         last_status = now;
854                 }
855
856                 /*
857                  * Potentially send a status message to the master
858                  */
859                 if (still_sending && stream->standby_message_timeout > 0 &&
860                         feTimestampDifferenceExceeds(last_status, now,
861                                                                                  stream->standby_message_timeout))
862                 {
863                         /* Time to send feedback! */
864                         if (!sendFeedback(conn, blockpos, now, false))
865                                 goto error;
866                         last_status = now;
867                 }
868
869                 /*
870                  * Calculate how long send/receive loops should sleep
871                  */
872                 sleeptime = CalculateCopyStreamSleeptime(now, stream->standby_message_timeout,
873                                                                                                  last_status);
874
875                 r = CopyStreamReceive(conn, sleeptime, &copybuf);
876                 while (r != 0)
877                 {
878                         if (r == -1)
879                                 goto error;
880                         if (r == -2)
881                         {
882                                 PGresult   *res = HandleEndOfCopyStream(conn, stream, copybuf, blockpos, stoppos);
883
884                                 if (res == NULL)
885                                         goto error;
886                                 else
887                                         return res;
888                         }
889
890                         /* Check the message type. */
891                         if (copybuf[0] == 'k')
892                         {
893                                 if (!ProcessKeepaliveMsg(conn, copybuf, r, blockpos,
894                                                                                  &last_status))
895                                         goto error;
896                         }
897                         else if (copybuf[0] == 'w')
898                         {
899                                 if (!ProcessXLogDataMsg(conn, stream, copybuf, r, &blockpos))
900                                         goto error;
901
902                                 /*
903                                  * Check if we should continue streaming, or abort at this
904                                  * point.
905                                  */
906                                 if (!CheckCopyStreamStop(conn, stream, blockpos, stoppos))
907                                         goto error;
908                         }
909                         else
910                         {
911                                 fprintf(stderr, _("%s: unrecognized streaming header: \"%c\"\n"),
912                                                 progname, copybuf[0]);
913                                 goto error;
914                         }
915
916                         /*
917                          * Process the received data, and any subsequent data we can read
918                          * without blocking.
919                          */
920                         r = CopyStreamReceive(conn, 0, &copybuf);
921                 }
922         }
923
924 error:
925         if (copybuf != NULL)
926                 PQfreemem(copybuf);
927         return NULL;
928 }
929
930 /*
931  * Wait until we can read CopyData message, or timeout.
932  *
933  * Returns 1 if data has become available for reading, 0 if timed out
934  * or interrupted by signal, and -1 on an error.
935  */
936 static int
937 CopyStreamPoll(PGconn *conn, long timeout_ms)
938 {
939         int                     ret;
940         fd_set          input_mask;
941         struct timeval timeout;
942         struct timeval *timeoutptr;
943
944         if (PQsocket(conn) < 0)
945         {
946                 fprintf(stderr, _("%s: invalid socket: %s"), progname,
947                                 PQerrorMessage(conn));
948                 return -1;
949         }
950
951         FD_ZERO(&input_mask);
952         FD_SET(PQsocket(conn), &input_mask);
953
954         if (timeout_ms < 0)
955                 timeoutptr = NULL;
956         else
957         {
958                 timeout.tv_sec = timeout_ms / 1000L;
959                 timeout.tv_usec = (timeout_ms % 1000L) * 1000L;
960                 timeoutptr = &timeout;
961         }
962
963         ret = select(PQsocket(conn) + 1, &input_mask, NULL, NULL, timeoutptr);
964         if (ret == 0 || (ret < 0 && errno == EINTR))
965                 return 0;                               /* Got a timeout or signal */
966         else if (ret < 0)
967         {
968                 fprintf(stderr, _("%s: select() failed: %s\n"),
969                                 progname, strerror(errno));
970                 return -1;
971         }
972
973         return 1;
974 }
975
976 /*
977  * Receive CopyData message available from XLOG stream, blocking for
978  * maximum of 'timeout' ms.
979  *
980  * If data was received, returns the length of the data. *buffer is set to
981  * point to a buffer holding the received message. The buffer is only valid
982  * until the next CopyStreamReceive call.
983  *
984  * 0 if no data was available within timeout, or wait was interrupted
985  * by signal. -1 on error. -2 if the server ended the COPY.
986  */
987 static int
988 CopyStreamReceive(PGconn *conn, long timeout, char **buffer)
989 {
990         char       *copybuf = NULL;
991         int                     rawlen;
992
993         if (*buffer != NULL)
994                 PQfreemem(*buffer);
995         *buffer = NULL;
996
997         /* Try to receive a CopyData message */
998         rawlen = PQgetCopyData(conn, &copybuf, 1);
999         if (rawlen == 0)
1000         {
1001                 /*
1002                  * No data available. Wait for some to appear, but not longer than the
1003                  * specified timeout, so that we can ping the server.
1004                  */
1005                 if (timeout != 0)
1006                 {
1007                         int                     ret;
1008
1009                         ret = CopyStreamPoll(conn, timeout);
1010                         if (ret <= 0)
1011                                 return ret;
1012                 }
1013
1014                 /* Else there is actually data on the socket */
1015                 if (PQconsumeInput(conn) == 0)
1016                 {
1017                         fprintf(stderr,
1018                                         _("%s: could not receive data from WAL stream: %s"),
1019                                         progname, PQerrorMessage(conn));
1020                         return -1;
1021                 }
1022
1023                 /* Now that we've consumed some input, try again */
1024                 rawlen = PQgetCopyData(conn, &copybuf, 1);
1025                 if (rawlen == 0)
1026                         return 0;
1027         }
1028         if (rawlen == -1)                       /* end-of-streaming or error */
1029                 return -2;
1030         if (rawlen == -2)
1031         {
1032                 fprintf(stderr, _("%s: could not read COPY data: %s"),
1033                                 progname, PQerrorMessage(conn));
1034                 return -1;
1035         }
1036
1037         /* Return received messages to caller */
1038         *buffer = copybuf;
1039         return rawlen;
1040 }
1041
1042 /*
1043  * Process the keepalive message.
1044  */
1045 static bool
1046 ProcessKeepaliveMsg(PGconn *conn, char *copybuf, int len,
1047                                         XLogRecPtr blockpos, int64 *last_status)
1048 {
1049         int                     pos;
1050         bool            replyRequested;
1051         int64           now;
1052
1053         /*
1054          * Parse the keepalive message, enclosed in the CopyData message. We just
1055          * check if the server requested a reply, and ignore the rest.
1056          */
1057         pos = 1;                                        /* skip msgtype 'k' */
1058         pos += 8;                                       /* skip walEnd */
1059         pos += 8;                                       /* skip sendTime */
1060
1061         if (len < pos + 1)
1062         {
1063                 fprintf(stderr, _("%s: streaming header too small: %d\n"),
1064                                 progname, len);
1065                 return false;
1066         }
1067         replyRequested = copybuf[pos];
1068
1069         /* If the server requested an immediate reply, send one. */
1070         if (replyRequested && still_sending)
1071         {
1072                 if (reportFlushPosition && lastFlushPosition < blockpos &&
1073                         walfile != -1)
1074                 {
1075                         /*
1076                          * If a valid flush location needs to be reported, flush the
1077                          * current WAL file so that the latest flush location is sent back
1078                          * to the server. This is necessary to see whether the last WAL
1079                          * data has been successfully replicated or not, at the normal
1080                          * shutdown of the server.
1081                          */
1082                         if (fsync(walfile) != 0)
1083                         {
1084                                 fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
1085                                                 progname, current_walfile_name, strerror(errno));
1086                                 return false;
1087                         }
1088                         lastFlushPosition = blockpos;
1089                 }
1090
1091                 now = feGetCurrentTimestamp();
1092                 if (!sendFeedback(conn, blockpos, now, false))
1093                         return false;
1094                 *last_status = now;
1095         }
1096
1097         return true;
1098 }
1099
1100 /*
1101  * Process XLogData message.
1102  */
1103 static bool
1104 ProcessXLogDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len,
1105                                    XLogRecPtr *blockpos)
1106 {
1107         int                     xlogoff;
1108         int                     bytes_left;
1109         int                     bytes_written;
1110         int                     hdr_len;
1111
1112         /*
1113          * Once we've decided we don't want to receive any more, just ignore any
1114          * subsequent XLogData messages.
1115          */
1116         if (!(still_sending))
1117                 return true;
1118
1119         /*
1120          * Read the header of the XLogData message, enclosed in the CopyData
1121          * message. We only need the WAL location field (dataStart), the rest of
1122          * the header is ignored.
1123          */
1124         hdr_len = 1;                            /* msgtype 'w' */
1125         hdr_len += 8;                           /* dataStart */
1126         hdr_len += 8;                           /* walEnd */
1127         hdr_len += 8;                           /* sendTime */
1128         if (len < hdr_len)
1129         {
1130                 fprintf(stderr, _("%s: streaming header too small: %d\n"),
1131                                 progname, len);
1132                 return false;
1133         }
1134         *blockpos = fe_recvint64(&copybuf[1]);
1135
1136         /* Extract WAL location for this block */
1137         xlogoff = *blockpos % XLOG_SEG_SIZE;
1138
1139         /*
1140          * Verify that the initial location in the stream matches where we think
1141          * we are.
1142          */
1143         if (walfile == -1)
1144         {
1145                 /* No file open yet */
1146                 if (xlogoff != 0)
1147                 {
1148                         fprintf(stderr,
1149                                         _("%s: received transaction log record for offset %u with no file open\n"),
1150                                         progname, xlogoff);
1151                         return false;
1152                 }
1153         }
1154         else
1155         {
1156                 /* More data in existing segment */
1157                 /* XXX: store seek value don't reseek all the time */
1158                 if (lseek(walfile, 0, SEEK_CUR) != xlogoff)
1159                 {
1160                         fprintf(stderr,
1161                                         _("%s: got WAL data offset %08x, expected %08x\n"),
1162                                         progname, xlogoff, (int) lseek(walfile, 0, SEEK_CUR));
1163                         return false;
1164                 }
1165         }
1166
1167         bytes_left = len - hdr_len;
1168         bytes_written = 0;
1169
1170         while (bytes_left)
1171         {
1172                 int                     bytes_to_write;
1173
1174                 /*
1175                  * If crossing a WAL boundary, only write up until we reach
1176                  * XLOG_SEG_SIZE.
1177                  */
1178                 if (xlogoff + bytes_left > XLOG_SEG_SIZE)
1179                         bytes_to_write = XLOG_SEG_SIZE - xlogoff;
1180                 else
1181                         bytes_to_write = bytes_left;
1182
1183                 if (walfile == -1)
1184                 {
1185                         if (!open_walfile(stream, *blockpos))
1186                         {
1187                                 /* Error logged by open_walfile */
1188                                 return false;
1189                         }
1190                 }
1191
1192                 if (write(walfile,
1193                                   copybuf + hdr_len + bytes_written,
1194                                   bytes_to_write) != bytes_to_write)
1195                 {
1196                         fprintf(stderr,
1197                                   _("%s: could not write %u bytes to WAL file \"%s\": %s\n"),
1198                                         progname, bytes_to_write, current_walfile_name,
1199                                         strerror(errno));
1200                         return false;
1201                 }
1202
1203                 /* Write was successful, advance our position */
1204                 bytes_written += bytes_to_write;
1205                 bytes_left -= bytes_to_write;
1206                 *blockpos += bytes_to_write;
1207                 xlogoff += bytes_to_write;
1208
1209                 /* Did we reach the end of a WAL segment? */
1210                 if (*blockpos % XLOG_SEG_SIZE == 0)
1211                 {
1212                         if (!close_walfile(stream, *blockpos))
1213                                 /* Error message written in close_walfile() */
1214                                 return false;
1215
1216                         xlogoff = 0;
1217
1218                         if (still_sending && stream->stream_stop(*blockpos, stream->timeline, true))
1219                         {
1220                                 if (PQputCopyEnd(conn, NULL) <= 0 || PQflush(conn))
1221                                 {
1222                                         fprintf(stderr, _("%s: could not send copy-end packet: %s"),
1223                                                         progname, PQerrorMessage(conn));
1224                                         return false;
1225                                 }
1226                                 still_sending = false;
1227                                 return true;    /* ignore the rest of this XLogData packet */
1228                         }
1229                 }
1230         }
1231         /* No more data left to write, receive next copy packet */
1232
1233         return true;
1234 }
1235
1236 /*
1237  * Handle end of the copy stream.
1238  */
1239 static PGresult *
1240 HandleEndOfCopyStream(PGconn *conn, StreamCtl *stream, char *copybuf,
1241                                           XLogRecPtr blockpos, XLogRecPtr *stoppos)
1242 {
1243         PGresult   *res = PQgetResult(conn);
1244
1245         /*
1246          * The server closed its end of the copy stream.  If we haven't closed
1247          * ours already, we need to do so now, unless the server threw an error,
1248          * in which case we don't.
1249          */
1250         if (still_sending)
1251         {
1252                 if (!close_walfile(stream, blockpos))
1253                 {
1254                         /* Error message written in close_walfile() */
1255                         PQclear(res);
1256                         return NULL;
1257                 }
1258                 if (PQresultStatus(res) == PGRES_COPY_IN)
1259                 {
1260                         if (PQputCopyEnd(conn, NULL) <= 0 || PQflush(conn))
1261                         {
1262                                 fprintf(stderr,
1263                                                 _("%s: could not send copy-end packet: %s"),
1264                                                 progname, PQerrorMessage(conn));
1265                                 PQclear(res);
1266                                 return NULL;
1267                         }
1268                         res = PQgetResult(conn);
1269                 }
1270                 still_sending = false;
1271         }
1272         if (copybuf != NULL)
1273                 PQfreemem(copybuf);
1274         *stoppos = blockpos;
1275         return res;
1276 }
1277
1278 /*
1279  * Check if we should continue streaming, or abort at this point.
1280  */
1281 static bool
1282 CheckCopyStreamStop(PGconn *conn, StreamCtl *stream, XLogRecPtr blockpos,
1283                                         XLogRecPtr *stoppos)
1284 {
1285         if (still_sending && stream->stream_stop(blockpos, stream->timeline, false))
1286         {
1287                 if (!close_walfile(stream, blockpos))
1288                 {
1289                         /* Potential error message is written by close_walfile */
1290                         return false;
1291                 }
1292                 if (PQputCopyEnd(conn, NULL) <= 0 || PQflush(conn))
1293                 {
1294                         fprintf(stderr, _("%s: could not send copy-end packet: %s"),
1295                                         progname, PQerrorMessage(conn));
1296                         return false;
1297                 }
1298                 still_sending = false;
1299         }
1300
1301         return true;
1302 }
1303
1304 /*
1305  * Calculate how long send/receive loops should sleep
1306  */
1307 static long
1308 CalculateCopyStreamSleeptime(int64 now, int standby_message_timeout,
1309                                                          int64 last_status)
1310 {
1311         int64           status_targettime = 0;
1312         long            sleeptime;
1313
1314         if (standby_message_timeout && still_sending)
1315                 status_targettime = last_status +
1316                         (standby_message_timeout - 1) * ((int64) 1000);
1317
1318         if (status_targettime > 0)
1319         {
1320                 long            secs;
1321                 int                     usecs;
1322
1323                 feTimestampDifference(now,
1324                                                           status_targettime,
1325                                                           &secs,
1326                                                           &usecs);
1327                 /* Always sleep at least 1 sec */
1328                 if (secs <= 0)
1329                 {
1330                         secs = 1;
1331                         usecs = 0;
1332                 }
1333
1334                 sleeptime = secs * 1000 + usecs / 1000;
1335         }
1336         else
1337                 sleeptime = -1;
1338
1339         return sleeptime;
1340 }