]> granicus.if.org Git - postgresql/blob - src/bin/pg_basebackup/receivelog.c
3a921ebf2db03f7a19df3ede7854c0d67c618188
[postgresql] / src / bin / pg_basebackup / receivelog.c
1 /*-------------------------------------------------------------------------
2  *
3  * receivelog.c - receive transaction log files using the streaming
4  *                                replication protocol.
5  *
6  * Author: Magnus Hagander <magnus@hagander.net>
7  *
8  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
9  *
10  * IDENTIFICATION
11  *                src/bin/pg_basebackup/receivelog.c
12  *-------------------------------------------------------------------------
13  */
14
15 #include "postgres_fe.h"
16
17 #include <sys/stat.h>
18 #include <unistd.h>
19 #ifdef HAVE_SYS_SELECT_H
20 #include <sys/select.h>
21 #endif
22
23 /* local includes */
24 #include "receivelog.h"
25 #include "streamutil.h"
26
27 #include "libpq-fe.h"
28 #include "access/xlog_internal.h"
29
30
31 /* fd and filename for currently open WAL file */
32 static int      walfile = -1;
33 static char current_walfile_name[MAXPGPATH] = "";
34 static bool reportFlushPosition = false;
35 static XLogRecPtr lastFlushPosition = InvalidXLogRecPtr;
36
37 static bool still_sending = true;               /* feedback still needs to be sent? */
38
39 static PGresult *HandleCopyStream(PGconn *conn, StreamCtl *stream,
40                                  XLogRecPtr *stoppos);
41 static int      CopyStreamPoll(PGconn *conn, long timeout_ms);
42 static int      CopyStreamReceive(PGconn *conn, long timeout, char **buffer);
43 static bool ProcessKeepaliveMsg(PGconn *conn, char *copybuf, int len,
44                                         XLogRecPtr blockpos, int64 *last_status);
45 static bool ProcessXLogDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len,
46                                    XLogRecPtr *blockpos);
47 static PGresult *HandleEndOfCopyStream(PGconn *conn, StreamCtl *stream, char *copybuf,
48                                           XLogRecPtr blockpos, XLogRecPtr *stoppos);
49 static bool CheckCopyStreamStop(PGconn *conn, StreamCtl *stream, XLogRecPtr blockpos,
50                                         XLogRecPtr *stoppos);
51 static long CalculateCopyStreamSleeptime(int64 now, int standby_message_timeout,
52                                                          int64 last_status);
53
54 static bool ReadEndOfStreamingResult(PGresult *res, XLogRecPtr *startpos,
55                                                  uint32 *timeline);
56
57 static bool
58 mark_file_as_archived(const char *basedir, const char *fname)
59 {
60         int                     fd;
61         static char tmppath[MAXPGPATH];
62
63         snprintf(tmppath, sizeof(tmppath), "%s/archive_status/%s.done",
64                          basedir, fname);
65
66         fd = open(tmppath, O_WRONLY | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR);
67         if (fd < 0)
68         {
69                 fprintf(stderr, _("%s: could not create archive status file \"%s\": %s\n"),
70                                 progname, tmppath, strerror(errno));
71                 return false;
72         }
73
74         if (fsync(fd) != 0)
75         {
76                 fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
77                                 progname, tmppath, strerror(errno));
78
79                 close(fd);
80
81                 return false;
82         }
83
84         close(fd);
85
86         return true;
87 }
88
89 /*
90  * Open a new WAL file in the specified directory.
91  *
92  * The file will be padded to 16Mb with zeroes. The base filename (without
93  * partial_suffix) is stored in current_walfile_name.
94  */
95 static bool
96 open_walfile(StreamCtl *stream, XLogRecPtr startpoint)
97 {
98         int                     f;
99         char            fn[MAXPGPATH];
100         struct stat statbuf;
101         char       *zerobuf;
102         int                     bytes;
103         XLogSegNo       segno;
104
105         XLByteToSeg(startpoint, segno);
106         XLogFileName(current_walfile_name, stream->timeline, segno);
107
108         snprintf(fn, sizeof(fn), "%s/%s%s", stream->basedir, current_walfile_name,
109                          stream->partial_suffix ? stream->partial_suffix : "");
110         f = open(fn, O_WRONLY | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR);
111         if (f == -1)
112         {
113                 fprintf(stderr,
114                                 _("%s: could not open transaction log file \"%s\": %s\n"),
115                                 progname, fn, strerror(errno));
116                 return false;
117         }
118
119         /*
120          * Verify that the file is either empty (just created), or a complete
121          * XLogSegSize segment. Anything in between indicates a corrupt file.
122          */
123         if (fstat(f, &statbuf) != 0)
124         {
125                 fprintf(stderr,
126                                 _("%s: could not stat transaction log file \"%s\": %s\n"),
127                                 progname, fn, strerror(errno));
128                 close(f);
129                 return false;
130         }
131         if (statbuf.st_size == XLogSegSize)
132         {
133                 /* File is open and ready to use */
134                 walfile = f;
135                 return true;
136         }
137         if (statbuf.st_size != 0)
138         {
139                 fprintf(stderr,
140                                 _("%s: transaction log file \"%s\" has %d bytes, should be 0 or %d\n"),
141                                 progname, fn, (int) statbuf.st_size, XLogSegSize);
142                 close(f);
143                 return false;
144         }
145
146         /* New, empty, file. So pad it to 16Mb with zeroes */
147         zerobuf = pg_malloc0(XLOG_BLCKSZ);
148         for (bytes = 0; bytes < XLogSegSize; bytes += XLOG_BLCKSZ)
149         {
150                 if (write(f, zerobuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
151                 {
152                         fprintf(stderr,
153                                         _("%s: could not pad transaction log file \"%s\": %s\n"),
154                                         progname, fn, strerror(errno));
155                         free(zerobuf);
156                         close(f);
157                         unlink(fn);
158                         return false;
159                 }
160         }
161         free(zerobuf);
162
163         if (lseek(f, SEEK_SET, 0) != 0)
164         {
165                 fprintf(stderr,
166                                 _("%s: could not seek to beginning of transaction log file \"%s\": %s\n"),
167                                 progname, fn, strerror(errno));
168                 close(f);
169                 return false;
170         }
171         walfile = f;
172         return true;
173 }
174
175 /*
176  * Close the current WAL file (if open), and rename it to the correct
177  * filename if it's complete. On failure, prints an error message to stderr
178  * and returns false, otherwise returns true.
179  */
180 static bool
181 close_walfile(StreamCtl *stream, XLogRecPtr pos)
182 {
183         off_t           currpos;
184
185         if (walfile == -1)
186                 return true;
187
188         currpos = lseek(walfile, 0, SEEK_CUR);
189         if (currpos == -1)
190         {
191                 fprintf(stderr,
192                          _("%s: could not determine seek position in file \"%s\": %s\n"),
193                                 progname, current_walfile_name, strerror(errno));
194                 return false;
195         }
196
197         if (fsync(walfile) != 0)
198         {
199                 fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
200                                 progname, current_walfile_name, strerror(errno));
201                 return false;
202         }
203
204         if (close(walfile) != 0)
205         {
206                 fprintf(stderr, _("%s: could not close file \"%s\": %s\n"),
207                                 progname, current_walfile_name, strerror(errno));
208                 walfile = -1;
209                 return false;
210         }
211         walfile = -1;
212
213         /*
214          * If we finished writing a .partial file, rename it into place.
215          */
216         if (currpos == XLOG_SEG_SIZE && stream->partial_suffix)
217         {
218                 char            oldfn[MAXPGPATH];
219                 char            newfn[MAXPGPATH];
220
221                 snprintf(oldfn, sizeof(oldfn), "%s/%s%s", stream->basedir, current_walfile_name, stream->partial_suffix);
222                 snprintf(newfn, sizeof(newfn), "%s/%s", stream->basedir, current_walfile_name);
223                 if (rename(oldfn, newfn) != 0)
224                 {
225                         fprintf(stderr, _("%s: could not rename file \"%s\": %s\n"),
226                                         progname, current_walfile_name, strerror(errno));
227                         return false;
228                 }
229         }
230         else if (stream->partial_suffix)
231                 fprintf(stderr,
232                                 _("%s: not renaming \"%s%s\", segment is not complete\n"),
233                                 progname, current_walfile_name, stream->partial_suffix);
234
235         /*
236          * Mark file as archived if requested by the caller - pg_basebackup needs
237          * to do so as files can otherwise get archived again after promotion of a
238          * new node. This is in line with walreceiver.c always doing a
239          * XLogArchiveForceDone() after a complete segment.
240          */
241         if (currpos == XLOG_SEG_SIZE && stream->mark_done)
242         {
243                 /* writes error message if failed */
244                 if (!mark_file_as_archived(stream->basedir, current_walfile_name))
245                         return false;
246         }
247
248         lastFlushPosition = pos;
249         return true;
250 }
251
252
253 /*
254  * Check if a timeline history file exists.
255  */
256 static bool
257 existsTimeLineHistoryFile(StreamCtl *stream)
258 {
259         char            path[MAXPGPATH];
260         char            histfname[MAXFNAMELEN];
261         int                     fd;
262
263         /*
264          * Timeline 1 never has a history file. We treat that as if it existed,
265          * since we never need to stream it.
266          */
267         if (stream->timeline == 1)
268                 return true;
269
270         TLHistoryFileName(histfname, stream->timeline);
271
272         snprintf(path, sizeof(path), "%s/%s", stream->basedir, histfname);
273
274         fd = open(path, O_RDONLY | PG_BINARY, 0);
275         if (fd < 0)
276         {
277                 if (errno != ENOENT)
278                         fprintf(stderr, _("%s: could not open timeline history file \"%s\": %s\n"),
279                                         progname, path, strerror(errno));
280                 return false;
281         }
282         else
283         {
284                 close(fd);
285                 return true;
286         }
287 }
288
289 static bool
290 writeTimeLineHistoryFile(StreamCtl *stream, char *filename, char *content)
291 {
292         int                     size = strlen(content);
293         char            path[MAXPGPATH];
294         char            tmppath[MAXPGPATH];
295         char            histfname[MAXFNAMELEN];
296         int                     fd;
297
298         /*
299          * Check that the server's idea of how timeline history files should be
300          * named matches ours.
301          */
302         TLHistoryFileName(histfname, stream->timeline);
303         if (strcmp(histfname, filename) != 0)
304         {
305                 fprintf(stderr, _("%s: server reported unexpected history file name for timeline %u: %s\n"),
306                                 progname, stream->timeline, filename);
307                 return false;
308         }
309
310         snprintf(path, sizeof(path), "%s/%s", stream->basedir, histfname);
311
312         /*
313          * Write into a temp file name.
314          */
315         snprintf(tmppath, MAXPGPATH, "%s.tmp", path);
316
317         unlink(tmppath);
318
319         fd = open(tmppath, O_WRONLY | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR);
320         if (fd < 0)
321         {
322                 fprintf(stderr, _("%s: could not create timeline history file \"%s\": %s\n"),
323                                 progname, tmppath, strerror(errno));
324                 return false;
325         }
326
327         errno = 0;
328         if ((int) write(fd, content, size) != size)
329         {
330                 int                     save_errno = errno;
331
332                 /*
333                  * If we fail to make the file, delete it to release disk space
334                  */
335                 close(fd);
336                 unlink(tmppath);
337                 errno = save_errno;
338
339                 fprintf(stderr, _("%s: could not write timeline history file \"%s\": %s\n"),
340                                 progname, tmppath, strerror(errno));
341                 return false;
342         }
343
344         if (fsync(fd) != 0)
345         {
346                 close(fd);
347                 fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
348                                 progname, tmppath, strerror(errno));
349                 return false;
350         }
351
352         if (close(fd) != 0)
353         {
354                 fprintf(stderr, _("%s: could not close file \"%s\": %s\n"),
355                                 progname, tmppath, strerror(errno));
356                 return false;
357         }
358
359         /*
360          * Now move the completed history file into place with its final name.
361          */
362         if (rename(tmppath, path) < 0)
363         {
364                 fprintf(stderr, _("%s: could not rename file \"%s\" to \"%s\": %s\n"),
365                                 progname, tmppath, path, strerror(errno));
366                 return false;
367         }
368
369         /* Maintain archive_status, check close_walfile() for details. */
370         if (stream->mark_done)
371         {
372                 /* writes error message if failed */
373                 if (!mark_file_as_archived(stream->basedir, histfname))
374                         return false;
375         }
376
377         return true;
378 }
379
380 /*
381  * Send a Standby Status Update message to server.
382  */
383 static bool
384 sendFeedback(PGconn *conn, XLogRecPtr blockpos, int64 now, bool replyRequested)
385 {
386         char            replybuf[1 + 8 + 8 + 8 + 8 + 1];
387         int                     len = 0;
388
389         replybuf[len] = 'r';
390         len += 1;
391         fe_sendint64(blockpos, &replybuf[len]);         /* write */
392         len += 8;
393         if (reportFlushPosition)
394                 fe_sendint64(lastFlushPosition, &replybuf[len]);                /* flush */
395         else
396                 fe_sendint64(InvalidXLogRecPtr, &replybuf[len]);                /* flush */
397         len += 8;
398         fe_sendint64(InvalidXLogRecPtr, &replybuf[len]);        /* apply */
399         len += 8;
400         fe_sendint64(now, &replybuf[len]);      /* sendTime */
401         len += 8;
402         replybuf[len] = replyRequested ? 1 : 0;         /* replyRequested */
403         len += 1;
404
405         if (PQputCopyData(conn, replybuf, len) <= 0 || PQflush(conn))
406         {
407                 fprintf(stderr, _("%s: could not send feedback packet: %s"),
408                                 progname, PQerrorMessage(conn));
409                 return false;
410         }
411
412         return true;
413 }
414
415 /*
416  * Check that the server version we're connected to is supported by
417  * ReceiveXlogStream().
418  *
419  * If it's not, an error message is printed to stderr, and false is returned.
420  */
421 bool
422 CheckServerVersionForStreaming(PGconn *conn)
423 {
424         int                     minServerMajor,
425                                 maxServerMajor;
426         int                     serverMajor;
427
428         /*
429          * The message format used in streaming replication changed in 9.3, so we
430          * cannot stream from older servers. And we don't support servers newer
431          * than the client; it might work, but we don't know, so err on the safe
432          * side.
433          */
434         minServerMajor = 903;
435         maxServerMajor = PG_VERSION_NUM / 100;
436         serverMajor = PQserverVersion(conn) / 100;
437         if (serverMajor < minServerMajor)
438         {
439                 const char *serverver = PQparameterStatus(conn, "server_version");
440
441                 fprintf(stderr, _("%s: incompatible server version %s; client does not support streaming from server versions older than %s\n"),
442                                 progname,
443                                 serverver ? serverver : "'unknown'",
444                                 "9.3");
445                 return false;
446         }
447         else if (serverMajor > maxServerMajor)
448         {
449                 const char *serverver = PQparameterStatus(conn, "server_version");
450
451                 fprintf(stderr, _("%s: incompatible server version %s; client does not support streaming from server versions newer than %s\n"),
452                                 progname,
453                                 serverver ? serverver : "'unknown'",
454                                 PG_VERSION);
455                 return false;
456         }
457         return true;
458 }
459
460 /*
461  * Receive a log stream starting at the specified position.
462  *
463  * Individual parameters are passed through the StreamCtl structure.
464  *
465  * If sysidentifier is specified, validate that both the system
466  * identifier and the timeline matches the specified ones
467  * (by sending an extra IDENTIFY_SYSTEM command)
468  *
469  * All received segments will be written to the directory
470  * specified by basedir. This will also fetch any missing timeline history
471  * files.
472  *
473  * The stream_stop callback will be called every time data
474  * is received, and whenever a segment is completed. If it returns
475  * true, the streaming will stop and the function
476  * return. As long as it returns false, streaming will continue
477  * indefinitely.
478  *
479  * standby_message_timeout controls how often we send a message
480  * back to the master letting it know our progress, in milliseconds.
481  * This message will only contain the write location, and never
482  * flush or replay.
483  *
484  * If 'partial_suffix' is not NULL, files are initially created with the
485  * given suffix, and the suffix is removed once the file is finished. That
486  * allows you to tell the difference between partial and completed files,
487  * so that you can continue later where you left.
488  *
489  * If 'synchronous' is true, the received WAL is flushed as soon as written,
490  * otherwise only when the WAL file is closed.
491  *
492  * Note: The log position *must* be at a log segment start!
493  */
494 bool
495 ReceiveXlogStream(PGconn *conn, StreamCtl *stream)
496 {
497         char            query[128];
498         char            slotcmd[128];
499         PGresult   *res;
500         XLogRecPtr      stoppos;
501
502         /*
503          * The caller should've checked the server version already, but doesn't do
504          * any harm to check it here too.
505          */
506         if (!CheckServerVersionForStreaming(conn))
507                 return false;
508
509         /*
510          * Decide whether we want to report the flush position. If we report
511          * the flush position, the primary will know what WAL we'll
512          * possibly re-request, and it can then remove older WAL safely.
513          * We must always do that when we are using slots.
514          *
515          * Reporting the flush position makes one eligible as a synchronous
516          * replica. People shouldn't include generic names in
517          * synchronous_standby_names, but we've protected them against it so
518          * far, so let's continue to do so unless specifically requested.
519          */
520         if (replication_slot != NULL)
521         {
522                 reportFlushPosition = true;
523                 sprintf(slotcmd, "SLOT \"%s\" ", replication_slot);
524         }
525         else
526         {
527                 if (stream->synchronous)
528                         reportFlushPosition = true;
529                 else
530                         reportFlushPosition = false;
531                 slotcmd[0] = 0;
532         }
533
534         if (stream->sysidentifier != NULL)
535         {
536                 /* Validate system identifier hasn't changed */
537                 res = PQexec(conn, "IDENTIFY_SYSTEM");
538                 if (PQresultStatus(res) != PGRES_TUPLES_OK)
539                 {
540                         fprintf(stderr,
541                                         _("%s: could not send replication command \"%s\": %s"),
542                                         progname, "IDENTIFY_SYSTEM", PQerrorMessage(conn));
543                         PQclear(res);
544                         return false;
545                 }
546                 if (PQntuples(res) != 1 || PQnfields(res) < 3)
547                 {
548                         fprintf(stderr,
549                                         _("%s: could not identify system: got %d rows and %d fields, expected %d rows and %d or more fields\n"),
550                                         progname, PQntuples(res), PQnfields(res), 1, 3);
551                         PQclear(res);
552                         return false;
553                 }
554                 if (strcmp(stream->sysidentifier, PQgetvalue(res, 0, 0)) != 0)
555                 {
556                         fprintf(stderr,
557                                         _("%s: system identifier does not match between base backup and streaming connection\n"),
558                                         progname);
559                         PQclear(res);
560                         return false;
561                 }
562                 if (stream->timeline > atoi(PQgetvalue(res, 0, 1)))
563                 {
564                         fprintf(stderr,
565                                 _("%s: starting timeline %u is not present in the server\n"),
566                                         progname, stream->timeline);
567                         PQclear(res);
568                         return false;
569                 }
570                 PQclear(res);
571         }
572
573         /*
574          * initialize flush position to starting point, it's the caller's
575          * responsibility that that's sane.
576          */
577         lastFlushPosition = stream->startpos;
578
579         while (1)
580         {
581                 /*
582                  * Fetch the timeline history file for this timeline, if we don't have
583                  * it already.
584                  */
585                 if (!existsTimeLineHistoryFile(stream))
586                 {
587                         snprintf(query, sizeof(query), "TIMELINE_HISTORY %u", stream->timeline);
588                         res = PQexec(conn, query);
589                         if (PQresultStatus(res) != PGRES_TUPLES_OK)
590                         {
591                                 /* FIXME: we might send it ok, but get an error */
592                                 fprintf(stderr, _("%s: could not send replication command \"%s\": %s"),
593                                         progname, "TIMELINE_HISTORY", PQresultErrorMessage(res));
594                                 PQclear(res);
595                                 return false;
596                         }
597
598                         /*
599                          * The response to TIMELINE_HISTORY is a single row result set
600                          * with two fields: filename and content
601                          */
602                         if (PQnfields(res) != 2 || PQntuples(res) != 1)
603                         {
604                                 fprintf(stderr,
605                                                 _("%s: unexpected response to TIMELINE_HISTORY command: got %d rows and %d fields, expected %d rows and %d fields\n"),
606                                                 progname, PQntuples(res), PQnfields(res), 1, 2);
607                         }
608
609                         /* Write the history file to disk */
610                         writeTimeLineHistoryFile(stream,
611                                                                          PQgetvalue(res, 0, 0),
612                                                                          PQgetvalue(res, 0, 1));
613
614                         PQclear(res);
615                 }
616
617                 /*
618                  * Before we start streaming from the requested location, check if the
619                  * callback tells us to stop here.
620                  */
621                 if (stream->stream_stop(stream->startpos, stream->timeline, false))
622                         return true;
623
624                 /* Initiate the replication stream at specified location */
625                 snprintf(query, sizeof(query), "START_REPLICATION %s%X/%X TIMELINE %u",
626                                  slotcmd,
627                                  (uint32) (stream->startpos >> 32), (uint32) stream->startpos,
628                                  stream->timeline);
629                 res = PQexec(conn, query);
630                 if (PQresultStatus(res) != PGRES_COPY_BOTH)
631                 {
632                         fprintf(stderr, _("%s: could not send replication command \"%s\": %s"),
633                                         progname, "START_REPLICATION", PQresultErrorMessage(res));
634                         PQclear(res);
635                         return false;
636                 }
637                 PQclear(res);
638
639                 /* Stream the WAL */
640                 res = HandleCopyStream(conn, stream, &stoppos);
641                 if (res == NULL)
642                         goto error;
643
644                 /*
645                  * Streaming finished.
646                  *
647                  * There are two possible reasons for that: a controlled shutdown, or
648                  * we reached the end of the current timeline. In case of
649                  * end-of-timeline, the server sends a result set after Copy has
650                  * finished, containing information about the next timeline. Read
651                  * that, and restart streaming from the next timeline. In case of
652                  * controlled shutdown, stop here.
653                  */
654                 if (PQresultStatus(res) == PGRES_TUPLES_OK)
655                 {
656                         /*
657                          * End-of-timeline. Read the next timeline's ID and starting
658                          * position. Usually, the starting position will match the end of
659                          * the previous timeline, but there are corner cases like if the
660                          * server had sent us half of a WAL record, when it was promoted.
661                          * The new timeline will begin at the end of the last complete
662                          * record in that case, overlapping the partial WAL record on the
663                          * the old timeline.
664                          */
665                         uint32          newtimeline;
666                         bool            parsed;
667
668                         parsed = ReadEndOfStreamingResult(res, &stream->startpos, &newtimeline);
669                         PQclear(res);
670                         if (!parsed)
671                                 goto error;
672
673                         /* Sanity check the values the server gave us */
674                         if (newtimeline <= stream->timeline)
675                         {
676                                 fprintf(stderr,
677                                                 _("%s: server reported unexpected next timeline %u, following timeline %u\n"),
678                                                 progname, newtimeline, stream->timeline);
679                                 goto error;
680                         }
681                         if (stream->startpos > stoppos)
682                         {
683                                 fprintf(stderr,
684                                                 _("%s: server stopped streaming timeline %u at %X/%X, but reported next timeline %u to begin at %X/%X\n"),
685                                                 progname,
686                                 stream->timeline, (uint32) (stoppos >> 32), (uint32) stoppos,
687                                                 newtimeline, (uint32) (stream->startpos >> 32), (uint32) stream->startpos);
688                                 goto error;
689                         }
690
691                         /* Read the final result, which should be CommandComplete. */
692                         res = PQgetResult(conn);
693                         if (PQresultStatus(res) != PGRES_COMMAND_OK)
694                         {
695                                 fprintf(stderr,
696                                    _("%s: unexpected termination of replication stream: %s"),
697                                                 progname, PQresultErrorMessage(res));
698                                 PQclear(res);
699                                 goto error;
700                         }
701                         PQclear(res);
702
703                         /*
704                          * Loop back to start streaming from the new timeline. Always
705                          * start streaming at the beginning of a segment.
706                          */
707                         stream->timeline = newtimeline;
708                         stream->startpos = stream->startpos - (stream->startpos % XLOG_SEG_SIZE);
709                         continue;
710                 }
711                 else if (PQresultStatus(res) == PGRES_COMMAND_OK)
712                 {
713                         PQclear(res);
714
715                         /*
716                          * End of replication (ie. controlled shut down of the server).
717                          *
718                          * Check if the callback thinks it's OK to stop here. If not,
719                          * complain.
720                          */
721                         if (stream->stream_stop(stoppos, stream->timeline, false))
722                                 return true;
723                         else
724                         {
725                                 fprintf(stderr, _("%s: replication stream was terminated before stop point\n"),
726                                                 progname);
727                                 goto error;
728                         }
729                 }
730                 else
731                 {
732                         /* Server returned an error. */
733                         fprintf(stderr,
734                                         _("%s: unexpected termination of replication stream: %s"),
735                                         progname, PQresultErrorMessage(res));
736                         PQclear(res);
737                         goto error;
738                 }
739         }
740
741 error:
742         if (walfile != -1 && close(walfile) != 0)
743                 fprintf(stderr, _("%s: could not close file \"%s\": %s\n"),
744                                 progname, current_walfile_name, strerror(errno));
745         walfile = -1;
746         return false;
747 }
748
749 /*
750  * Helper function to parse the result set returned by server after streaming
751  * has finished. On failure, prints an error to stderr and returns false.
752  */
753 static bool
754 ReadEndOfStreamingResult(PGresult *res, XLogRecPtr *startpos, uint32 *timeline)
755 {
756         uint32          startpos_xlogid,
757                                 startpos_xrecoff;
758
759         /*----------
760          * The result set consists of one row and two columns, e.g:
761          *
762          *      next_tli | next_tli_startpos
763          * ----------+-------------------
764          *                 4 | 0/9949AE0
765          *
766          * next_tli is the timeline ID of the next timeline after the one that
767          * just finished streaming. next_tli_startpos is the XLOG position where
768          * the server switched to it.
769          *----------
770          */
771         if (PQnfields(res) < 2 || PQntuples(res) != 1)
772         {
773                 fprintf(stderr,
774                                 _("%s: unexpected result set after end-of-timeline: got %d rows and %d fields, expected %d rows and %d fields\n"),
775                                 progname, PQntuples(res), PQnfields(res), 1, 2);
776                 return false;
777         }
778
779         *timeline = atoi(PQgetvalue(res, 0, 0));
780         if (sscanf(PQgetvalue(res, 0, 1), "%X/%X", &startpos_xlogid,
781                            &startpos_xrecoff) != 2)
782         {
783                 fprintf(stderr,
784                         _("%s: could not parse next timeline's starting point \"%s\"\n"),
785                                 progname, PQgetvalue(res, 0, 1));
786                 return false;
787         }
788         *startpos = ((uint64) startpos_xlogid << 32) | startpos_xrecoff;
789
790         return true;
791 }
792
793 /*
794  * The main loop of ReceiveXlogStream. Handles the COPY stream after
795  * initiating streaming with the START_STREAMING command.
796  *
797  * If the COPY ends (not necessarily successfully) due a message from the
798  * server, returns a PGresult and sets *stoppos to the last byte written.
799  * On any other sort of error, returns NULL.
800  */
801 static PGresult *
802 HandleCopyStream(PGconn *conn, StreamCtl *stream,
803                                  XLogRecPtr *stoppos)
804 {
805         char       *copybuf = NULL;
806         int64           last_status = -1;
807         XLogRecPtr      blockpos = stream->startpos;
808
809         still_sending = true;
810
811         while (1)
812         {
813                 int                     r;
814                 int64           now;
815                 long            sleeptime;
816
817                 /*
818                  * Check if we should continue streaming, or abort at this point.
819                  */
820                 if (!CheckCopyStreamStop(conn, stream, blockpos, stoppos))
821                         goto error;
822
823                 now = feGetCurrentTimestamp();
824
825                 /*
826                  * If synchronous option is true, issue sync command as soon as there
827                  * are WAL data which has not been flushed yet.
828                  */
829                 if (stream->synchronous && lastFlushPosition < blockpos && walfile != -1)
830                 {
831                         if (fsync(walfile) != 0)
832                         {
833                                 fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
834                                                 progname, current_walfile_name, strerror(errno));
835                                 goto error;
836                         }
837                         lastFlushPosition = blockpos;
838
839                         /*
840                          * Send feedback so that the server sees the latest WAL locations
841                          * immediately.
842                          */
843                         if (!sendFeedback(conn, blockpos, now, false))
844                                 goto error;
845                         last_status = now;
846                 }
847
848                 /*
849                  * Potentially send a status message to the master
850                  */
851                 if (still_sending && stream->standby_message_timeout > 0 &&
852                         feTimestampDifferenceExceeds(last_status, now,
853                                                                                  stream->standby_message_timeout))
854                 {
855                         /* Time to send feedback! */
856                         if (!sendFeedback(conn, blockpos, now, false))
857                                 goto error;
858                         last_status = now;
859                 }
860
861                 /*
862                  * Calculate how long send/receive loops should sleep
863                  */
864                 sleeptime = CalculateCopyStreamSleeptime(now, stream->standby_message_timeout,
865                                                                                                  last_status);
866
867                 r = CopyStreamReceive(conn, sleeptime, &copybuf);
868                 while (r != 0)
869                 {
870                         if (r == -1)
871                                 goto error;
872                         if (r == -2)
873                         {
874                                 PGresult   *res = HandleEndOfCopyStream(conn, stream, copybuf, blockpos, stoppos);
875
876                                 if (res == NULL)
877                                         goto error;
878                                 else
879                                         return res;
880                         }
881
882                         /* Check the message type. */
883                         if (copybuf[0] == 'k')
884                         {
885                                 if (!ProcessKeepaliveMsg(conn, copybuf, r, blockpos,
886                                                                                  &last_status))
887                                         goto error;
888                         }
889                         else if (copybuf[0] == 'w')
890                         {
891                                 if (!ProcessXLogDataMsg(conn, stream, copybuf, r, &blockpos))
892                                         goto error;
893
894                                 /*
895                                  * Check if we should continue streaming, or abort at this
896                                  * point.
897                                  */
898                                 if (!CheckCopyStreamStop(conn, stream, blockpos, stoppos))
899                                         goto error;
900                         }
901                         else
902                         {
903                                 fprintf(stderr, _("%s: unrecognized streaming header: \"%c\"\n"),
904                                                 progname, copybuf[0]);
905                                 goto error;
906                         }
907
908                         /*
909                          * Process the received data, and any subsequent data we can read
910                          * without blocking.
911                          */
912                         r = CopyStreamReceive(conn, 0, &copybuf);
913                 }
914         }
915
916 error:
917         if (copybuf != NULL)
918                 PQfreemem(copybuf);
919         return NULL;
920 }
921
922 /*
923  * Wait until we can read CopyData message, or timeout.
924  *
925  * Returns 1 if data has become available for reading, 0 if timed out
926  * or interrupted by signal, and -1 on an error.
927  */
928 static int
929 CopyStreamPoll(PGconn *conn, long timeout_ms)
930 {
931         int                     ret;
932         fd_set          input_mask;
933         struct timeval timeout;
934         struct timeval *timeoutptr;
935
936         if (PQsocket(conn) < 0)
937         {
938                 fprintf(stderr, _("%s: invalid socket: %s"), progname,
939                                 PQerrorMessage(conn));
940                 return -1;
941         }
942
943         FD_ZERO(&input_mask);
944         FD_SET(PQsocket(conn), &input_mask);
945
946         if (timeout_ms < 0)
947                 timeoutptr = NULL;
948         else
949         {
950                 timeout.tv_sec = timeout_ms / 1000L;
951                 timeout.tv_usec = (timeout_ms % 1000L) * 1000L;
952                 timeoutptr = &timeout;
953         }
954
955         ret = select(PQsocket(conn) + 1, &input_mask, NULL, NULL, timeoutptr);
956         if (ret == 0 || (ret < 0 && errno == EINTR))
957                 return 0;                               /* Got a timeout or signal */
958         else if (ret < 0)
959         {
960                 fprintf(stderr, _("%s: select() failed: %s\n"),
961                                 progname, strerror(errno));
962                 return -1;
963         }
964
965         return 1;
966 }
967
968 /*
969  * Receive CopyData message available from XLOG stream, blocking for
970  * maximum of 'timeout' ms.
971  *
972  * If data was received, returns the length of the data. *buffer is set to
973  * point to a buffer holding the received message. The buffer is only valid
974  * until the next CopyStreamReceive call.
975  *
976  * 0 if no data was available within timeout, or wait was interrupted
977  * by signal. -1 on error. -2 if the server ended the COPY.
978  */
979 static int
980 CopyStreamReceive(PGconn *conn, long timeout, char **buffer)
981 {
982         char       *copybuf = NULL;
983         int                     rawlen;
984
985         if (*buffer != NULL)
986                 PQfreemem(*buffer);
987         *buffer = NULL;
988
989         /* Try to receive a CopyData message */
990         rawlen = PQgetCopyData(conn, &copybuf, 1);
991         if (rawlen == 0)
992         {
993                 /*
994                  * No data available. Wait for some to appear, but not longer than the
995                  * specified timeout, so that we can ping the server.
996                  */
997                 if (timeout != 0)
998                 {
999                         int                     ret;
1000
1001                         ret = CopyStreamPoll(conn, timeout);
1002                         if (ret <= 0)
1003                                 return ret;
1004                 }
1005
1006                 /* Else there is actually data on the socket */
1007                 if (PQconsumeInput(conn) == 0)
1008                 {
1009                         fprintf(stderr,
1010                                         _("%s: could not receive data from WAL stream: %s"),
1011                                         progname, PQerrorMessage(conn));
1012                         return -1;
1013                 }
1014
1015                 /* Now that we've consumed some input, try again */
1016                 rawlen = PQgetCopyData(conn, &copybuf, 1);
1017                 if (rawlen == 0)
1018                         return 0;
1019         }
1020         if (rawlen == -1)                       /* end-of-streaming or error */
1021                 return -2;
1022         if (rawlen == -2)
1023         {
1024                 fprintf(stderr, _("%s: could not read COPY data: %s"),
1025                                 progname, PQerrorMessage(conn));
1026                 return -1;
1027         }
1028
1029         /* Return received messages to caller */
1030         *buffer = copybuf;
1031         return rawlen;
1032 }
1033
1034 /*
1035  * Process the keepalive message.
1036  */
1037 static bool
1038 ProcessKeepaliveMsg(PGconn *conn, char *copybuf, int len,
1039                                         XLogRecPtr blockpos, int64 *last_status)
1040 {
1041         int                     pos;
1042         bool            replyRequested;
1043         int64           now;
1044
1045         /*
1046          * Parse the keepalive message, enclosed in the CopyData message. We just
1047          * check if the server requested a reply, and ignore the rest.
1048          */
1049         pos = 1;                                        /* skip msgtype 'k' */
1050         pos += 8;                                       /* skip walEnd */
1051         pos += 8;                                       /* skip sendTime */
1052
1053         if (len < pos + 1)
1054         {
1055                 fprintf(stderr, _("%s: streaming header too small: %d\n"),
1056                                 progname, len);
1057                 return false;
1058         }
1059         replyRequested = copybuf[pos];
1060
1061         /* If the server requested an immediate reply, send one. */
1062         if (replyRequested && still_sending)
1063         {
1064                 if (reportFlushPosition && lastFlushPosition < blockpos &&
1065                         walfile != -1)
1066                 {
1067                         /*
1068                          * If a valid flush location needs to be reported, flush the
1069                          * current WAL file so that the latest flush location is sent back
1070                          * to the server. This is necessary to see whether the last WAL
1071                          * data has been successfully replicated or not, at the normal
1072                          * shutdown of the server.
1073                          */
1074                         if (fsync(walfile) != 0)
1075                         {
1076                                 fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
1077                                                 progname, current_walfile_name, strerror(errno));
1078                                 return false;
1079                         }
1080                         lastFlushPosition = blockpos;
1081                 }
1082
1083                 now = feGetCurrentTimestamp();
1084                 if (!sendFeedback(conn, blockpos, now, false))
1085                         return false;
1086                 *last_status = now;
1087         }
1088
1089         return true;
1090 }
1091
1092 /*
1093  * Process XLogData message.
1094  */
1095 static bool
1096 ProcessXLogDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len,
1097                                    XLogRecPtr *blockpos)
1098 {
1099         int                     xlogoff;
1100         int                     bytes_left;
1101         int                     bytes_written;
1102         int                     hdr_len;
1103
1104         /*
1105          * Once we've decided we don't want to receive any more, just ignore any
1106          * subsequent XLogData messages.
1107          */
1108         if (!(still_sending))
1109                 return true;
1110
1111         /*
1112          * Read the header of the XLogData message, enclosed in the CopyData
1113          * message. We only need the WAL location field (dataStart), the rest of
1114          * the header is ignored.
1115          */
1116         hdr_len = 1;                            /* msgtype 'w' */
1117         hdr_len += 8;                           /* dataStart */
1118         hdr_len += 8;                           /* walEnd */
1119         hdr_len += 8;                           /* sendTime */
1120         if (len < hdr_len)
1121         {
1122                 fprintf(stderr, _("%s: streaming header too small: %d\n"),
1123                                 progname, len);
1124                 return false;
1125         }
1126         *blockpos = fe_recvint64(&copybuf[1]);
1127
1128         /* Extract WAL location for this block */
1129         xlogoff = *blockpos % XLOG_SEG_SIZE;
1130
1131         /*
1132          * Verify that the initial location in the stream matches where we think
1133          * we are.
1134          */
1135         if (walfile == -1)
1136         {
1137                 /* No file open yet */
1138                 if (xlogoff != 0)
1139                 {
1140                         fprintf(stderr,
1141                                         _("%s: received transaction log record for offset %u with no file open\n"),
1142                                         progname, xlogoff);
1143                         return false;
1144                 }
1145         }
1146         else
1147         {
1148                 /* More data in existing segment */
1149                 /* XXX: store seek value don't reseek all the time */
1150                 if (lseek(walfile, 0, SEEK_CUR) != xlogoff)
1151                 {
1152                         fprintf(stderr,
1153                                         _("%s: got WAL data offset %08x, expected %08x\n"),
1154                                         progname, xlogoff, (int) lseek(walfile, 0, SEEK_CUR));
1155                         return false;
1156                 }
1157         }
1158
1159         bytes_left = len - hdr_len;
1160         bytes_written = 0;
1161
1162         while (bytes_left)
1163         {
1164                 int                     bytes_to_write;
1165
1166                 /*
1167                  * If crossing a WAL boundary, only write up until we reach
1168                  * XLOG_SEG_SIZE.
1169                  */
1170                 if (xlogoff + bytes_left > XLOG_SEG_SIZE)
1171                         bytes_to_write = XLOG_SEG_SIZE - xlogoff;
1172                 else
1173                         bytes_to_write = bytes_left;
1174
1175                 if (walfile == -1)
1176                 {
1177                         if (!open_walfile(stream, *blockpos))
1178                         {
1179                                 /* Error logged by open_walfile */
1180                                 return false;
1181                         }
1182                 }
1183
1184                 if (write(walfile,
1185                                   copybuf + hdr_len + bytes_written,
1186                                   bytes_to_write) != bytes_to_write)
1187                 {
1188                         fprintf(stderr,
1189                                   _("%s: could not write %u bytes to WAL file \"%s\": %s\n"),
1190                                         progname, bytes_to_write, current_walfile_name,
1191                                         strerror(errno));
1192                         return false;
1193                 }
1194
1195                 /* Write was successful, advance our position */
1196                 bytes_written += bytes_to_write;
1197                 bytes_left -= bytes_to_write;
1198                 *blockpos += bytes_to_write;
1199                 xlogoff += bytes_to_write;
1200
1201                 /* Did we reach the end of a WAL segment? */
1202                 if (*blockpos % XLOG_SEG_SIZE == 0)
1203                 {
1204                         if (!close_walfile(stream, *blockpos))
1205                                 /* Error message written in close_walfile() */
1206                                 return false;
1207
1208                         xlogoff = 0;
1209
1210                         if (still_sending && stream->stream_stop(*blockpos, stream->timeline, true))
1211                         {
1212                                 if (PQputCopyEnd(conn, NULL) <= 0 || PQflush(conn))
1213                                 {
1214                                         fprintf(stderr, _("%s: could not send copy-end packet: %s"),
1215                                                         progname, PQerrorMessage(conn));
1216                                         return false;
1217                                 }
1218                                 still_sending = false;
1219                                 return true;    /* ignore the rest of this XLogData packet */
1220                         }
1221                 }
1222         }
1223         /* No more data left to write, receive next copy packet */
1224
1225         return true;
1226 }
1227
1228 /*
1229  * Handle end of the copy stream.
1230  */
1231 static PGresult *
1232 HandleEndOfCopyStream(PGconn *conn, StreamCtl *stream, char *copybuf,
1233                                           XLogRecPtr blockpos, XLogRecPtr *stoppos)
1234 {
1235         PGresult   *res = PQgetResult(conn);
1236
1237         /*
1238          * The server closed its end of the copy stream.  If we haven't closed
1239          * ours already, we need to do so now, unless the server threw an error,
1240          * in which case we don't.
1241          */
1242         if (still_sending)
1243         {
1244                 if (!close_walfile(stream, blockpos))
1245                 {
1246                         /* Error message written in close_walfile() */
1247                         PQclear(res);
1248                         return NULL;
1249                 }
1250                 if (PQresultStatus(res) == PGRES_COPY_IN)
1251                 {
1252                         if (PQputCopyEnd(conn, NULL) <= 0 || PQflush(conn))
1253                         {
1254                                 fprintf(stderr,
1255                                                 _("%s: could not send copy-end packet: %s"),
1256                                                 progname, PQerrorMessage(conn));
1257                                 PQclear(res);
1258                                 return NULL;
1259                         }
1260                         res = PQgetResult(conn);
1261                 }
1262                 still_sending = false;
1263         }
1264         if (copybuf != NULL)
1265                 PQfreemem(copybuf);
1266         *stoppos = blockpos;
1267         return res;
1268 }
1269
1270 /*
1271  * Check if we should continue streaming, or abort at this point.
1272  */
1273 static bool
1274 CheckCopyStreamStop(PGconn *conn, StreamCtl *stream, XLogRecPtr blockpos,
1275                                         XLogRecPtr *stoppos)
1276 {
1277         if (still_sending && stream->stream_stop(blockpos, stream->timeline, false))
1278         {
1279                 if (!close_walfile(stream, blockpos))
1280                 {
1281                         /* Potential error message is written by close_walfile */
1282                         return false;
1283                 }
1284                 if (PQputCopyEnd(conn, NULL) <= 0 || PQflush(conn))
1285                 {
1286                         fprintf(stderr, _("%s: could not send copy-end packet: %s"),
1287                                         progname, PQerrorMessage(conn));
1288                         return false;
1289                 }
1290                 still_sending = false;
1291         }
1292
1293         return true;
1294 }
1295
1296 /*
1297  * Calculate how long send/receive loops should sleep
1298  */
1299 static long
1300 CalculateCopyStreamSleeptime(int64 now, int standby_message_timeout,
1301                                                          int64 last_status)
1302 {
1303         int64           status_targettime = 0;
1304         long            sleeptime;
1305
1306         if (standby_message_timeout && still_sending)
1307                 status_targettime = last_status +
1308                         (standby_message_timeout - 1) * ((int64) 1000);
1309
1310         if (status_targettime > 0)
1311         {
1312                 long            secs;
1313                 int                     usecs;
1314
1315                 feTimestampDifference(now,
1316                                                           status_targettime,
1317                                                           &secs,
1318                                                           &usecs);
1319                 /* Always sleep at least 1 sec */
1320                 if (secs <= 0)
1321                 {
1322                         secs = 1;
1323                         usecs = 0;
1324                 }
1325
1326                 sleeptime = secs * 1000 + usecs / 1000;
1327         }
1328         else
1329                 sleeptime = -1;
1330
1331         return sleeptime;
1332 }