]> granicus.if.org Git - postgresql/blob - src/backend/replication/basebackup.c
Fix inconsistencies in the code
[postgresql] / src / backend / replication / basebackup.c
1 /*-------------------------------------------------------------------------
2  *
3  * basebackup.c
4  *        code for taking a base backup and streaming it to a standby
5  *
6  * Portions Copyright (c) 2010-2019, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  *        src/backend/replication/basebackup.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14
15 #include <sys/stat.h>
16 #include <unistd.h>
17 #include <time.h>
18
19 #include "access/xlog_internal.h"       /* for pg_start/stop_backup */
20 #include "catalog/pg_type.h"
21 #include "common/file_perm.h"
22 #include "lib/stringinfo.h"
23 #include "libpq/libpq.h"
24 #include "libpq/pqformat.h"
25 #include "miscadmin.h"
26 #include "nodes/pg_list.h"
27 #include "pgtar.h"
28 #include "pgstat.h"
29 #include "port.h"
30 #include "postmaster/syslogger.h"
31 #include "replication/basebackup.h"
32 #include "replication/walsender.h"
33 #include "replication/walsender_private.h"
34 #include "storage/bufpage.h"
35 #include "storage/checksum.h"
36 #include "storage/dsm_impl.h"
37 #include "storage/fd.h"
38 #include "storage/ipc.h"
39 #include "storage/reinit.h"
40 #include "utils/builtins.h"
41 #include "utils/ps_status.h"
42 #include "utils/relcache.h"
43 #include "utils/timestamp.h"
44
45
46 typedef struct
47 {
48         const char *label;
49         bool            progress;
50         bool            fastcheckpoint;
51         bool            nowait;
52         bool            includewal;
53         uint32          maxrate;
54         bool            sendtblspcmapfile;
55 } basebackup_options;
56
57
58 static int64 sendDir(const char *path, int basepathlen, bool sizeonly,
59                                          List *tablespaces, bool sendtblspclinks);
60 static bool sendFile(const char *readfilename, const char *tarfilename,
61                                          struct stat *statbuf, bool missing_ok, Oid dboid);
62 static void sendFileWithContent(const char *filename, const char *content);
63 static int64 _tarWriteHeader(const char *filename, const char *linktarget,
64                                                          struct stat *statbuf, bool sizeonly);
65 static int64 _tarWriteDir(const char *pathbuf, int basepathlen, struct stat *statbuf,
66                                                   bool sizeonly);
67 static void send_int8_string(StringInfoData *buf, int64 intval);
68 static void SendBackupHeader(List *tablespaces);
69 static void base_backup_cleanup(int code, Datum arg);
70 static void perform_base_backup(basebackup_options *opt);
71 static void parse_basebackup_options(List *options, basebackup_options *opt);
72 static void SendXlogRecPtrResult(XLogRecPtr ptr, TimeLineID tli);
73 static int      compareWalFileNames(const void *a, const void *b);
74 static void throttle(size_t increment);
75 static bool is_checksummed_file(const char *fullpath, const char *filename);
76
77 /* Was the backup currently in-progress initiated in recovery mode? */
78 static bool backup_started_in_recovery = false;
79
80 /* Relative path of temporary statistics directory */
81 static char *statrelpath = NULL;
82
83 /*
84  * Size of each block sent into the tar stream for larger files.
85  */
86 #define TAR_SEND_SIZE 32768
87
88 /*
89  * How frequently to throttle, as a fraction of the specified rate-second.
90  */
91 #define THROTTLING_FREQUENCY    8
92
93 /* The actual number of bytes, transfer of which may cause sleep. */
94 static uint64 throttling_sample;
95
96 /* Amount of data already transferred but not yet throttled.  */
97 static int64 throttling_counter;
98
99 /* The minimum time required to transfer throttling_sample bytes. */
100 static TimeOffset elapsed_min_unit;
101
102 /* The last check of the transfer rate. */
103 static TimestampTz throttled_last;
104
105 /* The starting XLOG position of the base backup. */
106 static XLogRecPtr startptr;
107
108 /* Total number of checksum failures during base backup. */
109 static long long int total_checksum_failures;
110
111 /* Do not verify checksums. */
112 static bool noverify_checksums = false;
113
114 /*
115  * The contents of these directories are removed or recreated during server
116  * start so they are not included in backups.  The directories themselves are
117  * kept and included as empty to preserve access permissions.
118  *
119  * Note: this list should be kept in sync with the filter lists in pg_rewind's
120  * filemap.c.
121  */
122 static const char *excludeDirContents[] =
123 {
124         /*
125          * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped even
126          * when stats_temp_directory is set because PGSS_TEXT_FILE is always
127          * created there.
128          */
129         PG_STAT_TMP_DIR,
130
131         /*
132          * It is generally not useful to backup the contents of this directory
133          * even if the intention is to restore to another master. See backup.sgml
134          * for a more detailed description.
135          */
136         "pg_replslot",
137
138         /* Contents removed on startup, see dsm_cleanup_for_mmap(). */
139         PG_DYNSHMEM_DIR,
140
141         /* Contents removed on startup, see AsyncShmemInit(). */
142         "pg_notify",
143
144         /*
145          * Old contents are loaded for possible debugging but are not required for
146          * normal operation, see OldSerXidInit().
147          */
148         "pg_serial",
149
150         /* Contents removed on startup, see DeleteAllExportedSnapshotFiles(). */
151         "pg_snapshots",
152
153         /* Contents zeroed on startup, see StartupSUBTRANS(). */
154         "pg_subtrans",
155
156         /* end of list */
157         NULL
158 };
159
160 /*
161  * List of files excluded from backups.
162  */
163 static const char *excludeFiles[] =
164 {
165         /* Skip auto conf temporary file. */
166         PG_AUTOCONF_FILENAME ".tmp",
167
168         /* Skip current log file temporary file */
169         LOG_METAINFO_DATAFILE_TMP,
170
171         /* Skip relation cache because it is rebuilt on startup */
172         RELCACHE_INIT_FILENAME,
173
174         /*
175          * If there's a backup_label or tablespace_map file, it belongs to a
176          * backup started by the user with pg_start_backup().  It is *not* correct
177          * for this backup.  Our backup_label/tablespace_map is injected into the
178          * tar separately.
179          */
180         BACKUP_LABEL_FILE,
181         TABLESPACE_MAP,
182
183         "postmaster.pid",
184         "postmaster.opts",
185
186         /* end of list */
187         NULL
188 };
189
190 /*
191  * List of files excluded from checksum validation.
192  *
193  * Note: this list should be kept in sync with what pg_checksums.c
194  * includes.
195  */
196 static const char *const noChecksumFiles[] = {
197         "pg_control",
198         "pg_filenode.map",
199         "pg_internal.init",
200         "PG_VERSION",
201 #ifdef EXEC_BACKEND
202         "config_exec_params",
203         "config_exec_params.new",
204 #endif
205         NULL,
206 };
207
208
209 /*
210  * Called when ERROR or FATAL happens in perform_base_backup() after
211  * we have started the backup - make sure we end it!
212  */
213 static void
214 base_backup_cleanup(int code, Datum arg)
215 {
216         do_pg_abort_backup();
217 }
218
219 /*
220  * Actually do a base backup for the specified tablespaces.
221  *
222  * This is split out mainly to avoid complaints about "variable might be
223  * clobbered by longjmp" from stupider versions of gcc.
224  */
225 static void
226 perform_base_backup(basebackup_options *opt)
227 {
228         TimeLineID      starttli;
229         XLogRecPtr      endptr;
230         TimeLineID      endtli;
231         StringInfo      labelfile;
232         StringInfo      tblspc_map_file = NULL;
233         int                     datadirpathlen;
234         List       *tablespaces = NIL;
235
236         datadirpathlen = strlen(DataDir);
237
238         backup_started_in_recovery = RecoveryInProgress();
239
240         labelfile = makeStringInfo();
241         tblspc_map_file = makeStringInfo();
242
243         total_checksum_failures = 0;
244
245         startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &starttli,
246                                                                   labelfile, &tablespaces,
247                                                                   tblspc_map_file,
248                                                                   opt->progress, opt->sendtblspcmapfile);
249
250         /*
251          * Once do_pg_start_backup has been called, ensure that any failure causes
252          * us to abort the backup so we don't "leak" a backup counter. For this
253          * reason, *all* functionality between do_pg_start_backup() and the end of
254          * do_pg_stop_backup() should be inside the error cleanup block!
255          */
256
257         PG_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0);
258         {
259                 ListCell   *lc;
260                 tablespaceinfo *ti;
261
262                 SendXlogRecPtrResult(startptr, starttli);
263
264                 /*
265                  * Calculate the relative path of temporary statistics directory in
266                  * order to skip the files which are located in that directory later.
267                  */
268                 if (is_absolute_path(pgstat_stat_directory) &&
269                         strncmp(pgstat_stat_directory, DataDir, datadirpathlen) == 0)
270                         statrelpath = psprintf("./%s", pgstat_stat_directory + datadirpathlen + 1);
271                 else if (strncmp(pgstat_stat_directory, "./", 2) != 0)
272                         statrelpath = psprintf("./%s", pgstat_stat_directory);
273                 else
274                         statrelpath = pgstat_stat_directory;
275
276                 /* Add a node for the base directory at the end */
277                 ti = palloc0(sizeof(tablespaceinfo));
278                 ti->size = opt->progress ? sendDir(".", 1, true, tablespaces, true) : -1;
279                 tablespaces = lappend(tablespaces, ti);
280
281                 /* Send tablespace header */
282                 SendBackupHeader(tablespaces);
283
284                 /* Setup and activate network throttling, if client requested it */
285                 if (opt->maxrate > 0)
286                 {
287                         throttling_sample =
288                                 (int64) opt->maxrate * (int64) 1024 / THROTTLING_FREQUENCY;
289
290                         /*
291                          * The minimum amount of time for throttling_sample bytes to be
292                          * transferred.
293                          */
294                         elapsed_min_unit = USECS_PER_SEC / THROTTLING_FREQUENCY;
295
296                         /* Enable throttling. */
297                         throttling_counter = 0;
298
299                         /* The 'real data' starts now (header was ignored). */
300                         throttled_last = GetCurrentTimestamp();
301                 }
302                 else
303                 {
304                         /* Disable throttling. */
305                         throttling_counter = -1;
306                 }
307
308                 /* Send off our tablespaces one by one */
309                 foreach(lc, tablespaces)
310                 {
311                         tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
312                         StringInfoData buf;
313
314                         /* Send CopyOutResponse message */
315                         pq_beginmessage(&buf, 'H');
316                         pq_sendbyte(&buf, 0);   /* overall format */
317                         pq_sendint16(&buf, 0);  /* natts */
318                         pq_endmessage(&buf);
319
320                         if (ti->path == NULL)
321                         {
322                                 struct stat statbuf;
323
324                                 /* In the main tar, include the backup_label first... */
325                                 sendFileWithContent(BACKUP_LABEL_FILE, labelfile->data);
326
327                                 /*
328                                  * Send tablespace_map file if required and then the bulk of
329                                  * the files.
330                                  */
331                                 if (tblspc_map_file && opt->sendtblspcmapfile)
332                                 {
333                                         sendFileWithContent(TABLESPACE_MAP, tblspc_map_file->data);
334                                         sendDir(".", 1, false, tablespaces, false);
335                                 }
336                                 else
337                                         sendDir(".", 1, false, tablespaces, true);
338
339                                 /* ... and pg_control after everything else. */
340                                 if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0)
341                                         ereport(ERROR,
342                                                         (errcode_for_file_access(),
343                                                          errmsg("could not stat file \"%s\": %m",
344                                                                         XLOG_CONTROL_FILE)));
345                                 sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false, InvalidOid);
346                         }
347                         else
348                                 sendTablespace(ti->path, false);
349
350                         /*
351                          * If we're including WAL, and this is the main data directory we
352                          * don't terminate the tar stream here. Instead, we will append
353                          * the xlog files below and terminate it then. This is safe since
354                          * the main data directory is always sent *last*.
355                          */
356                         if (opt->includewal && ti->path == NULL)
357                         {
358                                 Assert(lnext(lc) == NULL);
359                         }
360                         else
361                                 pq_putemptymessage('c');        /* CopyDone */
362                 }
363
364                 endptr = do_pg_stop_backup(labelfile->data, !opt->nowait, &endtli);
365         }
366         PG_END_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0);
367
368
369         if (opt->includewal)
370         {
371                 /*
372                  * We've left the last tar file "open", so we can now append the
373                  * required WAL files to it.
374                  */
375                 char            pathbuf[MAXPGPATH];
376                 XLogSegNo       segno;
377                 XLogSegNo       startsegno;
378                 XLogSegNo       endsegno;
379                 struct stat statbuf;
380                 List       *historyFileList = NIL;
381                 List       *walFileList = NIL;
382                 char      **walFiles;
383                 int                     nWalFiles;
384                 char            firstoff[MAXFNAMELEN];
385                 char            lastoff[MAXFNAMELEN];
386                 DIR                *dir;
387                 struct dirent *de;
388                 int                     i;
389                 ListCell   *lc;
390                 TimeLineID      tli;
391
392                 /*
393                  * I'd rather not worry about timelines here, so scan pg_wal and
394                  * include all WAL files in the range between 'startptr' and 'endptr',
395                  * regardless of the timeline the file is stamped with. If there are
396                  * some spurious WAL files belonging to timelines that don't belong in
397                  * this server's history, they will be included too. Normally there
398                  * shouldn't be such files, but if there are, there's little harm in
399                  * including them.
400                  */
401                 XLByteToSeg(startptr, startsegno, wal_segment_size);
402                 XLogFileName(firstoff, ThisTimeLineID, startsegno, wal_segment_size);
403                 XLByteToPrevSeg(endptr, endsegno, wal_segment_size);
404                 XLogFileName(lastoff, ThisTimeLineID, endsegno, wal_segment_size);
405
406                 dir = AllocateDir("pg_wal");
407                 while ((de = ReadDir(dir, "pg_wal")) != NULL)
408                 {
409                         /* Does it look like a WAL segment, and is it in the range? */
410                         if (IsXLogFileName(de->d_name) &&
411                                 strcmp(de->d_name + 8, firstoff + 8) >= 0 &&
412                                 strcmp(de->d_name + 8, lastoff + 8) <= 0)
413                         {
414                                 walFileList = lappend(walFileList, pstrdup(de->d_name));
415                         }
416                         /* Does it look like a timeline history file? */
417                         else if (IsTLHistoryFileName(de->d_name))
418                         {
419                                 historyFileList = lappend(historyFileList, pstrdup(de->d_name));
420                         }
421                 }
422                 FreeDir(dir);
423
424                 /*
425                  * Before we go any further, check that none of the WAL segments we
426                  * need were removed.
427                  */
428                 CheckXLogRemoved(startsegno, ThisTimeLineID);
429
430                 /*
431                  * Put the WAL filenames into an array, and sort. We send the files in
432                  * order from oldest to newest, to reduce the chance that a file is
433                  * recycled before we get a chance to send it over.
434                  */
435                 nWalFiles = list_length(walFileList);
436                 walFiles = palloc(nWalFiles * sizeof(char *));
437                 i = 0;
438                 foreach(lc, walFileList)
439                 {
440                         walFiles[i++] = lfirst(lc);
441                 }
442                 qsort(walFiles, nWalFiles, sizeof(char *), compareWalFileNames);
443
444                 /*
445                  * There must be at least one xlog file in the pg_wal directory, since
446                  * we are doing backup-including-xlog.
447                  */
448                 if (nWalFiles < 1)
449                         ereport(ERROR,
450                                         (errmsg("could not find any WAL files")));
451
452                 /*
453                  * Sanity check: the first and last segment should cover startptr and
454                  * endptr, with no gaps in between.
455                  */
456                 XLogFromFileName(walFiles[0], &tli, &segno, wal_segment_size);
457                 if (segno != startsegno)
458                 {
459                         char            startfname[MAXFNAMELEN];
460
461                         XLogFileName(startfname, ThisTimeLineID, startsegno,
462                                                  wal_segment_size);
463                         ereport(ERROR,
464                                         (errmsg("could not find WAL file \"%s\"", startfname)));
465                 }
466                 for (i = 0; i < nWalFiles; i++)
467                 {
468                         XLogSegNo       currsegno = segno;
469                         XLogSegNo       nextsegno = segno + 1;
470
471                         XLogFromFileName(walFiles[i], &tli, &segno, wal_segment_size);
472                         if (!(nextsegno == segno || currsegno == segno))
473                         {
474                                 char            nextfname[MAXFNAMELEN];
475
476                                 XLogFileName(nextfname, ThisTimeLineID, nextsegno,
477                                                          wal_segment_size);
478                                 ereport(ERROR,
479                                                 (errmsg("could not find WAL file \"%s\"", nextfname)));
480                         }
481                 }
482                 if (segno != endsegno)
483                 {
484                         char            endfname[MAXFNAMELEN];
485
486                         XLogFileName(endfname, ThisTimeLineID, endsegno, wal_segment_size);
487                         ereport(ERROR,
488                                         (errmsg("could not find WAL file \"%s\"", endfname)));
489                 }
490
491                 /* Ok, we have everything we need. Send the WAL files. */
492                 for (i = 0; i < nWalFiles; i++)
493                 {
494                         FILE       *fp;
495                         char            buf[TAR_SEND_SIZE];
496                         size_t          cnt;
497                         pgoff_t         len = 0;
498
499                         snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", walFiles[i]);
500                         XLogFromFileName(walFiles[i], &tli, &segno, wal_segment_size);
501
502                         fp = AllocateFile(pathbuf, "rb");
503                         if (fp == NULL)
504                         {
505                                 int                     save_errno = errno;
506
507                                 /*
508                                  * Most likely reason for this is that the file was already
509                                  * removed by a checkpoint, so check for that to get a better
510                                  * error message.
511                                  */
512                                 CheckXLogRemoved(segno, tli);
513
514                                 errno = save_errno;
515                                 ereport(ERROR,
516                                                 (errcode_for_file_access(),
517                                                  errmsg("could not open file \"%s\": %m", pathbuf)));
518                         }
519
520                         if (fstat(fileno(fp), &statbuf) != 0)
521                                 ereport(ERROR,
522                                                 (errcode_for_file_access(),
523                                                  errmsg("could not stat file \"%s\": %m",
524                                                                 pathbuf)));
525                         if (statbuf.st_size != wal_segment_size)
526                         {
527                                 CheckXLogRemoved(segno, tli);
528                                 ereport(ERROR,
529                                                 (errcode_for_file_access(),
530                                                  errmsg("unexpected WAL file size \"%s\"", walFiles[i])));
531                         }
532
533                         /* send the WAL file itself */
534                         _tarWriteHeader(pathbuf, NULL, &statbuf, false);
535
536                         while ((cnt = fread(buf, 1,
537                                                                 Min(sizeof(buf), wal_segment_size - len),
538                                                                 fp)) > 0)
539                         {
540                                 CheckXLogRemoved(segno, tli);
541                                 /* Send the chunk as a CopyData message */
542                                 if (pq_putmessage('d', buf, cnt))
543                                         ereport(ERROR,
544                                                         (errmsg("base backup could not send data, aborting backup")));
545
546                                 len += cnt;
547                                 throttle(cnt);
548
549                                 if (len == wal_segment_size)
550                                         break;
551                         }
552
553                         if (len != wal_segment_size)
554                         {
555                                 CheckXLogRemoved(segno, tli);
556                                 ereport(ERROR,
557                                                 (errcode_for_file_access(),
558                                                  errmsg("unexpected WAL file size \"%s\"", walFiles[i])));
559                         }
560
561                         /* wal_segment_size is a multiple of 512, so no need for padding */
562
563                         FreeFile(fp);
564
565                         /*
566                          * Mark file as archived, otherwise files can get archived again
567                          * after promotion of a new node. This is in line with
568                          * walreceiver.c always doing an XLogArchiveForceDone() after a
569                          * complete segment.
570                          */
571                         StatusFilePath(pathbuf, walFiles[i], ".done");
572                         sendFileWithContent(pathbuf, "");
573                 }
574
575                 /*
576                  * Send timeline history files too. Only the latest timeline history
577                  * file is required for recovery, and even that only if there happens
578                  * to be a timeline switch in the first WAL segment that contains the
579                  * checkpoint record, or if we're taking a base backup from a standby
580                  * server and the target timeline changes while the backup is taken.
581                  * But they are small and highly useful for debugging purposes, so
582                  * better include them all, always.
583                  */
584                 foreach(lc, historyFileList)
585                 {
586                         char       *fname = lfirst(lc);
587
588                         snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", fname);
589
590                         if (lstat(pathbuf, &statbuf) != 0)
591                                 ereport(ERROR,
592                                                 (errcode_for_file_access(),
593                                                  errmsg("could not stat file \"%s\": %m", pathbuf)));
594
595                         sendFile(pathbuf, pathbuf, &statbuf, false, InvalidOid);
596
597                         /* unconditionally mark file as archived */
598                         StatusFilePath(pathbuf, fname, ".done");
599                         sendFileWithContent(pathbuf, "");
600                 }
601
602                 /* Send CopyDone message for the last tar file */
603                 pq_putemptymessage('c');
604         }
605         SendXlogRecPtrResult(endptr, endtli);
606
607         if (total_checksum_failures)
608         {
609                 if (total_checksum_failures > 1)
610                         ereport(WARNING,
611                                         (errmsg("%lld total checksum verification failures", total_checksum_failures)));
612
613                 ereport(ERROR,
614                                 (errcode(ERRCODE_DATA_CORRUPTED),
615                                  errmsg("checksum verification failure during base backup")));
616         }
617
618 }
619
620 /*
621  * qsort comparison function, to compare log/seg portion of WAL segment
622  * filenames, ignoring the timeline portion.
623  */
624 static int
625 compareWalFileNames(const void *a, const void *b)
626 {
627         char       *fna = *((char **) a);
628         char       *fnb = *((char **) b);
629
630         return strcmp(fna + 8, fnb + 8);
631 }
632
633 /*
634  * Parse the base backup options passed down by the parser
635  */
636 static void
637 parse_basebackup_options(List *options, basebackup_options *opt)
638 {
639         ListCell   *lopt;
640         bool            o_label = false;
641         bool            o_progress = false;
642         bool            o_fast = false;
643         bool            o_nowait = false;
644         bool            o_wal = false;
645         bool            o_maxrate = false;
646         bool            o_tablespace_map = false;
647         bool            o_noverify_checksums = false;
648
649         MemSet(opt, 0, sizeof(*opt));
650         foreach(lopt, options)
651         {
652                 DefElem    *defel = (DefElem *) lfirst(lopt);
653
654                 if (strcmp(defel->defname, "label") == 0)
655                 {
656                         if (o_label)
657                                 ereport(ERROR,
658                                                 (errcode(ERRCODE_SYNTAX_ERROR),
659                                                  errmsg("duplicate option \"%s\"", defel->defname)));
660                         opt->label = strVal(defel->arg);
661                         o_label = true;
662                 }
663                 else if (strcmp(defel->defname, "progress") == 0)
664                 {
665                         if (o_progress)
666                                 ereport(ERROR,
667                                                 (errcode(ERRCODE_SYNTAX_ERROR),
668                                                  errmsg("duplicate option \"%s\"", defel->defname)));
669                         opt->progress = true;
670                         o_progress = true;
671                 }
672                 else if (strcmp(defel->defname, "fast") == 0)
673                 {
674                         if (o_fast)
675                                 ereport(ERROR,
676                                                 (errcode(ERRCODE_SYNTAX_ERROR),
677                                                  errmsg("duplicate option \"%s\"", defel->defname)));
678                         opt->fastcheckpoint = true;
679                         o_fast = true;
680                 }
681                 else if (strcmp(defel->defname, "nowait") == 0)
682                 {
683                         if (o_nowait)
684                                 ereport(ERROR,
685                                                 (errcode(ERRCODE_SYNTAX_ERROR),
686                                                  errmsg("duplicate option \"%s\"", defel->defname)));
687                         opt->nowait = true;
688                         o_nowait = true;
689                 }
690                 else if (strcmp(defel->defname, "wal") == 0)
691                 {
692                         if (o_wal)
693                                 ereport(ERROR,
694                                                 (errcode(ERRCODE_SYNTAX_ERROR),
695                                                  errmsg("duplicate option \"%s\"", defel->defname)));
696                         opt->includewal = true;
697                         o_wal = true;
698                 }
699                 else if (strcmp(defel->defname, "max_rate") == 0)
700                 {
701                         long            maxrate;
702
703                         if (o_maxrate)
704                                 ereport(ERROR,
705                                                 (errcode(ERRCODE_SYNTAX_ERROR),
706                                                  errmsg("duplicate option \"%s\"", defel->defname)));
707
708                         maxrate = intVal(defel->arg);
709                         if (maxrate < MAX_RATE_LOWER || maxrate > MAX_RATE_UPPER)
710                                 ereport(ERROR,
711                                                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
712                                                  errmsg("%d is outside the valid range for parameter \"%s\" (%d .. %d)",
713                                                                 (int) maxrate, "MAX_RATE", MAX_RATE_LOWER, MAX_RATE_UPPER)));
714
715                         opt->maxrate = (uint32) maxrate;
716                         o_maxrate = true;
717                 }
718                 else if (strcmp(defel->defname, "tablespace_map") == 0)
719                 {
720                         if (o_tablespace_map)
721                                 ereport(ERROR,
722                                                 (errcode(ERRCODE_SYNTAX_ERROR),
723                                                  errmsg("duplicate option \"%s\"", defel->defname)));
724                         opt->sendtblspcmapfile = true;
725                         o_tablespace_map = true;
726                 }
727                 else if (strcmp(defel->defname, "noverify_checksums") == 0)
728                 {
729                         if (o_noverify_checksums)
730                                 ereport(ERROR,
731                                                 (errcode(ERRCODE_SYNTAX_ERROR),
732                                                  errmsg("duplicate option \"%s\"", defel->defname)));
733                         noverify_checksums = true;
734                         o_noverify_checksums = true;
735                 }
736                 else
737                         elog(ERROR, "option \"%s\" not recognized",
738                                  defel->defname);
739         }
740         if (opt->label == NULL)
741                 opt->label = "base backup";
742 }
743
744
745 /*
746  * SendBaseBackup() - send a complete base backup.
747  *
748  * The function will put the system into backup mode like pg_start_backup()
749  * does, so that the backup is consistent even though we read directly from
750  * the filesystem, bypassing the buffer cache.
751  */
752 void
753 SendBaseBackup(BaseBackupCmd *cmd)
754 {
755         basebackup_options opt;
756
757         parse_basebackup_options(cmd->options, &opt);
758
759         WalSndSetState(WALSNDSTATE_BACKUP);
760
761         if (update_process_title)
762         {
763                 char            activitymsg[50];
764
765                 snprintf(activitymsg, sizeof(activitymsg), "sending backup \"%s\"",
766                                  opt.label);
767                 set_ps_display(activitymsg, false);
768         }
769
770         perform_base_backup(&opt);
771 }
772
773 static void
774 send_int8_string(StringInfoData *buf, int64 intval)
775 {
776         char            is[32];
777
778         sprintf(is, INT64_FORMAT, intval);
779         pq_sendint32(buf, strlen(is));
780         pq_sendbytes(buf, is, strlen(is));
781 }
782
783 static void
784 SendBackupHeader(List *tablespaces)
785 {
786         StringInfoData buf;
787         ListCell   *lc;
788
789         /* Construct and send the directory information */
790         pq_beginmessage(&buf, 'T'); /* RowDescription */
791         pq_sendint16(&buf, 3);          /* 3 fields */
792
793         /* First field - spcoid */
794         pq_sendstring(&buf, "spcoid");
795         pq_sendint32(&buf, 0);          /* table oid */
796         pq_sendint16(&buf, 0);          /* attnum */
797         pq_sendint32(&buf, OIDOID); /* type oid */
798         pq_sendint16(&buf, 4);          /* typlen */
799         pq_sendint32(&buf, 0);          /* typmod */
800         pq_sendint16(&buf, 0);          /* format code */
801
802         /* Second field - spcpath */
803         pq_sendstring(&buf, "spclocation");
804         pq_sendint32(&buf, 0);
805         pq_sendint16(&buf, 0);
806         pq_sendint32(&buf, TEXTOID);
807         pq_sendint16(&buf, -1);
808         pq_sendint32(&buf, 0);
809         pq_sendint16(&buf, 0);
810
811         /* Third field - size */
812         pq_sendstring(&buf, "size");
813         pq_sendint32(&buf, 0);
814         pq_sendint16(&buf, 0);
815         pq_sendint32(&buf, INT8OID);
816         pq_sendint16(&buf, 8);
817         pq_sendint32(&buf, 0);
818         pq_sendint16(&buf, 0);
819         pq_endmessage(&buf);
820
821         foreach(lc, tablespaces)
822         {
823                 tablespaceinfo *ti = lfirst(lc);
824
825                 /* Send one datarow message */
826                 pq_beginmessage(&buf, 'D');
827                 pq_sendint16(&buf, 3);  /* number of columns */
828                 if (ti->path == NULL)
829                 {
830                         pq_sendint32(&buf, -1); /* Length = -1 ==> NULL */
831                         pq_sendint32(&buf, -1);
832                 }
833                 else
834                 {
835                         Size            len;
836
837                         len = strlen(ti->oid);
838                         pq_sendint32(&buf, len);
839                         pq_sendbytes(&buf, ti->oid, len);
840
841                         len = strlen(ti->path);
842                         pq_sendint32(&buf, len);
843                         pq_sendbytes(&buf, ti->path, len);
844                 }
845                 if (ti->size >= 0)
846                         send_int8_string(&buf, ti->size / 1024);
847                 else
848                         pq_sendint32(&buf, -1); /* NULL */
849
850                 pq_endmessage(&buf);
851         }
852
853         /* Send a CommandComplete message */
854         pq_puttextmessage('C', "SELECT");
855 }
856
857 /*
858  * Send a single resultset containing just a single
859  * XLogRecPtr record (in text format)
860  */
861 static void
862 SendXlogRecPtrResult(XLogRecPtr ptr, TimeLineID tli)
863 {
864         StringInfoData buf;
865         char            str[MAXFNAMELEN];
866         Size            len;
867
868         pq_beginmessage(&buf, 'T'); /* RowDescription */
869         pq_sendint16(&buf, 2);          /* 2 fields */
870
871         /* Field headers */
872         pq_sendstring(&buf, "recptr");
873         pq_sendint32(&buf, 0);          /* table oid */
874         pq_sendint16(&buf, 0);          /* attnum */
875         pq_sendint32(&buf, TEXTOID);    /* type oid */
876         pq_sendint16(&buf, -1);
877         pq_sendint32(&buf, 0);
878         pq_sendint16(&buf, 0);
879
880         pq_sendstring(&buf, "tli");
881         pq_sendint32(&buf, 0);          /* table oid */
882         pq_sendint16(&buf, 0);          /* attnum */
883
884         /*
885          * int8 may seem like a surprising data type for this, but in theory int4
886          * would not be wide enough for this, as TimeLineID is unsigned.
887          */
888         pq_sendint32(&buf, INT8OID);    /* type oid */
889         pq_sendint16(&buf, -1);
890         pq_sendint32(&buf, 0);
891         pq_sendint16(&buf, 0);
892         pq_endmessage(&buf);
893
894         /* Data row */
895         pq_beginmessage(&buf, 'D');
896         pq_sendint16(&buf, 2);          /* number of columns */
897
898         len = snprintf(str, sizeof(str),
899                                    "%X/%X", (uint32) (ptr >> 32), (uint32) ptr);
900         pq_sendint32(&buf, len);
901         pq_sendbytes(&buf, str, len);
902
903         len = snprintf(str, sizeof(str), "%u", tli);
904         pq_sendint32(&buf, len);
905         pq_sendbytes(&buf, str, len);
906
907         pq_endmessage(&buf);
908
909         /* Send a CommandComplete message */
910         pq_puttextmessage('C', "SELECT");
911 }
912
913 /*
914  * Inject a file with given name and content in the output tar stream.
915  */
916 static void
917 sendFileWithContent(const char *filename, const char *content)
918 {
919         struct stat statbuf;
920         int                     pad,
921                                 len;
922
923         len = strlen(content);
924
925         /*
926          * Construct a stat struct for the backup_label file we're injecting in
927          * the tar.
928          */
929         /* Windows doesn't have the concept of uid and gid */
930 #ifdef WIN32
931         statbuf.st_uid = 0;
932         statbuf.st_gid = 0;
933 #else
934         statbuf.st_uid = geteuid();
935         statbuf.st_gid = getegid();
936 #endif
937         statbuf.st_mtime = time(NULL);
938         statbuf.st_mode = pg_file_create_mode;
939         statbuf.st_size = len;
940
941         _tarWriteHeader(filename, NULL, &statbuf, false);
942         /* Send the contents as a CopyData message */
943         pq_putmessage('d', content, len);
944
945         /* Pad to 512 byte boundary, per tar format requirements */
946         pad = ((len + 511) & ~511) - len;
947         if (pad > 0)
948         {
949                 char            buf[512];
950
951                 MemSet(buf, 0, pad);
952                 pq_putmessage('d', buf, pad);
953         }
954 }
955
956 /*
957  * Include the tablespace directory pointed to by 'path' in the output tar
958  * stream.  If 'sizeonly' is true, we just calculate a total length and return
959  * it, without actually sending anything.
960  *
961  * Only used to send auxiliary tablespaces, not PGDATA.
962  */
963 int64
964 sendTablespace(char *path, bool sizeonly)
965 {
966         int64           size;
967         char            pathbuf[MAXPGPATH];
968         struct stat statbuf;
969
970         /*
971          * 'path' points to the tablespace location, but we only want to include
972          * the version directory in it that belongs to us.
973          */
974         snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path,
975                          TABLESPACE_VERSION_DIRECTORY);
976
977         /*
978          * Store a directory entry in the tar file so we get the permissions
979          * right.
980          */
981         if (lstat(pathbuf, &statbuf) != 0)
982         {
983                 if (errno != ENOENT)
984                         ereport(ERROR,
985                                         (errcode_for_file_access(),
986                                          errmsg("could not stat file or directory \"%s\": %m",
987                                                         pathbuf)));
988
989                 /* If the tablespace went away while scanning, it's no error. */
990                 return 0;
991         }
992
993         size = _tarWriteHeader(TABLESPACE_VERSION_DIRECTORY, NULL, &statbuf,
994                                                    sizeonly);
995
996         /* Send all the files in the tablespace version directory */
997         size += sendDir(pathbuf, strlen(path), sizeonly, NIL, true);
998
999         return size;
1000 }
1001
1002 /*
1003  * Include all files from the given directory in the output tar stream. If
1004  * 'sizeonly' is true, we just calculate a total length and return it, without
1005  * actually sending anything.
1006  *
1007  * Omit any directory in the tablespaces list, to avoid backing up
1008  * tablespaces twice when they were created inside PGDATA.
1009  *
1010  * If sendtblspclinks is true, we need to include symlink
1011  * information in the tar file. If not, we can skip that
1012  * as it will be sent separately in the tablespace_map file.
1013  */
1014 static int64
1015 sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces,
1016                 bool sendtblspclinks)
1017 {
1018         DIR                *dir;
1019         struct dirent *de;
1020         char            pathbuf[MAXPGPATH * 2];
1021         struct stat statbuf;
1022         int64           size = 0;
1023         const char *lastDir;            /* Split last dir from parent path. */
1024         bool            isDbDir = false;        /* Does this directory contain relations? */
1025
1026         /*
1027          * Determine if the current path is a database directory that can contain
1028          * relations.
1029          *
1030          * Start by finding the location of the delimiter between the parent path
1031          * and the current path.
1032          */
1033         lastDir = last_dir_separator(path);
1034
1035         /* Does this path look like a database path (i.e. all digits)? */
1036         if (lastDir != NULL &&
1037                 strspn(lastDir + 1, "0123456789") == strlen(lastDir + 1))
1038         {
1039                 /* Part of path that contains the parent directory. */
1040                 int                     parentPathLen = lastDir - path;
1041
1042                 /*
1043                  * Mark path as a database directory if the parent path is either
1044                  * $PGDATA/base or a tablespace version path.
1045                  */
1046                 if (strncmp(path, "./base", parentPathLen) == 0 ||
1047                         (parentPathLen >= (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) &&
1048                          strncmp(lastDir - (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1),
1049                                          TABLESPACE_VERSION_DIRECTORY,
1050                                          sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) == 0))
1051                         isDbDir = true;
1052         }
1053
1054         dir = AllocateDir(path);
1055         while ((de = ReadDir(dir, path)) != NULL)
1056         {
1057                 int                     excludeIdx;
1058                 bool            excludeFound;
1059                 ForkNumber      relForkNum; /* Type of fork if file is a relation */
1060                 int                     relOidChars;    /* Chars in filename that are the rel oid */
1061
1062                 /* Skip special stuff */
1063                 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
1064                         continue;
1065
1066                 /* Skip temporary files */
1067                 if (strncmp(de->d_name,
1068                                         PG_TEMP_FILE_PREFIX,
1069                                         strlen(PG_TEMP_FILE_PREFIX)) == 0)
1070                         continue;
1071
1072                 /*
1073                  * Check if the postmaster has signaled us to exit, and abort with an
1074                  * error in that case. The error handler further up will call
1075                  * do_pg_abort_backup() for us. Also check that if the backup was
1076                  * started while still in recovery, the server wasn't promoted.
1077                  * do_pg_stop_backup() will check that too, but it's better to stop
1078                  * the backup early than continue to the end and fail there.
1079                  */
1080                 CHECK_FOR_INTERRUPTS();
1081                 if (RecoveryInProgress() != backup_started_in_recovery)
1082                         ereport(ERROR,
1083                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1084                                          errmsg("the standby was promoted during online backup"),
1085                                          errhint("This means that the backup being taken is corrupt "
1086                                                          "and should not be used. "
1087                                                          "Try taking another online backup.")));
1088
1089                 /* Scan for files that should be excluded */
1090                 excludeFound = false;
1091                 for (excludeIdx = 0; excludeFiles[excludeIdx] != NULL; excludeIdx++)
1092                 {
1093                         if (strcmp(de->d_name, excludeFiles[excludeIdx]) == 0)
1094                         {
1095                                 elog(DEBUG1, "file \"%s\" excluded from backup", de->d_name);
1096                                 excludeFound = true;
1097                                 break;
1098                         }
1099                 }
1100
1101                 if (excludeFound)
1102                         continue;
1103
1104                 /* Exclude all forks for unlogged tables except the init fork */
1105                 if (isDbDir &&
1106                         parse_filename_for_nontemp_relation(de->d_name, &relOidChars,
1107                                                                                                 &relForkNum))
1108                 {
1109                         /* Never exclude init forks */
1110                         if (relForkNum != INIT_FORKNUM)
1111                         {
1112                                 char            initForkFile[MAXPGPATH];
1113                                 char            relOid[OIDCHARS + 1];
1114
1115                                 /*
1116                                  * If any other type of fork, check if there is an init fork
1117                                  * with the same OID. If so, the file can be excluded.
1118                                  */
1119                                 memcpy(relOid, de->d_name, relOidChars);
1120                                 relOid[relOidChars] = '\0';
1121                                 snprintf(initForkFile, sizeof(initForkFile), "%s/%s_init",
1122                                                  path, relOid);
1123
1124                                 if (lstat(initForkFile, &statbuf) == 0)
1125                                 {
1126                                         elog(DEBUG2,
1127                                                  "unlogged relation file \"%s\" excluded from backup",
1128                                                  de->d_name);
1129
1130                                         continue;
1131                                 }
1132                         }
1133                 }
1134
1135                 /* Exclude temporary relations */
1136                 if (isDbDir && looks_like_temp_rel_name(de->d_name))
1137                 {
1138                         elog(DEBUG2,
1139                                  "temporary relation file \"%s\" excluded from backup",
1140                                  de->d_name);
1141
1142                         continue;
1143                 }
1144
1145                 snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path, de->d_name);
1146
1147                 /* Skip pg_control here to back up it last */
1148                 if (strcmp(pathbuf, "./global/pg_control") == 0)
1149                         continue;
1150
1151                 if (lstat(pathbuf, &statbuf) != 0)
1152                 {
1153                         if (errno != ENOENT)
1154                                 ereport(ERROR,
1155                                                 (errcode_for_file_access(),
1156                                                  errmsg("could not stat file or directory \"%s\": %m",
1157                                                                 pathbuf)));
1158
1159                         /* If the file went away while scanning, it's not an error. */
1160                         continue;
1161                 }
1162
1163                 /* Scan for directories whose contents should be excluded */
1164                 excludeFound = false;
1165                 for (excludeIdx = 0; excludeDirContents[excludeIdx] != NULL; excludeIdx++)
1166                 {
1167                         if (strcmp(de->d_name, excludeDirContents[excludeIdx]) == 0)
1168                         {
1169                                 elog(DEBUG1, "contents of directory \"%s\" excluded from backup", de->d_name);
1170                                 size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly);
1171                                 excludeFound = true;
1172                                 break;
1173                         }
1174                 }
1175
1176                 if (excludeFound)
1177                         continue;
1178
1179                 /*
1180                  * Exclude contents of directory specified by statrelpath if not set
1181                  * to the default (pg_stat_tmp) which is caught in the loop above.
1182                  */
1183                 if (statrelpath != NULL && strcmp(pathbuf, statrelpath) == 0)
1184                 {
1185                         elog(DEBUG1, "contents of directory \"%s\" excluded from backup", statrelpath);
1186                         size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly);
1187                         continue;
1188                 }
1189
1190                 /*
1191                  * We can skip pg_wal, the WAL segments need to be fetched from the
1192                  * WAL archive anyway. But include it as an empty directory anyway, so
1193                  * we get permissions right.
1194                  */
1195                 if (strcmp(pathbuf, "./pg_wal") == 0)
1196                 {
1197                         /* If pg_wal is a symlink, write it as a directory anyway */
1198                         size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly);
1199
1200                         /*
1201                          * Also send archive_status directory (by hackishly reusing
1202                          * statbuf from above ...).
1203                          */
1204                         size += _tarWriteHeader("./pg_wal/archive_status", NULL, &statbuf,
1205                                                                         sizeonly);
1206
1207                         continue;                       /* don't recurse into pg_wal */
1208                 }
1209
1210                 /* Allow symbolic links in pg_tblspc only */
1211                 if (strcmp(path, "./pg_tblspc") == 0 &&
1212 #ifndef WIN32
1213                         S_ISLNK(statbuf.st_mode)
1214 #else
1215                         pgwin32_is_junction(pathbuf)
1216 #endif
1217                         )
1218                 {
1219 #if defined(HAVE_READLINK) || defined(WIN32)
1220                         char            linkpath[MAXPGPATH];
1221                         int                     rllen;
1222
1223                         rllen = readlink(pathbuf, linkpath, sizeof(linkpath));
1224                         if (rllen < 0)
1225                                 ereport(ERROR,
1226                                                 (errcode_for_file_access(),
1227                                                  errmsg("could not read symbolic link \"%s\": %m",
1228                                                                 pathbuf)));
1229                         if (rllen >= sizeof(linkpath))
1230                                 ereport(ERROR,
1231                                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1232                                                  errmsg("symbolic link \"%s\" target is too long",
1233                                                                 pathbuf)));
1234                         linkpath[rllen] = '\0';
1235
1236                         size += _tarWriteHeader(pathbuf + basepathlen + 1, linkpath,
1237                                                                         &statbuf, sizeonly);
1238 #else
1239
1240                         /*
1241                          * If the platform does not have symbolic links, it should not be
1242                          * possible to have tablespaces - clearly somebody else created
1243                          * them. Warn about it and ignore.
1244                          */
1245                         ereport(WARNING,
1246                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1247                                          errmsg("tablespaces are not supported on this platform")));
1248                         continue;
1249 #endif                                                  /* HAVE_READLINK */
1250                 }
1251                 else if (S_ISDIR(statbuf.st_mode))
1252                 {
1253                         bool            skip_this_dir = false;
1254                         ListCell   *lc;
1255
1256                         /*
1257                          * Store a directory entry in the tar file so we can get the
1258                          * permissions right.
1259                          */
1260                         size += _tarWriteHeader(pathbuf + basepathlen + 1, NULL, &statbuf,
1261                                                                         sizeonly);
1262
1263                         /*
1264                          * Call ourselves recursively for a directory, unless it happens
1265                          * to be a separate tablespace located within PGDATA.
1266                          */
1267                         foreach(lc, tablespaces)
1268                         {
1269                                 tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
1270
1271                                 /*
1272                                  * ti->rpath is the tablespace relative path within PGDATA, or
1273                                  * NULL if the tablespace has been properly located somewhere
1274                                  * else.
1275                                  *
1276                                  * Skip past the leading "./" in pathbuf when comparing.
1277                                  */
1278                                 if (ti->rpath && strcmp(ti->rpath, pathbuf + 2) == 0)
1279                                 {
1280                                         skip_this_dir = true;
1281                                         break;
1282                                 }
1283                         }
1284
1285                         /*
1286                          * skip sending directories inside pg_tblspc, if not required.
1287                          */
1288                         if (strcmp(pathbuf, "./pg_tblspc") == 0 && !sendtblspclinks)
1289                                 skip_this_dir = true;
1290
1291                         if (!skip_this_dir)
1292                                 size += sendDir(pathbuf, basepathlen, sizeonly, tablespaces, sendtblspclinks);
1293                 }
1294                 else if (S_ISREG(statbuf.st_mode))
1295                 {
1296                         bool            sent = false;
1297
1298                         if (!sizeonly)
1299                                 sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf,
1300                                                                 true, isDbDir ? pg_atoi(lastDir + 1, sizeof(Oid), 0) : InvalidOid);
1301
1302                         if (sent || sizeonly)
1303                         {
1304                                 /* Add size, rounded up to 512byte block */
1305                                 size += ((statbuf.st_size + 511) & ~511);
1306                                 size += 512;    /* Size of the header of the file */
1307                         }
1308                 }
1309                 else
1310                         ereport(WARNING,
1311                                         (errmsg("skipping special file \"%s\"", pathbuf)));
1312         }
1313         FreeDir(dir);
1314         return size;
1315 }
1316
1317 /*
1318  * Check if a file should have its checksum validated.
1319  * We validate checksums on files in regular tablespaces
1320  * (including global and default) only, and in those there
1321  * are some files that are explicitly excluded.
1322  */
1323 static bool
1324 is_checksummed_file(const char *fullpath, const char *filename)
1325 {
1326         const char *const *f;
1327
1328         /* Check that the file is in a tablespace */
1329         if (strncmp(fullpath, "./global/", 9) == 0 ||
1330                 strncmp(fullpath, "./base/", 7) == 0 ||
1331                 strncmp(fullpath, "/", 1) == 0)
1332         {
1333                 /* Compare file against noChecksumFiles skiplist */
1334                 for (f = noChecksumFiles; *f; f++)
1335                         if (strcmp(*f, filename) == 0)
1336                                 return false;
1337
1338                 return true;
1339         }
1340         else
1341                 return false;
1342 }
1343
1344 /*****
1345  * Functions for handling tar file format
1346  *
1347  * Copied from pg_dump, but modified to work with libpq for sending
1348  */
1349
1350
1351 /*
1352  * Given the member, write the TAR header & send the file.
1353  *
1354  * If 'missing_ok' is true, will not throw an error if the file is not found.
1355  *
1356  * If dboid is anything other than InvalidOid then any checksum failures detected
1357  * will get reported to the stats collector.
1358  *
1359  * Returns true if the file was successfully sent, false if 'missing_ok',
1360  * and the file did not exist.
1361  */
1362 static bool
1363 sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf,
1364                  bool missing_ok, Oid dboid)
1365 {
1366         FILE       *fp;
1367         BlockNumber blkno = 0;
1368         bool            block_retry = false;
1369         char            buf[TAR_SEND_SIZE];
1370         uint16          checksum;
1371         int                     checksum_failures = 0;
1372         off_t           cnt;
1373         int                     i;
1374         pgoff_t         len = 0;
1375         char       *page;
1376         size_t          pad;
1377         PageHeader      phdr;
1378         int                     segmentno = 0;
1379         char       *segmentpath;
1380         bool            verify_checksum = false;
1381
1382         fp = AllocateFile(readfilename, "rb");
1383         if (fp == NULL)
1384         {
1385                 if (errno == ENOENT && missing_ok)
1386                         return false;
1387                 ereport(ERROR,
1388                                 (errcode_for_file_access(),
1389                                  errmsg("could not open file \"%s\": %m", readfilename)));
1390         }
1391
1392         _tarWriteHeader(tarfilename, NULL, statbuf, false);
1393
1394         if (!noverify_checksums && DataChecksumsEnabled())
1395         {
1396                 char       *filename;
1397
1398                 /*
1399                  * Get the filename (excluding path).  As last_dir_separator()
1400                  * includes the last directory separator, we chop that off by
1401                  * incrementing the pointer.
1402                  */
1403                 filename = last_dir_separator(readfilename) + 1;
1404
1405                 if (is_checksummed_file(readfilename, filename))
1406                 {
1407                         verify_checksum = true;
1408
1409                         /*
1410                          * Cut off at the segment boundary (".") to get the segment number
1411                          * in order to mix it into the checksum.
1412                          */
1413                         segmentpath = strstr(filename, ".");
1414                         if (segmentpath != NULL)
1415                         {
1416                                 segmentno = atoi(segmentpath + 1);
1417                                 if (segmentno == 0)
1418                                         ereport(ERROR,
1419                                                         (errmsg("invalid segment number %d in file \"%s\"",
1420                                                                         segmentno, filename)));
1421                         }
1422                 }
1423         }
1424
1425         while ((cnt = fread(buf, 1, Min(sizeof(buf), statbuf->st_size - len), fp)) > 0)
1426         {
1427                 /*
1428                  * The checksums are verified at block level, so we iterate over the
1429                  * buffer in chunks of BLCKSZ, after making sure that
1430                  * TAR_SEND_SIZE/buf is divisible by BLCKSZ and we read a multiple of
1431                  * BLCKSZ bytes.
1432                  */
1433                 Assert(TAR_SEND_SIZE % BLCKSZ == 0);
1434
1435                 if (verify_checksum && (cnt % BLCKSZ != 0))
1436                 {
1437                         ereport(WARNING,
1438                                         (errmsg("cannot verify checksum in file \"%s\", block "
1439                                                         "%d: read buffer size %d and page size %d "
1440                                                         "differ",
1441                                                         readfilename, blkno, (int) cnt, BLCKSZ)));
1442                         verify_checksum = false;
1443                 }
1444
1445                 if (verify_checksum)
1446                 {
1447                         for (i = 0; i < cnt / BLCKSZ; i++)
1448                         {
1449                                 page = buf + BLCKSZ * i;
1450
1451                                 /*
1452                                  * Only check pages which have not been modified since the
1453                                  * start of the base backup. Otherwise, they might have been
1454                                  * written only halfway and the checksum would not be valid.
1455                                  * However, replaying WAL would reinstate the correct page in
1456                                  * this case. We also skip completely new pages, since they
1457                                  * don't have a checksum yet.
1458                                  */
1459                                 if (!PageIsNew(page) && PageGetLSN(page) < startptr)
1460                                 {
1461                                         checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
1462                                         phdr = (PageHeader) page;
1463                                         if (phdr->pd_checksum != checksum)
1464                                         {
1465                                                 /*
1466                                                  * Retry the block on the first failure.  It's
1467                                                  * possible that we read the first 4K page of the
1468                                                  * block just before postgres updated the entire block
1469                                                  * so it ends up looking torn to us.  We only need to
1470                                                  * retry once because the LSN should be updated to
1471                                                  * something we can ignore on the next pass.  If the
1472                                                  * error happens again then it is a true validation
1473                                                  * failure.
1474                                                  */
1475                                                 if (block_retry == false)
1476                                                 {
1477                                                         /* Reread the failed block */
1478                                                         if (fseek(fp, -(cnt - BLCKSZ * i), SEEK_CUR) == -1)
1479                                                         {
1480                                                                 ereport(ERROR,
1481                                                                                 (errcode_for_file_access(),
1482                                                                                  errmsg("could not fseek in file \"%s\": %m",
1483                                                                                                 readfilename)));
1484                                                         }
1485
1486                                                         if (fread(buf + BLCKSZ * i, 1, BLCKSZ, fp) != BLCKSZ)
1487                                                         {
1488                                                                 ereport(ERROR,
1489                                                                                 (errcode_for_file_access(),
1490                                                                                  errmsg("could not reread block %d of file \"%s\": %m",
1491                                                                                                 blkno, readfilename)));
1492                                                         }
1493
1494                                                         if (fseek(fp, cnt - BLCKSZ * i - BLCKSZ, SEEK_CUR) == -1)
1495                                                         {
1496                                                                 ereport(ERROR,
1497                                                                                 (errcode_for_file_access(),
1498                                                                                  errmsg("could not fseek in file \"%s\": %m",
1499                                                                                                 readfilename)));
1500                                                         }
1501
1502                                                         /* Set flag so we know a retry was attempted */
1503                                                         block_retry = true;
1504
1505                                                         /* Reset loop to validate the block again */
1506                                                         i--;
1507                                                         continue;
1508                                                 }
1509
1510                                                 checksum_failures++;
1511
1512                                                 if (checksum_failures <= 5)
1513                                                         ereport(WARNING,
1514                                                                         (errmsg("checksum verification failed in "
1515                                                                                         "file \"%s\", block %d: calculated "
1516                                                                                         "%X but expected %X",
1517                                                                                         readfilename, blkno, checksum,
1518                                                                                         phdr->pd_checksum)));
1519                                                 if (checksum_failures == 5)
1520                                                         ereport(WARNING,
1521                                                                         (errmsg("further checksum verification "
1522                                                                                         "failures in file \"%s\" will not "
1523                                                                                         "be reported", readfilename)));
1524                                         }
1525                                 }
1526                                 block_retry = false;
1527                                 blkno++;
1528                         }
1529                 }
1530
1531                 /* Send the chunk as a CopyData message */
1532                 if (pq_putmessage('d', buf, cnt))
1533                         ereport(ERROR,
1534                                         (errmsg("base backup could not send data, aborting backup")));
1535
1536                 len += cnt;
1537                 throttle(cnt);
1538
1539                 if (len >= statbuf->st_size)
1540                 {
1541                         /*
1542                          * Reached end of file. The file could be longer, if it was
1543                          * extended while we were sending it, but for a base backup we can
1544                          * ignore such extended data. It will be restored from WAL.
1545                          */
1546                         break;
1547                 }
1548         }
1549
1550         /* If the file was truncated while we were sending it, pad it with zeros */
1551         if (len < statbuf->st_size)
1552         {
1553                 MemSet(buf, 0, sizeof(buf));
1554                 while (len < statbuf->st_size)
1555                 {
1556                         cnt = Min(sizeof(buf), statbuf->st_size - len);
1557                         pq_putmessage('d', buf, cnt);
1558                         len += cnt;
1559                         throttle(cnt);
1560                 }
1561         }
1562
1563         /*
1564          * Pad to 512 byte boundary, per tar format requirements. (This small
1565          * piece of data is probably not worth throttling.)
1566          */
1567         pad = ((len + 511) & ~511) - len;
1568         if (pad > 0)
1569         {
1570                 MemSet(buf, 0, pad);
1571                 pq_putmessage('d', buf, pad);
1572         }
1573
1574         FreeFile(fp);
1575
1576         if (checksum_failures > 1)
1577         {
1578                 ereport(WARNING,
1579                                 (errmsg("file \"%s\" has a total of %d checksum verification "
1580                                                 "failures", readfilename, checksum_failures)));
1581
1582                 pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
1583         }
1584
1585         total_checksum_failures += checksum_failures;
1586
1587         return true;
1588 }
1589
1590
1591 static int64
1592 _tarWriteHeader(const char *filename, const char *linktarget,
1593                                 struct stat *statbuf, bool sizeonly)
1594 {
1595         char            h[512];
1596         enum tarError rc;
1597
1598         if (!sizeonly)
1599         {
1600                 rc = tarCreateHeader(h, filename, linktarget, statbuf->st_size,
1601                                                          statbuf->st_mode, statbuf->st_uid, statbuf->st_gid,
1602                                                          statbuf->st_mtime);
1603
1604                 switch (rc)
1605                 {
1606                         case TAR_OK:
1607                                 break;
1608                         case TAR_NAME_TOO_LONG:
1609                                 ereport(ERROR,
1610                                                 (errmsg("file name too long for tar format: \"%s\"",
1611                                                                 filename)));
1612                                 break;
1613                         case TAR_SYMLINK_TOO_LONG:
1614                                 ereport(ERROR,
1615                                                 (errmsg("symbolic link target too long for tar format: "
1616                                                                 "file name \"%s\", target \"%s\"",
1617                                                                 filename, linktarget)));
1618                                 break;
1619                         default:
1620                                 elog(ERROR, "unrecognized tar error: %d", rc);
1621                 }
1622
1623                 pq_putmessage('d', h, sizeof(h));
1624         }
1625
1626         return sizeof(h);
1627 }
1628
1629 /*
1630  * Write tar header for a directory.  If the entry in statbuf is a link then
1631  * write it as a directory anyway.
1632  */
1633 static int64
1634 _tarWriteDir(const char *pathbuf, int basepathlen, struct stat *statbuf,
1635                          bool sizeonly)
1636 {
1637         /* If symlink, write it as a directory anyway */
1638 #ifndef WIN32
1639         if (S_ISLNK(statbuf->st_mode))
1640 #else
1641         if (pgwin32_is_junction(pathbuf))
1642 #endif
1643                 statbuf->st_mode = S_IFDIR | pg_dir_create_mode;
1644
1645         return _tarWriteHeader(pathbuf + basepathlen + 1, NULL, statbuf, sizeonly);
1646 }
1647
1648 /*
1649  * Increment the network transfer counter by the given number of bytes,
1650  * and sleep if necessary to comply with the requested network transfer
1651  * rate.
1652  */
1653 static void
1654 throttle(size_t increment)
1655 {
1656         TimeOffset      elapsed_min;
1657
1658         if (throttling_counter < 0)
1659                 return;
1660
1661         throttling_counter += increment;
1662         if (throttling_counter < throttling_sample)
1663                 return;
1664
1665         /* How much time should have elapsed at minimum? */
1666         elapsed_min = elapsed_min_unit *
1667                 (throttling_counter / throttling_sample);
1668
1669         /*
1670          * Since the latch could be set repeatedly because of concurrently WAL
1671          * activity, sleep in a loop to ensure enough time has passed.
1672          */
1673         for (;;)
1674         {
1675                 TimeOffset      elapsed,
1676                                         sleep;
1677                 int                     wait_result;
1678
1679                 /* Time elapsed since the last measurement (and possible wake up). */
1680                 elapsed = GetCurrentTimestamp() - throttled_last;
1681
1682                 /* sleep if the transfer is faster than it should be */
1683                 sleep = elapsed_min - elapsed;
1684                 if (sleep <= 0)
1685                         break;
1686
1687                 ResetLatch(MyLatch);
1688
1689                 /* We're eating a potentially set latch, so check for interrupts */
1690                 CHECK_FOR_INTERRUPTS();
1691
1692                 /*
1693                  * (TAR_SEND_SIZE / throttling_sample * elapsed_min_unit) should be
1694                  * the maximum time to sleep. Thus the cast to long is safe.
1695                  */
1696                 wait_result = WaitLatch(MyLatch,
1697                                                                 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
1698                                                                 (long) (sleep / 1000),
1699                                                                 WAIT_EVENT_BASE_BACKUP_THROTTLE);
1700
1701                 if (wait_result & WL_LATCH_SET)
1702                         CHECK_FOR_INTERRUPTS();
1703
1704                 /* Done waiting? */
1705                 if (wait_result & WL_TIMEOUT)
1706                         break;
1707         }
1708
1709         /*
1710          * As we work with integers, only whole multiple of throttling_sample was
1711          * processed. The rest will be done during the next call of this function.
1712          */
1713         throttling_counter %= throttling_sample;
1714
1715         /*
1716          * Time interval for the remaining amount and possible next increments
1717          * starts now.
1718          */
1719         throttled_last = GetCurrentTimestamp();
1720 }