1 /*-------------------------------------------------------------------------
4 * code for taking a base backup and streaming it to a standby
6 * Portions Copyright (c) 2010-2019, PostgreSQL Global Development Group
9 * src/backend/replication/basebackup.c
11 *-------------------------------------------------------------------------
19 #include "access/xlog_internal.h" /* for pg_start/stop_backup */
20 #include "catalog/pg_type.h"
21 #include "common/file_perm.h"
22 #include "lib/stringinfo.h"
23 #include "libpq/libpq.h"
24 #include "libpq/pqformat.h"
25 #include "miscadmin.h"
26 #include "nodes/pg_list.h"
30 #include "postmaster/syslogger.h"
31 #include "replication/basebackup.h"
32 #include "replication/walsender.h"
33 #include "replication/walsender_private.h"
34 #include "storage/bufpage.h"
35 #include "storage/checksum.h"
36 #include "storage/dsm_impl.h"
37 #include "storage/fd.h"
38 #include "storage/ipc.h"
39 #include "storage/reinit.h"
40 #include "utils/builtins.h"
41 #include "utils/ps_status.h"
42 #include "utils/relcache.h"
43 #include "utils/timestamp.h"
54 bool sendtblspcmapfile;
58 static int64 sendDir(const char *path, int basepathlen, bool sizeonly,
59 List *tablespaces, bool sendtblspclinks);
60 static bool sendFile(const char *readfilename, const char *tarfilename,
61 struct stat *statbuf, bool missing_ok, Oid dboid);
62 static void sendFileWithContent(const char *filename, const char *content);
63 static int64 _tarWriteHeader(const char *filename, const char *linktarget,
64 struct stat *statbuf, bool sizeonly);
65 static int64 _tarWriteDir(const char *pathbuf, int basepathlen, struct stat *statbuf,
67 static void send_int8_string(StringInfoData *buf, int64 intval);
68 static void SendBackupHeader(List *tablespaces);
69 static void base_backup_cleanup(int code, Datum arg);
70 static void perform_base_backup(basebackup_options *opt);
71 static void parse_basebackup_options(List *options, basebackup_options *opt);
72 static void SendXlogRecPtrResult(XLogRecPtr ptr, TimeLineID tli);
73 static int compareWalFileNames(const void *a, const void *b);
74 static void throttle(size_t increment);
75 static bool is_checksummed_file(const char *fullpath, const char *filename);
77 /* Was the backup currently in-progress initiated in recovery mode? */
78 static bool backup_started_in_recovery = false;
80 /* Relative path of temporary statistics directory */
81 static char *statrelpath = NULL;
84 * Size of each block sent into the tar stream for larger files.
86 #define TAR_SEND_SIZE 32768
89 * How frequently to throttle, as a fraction of the specified rate-second.
91 #define THROTTLING_FREQUENCY 8
93 /* The actual number of bytes, transfer of which may cause sleep. */
94 static uint64 throttling_sample;
96 /* Amount of data already transferred but not yet throttled. */
97 static int64 throttling_counter;
99 /* The minimum time required to transfer throttling_sample bytes. */
100 static TimeOffset elapsed_min_unit;
102 /* The last check of the transfer rate. */
103 static TimestampTz throttled_last;
105 /* The starting XLOG position of the base backup. */
106 static XLogRecPtr startptr;
108 /* Total number of checksum failures during base backup. */
109 static long long int total_checksum_failures;
111 /* Do not verify checksums. */
112 static bool noverify_checksums = false;
115 * The contents of these directories are removed or recreated during server
116 * start so they are not included in backups. The directories themselves are
117 * kept and included as empty to preserve access permissions.
119 * Note: this list should be kept in sync with the filter lists in pg_rewind's
122 static const char *excludeDirContents[] =
125 * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped even
126 * when stats_temp_directory is set because PGSS_TEXT_FILE is always
132 * It is generally not useful to backup the contents of this directory
133 * even if the intention is to restore to another master. See backup.sgml
134 * for a more detailed description.
138 /* Contents removed on startup, see dsm_cleanup_for_mmap(). */
141 /* Contents removed on startup, see AsyncShmemInit(). */
145 * Old contents are loaded for possible debugging but are not required for
146 * normal operation, see OldSerXidInit().
150 /* Contents removed on startup, see DeleteAllExportedSnapshotFiles(). */
153 /* Contents zeroed on startup, see StartupSUBTRANS(). */
161 * List of files excluded from backups.
163 static const char *excludeFiles[] =
165 /* Skip auto conf temporary file. */
166 PG_AUTOCONF_FILENAME ".tmp",
168 /* Skip current log file temporary file */
169 LOG_METAINFO_DATAFILE_TMP,
171 /* Skip relation cache because it is rebuilt on startup */
172 RELCACHE_INIT_FILENAME,
175 * If there's a backup_label or tablespace_map file, it belongs to a
176 * backup started by the user with pg_start_backup(). It is *not* correct
177 * for this backup. Our backup_label/tablespace_map is injected into the
191 * List of files excluded from checksum validation.
193 * Note: this list should be kept in sync with what pg_checksums.c
196 static const char *const noChecksumFiles[] = {
202 "config_exec_params",
203 "config_exec_params.new",
210 * Called when ERROR or FATAL happens in perform_base_backup() after
211 * we have started the backup - make sure we end it!
214 base_backup_cleanup(int code, Datum arg)
216 do_pg_abort_backup();
220 * Actually do a base backup for the specified tablespaces.
222 * This is split out mainly to avoid complaints about "variable might be
223 * clobbered by longjmp" from stupider versions of gcc.
226 perform_base_backup(basebackup_options *opt)
231 StringInfo labelfile;
232 StringInfo tblspc_map_file = NULL;
234 List *tablespaces = NIL;
236 datadirpathlen = strlen(DataDir);
238 backup_started_in_recovery = RecoveryInProgress();
240 labelfile = makeStringInfo();
241 tblspc_map_file = makeStringInfo();
243 total_checksum_failures = 0;
245 startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &starttli,
246 labelfile, &tablespaces,
248 opt->progress, opt->sendtblspcmapfile);
251 * Once do_pg_start_backup has been called, ensure that any failure causes
252 * us to abort the backup so we don't "leak" a backup counter. For this
253 * reason, *all* functionality between do_pg_start_backup() and the end of
254 * do_pg_stop_backup() should be inside the error cleanup block!
257 PG_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0);
262 SendXlogRecPtrResult(startptr, starttli);
265 * Calculate the relative path of temporary statistics directory in
266 * order to skip the files which are located in that directory later.
268 if (is_absolute_path(pgstat_stat_directory) &&
269 strncmp(pgstat_stat_directory, DataDir, datadirpathlen) == 0)
270 statrelpath = psprintf("./%s", pgstat_stat_directory + datadirpathlen + 1);
271 else if (strncmp(pgstat_stat_directory, "./", 2) != 0)
272 statrelpath = psprintf("./%s", pgstat_stat_directory);
274 statrelpath = pgstat_stat_directory;
276 /* Add a node for the base directory at the end */
277 ti = palloc0(sizeof(tablespaceinfo));
278 ti->size = opt->progress ? sendDir(".", 1, true, tablespaces, true) : -1;
279 tablespaces = lappend(tablespaces, ti);
281 /* Send tablespace header */
282 SendBackupHeader(tablespaces);
284 /* Setup and activate network throttling, if client requested it */
285 if (opt->maxrate > 0)
288 (int64) opt->maxrate * (int64) 1024 / THROTTLING_FREQUENCY;
291 * The minimum amount of time for throttling_sample bytes to be
294 elapsed_min_unit = USECS_PER_SEC / THROTTLING_FREQUENCY;
296 /* Enable throttling. */
297 throttling_counter = 0;
299 /* The 'real data' starts now (header was ignored). */
300 throttled_last = GetCurrentTimestamp();
304 /* Disable throttling. */
305 throttling_counter = -1;
308 /* Send off our tablespaces one by one */
309 foreach(lc, tablespaces)
311 tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
314 /* Send CopyOutResponse message */
315 pq_beginmessage(&buf, 'H');
316 pq_sendbyte(&buf, 0); /* overall format */
317 pq_sendint16(&buf, 0); /* natts */
320 if (ti->path == NULL)
324 /* In the main tar, include the backup_label first... */
325 sendFileWithContent(BACKUP_LABEL_FILE, labelfile->data);
328 * Send tablespace_map file if required and then the bulk of
331 if (tblspc_map_file && opt->sendtblspcmapfile)
333 sendFileWithContent(TABLESPACE_MAP, tblspc_map_file->data);
334 sendDir(".", 1, false, tablespaces, false);
337 sendDir(".", 1, false, tablespaces, true);
339 /* ... and pg_control after everything else. */
340 if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0)
342 (errcode_for_file_access(),
343 errmsg("could not stat file \"%s\": %m",
344 XLOG_CONTROL_FILE)));
345 sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false, InvalidOid);
348 sendTablespace(ti->path, false);
351 * If we're including WAL, and this is the main data directory we
352 * don't terminate the tar stream here. Instead, we will append
353 * the xlog files below and terminate it then. This is safe since
354 * the main data directory is always sent *last*.
356 if (opt->includewal && ti->path == NULL)
358 Assert(lnext(lc) == NULL);
361 pq_putemptymessage('c'); /* CopyDone */
364 endptr = do_pg_stop_backup(labelfile->data, !opt->nowait, &endtli);
366 PG_END_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0);
372 * We've left the last tar file "open", so we can now append the
373 * required WAL files to it.
375 char pathbuf[MAXPGPATH];
377 XLogSegNo startsegno;
380 List *historyFileList = NIL;
381 List *walFileList = NIL;
384 char firstoff[MAXFNAMELEN];
385 char lastoff[MAXFNAMELEN];
393 * I'd rather not worry about timelines here, so scan pg_wal and
394 * include all WAL files in the range between 'startptr' and 'endptr',
395 * regardless of the timeline the file is stamped with. If there are
396 * some spurious WAL files belonging to timelines that don't belong in
397 * this server's history, they will be included too. Normally there
398 * shouldn't be such files, but if there are, there's little harm in
401 XLByteToSeg(startptr, startsegno, wal_segment_size);
402 XLogFileName(firstoff, ThisTimeLineID, startsegno, wal_segment_size);
403 XLByteToPrevSeg(endptr, endsegno, wal_segment_size);
404 XLogFileName(lastoff, ThisTimeLineID, endsegno, wal_segment_size);
406 dir = AllocateDir("pg_wal");
407 while ((de = ReadDir(dir, "pg_wal")) != NULL)
409 /* Does it look like a WAL segment, and is it in the range? */
410 if (IsXLogFileName(de->d_name) &&
411 strcmp(de->d_name + 8, firstoff + 8) >= 0 &&
412 strcmp(de->d_name + 8, lastoff + 8) <= 0)
414 walFileList = lappend(walFileList, pstrdup(de->d_name));
416 /* Does it look like a timeline history file? */
417 else if (IsTLHistoryFileName(de->d_name))
419 historyFileList = lappend(historyFileList, pstrdup(de->d_name));
425 * Before we go any further, check that none of the WAL segments we
428 CheckXLogRemoved(startsegno, ThisTimeLineID);
431 * Put the WAL filenames into an array, and sort. We send the files in
432 * order from oldest to newest, to reduce the chance that a file is
433 * recycled before we get a chance to send it over.
435 nWalFiles = list_length(walFileList);
436 walFiles = palloc(nWalFiles * sizeof(char *));
438 foreach(lc, walFileList)
440 walFiles[i++] = lfirst(lc);
442 qsort(walFiles, nWalFiles, sizeof(char *), compareWalFileNames);
445 * There must be at least one xlog file in the pg_wal directory, since
446 * we are doing backup-including-xlog.
450 (errmsg("could not find any WAL files")));
453 * Sanity check: the first and last segment should cover startptr and
454 * endptr, with no gaps in between.
456 XLogFromFileName(walFiles[0], &tli, &segno, wal_segment_size);
457 if (segno != startsegno)
459 char startfname[MAXFNAMELEN];
461 XLogFileName(startfname, ThisTimeLineID, startsegno,
464 (errmsg("could not find WAL file \"%s\"", startfname)));
466 for (i = 0; i < nWalFiles; i++)
468 XLogSegNo currsegno = segno;
469 XLogSegNo nextsegno = segno + 1;
471 XLogFromFileName(walFiles[i], &tli, &segno, wal_segment_size);
472 if (!(nextsegno == segno || currsegno == segno))
474 char nextfname[MAXFNAMELEN];
476 XLogFileName(nextfname, ThisTimeLineID, nextsegno,
479 (errmsg("could not find WAL file \"%s\"", nextfname)));
482 if (segno != endsegno)
484 char endfname[MAXFNAMELEN];
486 XLogFileName(endfname, ThisTimeLineID, endsegno, wal_segment_size);
488 (errmsg("could not find WAL file \"%s\"", endfname)));
491 /* Ok, we have everything we need. Send the WAL files. */
492 for (i = 0; i < nWalFiles; i++)
495 char buf[TAR_SEND_SIZE];
499 snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", walFiles[i]);
500 XLogFromFileName(walFiles[i], &tli, &segno, wal_segment_size);
502 fp = AllocateFile(pathbuf, "rb");
505 int save_errno = errno;
508 * Most likely reason for this is that the file was already
509 * removed by a checkpoint, so check for that to get a better
512 CheckXLogRemoved(segno, tli);
516 (errcode_for_file_access(),
517 errmsg("could not open file \"%s\": %m", pathbuf)));
520 if (fstat(fileno(fp), &statbuf) != 0)
522 (errcode_for_file_access(),
523 errmsg("could not stat file \"%s\": %m",
525 if (statbuf.st_size != wal_segment_size)
527 CheckXLogRemoved(segno, tli);
529 (errcode_for_file_access(),
530 errmsg("unexpected WAL file size \"%s\"", walFiles[i])));
533 /* send the WAL file itself */
534 _tarWriteHeader(pathbuf, NULL, &statbuf, false);
536 while ((cnt = fread(buf, 1,
537 Min(sizeof(buf), wal_segment_size - len),
540 CheckXLogRemoved(segno, tli);
541 /* Send the chunk as a CopyData message */
542 if (pq_putmessage('d', buf, cnt))
544 (errmsg("base backup could not send data, aborting backup")));
549 if (len == wal_segment_size)
553 if (len != wal_segment_size)
555 CheckXLogRemoved(segno, tli);
557 (errcode_for_file_access(),
558 errmsg("unexpected WAL file size \"%s\"", walFiles[i])));
561 /* wal_segment_size is a multiple of 512, so no need for padding */
566 * Mark file as archived, otherwise files can get archived again
567 * after promotion of a new node. This is in line with
568 * walreceiver.c always doing an XLogArchiveForceDone() after a
571 StatusFilePath(pathbuf, walFiles[i], ".done");
572 sendFileWithContent(pathbuf, "");
576 * Send timeline history files too. Only the latest timeline history
577 * file is required for recovery, and even that only if there happens
578 * to be a timeline switch in the first WAL segment that contains the
579 * checkpoint record, or if we're taking a base backup from a standby
580 * server and the target timeline changes while the backup is taken.
581 * But they are small and highly useful for debugging purposes, so
582 * better include them all, always.
584 foreach(lc, historyFileList)
586 char *fname = lfirst(lc);
588 snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", fname);
590 if (lstat(pathbuf, &statbuf) != 0)
592 (errcode_for_file_access(),
593 errmsg("could not stat file \"%s\": %m", pathbuf)));
595 sendFile(pathbuf, pathbuf, &statbuf, false, InvalidOid);
597 /* unconditionally mark file as archived */
598 StatusFilePath(pathbuf, fname, ".done");
599 sendFileWithContent(pathbuf, "");
602 /* Send CopyDone message for the last tar file */
603 pq_putemptymessage('c');
605 SendXlogRecPtrResult(endptr, endtli);
607 if (total_checksum_failures)
609 if (total_checksum_failures > 1)
611 (errmsg("%lld total checksum verification failures", total_checksum_failures)));
614 (errcode(ERRCODE_DATA_CORRUPTED),
615 errmsg("checksum verification failure during base backup")));
621 * qsort comparison function, to compare log/seg portion of WAL segment
622 * filenames, ignoring the timeline portion.
625 compareWalFileNames(const void *a, const void *b)
627 char *fna = *((char **) a);
628 char *fnb = *((char **) b);
630 return strcmp(fna + 8, fnb + 8);
634 * Parse the base backup options passed down by the parser
637 parse_basebackup_options(List *options, basebackup_options *opt)
640 bool o_label = false;
641 bool o_progress = false;
643 bool o_nowait = false;
645 bool o_maxrate = false;
646 bool o_tablespace_map = false;
647 bool o_noverify_checksums = false;
649 MemSet(opt, 0, sizeof(*opt));
650 foreach(lopt, options)
652 DefElem *defel = (DefElem *) lfirst(lopt);
654 if (strcmp(defel->defname, "label") == 0)
658 (errcode(ERRCODE_SYNTAX_ERROR),
659 errmsg("duplicate option \"%s\"", defel->defname)));
660 opt->label = strVal(defel->arg);
663 else if (strcmp(defel->defname, "progress") == 0)
667 (errcode(ERRCODE_SYNTAX_ERROR),
668 errmsg("duplicate option \"%s\"", defel->defname)));
669 opt->progress = true;
672 else if (strcmp(defel->defname, "fast") == 0)
676 (errcode(ERRCODE_SYNTAX_ERROR),
677 errmsg("duplicate option \"%s\"", defel->defname)));
678 opt->fastcheckpoint = true;
681 else if (strcmp(defel->defname, "nowait") == 0)
685 (errcode(ERRCODE_SYNTAX_ERROR),
686 errmsg("duplicate option \"%s\"", defel->defname)));
690 else if (strcmp(defel->defname, "wal") == 0)
694 (errcode(ERRCODE_SYNTAX_ERROR),
695 errmsg("duplicate option \"%s\"", defel->defname)));
696 opt->includewal = true;
699 else if (strcmp(defel->defname, "max_rate") == 0)
705 (errcode(ERRCODE_SYNTAX_ERROR),
706 errmsg("duplicate option \"%s\"", defel->defname)));
708 maxrate = intVal(defel->arg);
709 if (maxrate < MAX_RATE_LOWER || maxrate > MAX_RATE_UPPER)
711 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
712 errmsg("%d is outside the valid range for parameter \"%s\" (%d .. %d)",
713 (int) maxrate, "MAX_RATE", MAX_RATE_LOWER, MAX_RATE_UPPER)));
715 opt->maxrate = (uint32) maxrate;
718 else if (strcmp(defel->defname, "tablespace_map") == 0)
720 if (o_tablespace_map)
722 (errcode(ERRCODE_SYNTAX_ERROR),
723 errmsg("duplicate option \"%s\"", defel->defname)));
724 opt->sendtblspcmapfile = true;
725 o_tablespace_map = true;
727 else if (strcmp(defel->defname, "noverify_checksums") == 0)
729 if (o_noverify_checksums)
731 (errcode(ERRCODE_SYNTAX_ERROR),
732 errmsg("duplicate option \"%s\"", defel->defname)));
733 noverify_checksums = true;
734 o_noverify_checksums = true;
737 elog(ERROR, "option \"%s\" not recognized",
740 if (opt->label == NULL)
741 opt->label = "base backup";
746 * SendBaseBackup() - send a complete base backup.
748 * The function will put the system into backup mode like pg_start_backup()
749 * does, so that the backup is consistent even though we read directly from
750 * the filesystem, bypassing the buffer cache.
753 SendBaseBackup(BaseBackupCmd *cmd)
755 basebackup_options opt;
757 parse_basebackup_options(cmd->options, &opt);
759 WalSndSetState(WALSNDSTATE_BACKUP);
761 if (update_process_title)
763 char activitymsg[50];
765 snprintf(activitymsg, sizeof(activitymsg), "sending backup \"%s\"",
767 set_ps_display(activitymsg, false);
770 perform_base_backup(&opt);
774 send_int8_string(StringInfoData *buf, int64 intval)
778 sprintf(is, INT64_FORMAT, intval);
779 pq_sendint32(buf, strlen(is));
780 pq_sendbytes(buf, is, strlen(is));
784 SendBackupHeader(List *tablespaces)
789 /* Construct and send the directory information */
790 pq_beginmessage(&buf, 'T'); /* RowDescription */
791 pq_sendint16(&buf, 3); /* 3 fields */
793 /* First field - spcoid */
794 pq_sendstring(&buf, "spcoid");
795 pq_sendint32(&buf, 0); /* table oid */
796 pq_sendint16(&buf, 0); /* attnum */
797 pq_sendint32(&buf, OIDOID); /* type oid */
798 pq_sendint16(&buf, 4); /* typlen */
799 pq_sendint32(&buf, 0); /* typmod */
800 pq_sendint16(&buf, 0); /* format code */
802 /* Second field - spcpath */
803 pq_sendstring(&buf, "spclocation");
804 pq_sendint32(&buf, 0);
805 pq_sendint16(&buf, 0);
806 pq_sendint32(&buf, TEXTOID);
807 pq_sendint16(&buf, -1);
808 pq_sendint32(&buf, 0);
809 pq_sendint16(&buf, 0);
811 /* Third field - size */
812 pq_sendstring(&buf, "size");
813 pq_sendint32(&buf, 0);
814 pq_sendint16(&buf, 0);
815 pq_sendint32(&buf, INT8OID);
816 pq_sendint16(&buf, 8);
817 pq_sendint32(&buf, 0);
818 pq_sendint16(&buf, 0);
821 foreach(lc, tablespaces)
823 tablespaceinfo *ti = lfirst(lc);
825 /* Send one datarow message */
826 pq_beginmessage(&buf, 'D');
827 pq_sendint16(&buf, 3); /* number of columns */
828 if (ti->path == NULL)
830 pq_sendint32(&buf, -1); /* Length = -1 ==> NULL */
831 pq_sendint32(&buf, -1);
837 len = strlen(ti->oid);
838 pq_sendint32(&buf, len);
839 pq_sendbytes(&buf, ti->oid, len);
841 len = strlen(ti->path);
842 pq_sendint32(&buf, len);
843 pq_sendbytes(&buf, ti->path, len);
846 send_int8_string(&buf, ti->size / 1024);
848 pq_sendint32(&buf, -1); /* NULL */
853 /* Send a CommandComplete message */
854 pq_puttextmessage('C', "SELECT");
858 * Send a single resultset containing just a single
859 * XLogRecPtr record (in text format)
862 SendXlogRecPtrResult(XLogRecPtr ptr, TimeLineID tli)
865 char str[MAXFNAMELEN];
868 pq_beginmessage(&buf, 'T'); /* RowDescription */
869 pq_sendint16(&buf, 2); /* 2 fields */
872 pq_sendstring(&buf, "recptr");
873 pq_sendint32(&buf, 0); /* table oid */
874 pq_sendint16(&buf, 0); /* attnum */
875 pq_sendint32(&buf, TEXTOID); /* type oid */
876 pq_sendint16(&buf, -1);
877 pq_sendint32(&buf, 0);
878 pq_sendint16(&buf, 0);
880 pq_sendstring(&buf, "tli");
881 pq_sendint32(&buf, 0); /* table oid */
882 pq_sendint16(&buf, 0); /* attnum */
885 * int8 may seem like a surprising data type for this, but in theory int4
886 * would not be wide enough for this, as TimeLineID is unsigned.
888 pq_sendint32(&buf, INT8OID); /* type oid */
889 pq_sendint16(&buf, -1);
890 pq_sendint32(&buf, 0);
891 pq_sendint16(&buf, 0);
895 pq_beginmessage(&buf, 'D');
896 pq_sendint16(&buf, 2); /* number of columns */
898 len = snprintf(str, sizeof(str),
899 "%X/%X", (uint32) (ptr >> 32), (uint32) ptr);
900 pq_sendint32(&buf, len);
901 pq_sendbytes(&buf, str, len);
903 len = snprintf(str, sizeof(str), "%u", tli);
904 pq_sendint32(&buf, len);
905 pq_sendbytes(&buf, str, len);
909 /* Send a CommandComplete message */
910 pq_puttextmessage('C', "SELECT");
914 * Inject a file with given name and content in the output tar stream.
917 sendFileWithContent(const char *filename, const char *content)
923 len = strlen(content);
926 * Construct a stat struct for the backup_label file we're injecting in
929 /* Windows doesn't have the concept of uid and gid */
934 statbuf.st_uid = geteuid();
935 statbuf.st_gid = getegid();
937 statbuf.st_mtime = time(NULL);
938 statbuf.st_mode = pg_file_create_mode;
939 statbuf.st_size = len;
941 _tarWriteHeader(filename, NULL, &statbuf, false);
942 /* Send the contents as a CopyData message */
943 pq_putmessage('d', content, len);
945 /* Pad to 512 byte boundary, per tar format requirements */
946 pad = ((len + 511) & ~511) - len;
952 pq_putmessage('d', buf, pad);
957 * Include the tablespace directory pointed to by 'path' in the output tar
958 * stream. If 'sizeonly' is true, we just calculate a total length and return
959 * it, without actually sending anything.
961 * Only used to send auxiliary tablespaces, not PGDATA.
964 sendTablespace(char *path, bool sizeonly)
967 char pathbuf[MAXPGPATH];
971 * 'path' points to the tablespace location, but we only want to include
972 * the version directory in it that belongs to us.
974 snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path,
975 TABLESPACE_VERSION_DIRECTORY);
978 * Store a directory entry in the tar file so we get the permissions
981 if (lstat(pathbuf, &statbuf) != 0)
985 (errcode_for_file_access(),
986 errmsg("could not stat file or directory \"%s\": %m",
989 /* If the tablespace went away while scanning, it's no error. */
993 size = _tarWriteHeader(TABLESPACE_VERSION_DIRECTORY, NULL, &statbuf,
996 /* Send all the files in the tablespace version directory */
997 size += sendDir(pathbuf, strlen(path), sizeonly, NIL, true);
1003 * Include all files from the given directory in the output tar stream. If
1004 * 'sizeonly' is true, we just calculate a total length and return it, without
1005 * actually sending anything.
1007 * Omit any directory in the tablespaces list, to avoid backing up
1008 * tablespaces twice when they were created inside PGDATA.
1010 * If sendtblspclinks is true, we need to include symlink
1011 * information in the tar file. If not, we can skip that
1012 * as it will be sent separately in the tablespace_map file.
1015 sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces,
1016 bool sendtblspclinks)
1020 char pathbuf[MAXPGPATH * 2];
1021 struct stat statbuf;
1023 const char *lastDir; /* Split last dir from parent path. */
1024 bool isDbDir = false; /* Does this directory contain relations? */
1027 * Determine if the current path is a database directory that can contain
1030 * Start by finding the location of the delimiter between the parent path
1031 * and the current path.
1033 lastDir = last_dir_separator(path);
1035 /* Does this path look like a database path (i.e. all digits)? */
1036 if (lastDir != NULL &&
1037 strspn(lastDir + 1, "0123456789") == strlen(lastDir + 1))
1039 /* Part of path that contains the parent directory. */
1040 int parentPathLen = lastDir - path;
1043 * Mark path as a database directory if the parent path is either
1044 * $PGDATA/base or a tablespace version path.
1046 if (strncmp(path, "./base", parentPathLen) == 0 ||
1047 (parentPathLen >= (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) &&
1048 strncmp(lastDir - (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1),
1049 TABLESPACE_VERSION_DIRECTORY,
1050 sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) == 0))
1054 dir = AllocateDir(path);
1055 while ((de = ReadDir(dir, path)) != NULL)
1059 ForkNumber relForkNum; /* Type of fork if file is a relation */
1060 int relOidChars; /* Chars in filename that are the rel oid */
1062 /* Skip special stuff */
1063 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
1066 /* Skip temporary files */
1067 if (strncmp(de->d_name,
1068 PG_TEMP_FILE_PREFIX,
1069 strlen(PG_TEMP_FILE_PREFIX)) == 0)
1073 * Check if the postmaster has signaled us to exit, and abort with an
1074 * error in that case. The error handler further up will call
1075 * do_pg_abort_backup() for us. Also check that if the backup was
1076 * started while still in recovery, the server wasn't promoted.
1077 * dp_pg_stop_backup() will check that too, but it's better to stop
1078 * the backup early than continue to the end and fail there.
1080 CHECK_FOR_INTERRUPTS();
1081 if (RecoveryInProgress() != backup_started_in_recovery)
1083 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1084 errmsg("the standby was promoted during online backup"),
1085 errhint("This means that the backup being taken is corrupt "
1086 "and should not be used. "
1087 "Try taking another online backup.")));
1089 /* Scan for files that should be excluded */
1090 excludeFound = false;
1091 for (excludeIdx = 0; excludeFiles[excludeIdx] != NULL; excludeIdx++)
1093 if (strcmp(de->d_name, excludeFiles[excludeIdx]) == 0)
1095 elog(DEBUG1, "file \"%s\" excluded from backup", de->d_name);
1096 excludeFound = true;
1104 /* Exclude all forks for unlogged tables except the init fork */
1106 parse_filename_for_nontemp_relation(de->d_name, &relOidChars,
1109 /* Never exclude init forks */
1110 if (relForkNum != INIT_FORKNUM)
1112 char initForkFile[MAXPGPATH];
1113 char relOid[OIDCHARS + 1];
1116 * If any other type of fork, check if there is an init fork
1117 * with the same OID. If so, the file can be excluded.
1119 memcpy(relOid, de->d_name, relOidChars);
1120 relOid[relOidChars] = '\0';
1121 snprintf(initForkFile, sizeof(initForkFile), "%s/%s_init",
1124 if (lstat(initForkFile, &statbuf) == 0)
1127 "unlogged relation file \"%s\" excluded from backup",
1135 /* Exclude temporary relations */
1136 if (isDbDir && looks_like_temp_rel_name(de->d_name))
1139 "temporary relation file \"%s\" excluded from backup",
1145 snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path, de->d_name);
1147 /* Skip pg_control here to back up it last */
1148 if (strcmp(pathbuf, "./global/pg_control") == 0)
1151 if (lstat(pathbuf, &statbuf) != 0)
1153 if (errno != ENOENT)
1155 (errcode_for_file_access(),
1156 errmsg("could not stat file or directory \"%s\": %m",
1159 /* If the file went away while scanning, it's not an error. */
1163 /* Scan for directories whose contents should be excluded */
1164 excludeFound = false;
1165 for (excludeIdx = 0; excludeDirContents[excludeIdx] != NULL; excludeIdx++)
1167 if (strcmp(de->d_name, excludeDirContents[excludeIdx]) == 0)
1169 elog(DEBUG1, "contents of directory \"%s\" excluded from backup", de->d_name);
1170 size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly);
1171 excludeFound = true;
1180 * Exclude contents of directory specified by statrelpath if not set
1181 * to the default (pg_stat_tmp) which is caught in the loop above.
1183 if (statrelpath != NULL && strcmp(pathbuf, statrelpath) == 0)
1185 elog(DEBUG1, "contents of directory \"%s\" excluded from backup", statrelpath);
1186 size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly);
1191 * We can skip pg_wal, the WAL segments need to be fetched from the
1192 * WAL archive anyway. But include it as an empty directory anyway, so
1193 * we get permissions right.
1195 if (strcmp(pathbuf, "./pg_wal") == 0)
1197 /* If pg_wal is a symlink, write it as a directory anyway */
1198 size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly);
1201 * Also send archive_status directory (by hackishly reusing
1202 * statbuf from above ...).
1204 size += _tarWriteHeader("./pg_wal/archive_status", NULL, &statbuf,
1207 continue; /* don't recurse into pg_wal */
1210 /* Allow symbolic links in pg_tblspc only */
1211 if (strcmp(path, "./pg_tblspc") == 0 &&
1213 S_ISLNK(statbuf.st_mode)
1215 pgwin32_is_junction(pathbuf)
1219 #if defined(HAVE_READLINK) || defined(WIN32)
1220 char linkpath[MAXPGPATH];
1223 rllen = readlink(pathbuf, linkpath, sizeof(linkpath));
1226 (errcode_for_file_access(),
1227 errmsg("could not read symbolic link \"%s\": %m",
1229 if (rllen >= sizeof(linkpath))
1231 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1232 errmsg("symbolic link \"%s\" target is too long",
1234 linkpath[rllen] = '\0';
1236 size += _tarWriteHeader(pathbuf + basepathlen + 1, linkpath,
1237 &statbuf, sizeonly);
1241 * If the platform does not have symbolic links, it should not be
1242 * possible to have tablespaces - clearly somebody else created
1243 * them. Warn about it and ignore.
1246 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1247 errmsg("tablespaces are not supported on this platform")));
1249 #endif /* HAVE_READLINK */
1251 else if (S_ISDIR(statbuf.st_mode))
1253 bool skip_this_dir = false;
1257 * Store a directory entry in the tar file so we can get the
1258 * permissions right.
1260 size += _tarWriteHeader(pathbuf + basepathlen + 1, NULL, &statbuf,
1264 * Call ourselves recursively for a directory, unless it happens
1265 * to be a separate tablespace located within PGDATA.
1267 foreach(lc, tablespaces)
1269 tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
1272 * ti->rpath is the tablespace relative path within PGDATA, or
1273 * NULL if the tablespace has been properly located somewhere
1276 * Skip past the leading "./" in pathbuf when comparing.
1278 if (ti->rpath && strcmp(ti->rpath, pathbuf + 2) == 0)
1280 skip_this_dir = true;
1286 * skip sending directories inside pg_tblspc, if not required.
1288 if (strcmp(pathbuf, "./pg_tblspc") == 0 && !sendtblspclinks)
1289 skip_this_dir = true;
1292 size += sendDir(pathbuf, basepathlen, sizeonly, tablespaces, sendtblspclinks);
1294 else if (S_ISREG(statbuf.st_mode))
1299 sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf,
1300 true, isDbDir ? pg_atoi(lastDir + 1, sizeof(Oid), 0) : InvalidOid);
1302 if (sent || sizeonly)
1304 /* Add size, rounded up to 512byte block */
1305 size += ((statbuf.st_size + 511) & ~511);
1306 size += 512; /* Size of the header of the file */
1311 (errmsg("skipping special file \"%s\"", pathbuf)));
1318 * Check if a file should have its checksum validated.
1319 * We validate checksums on files in regular tablespaces
1320 * (including global and default) only, and in those there
1321 * are some files that are explicitly excluded.
1324 is_checksummed_file(const char *fullpath, const char *filename)
1326 const char *const *f;
1328 /* Check that the file is in a tablespace */
1329 if (strncmp(fullpath, "./global/", 9) == 0 ||
1330 strncmp(fullpath, "./base/", 7) == 0 ||
1331 strncmp(fullpath, "/", 1) == 0)
1333 /* Compare file against noChecksumFiles skiplist */
1334 for (f = noChecksumFiles; *f; f++)
1335 if (strcmp(*f, filename) == 0)
1345 * Functions for handling tar file format
1347 * Copied from pg_dump, but modified to work with libpq for sending
1352 * Given the member, write the TAR header & send the file.
1354 * If 'missing_ok' is true, will not throw an error if the file is not found.
1356 * If dboid is anything other than InvalidOid then any checksum failures detected
1357 * will get reported to the stats collector.
1359 * Returns true if the file was successfully sent, false if 'missing_ok',
1360 * and the file did not exist.
1363 sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf,
1364 bool missing_ok, Oid dboid)
1367 BlockNumber blkno = 0;
1368 bool block_retry = false;
1369 char buf[TAR_SEND_SIZE];
1371 int checksum_failures = 0;
1380 bool verify_checksum = false;
1382 fp = AllocateFile(readfilename, "rb");
1385 if (errno == ENOENT && missing_ok)
1388 (errcode_for_file_access(),
1389 errmsg("could not open file \"%s\": %m", readfilename)));
1392 _tarWriteHeader(tarfilename, NULL, statbuf, false);
1394 if (!noverify_checksums && DataChecksumsEnabled())
1399 * Get the filename (excluding path). As last_dir_separator()
1400 * includes the last directory separator, we chop that off by
1401 * incrementing the pointer.
1403 filename = last_dir_separator(readfilename) + 1;
1405 if (is_checksummed_file(readfilename, filename))
1407 verify_checksum = true;
1410 * Cut off at the segment boundary (".") to get the segment number
1411 * in order to mix it into the checksum.
1413 segmentpath = strstr(filename, ".");
1414 if (segmentpath != NULL)
1416 segmentno = atoi(segmentpath + 1);
1419 (errmsg("invalid segment number %d in file \"%s\"",
1420 segmentno, filename)));
1425 while ((cnt = fread(buf, 1, Min(sizeof(buf), statbuf->st_size - len), fp)) > 0)
1428 * The checksums are verified at block level, so we iterate over the
1429 * buffer in chunks of BLCKSZ, after making sure that
1430 * TAR_SEND_SIZE/buf is divisible by BLCKSZ and we read a multiple of
1433 Assert(TAR_SEND_SIZE % BLCKSZ == 0);
1435 if (verify_checksum && (cnt % BLCKSZ != 0))
1438 (errmsg("cannot verify checksum in file \"%s\", block "
1439 "%d: read buffer size %d and page size %d "
1441 readfilename, blkno, (int) cnt, BLCKSZ)));
1442 verify_checksum = false;
1445 if (verify_checksum)
1447 for (i = 0; i < cnt / BLCKSZ; i++)
1449 page = buf + BLCKSZ * i;
1452 * Only check pages which have not been modified since the
1453 * start of the base backup. Otherwise, they might have been
1454 * written only halfway and the checksum would not be valid.
1455 * However, replaying WAL would reinstate the correct page in
1456 * this case. We also skip completely new pages, since they
1457 * don't have a checksum yet.
1459 if (!PageIsNew(page) && PageGetLSN(page) < startptr)
1461 checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
1462 phdr = (PageHeader) page;
1463 if (phdr->pd_checksum != checksum)
1466 * Retry the block on the first failure. It's
1467 * possible that we read the first 4K page of the
1468 * block just before postgres updated the entire block
1469 * so it ends up looking torn to us. We only need to
1470 * retry once because the LSN should be updated to
1471 * something we can ignore on the next pass. If the
1472 * error happens again then it is a true validation
1475 if (block_retry == false)
1477 /* Reread the failed block */
1478 if (fseek(fp, -(cnt - BLCKSZ * i), SEEK_CUR) == -1)
1481 (errcode_for_file_access(),
1482 errmsg("could not fseek in file \"%s\": %m",
1486 if (fread(buf + BLCKSZ * i, 1, BLCKSZ, fp) != BLCKSZ)
1489 (errcode_for_file_access(),
1490 errmsg("could not reread block %d of file \"%s\": %m",
1491 blkno, readfilename)));
1494 if (fseek(fp, cnt - BLCKSZ * i - BLCKSZ, SEEK_CUR) == -1)
1497 (errcode_for_file_access(),
1498 errmsg("could not fseek in file \"%s\": %m",
1502 /* Set flag so we know a retry was attempted */
1505 /* Reset loop to validate the block again */
1510 checksum_failures++;
1512 if (checksum_failures <= 5)
1514 (errmsg("checksum verification failed in "
1515 "file \"%s\", block %d: calculated "
1516 "%X but expected %X",
1517 readfilename, blkno, checksum,
1518 phdr->pd_checksum)));
1519 if (checksum_failures == 5)
1521 (errmsg("further checksum verification "
1522 "failures in file \"%s\" will not "
1523 "be reported", readfilename)));
1526 block_retry = false;
1531 /* Send the chunk as a CopyData message */
1532 if (pq_putmessage('d', buf, cnt))
1534 (errmsg("base backup could not send data, aborting backup")));
1539 if (len >= statbuf->st_size)
1542 * Reached end of file. The file could be longer, if it was
1543 * extended while we were sending it, but for a base backup we can
1544 * ignore such extended data. It will be restored from WAL.
1550 /* If the file was truncated while we were sending it, pad it with zeros */
1551 if (len < statbuf->st_size)
1553 MemSet(buf, 0, sizeof(buf));
1554 while (len < statbuf->st_size)
1556 cnt = Min(sizeof(buf), statbuf->st_size - len);
1557 pq_putmessage('d', buf, cnt);
1564 * Pad to 512 byte boundary, per tar format requirements. (This small
1565 * piece of data is probably not worth throttling.)
1567 pad = ((len + 511) & ~511) - len;
1570 MemSet(buf, 0, pad);
1571 pq_putmessage('d', buf, pad);
1576 if (checksum_failures > 1)
1579 (errmsg("file \"%s\" has a total of %d checksum verification "
1580 "failures", readfilename, checksum_failures)));
1582 pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
1585 total_checksum_failures += checksum_failures;
1592 _tarWriteHeader(const char *filename, const char *linktarget,
1593 struct stat *statbuf, bool sizeonly)
1600 rc = tarCreateHeader(h, filename, linktarget, statbuf->st_size,
1601 statbuf->st_mode, statbuf->st_uid, statbuf->st_gid,
1608 case TAR_NAME_TOO_LONG:
1610 (errmsg("file name too long for tar format: \"%s\"",
1613 case TAR_SYMLINK_TOO_LONG:
1615 (errmsg("symbolic link target too long for tar format: "
1616 "file name \"%s\", target \"%s\"",
1617 filename, linktarget)));
1620 elog(ERROR, "unrecognized tar error: %d", rc);
1623 pq_putmessage('d', h, sizeof(h));
1630 * Write tar header for a directory. If the entry in statbuf is a link then
1631 * write it as a directory anyway.
1634 _tarWriteDir(const char *pathbuf, int basepathlen, struct stat *statbuf,
1637 /* If symlink, write it as a directory anyway */
1639 if (S_ISLNK(statbuf->st_mode))
1641 if (pgwin32_is_junction(pathbuf))
1643 statbuf->st_mode = S_IFDIR | pg_dir_create_mode;
1645 return _tarWriteHeader(pathbuf + basepathlen + 1, NULL, statbuf, sizeonly);
1649 * Increment the network transfer counter by the given number of bytes,
1650 * and sleep if necessary to comply with the requested network transfer
1654 throttle(size_t increment)
1656 TimeOffset elapsed_min;
1658 if (throttling_counter < 0)
1661 throttling_counter += increment;
1662 if (throttling_counter < throttling_sample)
1665 /* How much time should have elapsed at minimum? */
1666 elapsed_min = elapsed_min_unit *
1667 (throttling_counter / throttling_sample);
1670 * Since the latch could be set repeatedly because of concurrently WAL
1671 * activity, sleep in a loop to ensure enough time has passed.
1679 /* Time elapsed since the last measurement (and possible wake up). */
1680 elapsed = GetCurrentTimestamp() - throttled_last;
1682 /* sleep if the transfer is faster than it should be */
1683 sleep = elapsed_min - elapsed;
1687 ResetLatch(MyLatch);
1689 /* We're eating a potentially set latch, so check for interrupts */
1690 CHECK_FOR_INTERRUPTS();
1693 * (TAR_SEND_SIZE / throttling_sample * elapsed_min_unit) should be
1694 * the maximum time to sleep. Thus the cast to long is safe.
1696 wait_result = WaitLatch(MyLatch,
1697 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
1698 (long) (sleep / 1000),
1699 WAIT_EVENT_BASE_BACKUP_THROTTLE);
1701 if (wait_result & WL_LATCH_SET)
1702 CHECK_FOR_INTERRUPTS();
1705 if (wait_result & WL_TIMEOUT)
1710 * As we work with integers, only whole multiple of throttling_sample was
1711 * processed. The rest will be done during the next call of this function.
1713 throttling_counter %= throttling_sample;
1716 * Time interval for the remaining amount and possible next increments
1719 throttled_last = GetCurrentTimestamp();