From f04216341dd1cc235e975f93ac806d9d3729a344 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 13 Jun 2013 22:35:56 -0400 Subject: [PATCH] Refactor checksumming code to make it easier to use externally. pg_filedump and other external utility programs are likely to want to be able to check Postgres page checksums. To avoid messy duplication of code, move the checksumming functionality into an exported header file, much as we did awhile back for the CRC code. In passing, get rid of an unportable assumption that a static char[] array will be word-aligned, and do some other minor code beautification. --- src/backend/storage/buffer/bufmgr.c | 6 +- src/backend/storage/page/bufpage.c | 99 ++++--------- src/backend/storage/page/checksum.c | 146 +------------------- src/include/storage/checksum.h | 9 +- src/include/storage/checksum_impl.h | 207 ++++++++++++++++++++++++++++ 5 files changed, 251 insertions(+), 216 deletions(-) create mode 100644 src/include/storage/checksum_impl.h diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 43eb7d59f4..c6b033cf41 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1982,9 +1982,13 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) * have been able to write it while we were busy with log flushing because * we have the io_in_progress lock. */ - bufBlock = BufHdrGetBlock(buf); + /* + * Update page checksum if desired. Since we have only shared lock on the + * buffer, other processes might be updating hint bits in it, so we must + * copy the page to private storage if we do checksumming. + */ bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum); if (track_io_timing) diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index a5594bde64..36b88c5729 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -17,13 +17,12 @@ #include "access/htup_details.h" #include "access/xlog.h" #include "storage/checksum.h" +#include "utils/memutils.h" -bool ignore_checksum_failure = false; -static char pageCopyData[BLCKSZ]; /* for checksum calculation */ -static Page pageCopy = pageCopyData; +/* GUC variable */ +bool ignore_checksum_failure = false; -static uint16 PageCalcChecksum16(Page page, BlockNumber blkno); /* ---------------------------------------------------------------- * Page support functions @@ -94,7 +93,7 @@ PageIsVerified(Page page, BlockNumber blkno) { if (DataChecksumsEnabled()) { - checksum = PageCalcChecksum16(page, blkno); + checksum = pg_checksum_page((char *) page, blkno); if (checksum != p->pd_checksum) checksum_failure = true; @@ -885,13 +884,16 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) pfree(itemidbase); } + /* - * Set checksum for page in shared buffers. + * Set checksum for a page in shared buffers. * * If checksums are disabled, or if the page is not initialized, just return - * the input. Otherwise, we must make a copy of the page before calculating the - * checksum, to prevent concurrent modifications (e.g. setting hint bits) from - * making the final checksum invalid. + * the input. Otherwise, we must make a copy of the page before calculating + * the checksum, to prevent concurrent modifications (e.g. setting hint bits) + * from making the final checksum invalid. It doesn't matter if we include or + * exclude hints during the copy, as long as we write a valid page and + * associated checksum. * * Returns a pointer to the block-sized data that needs to be written. Uses * statically-allocated memory, so the caller must immediately write the @@ -900,79 +902,38 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) char * PageSetChecksumCopy(Page page, BlockNumber blkno) { + static char *pageCopy = NULL; + + /* If we don't need a checksum, just return the passed-in data */ if (PageIsNew(page) || !DataChecksumsEnabled()) return (char *) page; /* - * We make a copy iff we need to calculate a checksum because other - * backends may set hint bits on this page while we write, which would - * mean the checksum differs from the page contents. It doesn't matter if - * we include or exclude hints during the copy, as long as we write a - * valid page and associated checksum. + * We allocate the copy space once and use it over on each subsequent + * call. The point of palloc'ing here, rather than having a static char + * array, is first to ensure adequate alignment for the checksumming code + * and second to avoid wasting space in processes that never call this. */ - memcpy((char *) pageCopy, (char *) page, BLCKSZ); - PageSetChecksumInplace(pageCopy, blkno); - return (char *) pageCopy; + if (pageCopy == NULL) + pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ); + + memcpy(pageCopy, (char *) page, BLCKSZ); + ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno); + return pageCopy; } /* - * Set checksum for page in private memory. + * Set checksum for a page in private memory. * - * This is a simpler version of PageSetChecksumCopy(). The more explicit API - * allows us to more easily see if we're making the correct call and reduces - * the amount of additional code specific to page verification. + * This must only be used when we know that no other process can be modifying + * the page buffer. */ void PageSetChecksumInplace(Page page, BlockNumber blkno) { - if (PageIsNew(page)) + /* If we don't need a checksum, just return */ + if (PageIsNew(page) || !DataChecksumsEnabled()) return; - if (DataChecksumsEnabled()) - { - PageHeader p = (PageHeader) page; - - p->pd_checksum = PageCalcChecksum16(page, blkno); - } - - return; -} - -/* - * Calculate checksum for a PostgreSQL Page. This includes the block number (to - * detect the case when a page is somehow moved to a different location), the - * page header (excluding the checksum itself), and the page data. - * - * Note that if the checksum validation fails we cannot tell the difference - * between a transposed block and failure from direct on-block corruption, - * though that is better than just ignoring transposed blocks altogether. - */ -static uint16 -PageCalcChecksum16(Page page, BlockNumber blkno) -{ - PageHeader phdr = (PageHeader) page; - uint16 save_checksum; - uint32 checksum; - - /* only calculate the checksum for properly-initialized pages */ - Assert(!PageIsNew(page)); - - /* - * Save pd_checksum and set it to zero, so that the checksum calculation - * isn't affected by the checksum stored on the page. We do this to allow - * optimization of the checksum calculation on the whole block in one go. - */ - save_checksum = phdr->pd_checksum; - phdr->pd_checksum = 0; - checksum = checksum_block(page, BLCKSZ); - phdr->pd_checksum = save_checksum; - - /* mix in the block number to detect transposed pages */ - checksum ^= blkno; - - /* - * Reduce to a uint16 (to fit in the pd_checksum field) with an offset of - * one. That avoids checksums of zero, which seems like a good idea. - */ - return (checksum % 65535) + 1; + ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno); } diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c index 41c8ae784d..f72b70de88 100644 --- a/src/backend/storage/page/checksum.c +++ b/src/backend/storage/page/checksum.c @@ -6,156 +6,18 @@ * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * * IDENTIFICATION * src/backend/storage/page/checksum.c * *------------------------------------------------------------------------- - * - * Checksum algorithm - * - * The algorithm used to checksum pages is chosen for very fast calculation. - * Workloads where the database working set fits into OS file cache but not - * into shared buffers can read in pages at a very fast pace and the checksum - * algorithm itself can become the largest bottleneck. - * - * The checksum algorithm itself is based on the FNV-1a hash (FNV is shorthand - * for Fowler/Noll/Vo) The primitive of a plain FNV-1a hash folds in data 1 - * byte at a time according to the formula: - * - * hash = (hash ^ value) * FNV_PRIME - * - * FNV-1a algorithm is described at http://www.isthe.com/chongo/tech/comp/fnv/ - * - * PostgreSQL doesn't use FNV-1a hash directly because it has bad mixing of - * high bits - high order bits in input data only affect high order bits in - * output data. To resolve this we xor in the value prior to multiplication - * shifted right by 17 bits. The number 17 was chosen because it doesn't - * have common denominator with set bit positions in FNV_PRIME and empirically - * provides the fastest mixing for high order bits of final iterations quickly - * avalanche into lower positions. For performance reasons we choose to combine - * 4 bytes at a time. The actual hash formula used as the basis is: - * - * hash = (hash ^ value) * FNV_PRIME ^ ((hash ^ value) >> 17) - * - * The main bottleneck in this calculation is the multiplication latency. To - * hide the latency and to make use of SIMD parallelism multiple hash values - * are calculated in parallel. The page is treated as a 32 column two - * dimensional array of 32 bit values. Each column is aggregated separately - * into a partial checksum. Each partial checksum uses a different initial - * value (offset basis in FNV terminology). The initial values actually used - * were chosen randomly, as the values themselves don't matter as much as that - * they are different and don't match anything in real data. After initializing - * partial checksums each value in the column is aggregated according to the - * above formula. Finally two more iterations of the formula are performed with - * value 0 to mix the bits of the last value added. - * - * The partial checksums are then folded together using xor to form a single - * 32-bit checksum. The caller can safely reduce the value to 16 bits - * using modulo 2^16-1. That will cause a very slight bias towards lower - * values but this is not significant for the performance of the - * checksum. - * - * The algorithm choice was based on what instructions are available in SIMD - * instruction sets. This meant that a fast and good algorithm needed to use - * multiplication as the main mixing operator. The simplest multiplication - * based checksum primitive is the one used by FNV. The prime used is chosen - * for good dispersion of values. It has no known simple patterns that result - * in collisions. Test of 5-bit differentials of the primitive over 64bit keys - * reveals no differentials with 3 or more values out of 100000 random keys - * colliding. Avalanche test shows that only high order bits of the last word - * have a bias. Tests of 1-4 uncorrelated bit errors, stray 0 and 0xFF bytes, - * overwriting page from random position to end with 0 bytes, and overwriting - * random segments of page with 0x00, 0xFF and random data all show optimal - * 2e-16 false positive rate within margin of error. - * - * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer - * multiplication instruction. As of 2013 the corresponding instruction is - * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32). - * Vectorization requires a compiler to do the vectorization for us. For recent - * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough - * to achieve vectorization. - * - * The optimal amount of parallelism to use depends on CPU specific instruction - * latency, SIMD instruction width, throughput and the amount of registers - * available to hold intermediate state. Generally, more parallelism is better - * up to the point that state doesn't fit in registers and extra load-store - * instructions are needed to swap values in/out. The number chosen is a fixed - * part of the algorithm because changing the parallelism changes the checksum - * result. - * - * The parallelism number 32 was chosen based on the fact that it is the - * largest state that fits into architecturally visible x86 SSE registers while - * leaving some free registers for intermediate values. For future processors - * with 256bit vector registers this will leave some performance on the table. - * When vectorization is not available it might be beneficial to restructure - * the computation to calculate a subset of the columns at a time and perform - * multiple passes to avoid register spilling. This optimization opportunity - * is not used. Current coding also assumes that the compiler has the ability - * to unroll the inner loop to avoid loop overhead and minimize register - * spilling. For less sophisticated compilers it might be beneficial to manually - * unroll the inner loop. */ #include "postgres.h" #include "storage/checksum.h" -/* number of checksums to calculate in parallel */ -#define N_SUMS 32 -/* prime multiplier of FNV-1a hash */ -#define FNV_PRIME 16777619 - -/* - * Base offsets to initialize each of the parallel FNV hashes into a - * different initial state. - */ -static const uint32 checksumBaseOffsets[N_SUMS] = { - 0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A, - 0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C, - 0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA, - 0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB, - 0xE58F764B, 0x187636BC, 0x5D7B3BB1, 0xE73DE7DE, - 0x92BEC979, 0xCCA6C0B2, 0x304A0979, 0x85AA43D4, - 0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E, - 0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756 -}; - /* - * Calculate one round of the checksum. + * The actual code is in storage/checksum_impl.h. This is done so that + * external programs can incorporate the checksum code by #include'ing + * that file from the exported Postgres headers. (Compare our CRC code.) */ -#define CHECKSUM_COMP(checksum, value) do {\ - uint32 __tmp = (checksum) ^ (value);\ - (checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17);\ -} while (0) - -uint32 -checksum_block(char *data, uint32 size) -{ - uint32 sums[N_SUMS]; - uint32 (*dataArr)[N_SUMS] = (uint32 (*)[N_SUMS]) data; - uint32 result = 0; - int i, - j; - - /* ensure that the size is compatible with the algorithm */ - Assert((size % (sizeof(uint32) * N_SUMS)) == 0); - - /* initialize partial checksums to their corresponding offsets */ - memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); - - /* main checksum calculation */ - for (i = 0; i < size / sizeof(uint32) / N_SUMS; i++) - for (j = 0; j < N_SUMS; j++) - CHECKSUM_COMP(sums[j], dataArr[i][j]); - - /* finally add in two rounds of zeroes for additional mixing */ - for (i = 0; i < 2; i++) - for (j = 0; j < N_SUMS; j++) - CHECKSUM_COMP(sums[j], 0); - - /* xor fold partial checksums together */ - for (i = 0; i < N_SUMS; i++) - result ^= sums[i]; - - return result; -} +#include "storage/checksum_impl.h" diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h index e41fd9804b..9077c218ff 100644 --- a/src/include/storage/checksum.h +++ b/src/include/storage/checksum.h @@ -3,7 +3,6 @@ * checksum.h * Checksum implementation for data pages. * - * * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -14,10 +13,12 @@ #ifndef CHECKSUM_H #define CHECKSUM_H +#include "storage/block.h" + /* - * Fowler-Noll-Vo 1a block checksum algorithm. The data argument should be - * aligned on a 4-byte boundary. + * Compute the checksum for a Postgres page. The page must be aligned on a + * 4-byte boundary. */ -extern uint32 checksum_block(char *data, uint32 size); +extern uint16 pg_checksum_page(char *page, BlockNumber blkno); #endif /* CHECKSUM_H */ diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h new file mode 100644 index 0000000000..ce1b124fa5 --- /dev/null +++ b/src/include/storage/checksum_impl.h @@ -0,0 +1,207 @@ +/*------------------------------------------------------------------------- + * + * checksum_impl.h + * Checksum implementation for data pages. + * + * This file exists for the benefit of external programs that may wish to + * check Postgres page checksums. They can #include this to get the code + * referenced by storage/checksum.h. (Note: you may need to redefine + * Assert() as empty to compile this successfully externally.) + * + * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/checksum_impl.h + * + *------------------------------------------------------------------------- + */ + +/* + * The algorithm used to checksum pages is chosen for very fast calculation. + * Workloads where the database working set fits into OS file cache but not + * into shared buffers can read in pages at a very fast pace and the checksum + * algorithm itself can become the largest bottleneck. + * + * The checksum algorithm itself is based on the FNV-1a hash (FNV is shorthand + * for Fowler/Noll/Vo). The primitive of a plain FNV-1a hash folds in data 1 + * byte at a time according to the formula: + * + * hash = (hash ^ value) * FNV_PRIME + * + * FNV-1a algorithm is described at http://www.isthe.com/chongo/tech/comp/fnv/ + * + * PostgreSQL doesn't use FNV-1a hash directly because it has bad mixing of + * high bits - high order bits in input data only affect high order bits in + * output data. To resolve this we xor in the value prior to multiplication + * shifted right by 17 bits. The number 17 was chosen because it doesn't + * have common denominator with set bit positions in FNV_PRIME and empirically + * provides the fastest mixing for high order bits of final iterations quickly + * avalanche into lower positions. For performance reasons we choose to combine + * 4 bytes at a time. The actual hash formula used as the basis is: + * + * hash = (hash ^ value) * FNV_PRIME ^ ((hash ^ value) >> 17) + * + * The main bottleneck in this calculation is the multiplication latency. To + * hide the latency and to make use of SIMD parallelism multiple hash values + * are calculated in parallel. The page is treated as a 32 column two + * dimensional array of 32 bit values. Each column is aggregated separately + * into a partial checksum. Each partial checksum uses a different initial + * value (offset basis in FNV terminology). The initial values actually used + * were chosen randomly, as the values themselves don't matter as much as that + * they are different and don't match anything in real data. After initializing + * partial checksums each value in the column is aggregated according to the + * above formula. Finally two more iterations of the formula are performed with + * value 0 to mix the bits of the last value added. + * + * The partial checksums are then folded together using xor to form a single + * 32-bit checksum. The caller can safely reduce the value to 16 bits + * using modulo 2^16-1. That will cause a very slight bias towards lower + * values but this is not significant for the performance of the + * checksum. + * + * The algorithm choice was based on what instructions are available in SIMD + * instruction sets. This meant that a fast and good algorithm needed to use + * multiplication as the main mixing operator. The simplest multiplication + * based checksum primitive is the one used by FNV. The prime used is chosen + * for good dispersion of values. It has no known simple patterns that result + * in collisions. Test of 5-bit differentials of the primitive over 64bit keys + * reveals no differentials with 3 or more values out of 100000 random keys + * colliding. Avalanche test shows that only high order bits of the last word + * have a bias. Tests of 1-4 uncorrelated bit errors, stray 0 and 0xFF bytes, + * overwriting page from random position to end with 0 bytes, and overwriting + * random segments of page with 0x00, 0xFF and random data all show optimal + * 2e-16 false positive rate within margin of error. + * + * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer + * multiplication instruction. As of 2013 the corresponding instruction is + * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32). + * Vectorization requires a compiler to do the vectorization for us. For recent + * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough + * to achieve vectorization. + * + * The optimal amount of parallelism to use depends on CPU specific instruction + * latency, SIMD instruction width, throughput and the amount of registers + * available to hold intermediate state. Generally, more parallelism is better + * up to the point that state doesn't fit in registers and extra load-store + * instructions are needed to swap values in/out. The number chosen is a fixed + * part of the algorithm because changing the parallelism changes the checksum + * result. + * + * The parallelism number 32 was chosen based on the fact that it is the + * largest state that fits into architecturally visible x86 SSE registers while + * leaving some free registers for intermediate values. For future processors + * with 256bit vector registers this will leave some performance on the table. + * When vectorization is not available it might be beneficial to restructure + * the computation to calculate a subset of the columns at a time and perform + * multiple passes to avoid register spilling. This optimization opportunity + * is not used. Current coding also assumes that the compiler has the ability + * to unroll the inner loop to avoid loop overhead and minimize register + * spilling. For less sophisticated compilers it might be beneficial to + * manually unroll the inner loop. + */ + +#include "storage/bufpage.h" + +/* number of checksums to calculate in parallel */ +#define N_SUMS 32 +/* prime multiplier of FNV-1a hash */ +#define FNV_PRIME 16777619 + +/* + * Base offsets to initialize each of the parallel FNV hashes into a + * different initial state. + */ +static const uint32 checksumBaseOffsets[N_SUMS] = { + 0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A, + 0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C, + 0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA, + 0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB, + 0xE58F764B, 0x187636BC, 0x5D7B3BB1, 0xE73DE7DE, + 0x92BEC979, 0xCCA6C0B2, 0x304A0979, 0x85AA43D4, + 0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E, + 0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756 +}; + +/* + * Calculate one round of the checksum. + */ +#define CHECKSUM_COMP(checksum, value) \ +do { \ + uint32 __tmp = (checksum) ^ (value); \ + (checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17); \ +} while (0) + +/* + * Block checksum algorithm. The data argument must be aligned on a 4-byte + * boundary. + */ +static uint32 +pg_checksum_block(char *data, uint32 size) +{ + uint32 sums[N_SUMS]; + uint32 (*dataArr)[N_SUMS] = (uint32 (*)[N_SUMS]) data; + uint32 result = 0; + int i, + j; + + /* ensure that the size is compatible with the algorithm */ + Assert((size % (sizeof(uint32) * N_SUMS)) == 0); + + /* initialize partial checksums to their corresponding offsets */ + memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); + + /* main checksum calculation */ + for (i = 0; i < size / sizeof(uint32) / N_SUMS; i++) + for (j = 0; j < N_SUMS; j++) + CHECKSUM_COMP(sums[j], dataArr[i][j]); + + /* finally add in two rounds of zeroes for additional mixing */ + for (i = 0; i < 2; i++) + for (j = 0; j < N_SUMS; j++) + CHECKSUM_COMP(sums[j], 0); + + /* xor fold partial checksums together */ + for (i = 0; i < N_SUMS; i++) + result ^= sums[i]; + + return result; +} + +/* + * Compute the checksum for a Postgres page. The page must be aligned on a + * 4-byte boundary. + * + * The checksum includes the block number (to detect the case where a page is + * somehow moved to a different location), the page header (excluding the + * checksum itself), and the page data. + */ +uint16 +pg_checksum_page(char *page, BlockNumber blkno) +{ + PageHeader phdr = (PageHeader) page; + uint16 save_checksum; + uint32 checksum; + + /* We only calculate the checksum for properly-initialized pages */ + Assert(!PageIsNew(page)); + + /* + * Save pd_checksum and temporarily set it to zero, so that the checksum + * calculation isn't affected by the old checksum stored on the page. + * Restore it after, because actually updating the checksum is NOT part of + * the API of this function. + */ + save_checksum = phdr->pd_checksum; + phdr->pd_checksum = 0; + checksum = pg_checksum_block(page, BLCKSZ); + phdr->pd_checksum = save_checksum; + + /* Mix in the block number to detect transposed pages */ + checksum ^= blkno; + + /* + * Reduce to a uint16 (to fit in the pd_checksum field) with an offset of + * one. That avoids checksums of zero, which seems like a good idea. + */ + return (checksum % 65535) + 1; +} -- 2.40.0