4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
31 * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
32 * recurrence relations:
40 * c = c + b (fletcher-4 only)
43 * d = d + c (fletcher-4 only)
47 * a_0 = b_0 = c_0 = d_0 = 0
49 * f_0 .. f_(n-1) are the input data.
51 * Using standard techniques, these translate into the following series:
56 * n /___| n - i n /___| n - i
61 * \ | i*(i+1) \ | i*(i+1)*(i+2)
62 * c = > ------- f d = > ------------- f
63 * n /___| 2 n - i n /___| 6 n - i
66 * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
67 * Since the additions are done mod (2^64), errors in the high bits may not
68 * be noticed. For this reason, fletcher-2 is deprecated.
70 * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
71 * A conservative estimate of how big the buffer can get before we overflow
72 * can be estimated using f_i = 0xffffffff for all i:
75 * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
80 * So blocks of up to 2k will not overflow. Our largest block size is
81 * 128k, which has 32k 4-byte words, so we can compute the largest possible
82 * accumulators, then divide by 2^64 to figure the max amount of overflow:
85 * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
86 * a/2^64;b/2^64;c/2^64;d/2^64
94 * So a and b cannot overflow. To make sure each bit of input has some
95 * effect on the contents of c and d, we can look at what the factors of
96 * the coefficients in the equations for c_n and d_n are. The number of 2s
97 * in the factors determines the lowest set bit in the multiplier. Running
98 * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
99 * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow
100 * the 64-bit accumulators, every bit of every f_i effects every accumulator,
101 * even for 128k blocks.
103 * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
104 * we could do our calculations mod (2^32 - 1) by adding in the carries
105 * periodically, and store the number of carries in the top 32-bits.
107 * --------------------
108 * Checksum Performance
109 * --------------------
111 * There are two interesting components to checksum performance: cached and
112 * uncached performance. With cached data, fletcher-2 is about four times
113 * faster than fletcher-4. With uncached data, the performance difference is
114 * negligible, since the cost of a cache fill dominates the processing time.
115 * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
116 * efficient pass over the data.
118 * In normal operation, the data which is being checksummed is in a buffer
119 * which has been filled either by:
121 * 1. a compression step, which will be mostly cached, or
122 * 2. a bcopy() or copyin(), which will be uncached (because the
123 * copy is cache-bypassing).
125 * For both cached and uncached data, both fletcher checksums are much faster
126 * than sha-256, and slower than 'off', which doesn't touch the data at all.
129 #include <sys/types.h>
130 #include <sys/sysmacros.h>
131 #include <sys/byteorder.h>
133 #include <sys/zio_checksum.h>
134 #include <sys/zfs_context.h>
135 #include <zfs_fletcher.h>
138 static void fletcher_4_scalar_init(zio_cksum_t *zcp);
139 static void fletcher_4_scalar_native(const void *buf, uint64_t size,
141 static void fletcher_4_scalar_byteswap(const void *buf, uint64_t size,
143 static boolean_t fletcher_4_scalar_valid(void);
145 static const fletcher_4_ops_t fletcher_4_scalar_ops = {
146 .init_native = fletcher_4_scalar_init,
147 .compute_native = fletcher_4_scalar_native,
148 .init_byteswap = fletcher_4_scalar_init,
149 .compute_byteswap = fletcher_4_scalar_byteswap,
150 .valid = fletcher_4_scalar_valid,
154 static fletcher_4_ops_t fletcher_4_fastest_impl = {
156 .valid = fletcher_4_scalar_valid
159 static const fletcher_4_ops_t *fletcher_4_impls[] = {
160 &fletcher_4_scalar_ops,
161 #if defined(HAVE_SSE2)
162 &fletcher_4_sse2_ops,
164 #if defined(HAVE_SSE2) && defined(HAVE_SSSE3)
165 &fletcher_4_ssse3_ops,
167 #if defined(HAVE_AVX) && defined(HAVE_AVX2)
168 &fletcher_4_avx2_ops,
170 #if defined(__x86_64) && defined(HAVE_AVX512F)
171 &fletcher_4_avx512f_ops,
175 /* Hold all supported implementations */
176 static uint32_t fletcher_4_supp_impls_cnt = 0;
177 static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)];
179 /* Select fletcher4 implementation */
180 #define IMPL_FASTEST (UINT32_MAX)
181 #define IMPL_CYCLE (UINT32_MAX - 1)
182 #define IMPL_SCALAR (0)
184 static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST;
186 #define IMPL_READ(i) (*(volatile uint32_t *) &(i))
188 static struct fletcher_4_impl_selector {
189 const char *fis_name;
191 } fletcher_4_impl_selectors[] = {
192 #if !defined(_KERNEL)
193 { "cycle", IMPL_CYCLE },
195 { "fastest", IMPL_FASTEST },
196 { "scalar", IMPL_SCALAR }
199 static kstat_t *fletcher_4_kstat;
201 static struct fletcher_4_kstat {
204 } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
206 /* Indicate that benchmark has been completed */
207 static boolean_t fletcher_4_initialized = B_FALSE;
210 fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
212 const uint64_t *ip = buf;
213 const uint64_t *ipend = ip + (size / sizeof (uint64_t));
214 uint64_t a0, b0, a1, b1;
216 for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
223 ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
227 fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
229 const uint64_t *ip = buf;
230 const uint64_t *ipend = ip + (size / sizeof (uint64_t));
231 uint64_t a0, b0, a1, b1;
233 for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
234 a0 += BSWAP_64(ip[0]);
235 a1 += BSWAP_64(ip[1]);
240 ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
244 fletcher_4_scalar_init(zio_cksum_t *zcp)
246 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
250 fletcher_4_scalar_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
252 const uint32_t *ip = buf;
253 const uint32_t *ipend = ip + (size / sizeof (uint32_t));
261 for (; ip < ipend; ip++) {
268 ZIO_SET_CHECKSUM(zcp, a, b, c, d);
272 fletcher_4_scalar_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
274 const uint32_t *ip = buf;
275 const uint32_t *ipend = ip + (size / sizeof (uint32_t));
283 for (; ip < ipend; ip++) {
284 a += BSWAP_32(ip[0]);
290 ZIO_SET_CHECKSUM(zcp, a, b, c, d);
294 fletcher_4_scalar_valid(void)
300 fletcher_4_impl_set(const char *val)
303 uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
306 val_len = strlen(val);
307 while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */
310 /* check mandatory implementations */
311 for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) {
312 const char *name = fletcher_4_impl_selectors[i].fis_name;
314 if (val_len == strlen(name) &&
315 strncmp(val, name, val_len) == 0) {
316 impl = fletcher_4_impl_selectors[i].fis_sel;
322 if (err != 0 && fletcher_4_initialized) {
323 /* check all supported implementations */
324 for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
325 const char *name = fletcher_4_supp_impls[i]->name;
327 if (val_len == strlen(name) &&
328 strncmp(val, name, val_len) == 0) {
337 atomic_swap_32(&fletcher_4_impl_chosen, impl);
344 static inline const fletcher_4_ops_t *
345 fletcher_4_impl_get(void)
347 fletcher_4_ops_t *ops = NULL;
348 const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
352 ASSERT(fletcher_4_initialized);
353 ops = &fletcher_4_fastest_impl;
355 #if !defined(_KERNEL)
357 ASSERT(fletcher_4_initialized);
358 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
360 static uint32_t cycle_count = 0;
361 uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
362 ops = fletcher_4_supp_impls[idx];
367 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
368 ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
370 ops = fletcher_4_supp_impls[impl];
374 ASSERT3P(ops, !=, NULL);
380 fletcher_4_incremental_native(const void *buf, uint64_t size,
383 ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
385 fletcher_4_scalar_native(buf, size, zcp);
389 fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
392 ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
394 fletcher_4_scalar_byteswap(buf, size, zcp);
398 fletcher_4_native_impl(const fletcher_4_ops_t *ops, const void *buf,
399 uint64_t size, zio_cksum_t *zcp)
401 ops->init_native(zcp);
402 ops->compute_native(buf, size, zcp);
403 if (ops->fini_native != NULL)
404 ops->fini_native(zcp);
408 fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
410 const fletcher_4_ops_t *ops;
411 uint64_t p2size = P2ALIGN(size, 64);
413 ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
416 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
417 } else if (p2size == 0) {
418 ops = &fletcher_4_scalar_ops;
419 fletcher_4_native_impl(ops, buf, size, zcp);
421 ops = fletcher_4_impl_get();
422 fletcher_4_native_impl(ops, buf, p2size, zcp);
425 fletcher_4_incremental_native((char *)buf + p2size,
431 fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp)
433 fletcher_4_native_impl(&fletcher_4_scalar_ops, buf, size, zcp);
437 fletcher_4_byteswap_impl(const fletcher_4_ops_t *ops, const void *buf,
438 uint64_t size, zio_cksum_t *zcp)
440 ops->init_byteswap(zcp);
441 ops->compute_byteswap(buf, size, zcp);
442 if (ops->fini_byteswap != NULL)
443 ops->fini_byteswap(zcp);
447 fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
449 const fletcher_4_ops_t *ops;
450 uint64_t p2size = P2ALIGN(size, 64);
452 ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
455 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
456 } else if (p2size == 0) {
457 ops = &fletcher_4_scalar_ops;
458 fletcher_4_byteswap_impl(ops, buf, size, zcp);
460 ops = fletcher_4_impl_get();
461 fletcher_4_byteswap_impl(ops, buf, p2size, zcp);
464 fletcher_4_incremental_byteswap((char *)buf + p2size,
470 fletcher_4_kstat_headers(char *buf, size_t size)
474 off += snprintf(buf + off, size, "%-17s", "implementation");
475 off += snprintf(buf + off, size - off, "%-15s", "native");
476 (void) snprintf(buf + off, size - off, "%-15s\n", "byteswap");
482 fletcher_4_kstat_data(char *buf, size_t size, void *data)
484 struct fletcher_4_kstat *fastest_stat =
485 &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
486 struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *) data;
489 if (curr_stat == fastest_stat) {
490 off += snprintf(buf + off, size - off, "%-17s", "fastest");
491 off += snprintf(buf + off, size - off, "%-15s",
492 fletcher_4_supp_impls[fastest_stat->native]->name);
493 off += snprintf(buf + off, size - off, "%-15s\n",
494 fletcher_4_supp_impls[fastest_stat->byteswap]->name);
496 ptrdiff_t id = curr_stat - fletcher_4_stat_data;
498 off += snprintf(buf + off, size - off, "%-17s",
499 fletcher_4_supp_impls[id]->name);
500 off += snprintf(buf + off, size - off, "%-15llu",
501 (u_longlong_t) curr_stat->native);
502 off += snprintf(buf + off, size - off, "%-15llu\n",
503 (u_longlong_t) curr_stat->byteswap);
510 fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
512 if (n <= fletcher_4_supp_impls_cnt)
513 ksp->ks_private = (void *) (fletcher_4_stat_data + n);
515 ksp->ks_private = NULL;
517 return (ksp->ks_private);
520 #define FLETCHER_4_FASTEST_FN_COPY(type, src) \
522 fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \
523 fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \
524 fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
527 #define FLETCHER_4_BENCH_NS (MSEC2NSEC(50)) /* 50ms */
530 fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
533 struct fletcher_4_kstat *fastest_stat =
534 &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
536 uint64_t run_bw, run_time_ns, best_run = 0;
538 uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);
540 zio_checksum_func_t *fletcher_4_test = native ? fletcher_4_native :
543 for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
544 struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i];
545 uint64_t run_count = 0;
547 /* temporary set an implementation */
548 fletcher_4_impl_chosen = i;
553 for (l = 0; l < 32; l++, run_count++)
554 fletcher_4_test(data, data_size, &zc);
556 run_time_ns = gethrtime() - start;
557 } while (run_time_ns < FLETCHER_4_BENCH_NS);
560 run_bw = data_size * run_count * NANOSEC;
561 run_bw /= run_time_ns; /* B/s */
564 stat->native = run_bw;
566 stat->byteswap = run_bw;
568 if (run_bw > best_run) {
572 fastest_stat->native = i;
573 FLETCHER_4_FASTEST_FN_COPY(native,
574 fletcher_4_supp_impls[i]);
576 fastest_stat->byteswap = i;
577 FLETCHER_4_FASTEST_FN_COPY(byteswap,
578 fletcher_4_supp_impls[i]);
583 /* restore original selection */
584 atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
588 fletcher_4_init(void)
590 static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
591 fletcher_4_ops_t *curr_impl;
595 /* move supported impl into fletcher_4_supp_impls */
596 for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
597 curr_impl = (fletcher_4_ops_t *) fletcher_4_impls[i];
599 if (curr_impl->valid && curr_impl->valid())
600 fletcher_4_supp_impls[c++] = curr_impl;
602 membar_producer(); /* complete fletcher_4_supp_impls[] init */
603 fletcher_4_supp_impls_cnt = c; /* number of supported impl */
605 #if !defined(_KERNEL)
606 /* Skip benchmarking and use last implementation as fastest */
607 memcpy(&fletcher_4_fastest_impl,
608 fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1],
609 sizeof (fletcher_4_fastest_impl));
610 fletcher_4_fastest_impl.name = "fastest";
613 fletcher_4_initialized = B_TRUE;
615 /* Use 'cycle' math selection method for userspace */
616 VERIFY0(fletcher_4_impl_set("cycle"));
619 /* Benchmark all supported implementations */
620 databuf = vmem_alloc(data_size, KM_SLEEP);
621 for (i = 0; i < data_size / sizeof (uint64_t); i++)
622 ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
624 fletcher_4_benchmark_impl(B_FALSE, databuf, data_size);
625 fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
627 vmem_free(databuf, data_size);
629 /* install kstats for all implementations */
630 fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
631 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
632 if (fletcher_4_kstat != NULL) {
633 fletcher_4_kstat->ks_data = NULL;
634 fletcher_4_kstat->ks_ndata = UINT32_MAX;
635 kstat_set_raw_ops(fletcher_4_kstat,
636 fletcher_4_kstat_headers,
637 fletcher_4_kstat_data,
638 fletcher_4_kstat_addr);
639 kstat_install(fletcher_4_kstat);
642 /* Finish initialization */
643 fletcher_4_initialized = B_TRUE;
647 fletcher_4_fini(void)
649 if (fletcher_4_kstat != NULL) {
650 kstat_delete(fletcher_4_kstat);
651 fletcher_4_kstat = NULL;
655 #if defined(_KERNEL) && defined(HAVE_SPL)
658 fletcher_4_param_get(char *buffer, struct kernel_param *unused)
660 const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
665 fmt = (impl == IMPL_FASTEST) ? "[%s] " : "%s ";
666 cnt += sprintf(buffer + cnt, fmt, "fastest");
668 /* list all supported implementations */
669 for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
670 fmt = (i == impl) ? "[%s] " : "%s ";
671 cnt += sprintf(buffer + cnt, fmt,
672 fletcher_4_supp_impls[i]->name);
679 fletcher_4_param_set(const char *val, struct kernel_param *unused)
681 return (fletcher_4_impl_set(val));
685 * Choose a fletcher 4 implementation in ZFS.
686 * Users can choose "cycle" to exercise all implementations, but this is
687 * for testing purpose therefore it can only be set in user space.
689 module_param_call(zfs_fletcher_4_impl,
690 fletcher_4_param_set, fletcher_4_param_get, NULL, 0644);
691 MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation.");
693 EXPORT_SYMBOL(fletcher_4_init);
694 EXPORT_SYMBOL(fletcher_4_fini);
695 EXPORT_SYMBOL(fletcher_2_native);
696 EXPORT_SYMBOL(fletcher_2_byteswap);
697 EXPORT_SYMBOL(fletcher_4_native);
698 EXPORT_SYMBOL(fletcher_4_native_varsize);
699 EXPORT_SYMBOL(fletcher_4_byteswap);
700 EXPORT_SYMBOL(fletcher_4_incremental_native);
701 EXPORT_SYMBOL(fletcher_4_incremental_byteswap);