granicus.if.org Git - zfs/blob - module/zcommon/zfs_fletcher.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
  25  */
  26
  27 /*
  28  * Fletcher Checksums
  29  * ------------------
  30  *
  31  * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
  32  * recurrence relations:
  33  *
  34  *      a  = a    + f
  35  *       i    i-1    i-1
  36  *
  37  *      b  = b    + a
  38  *       i    i-1    i
  39  *
  40  *      c  = c    + b           (fletcher-4 only)
  41  *       i    i-1    i
  42  *
  43  *      d  = d    + c           (fletcher-4 only)
  44  *       i    i-1    i
  45  *
  46  * Where
  47  *      a_0 = b_0 = c_0 = d_0 = 0
  48  * and
  49  *      f_0 .. f_(n-1) are the input data.
  50  *
  51  * Using standard techniques, these translate into the following series:
  52  *
  53  *           __n_                            __n_
  54  *           \   |                           \   |
  55  *      a  =  >     f                   b  =  >     i * f
  56  *       n   /___|   n - i               n   /___|       n - i
  57  *           i = 1                           i = 1
  58  *
  59  *
  60  *           __n_                            __n_
  61  *           \   |  i*(i+1)                  \   |  i*(i+1)*(i+2)
  62  *      c  =  >     ------- f           d  =  >     ------------- f
  63  *       n   /___|     2     n - i       n   /___|        6        n - i
  64  *           i = 1                           i = 1
  65  *
  66  * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
  67  * Since the additions are done mod (2^64), errors in the high bits may not
  68  * be noticed.  For this reason, fletcher-2 is deprecated.
  69  *
  70  * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
  71  * A conservative estimate of how big the buffer can get before we overflow
  72  * can be estimated using f_i = 0xffffffff for all i:
  73  *
  74  * % bc
  75  *  f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
  76  * 2264
  77  *  quit
  78  * %
  79  *
  80  * So blocks of up to 2k will not overflow.  Our largest block size is
  81  * 128k, which has 32k 4-byte words, so we can compute the largest possible
  82  * accumulators, then divide by 2^64 to figure the max amount of overflow:
  83  *
  84  * % bc
  85  *  a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
  86  *  a/2^64;b/2^64;c/2^64;d/2^64
  87  * 0
  88  * 0
  89  * 1365
  90  * 11186858
  91  *  quit
  92  * %
  93  *
  94  * So a and b cannot overflow.  To make sure each bit of input has some
  95  * effect on the contents of c and d, we can look at what the factors of
  96  * the coefficients in the equations for c_n and d_n are.  The number of 2s
  97  * in the factors determines the lowest set bit in the multiplier.  Running
  98  * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
  99  * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15.  So while some data may overflow
 100  * the 64-bit accumulators, every bit of every f_i effects every accumulator,
 101  * even for 128k blocks.
 102  *
 103  * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
 104  * we could do our calculations mod (2^32 - 1) by adding in the carries
 105  * periodically, and store the number of carries in the top 32-bits.
 106  *
 107  * --------------------
 108  * Checksum Performance
 109  * --------------------
 110  *
 111  * There are two interesting components to checksum performance: cached and
 112  * uncached performance.  With cached data, fletcher-2 is about four times
 113  * faster than fletcher-4.  With uncached data, the performance difference is
 114  * negligible, since the cost of a cache fill dominates the processing time.
 115  * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
 116  * efficient pass over the data.
 117  *
 118  * In normal operation, the data which is being checksummed is in a buffer
 119  * which has been filled either by:
 120  *
 121  *      1. a compression step, which will be mostly cached, or
 122  *      2. a bcopy() or copyin(), which will be uncached (because the
 123  *         copy is cache-bypassing).
 124  *
 125  * For both cached and uncached data, both fletcher checksums are much faster
 126  * than sha-256, and slower than 'off', which doesn't touch the data at all.
 127  */
 128
 129 #include <sys/types.h>
 130 #include <sys/sysmacros.h>
 131 #include <sys/byteorder.h>
 132 #include <sys/spa.h>
 133 #include <sys/zio_checksum.h>
 134 #include <sys/zfs_context.h>
 135 #include <zfs_fletcher.h>
 136
 137
 138 static void fletcher_4_scalar_init(zio_cksum_t *zcp);
 139 static void fletcher_4_scalar_native(const void *buf, uint64_t size,
 140     zio_cksum_t *zcp);
 141 static void fletcher_4_scalar_byteswap(const void *buf, uint64_t size,
 142     zio_cksum_t *zcp);
 143 static boolean_t fletcher_4_scalar_valid(void);
 144
 145 static const fletcher_4_ops_t fletcher_4_scalar_ops = {
 146         .init_native = fletcher_4_scalar_init,
 147         .compute_native = fletcher_4_scalar_native,
 148         .init_byteswap = fletcher_4_scalar_init,
 149         .compute_byteswap = fletcher_4_scalar_byteswap,
 150         .valid = fletcher_4_scalar_valid,
 151         .name = "scalar"
 152 };
 153
 154 static fletcher_4_ops_t fletcher_4_fastest_impl = {
 155         .name = "fastest",
 156         .valid = fletcher_4_scalar_valid
 157 };
 158
 159 static const fletcher_4_ops_t *fletcher_4_impls[] = {
 160         &fletcher_4_scalar_ops,
 161 #if defined(HAVE_SSE2)
 162         &fletcher_4_sse2_ops,
 163 #endif
 164 #if defined(HAVE_SSE2) && defined(HAVE_SSSE3)
 165         &fletcher_4_ssse3_ops,
 166 #endif
 167 #if defined(HAVE_AVX) && defined(HAVE_AVX2)
 168         &fletcher_4_avx2_ops,
 169 #endif
 170 #if defined(__x86_64) && defined(HAVE_AVX512F)
 171         &fletcher_4_avx512f_ops,
 172 #endif
 173 };
 174
 175 /* Hold all supported implementations */
 176 static uint32_t fletcher_4_supp_impls_cnt = 0;
 177 static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)];
 178
 179 /* Select fletcher4 implementation */
 180 #define IMPL_FASTEST    (UINT32_MAX)
 181 #define IMPL_CYCLE      (UINT32_MAX - 1)
 182 #define IMPL_SCALAR     (0)
 183
 184 static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST;
 185
 186 #define IMPL_READ(i)    (*(volatile uint32_t *) &(i))
 187
 188 static struct fletcher_4_impl_selector {
 189         const char      *fis_name;
 190         uint32_t        fis_sel;
 191 } fletcher_4_impl_selectors[] = {
 192 #if !defined(_KERNEL)
 193         { "cycle",      IMPL_CYCLE },
 194 #endif
 195         { "fastest",    IMPL_FASTEST },
 196         { "scalar",     IMPL_SCALAR }
 197 };
 198
 199 static kstat_t *fletcher_4_kstat;
 200
 201 static struct fletcher_4_kstat {
 202         uint64_t native;
 203         uint64_t byteswap;
 204 } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
 205
 206 /* Indicate that benchmark has been completed */
 207 static boolean_t fletcher_4_initialized = B_FALSE;
 208
 209 void
 210 fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
 211 {
 212         const uint64_t *ip = buf;
 213         const uint64_t *ipend = ip + (size / sizeof (uint64_t));
 214         uint64_t a0, b0, a1, b1;
 215
 216         for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
 217                 a0 += ip[0];
 218                 a1 += ip[1];
 219                 b0 += a0;
 220                 b1 += a1;
 221         }
 222
 223         ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
 224 }
 225
 226 void
 227 fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
 228 {
 229         const uint64_t *ip = buf;
 230         const uint64_t *ipend = ip + (size / sizeof (uint64_t));
 231         uint64_t a0, b0, a1, b1;
 232
 233         for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
 234                 a0 += BSWAP_64(ip[0]);
 235                 a1 += BSWAP_64(ip[1]);
 236                 b0 += a0;
 237                 b1 += a1;
 238         }
 239
 240         ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
 241 }
 242
 243 static void
 244 fletcher_4_scalar_init(zio_cksum_t *zcp)
 245 {
 246         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 247 }
 248
 249 static void
 250 fletcher_4_scalar_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
 251 {
 252         const uint32_t *ip = buf;
 253         const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 254         uint64_t a, b, c, d;
 255
 256         a = zcp->zc_word[0];
 257         b = zcp->zc_word[1];
 258         c = zcp->zc_word[2];
 259         d = zcp->zc_word[3];
 260
 261         for (; ip < ipend; ip++) {
 262                 a += ip[0];
 263                 b += a;
 264                 c += b;
 265                 d += c;
 266         }
 267
 268         ZIO_SET_CHECKSUM(zcp, a, b, c, d);
 269 }
 270
 271 static void
 272 fletcher_4_scalar_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
 273 {
 274         const uint32_t *ip = buf;
 275         const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 276         uint64_t a, b, c, d;
 277
 278         a = zcp->zc_word[0];
 279         b = zcp->zc_word[1];
 280         c = zcp->zc_word[2];
 281         d = zcp->zc_word[3];
 282
 283         for (; ip < ipend; ip++) {
 284                 a += BSWAP_32(ip[0]);
 285                 b += a;
 286                 c += b;
 287                 d += c;
 288         }
 289
 290         ZIO_SET_CHECKSUM(zcp, a, b, c, d);
 291 }
 292
 293 static boolean_t
 294 fletcher_4_scalar_valid(void)
 295 {
 296         return (B_TRUE);
 297 }
 298
 299 int
 300 fletcher_4_impl_set(const char *val)
 301 {
 302         int err = -EINVAL;
 303         uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
 304         size_t i, val_len;
 305
 306         val_len = strlen(val);
 307         while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */
 308                 val_len--;
 309
 310         /* check mandatory implementations */
 311         for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) {
 312                 const char *name = fletcher_4_impl_selectors[i].fis_name;
 313
 314                 if (val_len == strlen(name) &&
 315                     strncmp(val, name, val_len) == 0) {
 316                         impl = fletcher_4_impl_selectors[i].fis_sel;
 317                         err = 0;
 318                         break;
 319                 }
 320         }
 321
 322         if (err != 0 && fletcher_4_initialized) {
 323                 /* check all supported implementations */
 324                 for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
 325                         const char *name = fletcher_4_supp_impls[i]->name;
 326
 327                         if (val_len == strlen(name) &&
 328                             strncmp(val, name, val_len) == 0) {
 329                                 impl = i;
 330                                 err = 0;
 331                                 break;
 332                         }
 333                 }
 334         }
 335
 336         if (err == 0) {
 337                 atomic_swap_32(&fletcher_4_impl_chosen, impl);
 338                 membar_producer();
 339         }
 340
 341         return (err);
 342 }
 343
 344 static inline const fletcher_4_ops_t *
 345 fletcher_4_impl_get(void)
 346 {
 347         fletcher_4_ops_t *ops = NULL;
 348         const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
 349
 350         switch (impl) {
 351         case IMPL_FASTEST:
 352                 ASSERT(fletcher_4_initialized);
 353                 ops = &fletcher_4_fastest_impl;
 354                 break;
 355 #if !defined(_KERNEL)
 356         case IMPL_CYCLE: {
 357                 ASSERT(fletcher_4_initialized);
 358                 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
 359
 360                 static uint32_t cycle_count = 0;
 361                 uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
 362                 ops = fletcher_4_supp_impls[idx];
 363         }
 364         break;
 365 #endif
 366         default:
 367                 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
 368                 ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
 369
 370                 ops = fletcher_4_supp_impls[impl];
 371                 break;
 372         }
 373
 374         ASSERT3P(ops, !=, NULL);
 375
 376         return (ops);
 377 }
 378
 379 void
 380 fletcher_4_incremental_native(const void *buf, uint64_t size,
 381     zio_cksum_t *zcp)
 382 {
 383         ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
 384
 385         fletcher_4_scalar_native(buf, size, zcp);
 386 }
 387
 388 void
 389 fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
 390     zio_cksum_t *zcp)
 391 {
 392         ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
 393
 394         fletcher_4_scalar_byteswap(buf, size, zcp);
 395 }
 396
 397 static inline void
 398 fletcher_4_native_impl(const fletcher_4_ops_t *ops, const void *buf,
 399         uint64_t size, zio_cksum_t *zcp)
 400 {
 401         ops->init_native(zcp);
 402         ops->compute_native(buf, size, zcp);
 403         if (ops->fini_native != NULL)
 404                 ops->fini_native(zcp);
 405 }
 406
 407 void
 408 fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
 409 {
 410         const fletcher_4_ops_t *ops;
 411         uint64_t p2size = P2ALIGN(size, 64);
 412
 413         ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
 414
 415         if (size == 0) {
 416                 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 417         } else if (p2size == 0) {
 418                 ops = &fletcher_4_scalar_ops;
 419                 fletcher_4_native_impl(ops, buf, size, zcp);
 420         } else {
 421                 ops = fletcher_4_impl_get();
 422                 fletcher_4_native_impl(ops, buf, p2size, zcp);
 423
 424                 if (p2size < size)
 425                         fletcher_4_incremental_native((char *)buf + p2size,
 426                             size - p2size, zcp);
 427         }
 428 }
 429
 430 void
 431 fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp)
 432 {
 433         fletcher_4_native_impl(&fletcher_4_scalar_ops, buf, size, zcp);
 434 }
 435
 436 static inline void
 437 fletcher_4_byteswap_impl(const fletcher_4_ops_t *ops, const void *buf,
 438         uint64_t size, zio_cksum_t *zcp)
 439 {
 440         ops->init_byteswap(zcp);
 441         ops->compute_byteswap(buf, size, zcp);
 442         if (ops->fini_byteswap != NULL)
 443                 ops->fini_byteswap(zcp);
 444 }
 445
 446 void
 447 fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
 448 {
 449         const fletcher_4_ops_t *ops;
 450         uint64_t p2size = P2ALIGN(size, 64);
 451
 452         ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
 453
 454         if (size == 0) {
 455                 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 456         } else if (p2size == 0) {
 457                 ops = &fletcher_4_scalar_ops;
 458                 fletcher_4_byteswap_impl(ops, buf, size, zcp);
 459         } else {
 460                 ops = fletcher_4_impl_get();
 461                 fletcher_4_byteswap_impl(ops, buf, p2size, zcp);
 462
 463                 if (p2size < size)
 464                         fletcher_4_incremental_byteswap((char *)buf + p2size,
 465                             size - p2size, zcp);
 466         }
 467 }
 468
 469 static int
 470 fletcher_4_kstat_headers(char *buf, size_t size)
 471 {
 472         ssize_t off = 0;
 473
 474         off += snprintf(buf + off, size, "%-17s", "implementation");
 475         off += snprintf(buf + off, size - off, "%-15s", "native");
 476         (void) snprintf(buf + off, size - off, "%-15s\n", "byteswap");
 477
 478         return (0);
 479 }
 480
 481 static int
 482 fletcher_4_kstat_data(char *buf, size_t size, void *data)
 483 {
 484         struct fletcher_4_kstat *fastest_stat =
 485             &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
 486         struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *) data;
 487         ssize_t off = 0;
 488
 489         if (curr_stat == fastest_stat) {
 490                 off += snprintf(buf + off, size - off, "%-17s", "fastest");
 491                 off += snprintf(buf + off, size - off, "%-15s",
 492                     fletcher_4_supp_impls[fastest_stat->native]->name);
 493                 off += snprintf(buf + off, size - off, "%-15s\n",
 494                     fletcher_4_supp_impls[fastest_stat->byteswap]->name);
 495         } else {
 496                 ptrdiff_t id = curr_stat - fletcher_4_stat_data;
 497
 498                 off += snprintf(buf + off, size - off, "%-17s",
 499                     fletcher_4_supp_impls[id]->name);
 500                 off += snprintf(buf + off, size - off, "%-15llu",
 501                             (u_longlong_t) curr_stat->native);
 502                 off += snprintf(buf + off, size - off, "%-15llu\n",
 503                             (u_longlong_t) curr_stat->byteswap);
 504         }
 505
 506         return (0);
 507 }
 508
 509 static void *
 510 fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
 511 {
 512         if (n <= fletcher_4_supp_impls_cnt)
 513                 ksp->ks_private = (void *) (fletcher_4_stat_data + n);
 514         else
 515                 ksp->ks_private = NULL;
 516
 517         return (ksp->ks_private);
 518 }
 519
 520 #define FLETCHER_4_FASTEST_FN_COPY(type, src)                             \
 521 {                                                                         \
 522         fletcher_4_fastest_impl.init_ ## type = src->init_ ## type;       \
 523         fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type;       \
 524         fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
 525 }
 526
 527 #define FLETCHER_4_BENCH_NS     (MSEC2NSEC(50))         /* 50ms */
 528
 529 static void
 530 fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
 531 {
 532
 533         struct fletcher_4_kstat *fastest_stat =
 534             &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
 535         hrtime_t start;
 536         uint64_t run_bw, run_time_ns, best_run = 0;
 537         zio_cksum_t zc;
 538         uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);
 539
 540         zio_checksum_func_t *fletcher_4_test = native ? fletcher_4_native :
 541             fletcher_4_byteswap;
 542
 543         for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
 544                 struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i];
 545                 uint64_t run_count = 0;
 546
 547                 /* temporary set an implementation */
 548                 fletcher_4_impl_chosen = i;
 549
 550                 kpreempt_disable();
 551                 start = gethrtime();
 552                 do {
 553                         for (l = 0; l < 32; l++, run_count++)
 554                                 fletcher_4_test(data, data_size, &zc);
 555
 556                         run_time_ns = gethrtime() - start;
 557                 } while (run_time_ns < FLETCHER_4_BENCH_NS);
 558                 kpreempt_enable();
 559
 560                 run_bw = data_size * run_count * NANOSEC;
 561                 run_bw /= run_time_ns;  /* B/s */
 562
 563                 if (native)
 564                         stat->native = run_bw;
 565                 else
 566                         stat->byteswap = run_bw;
 567
 568                 if (run_bw > best_run) {
 569                         best_run = run_bw;
 570
 571                         if (native) {
 572                                 fastest_stat->native = i;
 573                                 FLETCHER_4_FASTEST_FN_COPY(native,
 574                                     fletcher_4_supp_impls[i]);
 575                         } else {
 576                                 fastest_stat->byteswap = i;
 577                                 FLETCHER_4_FASTEST_FN_COPY(byteswap,
 578                                     fletcher_4_supp_impls[i]);
 579                         }
 580                 }
 581         }
 582
 583         /* restore original selection */
 584         atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
 585 }
 586
 587 void
 588 fletcher_4_init(void)
 589 {
 590         static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
 591         fletcher_4_ops_t *curr_impl;
 592         char *databuf;
 593         int i, c;
 594
 595         /* move supported impl into fletcher_4_supp_impls */
 596         for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
 597                 curr_impl = (fletcher_4_ops_t *) fletcher_4_impls[i];
 598
 599                 if (curr_impl->valid && curr_impl->valid())
 600                         fletcher_4_supp_impls[c++] = curr_impl;
 601         }
 602         membar_producer();      /* complete fletcher_4_supp_impls[] init */
 603         fletcher_4_supp_impls_cnt = c;  /* number of supported impl */
 604
 605 #if !defined(_KERNEL)
 606         /* Skip benchmarking and use last implementation as fastest */
 607         memcpy(&fletcher_4_fastest_impl,
 608             fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1],
 609             sizeof (fletcher_4_fastest_impl));
 610         fletcher_4_fastest_impl.name = "fastest";
 611         membar_producer();
 612
 613         fletcher_4_initialized = B_TRUE;
 614
 615         /* Use 'cycle' math selection method for userspace */
 616         VERIFY0(fletcher_4_impl_set("cycle"));
 617         return;
 618 #endif
 619         /* Benchmark all supported implementations */
 620         databuf = vmem_alloc(data_size, KM_SLEEP);
 621         for (i = 0; i < data_size / sizeof (uint64_t); i++)
 622                 ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
 623
 624         fletcher_4_benchmark_impl(B_FALSE, databuf, data_size);
 625         fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
 626
 627         vmem_free(databuf, data_size);
 628
 629         /* install kstats for all implementations */
 630         fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
 631                 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
 632         if (fletcher_4_kstat != NULL) {
 633                 fletcher_4_kstat->ks_data = NULL;
 634                 fletcher_4_kstat->ks_ndata = UINT32_MAX;
 635                 kstat_set_raw_ops(fletcher_4_kstat,
 636                     fletcher_4_kstat_headers,
 637                     fletcher_4_kstat_data,
 638                     fletcher_4_kstat_addr);
 639                 kstat_install(fletcher_4_kstat);
 640         }
 641
 642         /* Finish initialization */
 643         fletcher_4_initialized = B_TRUE;
 644 }
 645
 646 void
 647 fletcher_4_fini(void)
 648 {
 649         if (fletcher_4_kstat != NULL) {
 650                 kstat_delete(fletcher_4_kstat);
 651                 fletcher_4_kstat = NULL;
 652         }
 653 }
 654
 655 #if defined(_KERNEL) && defined(HAVE_SPL)
 656
 657 static int
 658 fletcher_4_param_get(char *buffer, struct kernel_param *unused)
 659 {
 660         const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
 661         char *fmt;
 662         int i, cnt = 0;
 663
 664         /* list fastest */
 665         fmt = (impl == IMPL_FASTEST) ? "[%s] " : "%s ";
 666         cnt += sprintf(buffer + cnt, fmt, "fastest");
 667
 668         /* list all supported implementations */
 669         for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
 670                 fmt = (i == impl) ? "[%s] " : "%s ";
 671                 cnt += sprintf(buffer + cnt, fmt,
 672                     fletcher_4_supp_impls[i]->name);
 673         }
 674
 675         return (cnt);
 676 }
 677
 678 static int
 679 fletcher_4_param_set(const char *val, struct kernel_param *unused)
 680 {
 681         return (fletcher_4_impl_set(val));
 682 }
 683
 684 /*
 685  * Choose a fletcher 4 implementation in ZFS.
 686  * Users can choose "cycle" to exercise all implementations, but this is
 687  * for testing purpose therefore it can only be set in user space.
 688  */
 689 module_param_call(zfs_fletcher_4_impl,
 690     fletcher_4_param_set, fletcher_4_param_get, NULL, 0644);
 691 MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation.");
 692
 693 EXPORT_SYMBOL(fletcher_4_init);
 694 EXPORT_SYMBOL(fletcher_4_fini);
 695 EXPORT_SYMBOL(fletcher_2_native);
 696 EXPORT_SYMBOL(fletcher_2_byteswap);
 697 EXPORT_SYMBOL(fletcher_4_native);
 698 EXPORT_SYMBOL(fletcher_4_native_varsize);
 699 EXPORT_SYMBOL(fletcher_4_byteswap);
 700 EXPORT_SYMBOL(fletcher_4_incremental_native);
 701 EXPORT_SYMBOL(fletcher_4_incremental_byteswap);
 702 #endif