granicus.if.org Git - zfs/blob - module/zcommon/zfs_fletcher.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*
  27  * Fletcher Checksums
  28  * ------------------
  29  *
  30  * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
  31  * recurrence relations:
  32  *
  33  *      a  = a    + f
  34  *       i    i-1    i-1
  35  *
  36  *      b  = b    + a
  37  *       i    i-1    i
  38  *
  39  *      c  = c    + b           (fletcher-4 only)
  40  *       i    i-1    i
  41  *
  42  *      d  = d    + c           (fletcher-4 only)
  43  *       i    i-1    i
  44  *
  45  * Where
  46  *      a_0 = b_0 = c_0 = d_0 = 0
  47  * and
  48  *      f_0 .. f_(n-1) are the input data.
  49  *
  50  * Using standard techniques, these translate into the following series:
  51  *
  52  *           __n_                            __n_
  53  *           \   |                           \   |
  54  *      a  =  >     f                   b  =  >     i * f
  55  *       n   /___|   n - i               n   /___|       n - i
  56  *           i = 1                           i = 1
  57  *
  58  *
  59  *           __n_                            __n_
  60  *           \   |  i*(i+1)                  \   |  i*(i+1)*(i+2)
  61  *      c  =  >     ------- f           d  =  >     ------------- f
  62  *       n   /___|     2     n - i       n   /___|        6        n - i
  63  *           i = 1                           i = 1
  64  *
  65  * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
  66  * Since the additions are done mod (2^64), errors in the high bits may not
  67  * be noticed.  For this reason, fletcher-2 is deprecated.
  68  *
  69  * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
  70  * A conservative estimate of how big the buffer can get before we overflow
  71  * can be estimated using f_i = 0xffffffff for all i:
  72  *
  73  * % bc
  74  *  f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
  75  * 2264
  76  *  quit
  77  * %
  78  *
  79  * So blocks of up to 2k will not overflow.  Our largest block size is
  80  * 128k, which has 32k 4-byte words, so we can compute the largest possible
  81  * accumulators, then divide by 2^64 to figure the max amount of overflow:
  82  *
  83  * % bc
  84  *  a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
  85  *  a/2^64;b/2^64;c/2^64;d/2^64
  86  * 0
  87  * 0
  88  * 1365
  89  * 11186858
  90  *  quit
  91  * %
  92  *
  93  * So a and b cannot overflow.  To make sure each bit of input has some
  94  * effect on the contents of c and d, we can look at what the factors of
  95  * the coefficients in the equations for c_n and d_n are.  The number of 2s
  96  * in the factors determines the lowest set bit in the multiplier.  Running
  97  * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
  98  * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15.  So while some data may overflow
  99  * the 64-bit accumulators, every bit of every f_i effects every accumulator,
 100  * even for 128k blocks.
 101  *
 102  * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
 103  * we could do our calculations mod (2^32 - 1) by adding in the carries
 104  * periodically, and store the number of carries in the top 32-bits.
 105  *
 106  * --------------------
 107  * Checksum Performance
 108  * --------------------
 109  *
 110  * There are two interesting components to checksum performance: cached and
 111  * uncached performance.  With cached data, fletcher-2 is about four times
 112  * faster than fletcher-4.  With uncached data, the performance difference is
 113  * negligible, since the cost of a cache fill dominates the processing time.
 114  * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
 115  * efficient pass over the data.
 116  *
 117  * In normal operation, the data which is being checksummed is in a buffer
 118  * which has been filled either by:
 119  *
 120  *      1. a compression step, which will be mostly cached, or
 121  *      2. a bcopy() or copyin(), which will be uncached (because the
 122  *         copy is cache-bypassing).
 123  *
 124  * For both cached and uncached data, both fletcher checksums are much faster
 125  * than sha-256, and slower than 'off', which doesn't touch the data at all.
 126  */
 127
 128 #include <sys/types.h>
 129 #include <sys/sysmacros.h>
 130 #include <sys/byteorder.h>
 131 #include <sys/spa.h>
 132 #include <sys/zfs_context.h>
 133 #include <zfs_fletcher.h>
 134
 135 static void fletcher_4_scalar_init(zio_cksum_t *zcp);
 136 static void fletcher_4_scalar(const void *buf, uint64_t size,
 137     zio_cksum_t *zcp);
 138 static void fletcher_4_scalar_byteswap(const void *buf, uint64_t size,
 139     zio_cksum_t *zcp);
 140 static boolean_t fletcher_4_scalar_valid(void);
 141
 142 static const fletcher_4_ops_t fletcher_4_scalar_ops = {
 143         .init = fletcher_4_scalar_init,
 144         .compute = fletcher_4_scalar,
 145         .compute_byteswap = fletcher_4_scalar_byteswap,
 146         .valid = fletcher_4_scalar_valid,
 147         .name = "scalar"
 148 };
 149
 150 static const fletcher_4_ops_t *fletcher_4_algos[] = {
 151         &fletcher_4_scalar_ops,
 152 #if defined(HAVE_AVX) && defined(HAVE_AVX2)
 153         &fletcher_4_avx2_ops,
 154 #endif
 155 };
 156
 157 static enum fletcher_selector {
 158         FLETCHER_FASTEST = 0,
 159         FLETCHER_SCALAR,
 160 #if defined(HAVE_AVX) && defined(HAVE_AVX2)
 161         FLETCHER_AVX2,
 162 #endif
 163         FLETCHER_CYCLE
 164 } fletcher_4_impl_chosen = FLETCHER_SCALAR;
 165
 166 static struct fletcher_4_impl_selector {
 167         const char              *fis_name;
 168         const fletcher_4_ops_t  *fis_ops;
 169 } fletcher_4_impl_selectors[] = {
 170         [ FLETCHER_FASTEST ]    = { "fastest", NULL },
 171         [ FLETCHER_SCALAR ]     = { "scalar", &fletcher_4_scalar_ops },
 172 #if defined(HAVE_AVX) && defined(HAVE_AVX2)
 173         [ FLETCHER_AVX2 ]       = { "avx2", &fletcher_4_avx2_ops },
 174 #endif
 175 #if !defined(_KERNEL)
 176         [ FLETCHER_CYCLE ]      = { "cycle", &fletcher_4_scalar_ops }
 177 #endif
 178 };
 179
 180 static kmutex_t fletcher_4_impl_lock;
 181
 182 static kstat_t *fletcher_4_kstat;
 183
 184 static kstat_named_t fletcher_4_kstat_data[ARRAY_SIZE(fletcher_4_algos)];
 185
 186 void
 187 fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
 188 {
 189         const uint64_t *ip = buf;
 190         const uint64_t *ipend = ip + (size / sizeof (uint64_t));
 191         uint64_t a0, b0, a1, b1;
 192
 193         for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
 194                 a0 += ip[0];
 195                 a1 += ip[1];
 196                 b0 += a0;
 197                 b1 += a1;
 198         }
 199
 200         ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
 201 }
 202
 203 void
 204 fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
 205 {
 206         const uint64_t *ip = buf;
 207         const uint64_t *ipend = ip + (size / sizeof (uint64_t));
 208         uint64_t a0, b0, a1, b1;
 209
 210         for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
 211                 a0 += BSWAP_64(ip[0]);
 212                 a1 += BSWAP_64(ip[1]);
 213                 b0 += a0;
 214                 b1 += a1;
 215         }
 216
 217         ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
 218 }
 219
 220 static void fletcher_4_scalar_init(zio_cksum_t *zcp)
 221 {
 222         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 223 }
 224
 225 static void
 226 fletcher_4_scalar(const void *buf, uint64_t size, zio_cksum_t *zcp)
 227 {
 228         const uint32_t *ip = buf;
 229         const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 230         uint64_t a, b, c, d;
 231
 232         a = zcp->zc_word[0];
 233         b = zcp->zc_word[1];
 234         c = zcp->zc_word[2];
 235         d = zcp->zc_word[3];
 236
 237         for (; ip < ipend; ip++) {
 238                 a += ip[0];
 239                 b += a;
 240                 c += b;
 241                 d += c;
 242         }
 243
 244         ZIO_SET_CHECKSUM(zcp, a, b, c, d);
 245 }
 246
 247 static void
 248 fletcher_4_scalar_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
 249 {
 250         const uint32_t *ip = buf;
 251         const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 252         uint64_t a, b, c, d;
 253
 254         a = zcp->zc_word[0];
 255         b = zcp->zc_word[1];
 256         c = zcp->zc_word[2];
 257         d = zcp->zc_word[3];
 258
 259         for (; ip < ipend; ip++) {
 260                 a += BSWAP_32(ip[0]);
 261                 b += a;
 262                 c += b;
 263                 d += c;
 264         }
 265
 266         ZIO_SET_CHECKSUM(zcp, a, b, c, d);
 267 }
 268
 269 static boolean_t
 270 fletcher_4_scalar_valid(void)
 271 {
 272         return (B_TRUE);
 273 }
 274
 275 int
 276 fletcher_4_impl_set(const char *val)
 277 {
 278         const fletcher_4_ops_t *ops;
 279         enum fletcher_selector idx;
 280         size_t val_len;
 281         unsigned i;
 282
 283         val_len = strlen(val);
 284         while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */
 285                 val_len--;
 286
 287         for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) {
 288                 const char *name = fletcher_4_impl_selectors[i].fis_name;
 289
 290                 if (val_len == strlen(name) &&
 291                     strncmp(val, name, val_len) == 0) {
 292                         idx = i;
 293                         break;
 294                 }
 295         }
 296         if (i >= ARRAY_SIZE(fletcher_4_impl_selectors))
 297                 return (-EINVAL);
 298
 299         ops = fletcher_4_impl_selectors[idx].fis_ops;
 300         if (ops == NULL || !ops->valid())
 301                 return (-ENOTSUP);
 302
 303         mutex_enter(&fletcher_4_impl_lock);
 304         if (fletcher_4_impl_chosen != idx)
 305                 fletcher_4_impl_chosen = idx;
 306         mutex_exit(&fletcher_4_impl_lock);
 307
 308         return (0);
 309 }
 310
 311 static inline const fletcher_4_ops_t *
 312 fletcher_4_impl_get(void)
 313 {
 314 #if !defined(_KERNEL)
 315         if (fletcher_4_impl_chosen == FLETCHER_CYCLE) {
 316                 static volatile unsigned int cycle_count = 0;
 317                 const fletcher_4_ops_t *ops = NULL;
 318                 unsigned int index;
 319
 320                 while (1) {
 321                         index = atomic_inc_uint_nv(&cycle_count);
 322                         ops = fletcher_4_algos[
 323                             index % ARRAY_SIZE(fletcher_4_algos)];
 324                         if (ops->valid())
 325                                 break;
 326                 }
 327                 return (ops);
 328         }
 329 #endif
 330         membar_producer();
 331         return (fletcher_4_impl_selectors[fletcher_4_impl_chosen].fis_ops);
 332 }
 333
 334 void
 335 fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
 336 {
 337         const fletcher_4_ops_t *ops;
 338
 339         if (IS_P2ALIGNED(size, 4 * sizeof (uint32_t)))
 340                 ops = fletcher_4_impl_get();
 341         else
 342                 ops = &fletcher_4_scalar_ops;
 343
 344         ops->init(zcp);
 345         ops->compute(buf, size, zcp);
 346         if (ops->fini != NULL)
 347                 ops->fini(zcp);
 348 }
 349
 350 void
 351 fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
 352 {
 353         const fletcher_4_ops_t *ops;
 354
 355         if (IS_P2ALIGNED(size, 4 * sizeof (uint32_t)))
 356                 ops = fletcher_4_impl_get();
 357         else
 358                 ops = &fletcher_4_scalar_ops;
 359
 360         ops->init(zcp);
 361         ops->compute_byteswap(buf, size, zcp);
 362         if (ops->fini != NULL)
 363                 ops->fini(zcp);
 364 }
 365
 366 void
 367 fletcher_4_incremental_native(const void *buf, uint64_t size,
 368     zio_cksum_t *zcp)
 369 {
 370         fletcher_4_scalar(buf, size, zcp);
 371 }
 372
 373 void
 374 fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
 375     zio_cksum_t *zcp)
 376 {
 377         fletcher_4_scalar_byteswap(buf, size, zcp);
 378 }
 379
 380 void
 381 fletcher_4_init(void)
 382 {
 383         const uint64_t const bench_ns = (50 * MICROSEC); /* 50ms */
 384         unsigned long best_run_count = 0;
 385         unsigned long best_run_index = 0;
 386         const unsigned data_size = 4096;
 387         char *databuf;
 388         int i;
 389
 390         databuf = kmem_alloc(data_size, KM_SLEEP);
 391         for (i = 0; i < ARRAY_SIZE(fletcher_4_algos); i++) {
 392                 const fletcher_4_ops_t *ops = fletcher_4_algos[i];
 393                 kstat_named_t *stat = &fletcher_4_kstat_data[i];
 394                 unsigned long run_count = 0;
 395                 hrtime_t start;
 396                 zio_cksum_t zc;
 397
 398                 strncpy(stat->name, ops->name, sizeof (stat->name) - 1);
 399                 stat->data_type = KSTAT_DATA_UINT64;
 400                 stat->value.ui64 = 0;
 401
 402                 if (!ops->valid())
 403                         continue;
 404
 405                 kpreempt_disable();
 406                 start = gethrtime();
 407                 ops->init(&zc);
 408                 do {
 409                         ops->compute(databuf, data_size, &zc);
 410                         run_count++;
 411                 } while (gethrtime() < start + bench_ns);
 412                 if (ops->fini != NULL)
 413                         ops->fini(&zc);
 414                 kpreempt_enable();
 415
 416                 if (run_count > best_run_count) {
 417                         best_run_count = run_count;
 418                         best_run_index = i;
 419                 }
 420
 421                 /*
 422                  * Due to high overhead of gethrtime(), the performance data
 423                  * here is inaccurate and much slower than it could be.
 424                  * It's fine for our use though because only relative speed
 425                  * is important.
 426                  */
 427                 stat->value.ui64 = data_size * run_count *
 428                     (NANOSEC / bench_ns) >> 20; /* by MB/s */
 429         }
 430         kmem_free(databuf, data_size);
 431
 432         fletcher_4_impl_selectors[FLETCHER_FASTEST].fis_ops =
 433             fletcher_4_algos[best_run_index];
 434
 435         mutex_init(&fletcher_4_impl_lock, NULL, MUTEX_DEFAULT, NULL);
 436         fletcher_4_impl_set("fastest");
 437
 438         fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench",
 439             "misc", KSTAT_TYPE_NAMED, ARRAY_SIZE(fletcher_4_algos),
 440             KSTAT_FLAG_VIRTUAL);
 441         if (fletcher_4_kstat != NULL) {
 442                 fletcher_4_kstat->ks_data = fletcher_4_kstat_data;
 443                 kstat_install(fletcher_4_kstat);
 444         }
 445 }
 446
 447 void
 448 fletcher_4_fini(void)
 449 {
 450         mutex_destroy(&fletcher_4_impl_lock);
 451         if (fletcher_4_kstat != NULL) {
 452                 kstat_delete(fletcher_4_kstat);
 453                 fletcher_4_kstat = NULL;
 454         }
 455 }
 456
 457 #if defined(_KERNEL) && defined(HAVE_SPL)
 458
 459 static int
 460 fletcher_4_param_get(char *buffer, struct kernel_param *unused)
 461 {
 462         int i, cnt = 0;
 463
 464         for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) {
 465                 const fletcher_4_ops_t *ops;
 466
 467                 ops = fletcher_4_impl_selectors[i].fis_ops;
 468                 if (!ops->valid())
 469                         continue;
 470
 471                 cnt += sprintf(buffer + cnt,
 472                     fletcher_4_impl_chosen == i ? "[%s] " : "%s ",
 473                     fletcher_4_impl_selectors[i].fis_name);
 474         }
 475
 476         return (cnt);
 477 }
 478
 479 static int
 480 fletcher_4_param_set(const char *val, struct kernel_param *unused)
 481 {
 482         return (fletcher_4_impl_set(val));
 483 }
 484
 485 /*
 486  * Choose a fletcher 4 implementation in ZFS.
 487  * Users can choose the "fastest" algorithm, or "scalar" and "avx2" which means
 488  * to compute fletcher 4 by CPU or vector instructions respectively.
 489  * Users can also choose "cycle" to exercise all implementions, but this is
 490  * for testing purpose therefore it can only be set in user space.
 491  */
 492 module_param_call(zfs_fletcher_4_impl,
 493     fletcher_4_param_set, fletcher_4_param_get, NULL, 0644);
 494 MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 algorithm");
 495
 496 EXPORT_SYMBOL(fletcher_4_init);
 497 EXPORT_SYMBOL(fletcher_4_fini);
 498 EXPORT_SYMBOL(fletcher_2_native);
 499 EXPORT_SYMBOL(fletcher_2_byteswap);
 500 EXPORT_SYMBOL(fletcher_4_native);
 501 EXPORT_SYMBOL(fletcher_4_byteswap);
 502 EXPORT_SYMBOL(fletcher_4_incremental_native);
 503 EXPORT_SYMBOL(fletcher_4_incremental_byteswap);
 504 #endif