4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
26 #include <sys/zfs_context.h>
28 #include <sys/vdev_impl.h>
30 #include <sys/zio_checksum.h>
31 #include <sys/fs/zfs.h>
32 #include <sys/fm/fs/zfs.h>
35 * Virtual device vector for RAID-Z.
37 * This vdev supports single, double, and triple parity. For single parity,
38 * we use a simple XOR of all the data columns. For double or triple parity,
39 * we use a special case of Reed-Solomon coding. This extends the
40 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
41 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
42 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
43 * former is also based. The latter is designed to provide higher performance
46 * Note that the Plank paper claimed to support arbitrary N+M, but was then
47 * amended six years later identifying a critical flaw that invalidates its
48 * claims. Nevertheless, the technique can be adapted to work for up to
49 * triple parity. For additional parity, the amendment "Note: Correction to
50 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
51 * is viable, but the additional complexity means that write performance will
54 * All of the methods above operate on a Galois field, defined over the
55 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
56 * can be expressed with a single byte. Briefly, the operations on the
57 * field are defined as follows:
59 * o addition (+) is represented by a bitwise XOR
60 * o subtraction (-) is therefore identical to addition: A + B = A - B
61 * o multiplication of A by 2 is defined by the following bitwise expression:
65 * (A * 2)_4 = A_3 + A_7
66 * (A * 2)_3 = A_2 + A_7
67 * (A * 2)_2 = A_1 + A_7
71 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
72 * As an aside, this multiplication is derived from the error correcting
73 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
75 * Observe that any number in the field (except for 0) can be expressed as a
76 * power of 2 -- a generator for the field. We store a table of the powers of
77 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
78 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
79 * than field addition). The inverse of a field element A (A^-1) is therefore
80 * A ^ (255 - 1) = A^254.
82 * The up-to-three parity columns, P, Q, R over several data columns,
83 * D_0, ... D_n-1, can be expressed by field operations:
85 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
86 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
87 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
88 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
89 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
91 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
92 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
93 * independent coefficients. (There are no additional coefficients that have
94 * this property which is why the uncorrected Plank method breaks down.)
96 * See the reconstruction code below for how P, Q and R can used individually
97 * or in concert to recover missing data columns.
100 typedef struct raidz_col {
101 uint64_t rc_devidx; /* child device index for I/O */
102 uint64_t rc_offset; /* device offset */
103 uint64_t rc_size; /* I/O size */
104 void *rc_data; /* I/O data */
105 void *rc_gdata; /* used to store the "good" version */
106 int rc_error; /* I/O error for this device */
107 uint8_t rc_tried; /* Did we attempt this I/O column? */
108 uint8_t rc_skipped; /* Did we skip this I/O column? */
111 typedef struct raidz_map {
112 uint64_t rm_cols; /* Regular column count */
113 uint64_t rm_scols; /* Count including skipped columns */
114 uint64_t rm_bigcols; /* Number of oversized columns */
115 uint64_t rm_asize; /* Actual total I/O size */
116 uint64_t rm_missingdata; /* Count of missing data devices */
117 uint64_t rm_missingparity; /* Count of missing parity devices */
118 uint64_t rm_firstdatacol; /* First data column/parity count */
119 uint64_t rm_nskip; /* Skipped sectors for padding */
120 uint64_t rm_skipstart; /* Column index of padding start */
121 void *rm_datacopy; /* rm_asize-buffer of copied data */
122 uintptr_t rm_reports; /* # of referencing checksum reports */
123 uint8_t rm_freed; /* map no longer has referencing ZIO */
124 uint8_t rm_ecksuminjected; /* checksum error was injected */
125 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
128 #define VDEV_RAIDZ_P 0
129 #define VDEV_RAIDZ_Q 1
130 #define VDEV_RAIDZ_R 2
132 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
133 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
136 * We provide a mechanism to perform the field multiplication operation on a
137 * 64-bit value all at once rather than a byte at a time. This works by
138 * creating a mask from the top bit in each byte and using that to
139 * conditionally apply the XOR of 0x1d.
141 #define VDEV_RAIDZ_64MUL_2(x, mask) \
143 (mask) = (x) & 0x8080808080808080ULL; \
144 (mask) = ((mask) << 1) - ((mask) >> 7); \
145 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
146 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
149 #define VDEV_RAIDZ_64MUL_4(x, mask) \
151 VDEV_RAIDZ_64MUL_2((x), mask); \
152 VDEV_RAIDZ_64MUL_2((x), mask); \
156 * Force reconstruction to use the general purpose method.
158 int vdev_raidz_default_to_general;
161 * These two tables represent powers and logs of 2 in the Galois field defined
162 * above. These values were computed by repeatedly multiplying by 2 as above.
164 static const uint8_t vdev_raidz_pow2[256] = {
165 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
166 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
167 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
168 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
169 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
170 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
171 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
172 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
173 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
174 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
175 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
176 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
177 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
178 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
179 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
180 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
181 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
182 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
183 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
184 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
185 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
186 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
187 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
188 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
189 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
190 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
191 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
192 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
193 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
194 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
195 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
196 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
198 static const uint8_t vdev_raidz_log2[256] = {
199 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
200 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
201 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
202 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
203 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
204 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
205 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
206 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
207 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
208 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
209 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
210 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
211 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
212 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
213 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
214 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
215 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
216 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
217 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
218 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
219 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
220 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
221 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
222 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
223 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
224 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
225 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
226 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
227 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
228 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
229 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
230 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
233 static void vdev_raidz_generate_parity(raidz_map_t *rm);
236 * Multiply a given number by 2 raised to the given power.
239 vdev_raidz_exp2(uint_t a, int exp)
245 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
247 exp += vdev_raidz_log2[a];
251 return (vdev_raidz_pow2[exp]);
255 vdev_raidz_map_free(raidz_map_t *rm)
260 for (c = 0; c < rm->rm_firstdatacol; c++) {
261 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
263 if (rm->rm_col[c].rc_gdata != NULL)
264 zio_buf_free(rm->rm_col[c].rc_gdata,
265 rm->rm_col[c].rc_size);
269 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
270 size += rm->rm_col[c].rc_size;
272 if (rm->rm_datacopy != NULL)
273 zio_buf_free(rm->rm_datacopy, size);
275 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
279 vdev_raidz_map_free_vsd(zio_t *zio)
281 raidz_map_t *rm = zio->io_vsd;
283 ASSERT3U(rm->rm_freed, ==, 0);
286 if (rm->rm_reports == 0)
287 vdev_raidz_map_free(rm);
292 vdev_raidz_cksum_free(void *arg, size_t ignored)
294 raidz_map_t *rm = arg;
296 ASSERT3U(rm->rm_reports, >, 0);
298 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
299 vdev_raidz_map_free(rm);
303 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
305 raidz_map_t *rm = zcr->zcr_cbdata;
306 size_t c = zcr->zcr_cbinfo;
309 const char *good = NULL;
310 const char *bad = rm->rm_col[c].rc_data;
312 if (good_data == NULL) {
313 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
317 if (c < rm->rm_firstdatacol) {
319 * The first time through, calculate the parity blocks for
320 * the good data (this relies on the fact that the good
321 * data never changes for a given logical ZIO)
323 if (rm->rm_col[0].rc_gdata == NULL) {
324 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
328 * Set up the rm_col[]s to generate the parity for
329 * good_data, first saving the parity bufs and
330 * replacing them with buffers to hold the result.
332 for (x = 0; x < rm->rm_firstdatacol; x++) {
333 bad_parity[x] = rm->rm_col[x].rc_data;
334 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
335 zio_buf_alloc(rm->rm_col[x].rc_size);
338 /* fill in the data columns from good_data */
339 buf = (char *)good_data;
340 for (; x < rm->rm_cols; x++) {
341 rm->rm_col[x].rc_data = buf;
342 buf += rm->rm_col[x].rc_size;
346 * Construct the parity from the good data.
348 vdev_raidz_generate_parity(rm);
350 /* restore everything back to its original state */
351 for (x = 0; x < rm->rm_firstdatacol; x++)
352 rm->rm_col[x].rc_data = bad_parity[x];
354 buf = rm->rm_datacopy;
355 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
356 rm->rm_col[x].rc_data = buf;
357 buf += rm->rm_col[x].rc_size;
361 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
362 good = rm->rm_col[c].rc_gdata;
364 /* adjust good_data to point at the start of our column */
367 for (x = rm->rm_firstdatacol; x < c; x++)
368 good += rm->rm_col[x].rc_size;
371 /* we drop the ereport if it ends up that the data was good */
372 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
376 * Invoked indirectly by zfs_ereport_start_checksum(), called
377 * below when our read operation fails completely. The main point
378 * is to keep a copy of everything we read from disk, so that at
379 * vdev_raidz_cksum_finish() time we can compare it with the good data.
382 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
384 size_t c = (size_t)(uintptr_t)arg;
387 raidz_map_t *rm = zio->io_vsd;
390 /* set up the report and bump the refcount */
391 zcr->zcr_cbdata = rm;
393 zcr->zcr_finish = vdev_raidz_cksum_finish;
394 zcr->zcr_free = vdev_raidz_cksum_free;
397 ASSERT3U(rm->rm_reports, >, 0);
399 if (rm->rm_datacopy != NULL)
403 * It's the first time we're called for this raidz_map_t, so we need
404 * to copy the data aside; there's no guarantee that our zio's buffer
405 * won't be re-used for something else.
407 * Our parity data is already in separate buffers, so there's no need
412 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
413 size += rm->rm_col[c].rc_size;
415 buf = rm->rm_datacopy = zio_buf_alloc(size);
417 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
418 raidz_col_t *col = &rm->rm_col[c];
420 bcopy(col->rc_data, buf, col->rc_size);
425 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
428 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
429 vdev_raidz_map_free_vsd,
430 vdev_raidz_cksum_report
434 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
438 uint64_t b = zio->io_offset >> unit_shift;
439 uint64_t s = zio->io_size >> unit_shift;
440 uint64_t f = b % dcols;
441 uint64_t o = (b / dcols) << unit_shift;
442 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
444 q = s / (dcols - nparity);
445 r = s - q * (dcols - nparity);
446 bc = (r == 0 ? 0 : r + nparity);
447 tot = s + nparity * (q + (r == 0 ? 0 : 1));
451 scols = MIN(dcols, roundup(bc, nparity + 1));
457 ASSERT3U(acols, <=, scols);
459 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
462 rm->rm_scols = scols;
464 rm->rm_skipstart = bc;
465 rm->rm_missingdata = 0;
466 rm->rm_missingparity = 0;
467 rm->rm_firstdatacol = nparity;
468 rm->rm_datacopy = NULL;
471 rm->rm_ecksuminjected = 0;
475 for (c = 0; c < scols; c++) {
480 coff += 1ULL << unit_shift;
482 rm->rm_col[c].rc_devidx = col;
483 rm->rm_col[c].rc_offset = coff;
484 rm->rm_col[c].rc_data = NULL;
485 rm->rm_col[c].rc_gdata = NULL;
486 rm->rm_col[c].rc_error = 0;
487 rm->rm_col[c].rc_tried = 0;
488 rm->rm_col[c].rc_skipped = 0;
491 rm->rm_col[c].rc_size = 0;
493 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
495 rm->rm_col[c].rc_size = q << unit_shift;
497 asize += rm->rm_col[c].rc_size;
500 ASSERT3U(asize, ==, tot << unit_shift);
501 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
502 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
503 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
504 ASSERT3U(rm->rm_nskip, <=, nparity);
506 for (c = 0; c < rm->rm_firstdatacol; c++)
507 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
509 rm->rm_col[c].rc_data = zio->io_data;
511 for (c = c + 1; c < acols; c++)
512 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
513 rm->rm_col[c - 1].rc_size;
516 * If all data stored spans all columns, there's a danger that parity
517 * will always be on the same device and, since parity isn't read
518 * during normal operation, that that device's I/O bandwidth won't be
519 * used effectively. We therefore switch the parity every 1MB.
521 * ... at least that was, ostensibly, the theory. As a practical
522 * matter unless we juggle the parity between all devices evenly, we
523 * won't see any benefit. Further, occasional writes that aren't a
524 * multiple of the LCM of the number of children and the minimum
525 * stripe width are sufficient to avoid pessimal behavior.
526 * Unfortunately, this decision created an implicit on-disk format
527 * requirement that we need to support for all eternity, but only
528 * for single-parity RAID-Z.
530 * If we intend to skip a sector in the zeroth column for padding
531 * we must make sure to note this swap. We will never intend to
532 * skip the first column since at least one data and one parity
533 * column must appear in each row.
535 ASSERT(rm->rm_cols >= 2);
536 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
538 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
539 devidx = rm->rm_col[0].rc_devidx;
540 o = rm->rm_col[0].rc_offset;
541 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
542 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
543 rm->rm_col[1].rc_devidx = devidx;
544 rm->rm_col[1].rc_offset = o;
546 if (rm->rm_skipstart == 0)
547 rm->rm_skipstart = 1;
551 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
556 vdev_raidz_generate_parity_p(raidz_map_t *rm)
558 uint64_t *p, *src, pcount, ccount, i;
561 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
563 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
564 src = rm->rm_col[c].rc_data;
565 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
566 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
568 if (c == rm->rm_firstdatacol) {
569 ASSERT(ccount == pcount);
570 for (i = 0; i < ccount; i++, src++, p++) {
574 ASSERT(ccount <= pcount);
575 for (i = 0; i < ccount; i++, src++, p++) {
583 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
585 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
588 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
589 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
590 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
592 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
593 src = rm->rm_col[c].rc_data;
594 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
595 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
597 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
599 if (c == rm->rm_firstdatacol) {
600 ASSERT(ccnt == pcnt || ccnt == 0);
601 for (i = 0; i < ccnt; i++, src++, p++, q++) {
605 for (; i < pcnt; i++, src++, p++, q++) {
610 ASSERT(ccnt <= pcnt);
613 * Apply the algorithm described above by multiplying
614 * the previous result and adding in the new value.
616 for (i = 0; i < ccnt; i++, src++, p++, q++) {
619 VDEV_RAIDZ_64MUL_2(*q, mask);
624 * Treat short columns as though they are full of 0s.
625 * Note that there's therefore nothing needed for P.
627 for (; i < pcnt; i++, q++) {
628 VDEV_RAIDZ_64MUL_2(*q, mask);
635 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
637 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
640 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
641 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
642 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
643 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
644 rm->rm_col[VDEV_RAIDZ_R].rc_size);
646 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
647 src = rm->rm_col[c].rc_data;
648 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
649 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
650 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
652 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
654 if (c == rm->rm_firstdatacol) {
655 ASSERT(ccnt == pcnt || ccnt == 0);
656 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
661 for (; i < pcnt; i++, src++, p++, q++, r++) {
667 ASSERT(ccnt <= pcnt);
670 * Apply the algorithm described above by multiplying
671 * the previous result and adding in the new value.
673 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
676 VDEV_RAIDZ_64MUL_2(*q, mask);
679 VDEV_RAIDZ_64MUL_4(*r, mask);
684 * Treat short columns as though they are full of 0s.
685 * Note that there's therefore nothing needed for P.
687 for (; i < pcnt; i++, q++, r++) {
688 VDEV_RAIDZ_64MUL_2(*q, mask);
689 VDEV_RAIDZ_64MUL_4(*r, mask);
696 * Generate RAID parity in the first virtual columns according to the number of
697 * parity columns available.
700 vdev_raidz_generate_parity(raidz_map_t *rm)
702 switch (rm->rm_firstdatacol) {
704 vdev_raidz_generate_parity_p(rm);
707 vdev_raidz_generate_parity_pq(rm);
710 vdev_raidz_generate_parity_pqr(rm);
713 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
718 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
720 uint64_t *dst, *src, xcount, ccount, count, i;
725 ASSERT(x >= rm->rm_firstdatacol);
726 ASSERT(x < rm->rm_cols);
728 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
729 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
732 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
733 dst = rm->rm_col[x].rc_data;
734 for (i = 0; i < xcount; i++, dst++, src++) {
738 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
739 src = rm->rm_col[c].rc_data;
740 dst = rm->rm_col[x].rc_data;
745 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
746 count = MIN(ccount, xcount);
748 for (i = 0; i < count; i++, dst++, src++) {
753 return (1 << VDEV_RAIDZ_P);
757 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
759 uint64_t *dst, *src, xcount, ccount, count, mask, i;
766 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
767 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
769 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
770 src = rm->rm_col[c].rc_data;
771 dst = rm->rm_col[x].rc_data;
776 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
778 count = MIN(ccount, xcount);
780 if (c == rm->rm_firstdatacol) {
781 for (i = 0; i < count; i++, dst++, src++) {
784 for (; i < xcount; i++, dst++) {
789 for (i = 0; i < count; i++, dst++, src++) {
790 VDEV_RAIDZ_64MUL_2(*dst, mask);
794 for (; i < xcount; i++, dst++) {
795 VDEV_RAIDZ_64MUL_2(*dst, mask);
800 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
801 dst = rm->rm_col[x].rc_data;
802 exp = 255 - (rm->rm_cols - 1 - x);
804 for (i = 0; i < xcount; i++, dst++, src++) {
806 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
807 *b = vdev_raidz_exp2(*b, exp);
811 return (1 << VDEV_RAIDZ_Q);
815 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
817 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
819 uint64_t xsize, ysize, i;
825 ASSERT(x >= rm->rm_firstdatacol);
826 ASSERT(y < rm->rm_cols);
828 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
831 * Move the parity data aside -- we're going to compute parity as
832 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
833 * reuse the parity generation mechanism without trashing the actual
834 * parity so we make those columns appear to be full of zeros by
835 * setting their lengths to zero.
837 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
838 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
839 xsize = rm->rm_col[x].rc_size;
840 ysize = rm->rm_col[y].rc_size;
842 rm->rm_col[VDEV_RAIDZ_P].rc_data =
843 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
844 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
845 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
846 rm->rm_col[x].rc_size = 0;
847 rm->rm_col[y].rc_size = 0;
849 vdev_raidz_generate_parity_pq(rm);
851 rm->rm_col[x].rc_size = xsize;
852 rm->rm_col[y].rc_size = ysize;
856 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
857 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
858 xd = rm->rm_col[x].rc_data;
859 yd = rm->rm_col[y].rc_data;
863 * Pxy = P + D_x + D_y
864 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
866 * We can then solve for D_x:
867 * D_x = A * (P + Pxy) + B * (Q + Qxy)
869 * A = 2^(x - y) * (2^(x - y) + 1)^-1
870 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
872 * With D_x in hand, we can easily solve for D_y:
873 * D_y = P + Pxy + D_x
876 a = vdev_raidz_pow2[255 + x - y];
877 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
878 tmp = 255 - vdev_raidz_log2[a ^ 1];
880 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
881 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
883 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
884 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
885 vdev_raidz_exp2(*q ^ *qxy, bexp);
888 *yd = *p ^ *pxy ^ *xd;
891 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
892 rm->rm_col[VDEV_RAIDZ_P].rc_size);
893 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
894 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
897 * Restore the saved parity data.
899 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
900 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
902 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
907 * In the general case of reconstruction, we must solve the system of linear
908 * equations defined by the coeffecients used to generate parity as well as
909 * the contents of the data and parity disks. This can be expressed with
910 * vectors for the original data (D) and the actual data (d) and parity (p)
911 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
915 * | V | | D_0 | | p_m-1 |
916 * | | x | : | = | d_0 |
917 * | I | | D_n-1 | | : |
918 * | | ~~ ~~ | d_n-1 |
921 * I is simply a square identity matrix of size n, and V is a vandermonde
922 * matrix defined by the coeffecients we chose for the various parity columns
923 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
924 * computation as well as linear separability.
927 * | 1 .. 1 1 1 | | p_0 |
928 * | 2^n-1 .. 4 2 1 | __ __ | : |
929 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
930 * | 1 .. 0 0 0 | | D_1 | | d_0 |
931 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
932 * | : : : : | | : | | d_2 |
933 * | 0 .. 1 0 0 | | D_n-1 | | : |
934 * | 0 .. 0 1 0 | ~~ ~~ | : |
935 * | 0 .. 0 0 1 | | d_n-1 |
938 * Note that I, V, d, and p are known. To compute D, we must invert the
939 * matrix and use the known data and parity values to reconstruct the unknown
940 * data values. We begin by removing the rows in V|I and d|p that correspond
941 * to failed or missing columns; we then make V|I square (n x n) and d|p
942 * sized n by removing rows corresponding to unused parity from the bottom up
943 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
944 * using Gauss-Jordan elimination. In the example below we use m=3 parity
945 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
947 * | 1 1 1 1 1 1 1 1 |
948 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
949 * | 19 205 116 29 64 16 4 1 | / /
950 * | 1 0 0 0 0 0 0 0 | / /
951 * | 0 1 0 0 0 0 0 0 | <--' /
952 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
953 * | 0 0 0 1 0 0 0 0 |
954 * | 0 0 0 0 1 0 0 0 |
955 * | 0 0 0 0 0 1 0 0 |
956 * | 0 0 0 0 0 0 1 0 |
957 * | 0 0 0 0 0 0 0 1 |
960 * | 1 1 1 1 1 1 1 1 |
961 * | 128 64 32 16 8 4 2 1 |
962 * | 19 205 116 29 64 16 4 1 |
963 * | 1 0 0 0 0 0 0 0 |
964 * | 0 1 0 0 0 0 0 0 |
965 * (V|I)' = | 0 0 1 0 0 0 0 0 |
966 * | 0 0 0 1 0 0 0 0 |
967 * | 0 0 0 0 1 0 0 0 |
968 * | 0 0 0 0 0 1 0 0 |
969 * | 0 0 0 0 0 0 1 0 |
970 * | 0 0 0 0 0 0 0 1 |
973 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
974 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
975 * matrix is not singular.
977 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
978 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
979 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
980 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
981 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
982 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
983 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
984 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
987 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
988 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
989 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
990 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
991 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
992 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
993 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
994 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
997 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
998 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
999 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1000 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1001 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1002 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1003 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1004 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1007 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1008 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1009 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1010 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1011 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1012 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1013 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1014 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1017 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1018 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1019 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1020 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1021 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1022 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1023 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1024 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1027 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1028 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1029 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1030 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1031 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1032 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1033 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1034 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1037 * | 0 0 1 0 0 0 0 0 |
1038 * | 167 100 5 41 159 169 217 208 |
1039 * | 166 100 4 40 158 168 216 209 |
1040 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1041 * | 0 0 0 0 1 0 0 0 |
1042 * | 0 0 0 0 0 1 0 0 |
1043 * | 0 0 0 0 0 0 1 0 |
1044 * | 0 0 0 0 0 0 0 1 |
1047 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1048 * of the missing data.
1050 * As is apparent from the example above, the only non-trivial rows in the
1051 * inverse matrix correspond to the data disks that we're trying to
1052 * reconstruct. Indeed, those are the only rows we need as the others would
1053 * only be useful for reconstructing data known or assumed to be valid. For
1054 * that reason, we only build the coefficients in the rows that correspond to
1060 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1066 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1069 * Fill in the missing rows of interest.
1071 for (i = 0; i < nmap; i++) {
1072 ASSERT3S(0, <=, map[i]);
1073 ASSERT3S(map[i], <=, 2);
1080 for (j = 0; j < n; j++) {
1084 rows[i][j] = vdev_raidz_pow2[pow];
1090 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1091 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1097 * Assert that the first nmissing entries from the array of used
1098 * columns correspond to parity columns and that subsequent entries
1099 * correspond to data columns.
1101 for (i = 0; i < nmissing; i++) {
1102 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1104 for (; i < n; i++) {
1105 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1109 * First initialize the storage where we'll compute the inverse rows.
1111 for (i = 0; i < nmissing; i++) {
1112 for (j = 0; j < n; j++) {
1113 invrows[i][j] = (i == j) ? 1 : 0;
1118 * Subtract all trivial rows from the rows of consequence.
1120 for (i = 0; i < nmissing; i++) {
1121 for (j = nmissing; j < n; j++) {
1122 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1123 jj = used[j] - rm->rm_firstdatacol;
1125 invrows[i][j] = rows[i][jj];
1131 * For each of the rows of interest, we must normalize it and subtract
1132 * a multiple of it from the other rows.
1134 for (i = 0; i < nmissing; i++) {
1135 for (j = 0; j < missing[i]; j++) {
1136 ASSERT3U(rows[i][j], ==, 0);
1138 ASSERT3U(rows[i][missing[i]], !=, 0);
1141 * Compute the inverse of the first element and multiply each
1142 * element in the row by that value.
1144 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1146 for (j = 0; j < n; j++) {
1147 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1148 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1151 for (ii = 0; ii < nmissing; ii++) {
1155 ASSERT3U(rows[ii][missing[i]], !=, 0);
1157 log = vdev_raidz_log2[rows[ii][missing[i]]];
1159 for (j = 0; j < n; j++) {
1161 vdev_raidz_exp2(rows[i][j], log);
1163 vdev_raidz_exp2(invrows[i][j], log);
1169 * Verify that the data that is left in the rows are properly part of
1170 * an identity matrix.
1172 for (i = 0; i < nmissing; i++) {
1173 for (j = 0; j < n; j++) {
1174 if (j == missing[i]) {
1175 ASSERT3U(rows[i][j], ==, 1);
1177 ASSERT3U(rows[i][j], ==, 0);
1184 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1185 int *missing, uint8_t **invrows, const uint8_t *used)
1190 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1191 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1192 uint8_t log = 0, val;
1194 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1198 psize = sizeof (invlog[0][0]) * n * nmissing;
1199 p = kmem_alloc(psize, KM_SLEEP);
1201 for (pp = p, i = 0; i < nmissing; i++) {
1206 for (i = 0; i < nmissing; i++) {
1207 for (j = 0; j < n; j++) {
1208 ASSERT3U(invrows[i][j], !=, 0);
1209 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1213 for (i = 0; i < n; i++) {
1215 ASSERT3U(c, <, rm->rm_cols);
1217 src = rm->rm_col[c].rc_data;
1218 ccount = rm->rm_col[c].rc_size;
1219 for (j = 0; j < nmissing; j++) {
1220 cc = missing[j] + rm->rm_firstdatacol;
1221 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1222 ASSERT3U(cc, <, rm->rm_cols);
1223 ASSERT3U(cc, !=, c);
1225 dst[j] = rm->rm_col[cc].rc_data;
1226 dcount[j] = rm->rm_col[cc].rc_size;
1229 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1231 for (x = 0; x < ccount; x++, src++) {
1233 log = vdev_raidz_log2[*src];
1235 for (cc = 0; cc < nmissing; cc++) {
1236 if (x >= dcount[cc])
1242 if ((ll = log + invlog[cc][i]) >= 255)
1244 val = vdev_raidz_pow2[ll];
1255 kmem_free(p, psize);
1259 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1263 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1264 int parity_map[VDEV_RAIDZ_MAXPARITY];
1269 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1270 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1276 n = rm->rm_cols - rm->rm_firstdatacol;
1279 * Figure out which data columns are missing.
1282 for (t = 0; t < ntgts; t++) {
1283 if (tgts[t] >= rm->rm_firstdatacol) {
1284 missing_rows[nmissing_rows++] =
1285 tgts[t] - rm->rm_firstdatacol;
1290 * Figure out which parity columns to use to help generate the missing
1293 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1295 ASSERT(c < rm->rm_firstdatacol);
1298 * Skip any targeted parity columns.
1300 if (c == tgts[tt]) {
1312 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1314 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1315 nmissing_rows * n + sizeof (used[0]) * n;
1316 p = kmem_alloc(psize, KM_SLEEP);
1318 for (pp = p, i = 0; i < nmissing_rows; i++) {
1326 for (i = 0; i < nmissing_rows; i++) {
1327 used[i] = parity_map[i];
1330 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1331 if (tt < nmissing_rows &&
1332 c == missing_rows[tt] + rm->rm_firstdatacol) {
1343 * Initialize the interesting rows of the matrix.
1345 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1348 * Invert the matrix.
1350 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1354 * Reconstruct the missing data using the generated matrix.
1356 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1359 kmem_free(p, psize);
1365 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1367 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1371 int nbadparity, nbaddata;
1372 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1375 * The tgts list must already be sorted.
1377 for (i = 1; i < nt; i++) {
1378 ASSERT(t[i] > t[i - 1]);
1381 nbadparity = rm->rm_firstdatacol;
1382 nbaddata = rm->rm_cols - nbadparity;
1384 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1385 if (c < rm->rm_firstdatacol)
1386 parity_valid[c] = B_FALSE;
1388 if (i < nt && c == t[i]) {
1391 } else if (rm->rm_col[c].rc_error != 0) {
1393 } else if (c >= rm->rm_firstdatacol) {
1396 parity_valid[c] = B_TRUE;
1401 ASSERT(ntgts >= nt);
1402 ASSERT(nbaddata >= 0);
1403 ASSERT(nbaddata + nbadparity == ntgts);
1405 dt = &tgts[nbadparity];
1408 * See if we can use any of our optimized reconstruction routines.
1410 if (!vdev_raidz_default_to_general) {
1413 if (parity_valid[VDEV_RAIDZ_P])
1414 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1416 ASSERT(rm->rm_firstdatacol > 1);
1418 if (parity_valid[VDEV_RAIDZ_Q])
1419 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1421 ASSERT(rm->rm_firstdatacol > 2);
1425 ASSERT(rm->rm_firstdatacol > 1);
1427 if (parity_valid[VDEV_RAIDZ_P] &&
1428 parity_valid[VDEV_RAIDZ_Q])
1429 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1431 ASSERT(rm->rm_firstdatacol > 2);
1437 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1438 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1444 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
1447 uint64_t nparity = vd->vdev_nparity;
1452 ASSERT(nparity > 0);
1454 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1455 vd->vdev_children < nparity + 1) {
1456 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1460 vdev_open_children(vd);
1462 for (c = 0; c < vd->vdev_children; c++) {
1463 cvd = vd->vdev_child[c];
1465 if (cvd->vdev_open_error != 0) {
1466 lasterror = cvd->vdev_open_error;
1471 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1472 *ashift = MAX(*ashift, cvd->vdev_ashift);
1475 *asize *= vd->vdev_children;
1477 if (numerrors > nparity) {
1478 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1486 vdev_raidz_close(vdev_t *vd)
1490 for (c = 0; c < vd->vdev_children; c++)
1491 vdev_close(vd->vdev_child[c]);
1495 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1498 uint64_t ashift = vd->vdev_top->vdev_ashift;
1499 uint64_t cols = vd->vdev_children;
1500 uint64_t nparity = vd->vdev_nparity;
1502 asize = ((psize - 1) >> ashift) + 1;
1503 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1504 asize = roundup(asize, nparity + 1) << ashift;
1510 vdev_raidz_child_done(zio_t *zio)
1512 raidz_col_t *rc = zio->io_private;
1514 rc->rc_error = zio->io_error;
1520 vdev_raidz_io_start(zio_t *zio)
1522 vdev_t *vd = zio->io_vd;
1523 vdev_t *tvd = vd->vdev_top;
1529 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1532 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1534 if (zio->io_type == ZIO_TYPE_WRITE) {
1535 vdev_raidz_generate_parity(rm);
1537 for (c = 0; c < rm->rm_cols; c++) {
1538 rc = &rm->rm_col[c];
1539 cvd = vd->vdev_child[rc->rc_devidx];
1540 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1541 rc->rc_offset, rc->rc_data, rc->rc_size,
1542 zio->io_type, zio->io_priority, 0,
1543 vdev_raidz_child_done, rc));
1547 * Generate optional I/Os for any skipped sectors to improve
1548 * aggregation contiguity.
1550 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1551 ASSERT(c <= rm->rm_scols);
1552 if (c == rm->rm_scols)
1554 rc = &rm->rm_col[c];
1555 cvd = vd->vdev_child[rc->rc_devidx];
1556 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1557 rc->rc_offset + rc->rc_size, NULL,
1558 1 << tvd->vdev_ashift,
1559 zio->io_type, zio->io_priority,
1560 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1563 return (ZIO_PIPELINE_CONTINUE);
1566 ASSERT(zio->io_type == ZIO_TYPE_READ);
1569 * Iterate over the columns in reverse order so that we hit the parity
1570 * last -- any errors along the way will force us to read the parity.
1572 for (c = rm->rm_cols - 1; c >= 0; c--) {
1573 rc = &rm->rm_col[c];
1574 cvd = vd->vdev_child[rc->rc_devidx];
1575 if (!vdev_readable(cvd)) {
1576 if (c >= rm->rm_firstdatacol)
1577 rm->rm_missingdata++;
1579 rm->rm_missingparity++;
1580 rc->rc_error = ENXIO;
1581 rc->rc_tried = 1; /* don't even try */
1585 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1586 if (c >= rm->rm_firstdatacol)
1587 rm->rm_missingdata++;
1589 rm->rm_missingparity++;
1590 rc->rc_error = ESTALE;
1594 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1595 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1596 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1597 rc->rc_offset, rc->rc_data, rc->rc_size,
1598 zio->io_type, zio->io_priority, 0,
1599 vdev_raidz_child_done, rc));
1603 return (ZIO_PIPELINE_CONTINUE);
1608 * Report a checksum error for a child of a RAID-Z device.
1611 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1613 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1615 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1616 zio_bad_cksum_t zbc;
1617 raidz_map_t *rm = zio->io_vsd;
1619 mutex_enter(&vd->vdev_stat_lock);
1620 vd->vdev_stat.vs_checksum_errors++;
1621 mutex_exit(&vd->vdev_stat_lock);
1623 zbc.zbc_has_cksum = 0;
1624 zbc.zbc_injected = rm->rm_ecksuminjected;
1626 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1627 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1633 * We keep track of whether or not there were any injected errors, so that
1634 * any ereports we generate can note it.
1637 raidz_checksum_verify(zio_t *zio)
1639 zio_bad_cksum_t zbc;
1640 raidz_map_t *rm = zio->io_vsd;
1643 bzero(&zbc, sizeof (zio_bad_cksum_t));
1645 ret = zio_checksum_error(zio, &zbc);
1646 if (ret != 0 && zbc.zbc_injected != 0)
1647 rm->rm_ecksuminjected = 1;
1653 * Generate the parity from the data columns. If we tried and were able to
1654 * read the parity without error, verify that the generated parity matches the
1655 * data we read. If it doesn't, we fire off a checksum error. Return the
1656 * number such failures.
1659 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1661 void *orig[VDEV_RAIDZ_MAXPARITY];
1665 for (c = 0; c < rm->rm_firstdatacol; c++) {
1666 rc = &rm->rm_col[c];
1667 if (!rc->rc_tried || rc->rc_error != 0)
1669 orig[c] = zio_buf_alloc(rc->rc_size);
1670 bcopy(rc->rc_data, orig[c], rc->rc_size);
1673 vdev_raidz_generate_parity(rm);
1675 for (c = 0; c < rm->rm_firstdatacol; c++) {
1676 rc = &rm->rm_col[c];
1677 if (!rc->rc_tried || rc->rc_error != 0)
1679 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1680 raidz_checksum_error(zio, rc, orig[c]);
1681 rc->rc_error = ECKSUM;
1684 zio_buf_free(orig[c], rc->rc_size);
1691 * Keep statistics on all the ways that we used parity to correct data.
1693 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1696 vdev_raidz_worst_error(raidz_map_t *rm)
1700 for (c = 0; c < rm->rm_cols; c++)
1701 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1707 * Iterate over all combinations of bad data and attempt a reconstruction.
1708 * Note that the algorithm below is non-optimal because it doesn't take into
1709 * account how reconstruction is actually performed. For example, with
1710 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1711 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1712 * cases we'd only use parity information in column 0.
1715 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1717 raidz_map_t *rm = zio->io_vsd;
1719 void *orig[VDEV_RAIDZ_MAXPARITY];
1720 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1721 int *tgts = &tstore[1];
1722 int curr, next, i, c, n;
1725 ASSERT(total_errors < rm->rm_firstdatacol);
1728 * This simplifies one edge condition.
1732 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1734 * Initialize the targets array by finding the first n columns
1735 * that contain no error.
1737 * If there were no data errors, we need to ensure that we're
1738 * always explicitly attempting to reconstruct at least one
1739 * data column. To do this, we simply push the highest target
1740 * up into the data columns.
1742 for (c = 0, i = 0; i < n; i++) {
1743 if (i == n - 1 && data_errors == 0 &&
1744 c < rm->rm_firstdatacol) {
1745 c = rm->rm_firstdatacol;
1748 while (rm->rm_col[c].rc_error != 0) {
1750 ASSERT3S(c, <, rm->rm_cols);
1757 * Setting tgts[n] simplifies the other edge condition.
1759 tgts[n] = rm->rm_cols;
1762 * These buffers were allocated in previous iterations.
1764 for (i = 0; i < n - 1; i++) {
1765 ASSERT(orig[i] != NULL);
1768 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1778 * Save off the original data that we're going to
1779 * attempt to reconstruct.
1781 for (i = 0; i < n; i++) {
1782 ASSERT(orig[i] != NULL);
1785 ASSERT3S(c, <, rm->rm_cols);
1786 rc = &rm->rm_col[c];
1787 bcopy(rc->rc_data, orig[i], rc->rc_size);
1791 * Attempt a reconstruction and exit the outer loop on
1794 code = vdev_raidz_reconstruct(rm, tgts, n);
1795 if (raidz_checksum_verify(zio) == 0) {
1796 atomic_inc_64(&raidz_corrected[code]);
1798 for (i = 0; i < n; i++) {
1800 rc = &rm->rm_col[c];
1801 ASSERT(rc->rc_error == 0);
1803 raidz_checksum_error(zio, rc,
1805 rc->rc_error = ECKSUM;
1813 * Restore the original data.
1815 for (i = 0; i < n; i++) {
1817 rc = &rm->rm_col[c];
1818 bcopy(orig[i], rc->rc_data, rc->rc_size);
1823 * Find the next valid column after the curr
1826 for (next = tgts[curr] + 1;
1827 next < rm->rm_cols &&
1828 rm->rm_col[next].rc_error != 0; next++)
1831 ASSERT(next <= tgts[curr + 1]);
1834 * If that spot is available, we're done here.
1836 if (next != tgts[curr + 1])
1840 * Otherwise, find the next valid column after
1841 * the previous position.
1843 for (c = tgts[curr - 1] + 1;
1844 rm->rm_col[c].rc_error != 0; c++)
1850 } while (curr != n);
1855 for (i = 0; i < n; i++) {
1856 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1863 vdev_raidz_io_done(zio_t *zio)
1865 vdev_t *vd = zio->io_vd;
1867 raidz_map_t *rm = zio->io_vsd;
1868 raidz_col_t *rc = NULL;
1869 int unexpected_errors = 0;
1870 int parity_errors = 0;
1871 int parity_untried = 0;
1872 int data_errors = 0;
1873 int total_errors = 0;
1875 int tgts[VDEV_RAIDZ_MAXPARITY];
1878 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1880 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1881 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1883 for (c = 0; c < rm->rm_cols; c++) {
1884 rc = &rm->rm_col[c];
1887 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1889 if (c < rm->rm_firstdatacol)
1894 if (!rc->rc_skipped)
1895 unexpected_errors++;
1898 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1903 if (zio->io_type == ZIO_TYPE_WRITE) {
1905 * XXX -- for now, treat partial writes as a success.
1906 * (If we couldn't write enough columns to reconstruct
1907 * the data, the I/O failed. Otherwise, good enough.)
1909 * Now that we support write reallocation, it would be better
1910 * to treat partial failure as real failure unless there are
1911 * no non-degraded top-level vdevs left, and not update DTLs
1912 * if we intend to reallocate.
1915 if (total_errors > rm->rm_firstdatacol)
1916 zio->io_error = vdev_raidz_worst_error(rm);
1921 ASSERT(zio->io_type == ZIO_TYPE_READ);
1923 * There are three potential phases for a read:
1924 * 1. produce valid data from the columns read
1925 * 2. read all disks and try again
1926 * 3. perform combinatorial reconstruction
1928 * Each phase is progressively both more expensive and less likely to
1929 * occur. If we encounter more errors than we can repair or all phases
1930 * fail, we have no choice but to return an error.
1934 * If the number of errors we saw was correctable -- less than or equal
1935 * to the number of parity disks read -- attempt to produce data that
1936 * has a valid checksum. Naturally, this case applies in the absence of
1939 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1940 if (data_errors == 0) {
1941 if (raidz_checksum_verify(zio) == 0) {
1943 * If we read parity information (unnecessarily
1944 * as it happens since no reconstruction was
1945 * needed) regenerate and verify the parity.
1946 * We also regenerate parity when resilvering
1947 * so we can write it out to the failed device
1950 if (parity_errors + parity_untried <
1951 rm->rm_firstdatacol ||
1952 (zio->io_flags & ZIO_FLAG_RESILVER)) {
1953 n = raidz_parity_verify(zio, rm);
1954 unexpected_errors += n;
1955 ASSERT(parity_errors + n <=
1956 rm->rm_firstdatacol);
1962 * We either attempt to read all the parity columns or
1963 * none of them. If we didn't try to read parity, we
1964 * wouldn't be here in the correctable case. There must
1965 * also have been fewer parity errors than parity
1966 * columns or, again, we wouldn't be in this code path.
1968 ASSERT(parity_untried == 0);
1969 ASSERT(parity_errors < rm->rm_firstdatacol);
1972 * Identify the data columns that reported an error.
1975 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1976 rc = &rm->rm_col[c];
1977 if (rc->rc_error != 0) {
1978 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1983 ASSERT(rm->rm_firstdatacol >= n);
1985 code = vdev_raidz_reconstruct(rm, tgts, n);
1987 if (raidz_checksum_verify(zio) == 0) {
1988 atomic_inc_64(&raidz_corrected[code]);
1991 * If we read more parity disks than were used
1992 * for reconstruction, confirm that the other
1993 * parity disks produced correct data. This
1994 * routine is suboptimal in that it regenerates
1995 * the parity that we already used in addition
1996 * to the parity that we're attempting to
1997 * verify, but this should be a relatively
1998 * uncommon case, and can be optimized if it
1999 * becomes a problem. Note that we regenerate
2000 * parity when resilvering so we can write it
2001 * out to failed devices later.
2003 if (parity_errors < rm->rm_firstdatacol - n ||
2004 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2005 n = raidz_parity_verify(zio, rm);
2006 unexpected_errors += n;
2007 ASSERT(parity_errors + n <=
2008 rm->rm_firstdatacol);
2017 * This isn't a typical situation -- either we got a read error or
2018 * a child silently returned bad data. Read every block so we can
2019 * try again with as much data and parity as we can track down. If
2020 * we've already been through once before, all children will be marked
2021 * as tried so we'll proceed to combinatorial reconstruction.
2023 unexpected_errors = 1;
2024 rm->rm_missingdata = 0;
2025 rm->rm_missingparity = 0;
2027 for (c = 0; c < rm->rm_cols; c++) {
2028 if (rm->rm_col[c].rc_tried)
2031 zio_vdev_io_redone(zio);
2033 rc = &rm->rm_col[c];
2036 zio_nowait(zio_vdev_child_io(zio, NULL,
2037 vd->vdev_child[rc->rc_devidx],
2038 rc->rc_offset, rc->rc_data, rc->rc_size,
2039 zio->io_type, zio->io_priority, 0,
2040 vdev_raidz_child_done, rc));
2041 } while (++c < rm->rm_cols);
2047 * At this point we've attempted to reconstruct the data given the
2048 * errors we detected, and we've attempted to read all columns. There
2049 * must, therefore, be one or more additional problems -- silent errors
2050 * resulting in invalid data rather than explicit I/O errors resulting
2051 * in absent data. We check if there is enough additional data to
2052 * possibly reconstruct the data and then perform combinatorial
2053 * reconstruction over all possible combinations. If that fails,
2056 if (total_errors > rm->rm_firstdatacol) {
2057 zio->io_error = vdev_raidz_worst_error(rm);
2059 } else if (total_errors < rm->rm_firstdatacol &&
2060 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2062 * If we didn't use all the available parity for the
2063 * combinatorial reconstruction, verify that the remaining
2064 * parity is correct.
2066 if (code != (1 << rm->rm_firstdatacol) - 1)
2067 (void) raidz_parity_verify(zio, rm);
2070 * We're here because either:
2072 * total_errors == rm_first_datacol, or
2073 * vdev_raidz_combrec() failed
2075 * In either case, there is enough bad data to prevent
2078 * Start checksum ereports for all children which haven't
2079 * failed, and the IO wasn't speculative.
2081 zio->io_error = ECKSUM;
2083 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2084 for (c = 0; c < rm->rm_cols; c++) {
2085 rc = &rm->rm_col[c];
2086 if (rc->rc_error == 0) {
2087 zio_bad_cksum_t zbc;
2088 zbc.zbc_has_cksum = 0;
2090 rm->rm_ecksuminjected;
2092 zfs_ereport_start_checksum(
2094 vd->vdev_child[rc->rc_devidx],
2095 zio, rc->rc_offset, rc->rc_size,
2096 (void *)(uintptr_t)c, &zbc);
2103 zio_checksum_verified(zio);
2105 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2106 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2108 * Use the good data we have in hand to repair damaged children.
2110 for (c = 0; c < rm->rm_cols; c++) {
2111 rc = &rm->rm_col[c];
2112 cvd = vd->vdev_child[rc->rc_devidx];
2114 if (rc->rc_error == 0)
2117 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2118 rc->rc_offset, rc->rc_data, rc->rc_size,
2119 ZIO_TYPE_WRITE, zio->io_priority,
2120 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2121 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2127 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2129 if (faulted > vd->vdev_nparity)
2130 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2131 VDEV_AUX_NO_REPLICAS);
2132 else if (degraded + faulted != 0)
2133 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2135 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2138 vdev_ops_t vdev_raidz_ops = {
2142 vdev_raidz_io_start,
2144 vdev_raidz_state_change,
2147 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2148 B_FALSE /* not a leaf vdev */