4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #include <sys/zfs_context.h>
29 #include <sys/vdev_impl.h>
31 #include <sys/zio_checksum.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
36 * Virtual device vector for RAID-Z.
38 * This vdev supports single, double, and triple parity. For single parity,
39 * we use a simple XOR of all the data columns. For double or triple parity,
40 * we use a special case of Reed-Solomon coding. This extends the
41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44 * former is also based. The latter is designed to provide higher performance
47 * Note that the Plank paper claimed to support arbitrary N+M, but was then
48 * amended six years later identifying a critical flaw that invalidates its
49 * claims. Nevertheless, the technique can be adapted to work for up to
50 * triple parity. For additional parity, the amendment "Note: Correction to
51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52 * is viable, but the additional complexity means that write performance will
55 * All of the methods above operate on a Galois field, defined over the
56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57 * can be expressed with a single byte. Briefly, the operations on the
58 * field are defined as follows:
60 * o addition (+) is represented by a bitwise XOR
61 * o subtraction (-) is therefore identical to addition: A + B = A - B
62 * o multiplication of A by 2 is defined by the following bitwise expression:
66 * (A * 2)_4 = A_3 + A_7
67 * (A * 2)_3 = A_2 + A_7
68 * (A * 2)_2 = A_1 + A_7
72 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
73 * As an aside, this multiplication is derived from the error correcting
74 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
76 * Observe that any number in the field (except for 0) can be expressed as a
77 * power of 2 -- a generator for the field. We store a table of the powers of
78 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
79 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
80 * than field addition). The inverse of a field element A (A^-1) is therefore
81 * A ^ (255 - 1) = A^254.
83 * The up-to-three parity columns, P, Q, R over several data columns,
84 * D_0, ... D_n-1, can be expressed by field operations:
86 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
87 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
88 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
89 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
92 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94 * independent coefficients. (There are no additional coefficients that have
95 * this property which is why the uncorrected Plank method breaks down.)
97 * See the reconstruction code below for how P, Q and R can used individually
98 * or in concert to recover missing data columns.
101 typedef struct raidz_col {
102 uint64_t rc_devidx; /* child device index for I/O */
103 uint64_t rc_offset; /* device offset */
104 uint64_t rc_size; /* I/O size */
105 void *rc_data; /* I/O data */
106 int rc_error; /* I/O error for this device */
107 uint8_t rc_tried; /* Did we attempt this I/O column? */
108 uint8_t rc_skipped; /* Did we skip this I/O column? */
111 typedef struct raidz_map {
112 uint64_t rm_cols; /* Regular column count */
113 uint64_t rm_scols; /* Count including skipped columns */
114 uint64_t rm_bigcols; /* Number of oversized columns */
115 uint64_t rm_asize; /* Actual total I/O size */
116 uint64_t rm_missingdata; /* Count of missing data devices */
117 uint64_t rm_missingparity; /* Count of missing parity devices */
118 uint64_t rm_firstdatacol; /* First data column/parity count */
119 uint64_t rm_skipped; /* Skipped sectors for padding */
120 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
123 #define VDEV_RAIDZ_P 0
124 #define VDEV_RAIDZ_Q 1
125 #define VDEV_RAIDZ_R 2
126 #define VDEV_RAIDZ_MAXPARITY 3
128 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
129 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
132 * We provide a mechanism to perform the field multiplication operation on a
133 * 64-bit value all at once rather than a byte at a time. This works by
134 * creating a mask from the top bit in each byte and using that to
135 * conditionally apply the XOR of 0x1d.
137 #define VDEV_RAIDZ_64MUL_2(x, mask) \
139 (mask) = (x) & 0x8080808080808080ULL; \
140 (mask) = ((mask) << 1) - ((mask) >> 7); \
141 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
142 ((mask) & 0x1d1d1d1d1d1d1d1d); \
145 #define VDEV_RAIDZ_64MUL_4(x, mask) \
147 VDEV_RAIDZ_64MUL_2((x), mask); \
148 VDEV_RAIDZ_64MUL_2((x), mask); \
152 * Force reconstruction to use the general purpose method.
154 int vdev_raidz_default_to_general;
157 * These two tables represent powers and logs of 2 in the Galois field defined
158 * above. These values were computed by repeatedly multiplying by 2 as above.
160 static const uint8_t vdev_raidz_pow2[256] = {
161 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
162 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
163 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
164 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
165 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
166 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
167 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
168 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
169 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
170 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
171 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
172 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
173 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
174 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
175 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
176 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
177 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
178 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
179 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
180 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
181 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
182 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
183 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
184 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
185 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
186 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
187 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
188 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
189 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
190 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
191 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
192 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
194 static const uint8_t vdev_raidz_log2[256] = {
195 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
196 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
197 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
198 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
199 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
200 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
201 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
202 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
203 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
204 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
205 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
206 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
207 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
208 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
209 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
210 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
211 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
212 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
213 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
214 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
215 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
216 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
217 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
218 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
219 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
220 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
221 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
222 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
223 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
224 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
225 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
226 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
230 * Multiply a given number by 2 raised to the given power.
233 vdev_raidz_exp2(uint_t a, int exp)
239 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
241 exp += vdev_raidz_log2[a];
245 return (vdev_raidz_pow2[exp]);
249 vdev_raidz_map_free(zio_t *zio)
251 raidz_map_t *rm = zio->io_vsd;
254 for (c = 0; c < rm->rm_firstdatacol; c++)
255 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
257 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
261 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
265 uint64_t b = zio->io_offset >> unit_shift;
266 uint64_t s = zio->io_size >> unit_shift;
267 uint64_t f = b % dcols;
268 uint64_t o = (b / dcols) << unit_shift;
269 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
271 q = s / (dcols - nparity);
272 r = s - q * (dcols - nparity);
273 bc = (r == 0 ? 0 : r + nparity);
274 tot = s + nparity * (q + (r == 0 ? 0 : 1));
278 scols = MIN(dcols, roundup(bc, nparity + 1));
284 ASSERT3U(acols, <=, scols);
286 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
289 rm->rm_scols = scols;
291 rm->rm_missingdata = 0;
292 rm->rm_missingparity = 0;
293 rm->rm_firstdatacol = nparity;
297 for (c = 0; c < scols; c++) {
302 coff += 1ULL << unit_shift;
304 rm->rm_col[c].rc_devidx = col;
305 rm->rm_col[c].rc_offset = coff;
306 rm->rm_col[c].rc_data = NULL;
307 rm->rm_col[c].rc_error = 0;
308 rm->rm_col[c].rc_tried = 0;
309 rm->rm_col[c].rc_skipped = 0;
312 rm->rm_col[c].rc_size = 0;
314 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
316 rm->rm_col[c].rc_size = q << unit_shift;
318 asize += rm->rm_col[c].rc_size;
321 ASSERT3U(asize, ==, tot << unit_shift);
322 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
323 rm->rm_skipped = roundup(tot, nparity + 1) - tot;
324 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_skipped << unit_shift);
325 ASSERT3U(rm->rm_skipped, <=, nparity);
327 for (c = 0; c < rm->rm_firstdatacol; c++)
328 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
330 rm->rm_col[c].rc_data = zio->io_data;
332 for (c = c + 1; c < acols; c++)
333 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
334 rm->rm_col[c - 1].rc_size;
337 * If all data stored spans all columns, there's a danger that parity
338 * will always be on the same device and, since parity isn't read
339 * during normal operation, that that device's I/O bandwidth won't be
340 * used effectively. We therefore switch the parity every 1MB.
342 * ... at least that was, ostensibly, the theory. As a practical
343 * matter unless we juggle the parity between all devices evenly, we
344 * won't see any benefit. Further, occasional writes that aren't a
345 * multiple of the LCM of the number of children and the minimum
346 * stripe width are sufficient to avoid pessimal behavior.
347 * Unfortunately, this decision created an implicit on-disk format
348 * requirement that we need to support for all eternity, but only
349 * for single-parity RAID-Z.
351 ASSERT(rm->rm_cols >= 2);
352 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
354 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
355 devidx = rm->rm_col[0].rc_devidx;
356 o = rm->rm_col[0].rc_offset;
357 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
358 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
359 rm->rm_col[1].rc_devidx = devidx;
360 rm->rm_col[1].rc_offset = o;
364 zio->io_vsd_free = vdev_raidz_map_free;
369 vdev_raidz_generate_parity_p(raidz_map_t *rm)
371 uint64_t *p, *src, pcount, ccount, i;
374 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
376 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
377 src = rm->rm_col[c].rc_data;
378 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
379 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
381 if (c == rm->rm_firstdatacol) {
382 ASSERT(ccount == pcount);
383 for (i = 0; i < ccount; i++, src++, p++) {
387 ASSERT(ccount <= pcount);
388 for (i = 0; i < ccount; i++, src++, p++) {
396 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
398 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
401 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
402 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
403 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
405 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
406 src = rm->rm_col[c].rc_data;
407 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
408 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
410 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
412 if (c == rm->rm_firstdatacol) {
413 ASSERT(ccnt == pcnt || ccnt == 0);
414 for (i = 0; i < ccnt; i++, src++, p++, q++) {
418 for (; i < pcnt; i++, src++, p++, q++) {
423 ASSERT(ccnt <= pcnt);
426 * Apply the algorithm described above by multiplying
427 * the previous result and adding in the new value.
429 for (i = 0; i < ccnt; i++, src++, p++, q++) {
432 VDEV_RAIDZ_64MUL_2(*q, mask);
437 * Treat short columns as though they are full of 0s.
438 * Note that there's therefore nothing needed for P.
440 for (; i < pcnt; i++, q++) {
441 VDEV_RAIDZ_64MUL_2(*q, mask);
448 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
450 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
453 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
454 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
455 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
456 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
457 rm->rm_col[VDEV_RAIDZ_R].rc_size);
459 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
460 src = rm->rm_col[c].rc_data;
461 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
462 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
463 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
465 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
467 if (c == rm->rm_firstdatacol) {
468 ASSERT(ccnt == pcnt || ccnt == 0);
469 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
474 for (; i < pcnt; i++, src++, p++, q++, r++) {
480 ASSERT(ccnt <= pcnt);
483 * Apply the algorithm described above by multiplying
484 * the previous result and adding in the new value.
486 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
489 VDEV_RAIDZ_64MUL_2(*q, mask);
492 VDEV_RAIDZ_64MUL_4(*r, mask);
497 * Treat short columns as though they are full of 0s.
498 * Note that there's therefore nothing needed for P.
500 for (; i < pcnt; i++, q++, r++) {
501 VDEV_RAIDZ_64MUL_2(*q, mask);
502 VDEV_RAIDZ_64MUL_4(*r, mask);
509 * Generate RAID parity in the first virtual columns according to the number of
510 * parity columns available.
513 vdev_raidz_generate_parity(raidz_map_t *rm)
515 switch (rm->rm_firstdatacol) {
517 vdev_raidz_generate_parity_p(rm);
520 vdev_raidz_generate_parity_pq(rm);
523 vdev_raidz_generate_parity_pqr(rm);
526 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
531 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
533 uint64_t *dst, *src, xcount, ccount, count, i;
538 ASSERT(x >= rm->rm_firstdatacol);
539 ASSERT(x < rm->rm_cols);
541 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
542 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
545 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
546 dst = rm->rm_col[x].rc_data;
547 for (i = 0; i < xcount; i++, dst++, src++) {
551 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
552 src = rm->rm_col[c].rc_data;
553 dst = rm->rm_col[x].rc_data;
558 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
559 count = MIN(ccount, xcount);
561 for (i = 0; i < count; i++, dst++, src++) {
566 return (1 << VDEV_RAIDZ_P);
570 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
572 uint64_t *dst, *src, xcount, ccount, count, mask, i;
579 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
580 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
582 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
583 src = rm->rm_col[c].rc_data;
584 dst = rm->rm_col[x].rc_data;
589 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
591 count = MIN(ccount, xcount);
593 if (c == rm->rm_firstdatacol) {
594 for (i = 0; i < count; i++, dst++, src++) {
597 for (; i < xcount; i++, dst++) {
602 for (i = 0; i < count; i++, dst++, src++) {
603 VDEV_RAIDZ_64MUL_2(*dst, mask);
607 for (; i < xcount; i++, dst++) {
608 VDEV_RAIDZ_64MUL_2(*dst, mask);
613 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
614 dst = rm->rm_col[x].rc_data;
615 exp = 255 - (rm->rm_cols - 1 - x);
617 for (i = 0; i < xcount; i++, dst++, src++) {
619 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
620 *b = vdev_raidz_exp2(*b, exp);
624 return (1 << VDEV_RAIDZ_Q);
628 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
630 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
632 uint64_t xsize, ysize, i;
638 ASSERT(x >= rm->rm_firstdatacol);
639 ASSERT(y < rm->rm_cols);
641 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
644 * Move the parity data aside -- we're going to compute parity as
645 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
646 * reuse the parity generation mechanism without trashing the actual
647 * parity so we make those columns appear to be full of zeros by
648 * setting their lengths to zero.
650 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
651 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
652 xsize = rm->rm_col[x].rc_size;
653 ysize = rm->rm_col[y].rc_size;
655 rm->rm_col[VDEV_RAIDZ_P].rc_data =
656 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
657 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
658 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
659 rm->rm_col[x].rc_size = 0;
660 rm->rm_col[y].rc_size = 0;
662 vdev_raidz_generate_parity_pq(rm);
664 rm->rm_col[x].rc_size = xsize;
665 rm->rm_col[y].rc_size = ysize;
669 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
670 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
671 xd = rm->rm_col[x].rc_data;
672 yd = rm->rm_col[y].rc_data;
676 * Pxy = P + D_x + D_y
677 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
679 * We can then solve for D_x:
680 * D_x = A * (P + Pxy) + B * (Q + Qxy)
682 * A = 2^(x - y) * (2^(x - y) + 1)^-1
683 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
685 * With D_x in hand, we can easily solve for D_y:
686 * D_y = P + Pxy + D_x
689 a = vdev_raidz_pow2[255 + x - y];
690 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
691 tmp = 255 - vdev_raidz_log2[a ^ 1];
693 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
694 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
696 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
697 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
698 vdev_raidz_exp2(*q ^ *qxy, bexp);
701 *yd = *p ^ *pxy ^ *xd;
704 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
705 rm->rm_col[VDEV_RAIDZ_P].rc_size);
706 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
707 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
710 * Restore the saved parity data.
712 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
713 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
715 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
720 * In the general case of reconstruction, we must solve the system of linear
721 * equations defined by the coeffecients used to generate parity as well as
722 * the contents of the data and parity disks. This can be expressed with
723 * vectors for the original data (D) and the actual data (d) and parity (p)
724 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
728 * | V | | D_0 | | p_m-1 |
729 * | | x | : | = | d_0 |
730 * | I | | D_n-1 | | : |
731 * | | ~~ ~~ | d_n-1 |
734 * I is simply a square identity matrix of size n, and V is a vandermonde
735 * matrix defined by the coeffecients we chose for the various parity columns
736 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
737 * computation as well as linear separability.
740 * | 1 .. 1 1 1 | | p_0 |
741 * | 2^n-1 .. 4 2 1 | __ __ | : |
742 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
743 * | 1 .. 0 0 0 | | D_1 | | d_0 |
744 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
745 * | : : : : | | : | | d_2 |
746 * | 0 .. 1 0 0 | | D_n-1 | | : |
747 * | 0 .. 0 1 0 | ~~ ~~ | : |
748 * | 0 .. 0 0 1 | | d_n-1 |
751 * Note that I, V, d, and p are known. To compute D, we must invert the
752 * matrix and use the known data and parity values to reconstruct the unknown
753 * data values. We begin by removing the rows in V|I and d|p that correspond
754 * to failed or missing columns; we then make V|I square (n x n) and d|p
755 * sized n by removing rows corresponding to unused parity from the bottom up
756 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
757 * using Gauss-Jordan elimination. In the example below we use m=3 parity
758 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
760 * | 1 1 1 1 1 1 1 1 |
761 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
762 * | 19 205 116 29 64 16 4 1 | / /
763 * | 1 0 0 0 0 0 0 0 | / /
764 * | 0 1 0 0 0 0 0 0 | <--' /
765 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
766 * | 0 0 0 1 0 0 0 0 |
767 * | 0 0 0 0 1 0 0 0 |
768 * | 0 0 0 0 0 1 0 0 |
769 * | 0 0 0 0 0 0 1 0 |
770 * | 0 0 0 0 0 0 0 1 |
773 * | 1 1 1 1 1 1 1 1 |
774 * | 128 64 32 16 8 4 2 1 |
775 * | 19 205 116 29 64 16 4 1 |
776 * | 1 0 0 0 0 0 0 0 |
777 * | 0 1 0 0 0 0 0 0 |
778 * (V|I)' = | 0 0 1 0 0 0 0 0 |
779 * | 0 0 0 1 0 0 0 0 |
780 * | 0 0 0 0 1 0 0 0 |
781 * | 0 0 0 0 0 1 0 0 |
782 * | 0 0 0 0 0 0 1 0 |
783 * | 0 0 0 0 0 0 0 1 |
786 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
787 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
788 * matrix is not singular.
790 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
791 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
792 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
793 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
794 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
795 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
796 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
797 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
800 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
801 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
802 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
803 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
804 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
805 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
806 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
807 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
810 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
811 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
812 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
813 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
814 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
815 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
816 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
817 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
820 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
821 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
822 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
823 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
824 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
825 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
826 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
827 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
830 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
831 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
832 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
833 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
834 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
835 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
836 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
837 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
840 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
841 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
842 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
843 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
844 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
845 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
846 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
847 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
850 * | 0 0 1 0 0 0 0 0 |
851 * | 167 100 5 41 159 169 217 208 |
852 * | 166 100 4 40 158 168 216 209 |
853 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
854 * | 0 0 0 0 1 0 0 0 |
855 * | 0 0 0 0 0 1 0 0 |
856 * | 0 0 0 0 0 0 1 0 |
857 * | 0 0 0 0 0 0 0 1 |
860 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
861 * of the missing data.
863 * As is apparent from the example above, the only non-trivial rows in the
864 * inverse matrix correspond to the data disks that we're trying to
865 * reconstruct. Indeed, those are the only rows we need as the others would
866 * only be useful for reconstructing data known or assumed to be valid. For
867 * that reason, we only build the coefficients in the rows that correspond to
873 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
879 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
882 * Fill in the missing rows of interest.
884 for (i = 0; i < nmap; i++) {
885 ASSERT3S(0, <=, map[i]);
886 ASSERT3S(map[i], <=, 2);
893 for (j = 0; j < n; j++) {
897 rows[i][j] = vdev_raidz_pow2[pow];
903 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
904 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
910 * Assert that the first nmissing entries from the array of used
911 * columns correspond to parity columns and that subsequent entries
912 * correspond to data columns.
914 for (i = 0; i < nmissing; i++) {
915 ASSERT3S(used[i], <, rm->rm_firstdatacol);
918 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
922 * First initialize the storage where we'll compute the inverse rows.
924 for (i = 0; i < nmissing; i++) {
925 for (j = 0; j < n; j++) {
926 invrows[i][j] = (i == j) ? 1 : 0;
931 * Subtract all trivial rows from the rows of consequence.
933 for (i = 0; i < nmissing; i++) {
934 for (j = nmissing; j < n; j++) {
935 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
936 jj = used[j] - rm->rm_firstdatacol;
938 invrows[i][j] = rows[i][jj];
944 * For each of the rows of interest, we must normalize it and subtract
945 * a multiple of it from the other rows.
947 for (i = 0; i < nmissing; i++) {
948 for (j = 0; j < missing[i]; j++) {
949 ASSERT3U(rows[i][j], ==, 0);
951 ASSERT3U(rows[i][missing[i]], !=, 0);
954 * Compute the inverse of the first element and multiply each
955 * element in the row by that value.
957 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
959 for (j = 0; j < n; j++) {
960 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
961 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
964 for (ii = 0; ii < nmissing; ii++) {
968 ASSERT3U(rows[ii][missing[i]], !=, 0);
970 log = vdev_raidz_log2[rows[ii][missing[i]]];
972 for (j = 0; j < n; j++) {
974 vdev_raidz_exp2(rows[i][j], log);
976 vdev_raidz_exp2(invrows[i][j], log);
982 * Verify that the data that is left in the rows are properly part of
983 * an identity matrix.
985 for (i = 0; i < nmissing; i++) {
986 for (j = 0; j < n; j++) {
987 if (j == missing[i]) {
988 ASSERT3U(rows[i][j], ==, 1);
990 ASSERT3U(rows[i][j], ==, 0);
997 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
998 int *missing, uint8_t **invrows, const uint8_t *used)
1003 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1004 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1007 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1011 psize = sizeof (invlog[0][0]) * n * nmissing;
1012 p = kmem_alloc(psize, KM_SLEEP);
1014 for (pp = p, i = 0; i < nmissing; i++) {
1019 for (i = 0; i < nmissing; i++) {
1020 for (j = 0; j < n; j++) {
1021 ASSERT3U(invrows[i][j], !=, 0);
1022 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1026 for (i = 0; i < n; i++) {
1028 ASSERT3U(c, <, rm->rm_cols);
1030 src = rm->rm_col[c].rc_data;
1031 ccount = rm->rm_col[c].rc_size;
1032 for (j = 0; j < nmissing; j++) {
1033 cc = missing[j] + rm->rm_firstdatacol;
1034 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1035 ASSERT3U(cc, <, rm->rm_cols);
1036 ASSERT3U(cc, !=, c);
1038 dst[j] = rm->rm_col[cc].rc_data;
1039 dcount[j] = rm->rm_col[cc].rc_size;
1042 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1044 for (x = 0; x < ccount; x++, src++) {
1046 log = vdev_raidz_log2[*src];
1048 for (cc = 0; cc < nmissing; cc++) {
1049 if (x >= dcount[cc])
1055 if ((ll = log + invlog[cc][i]) >= 255)
1057 val = vdev_raidz_pow2[ll];
1068 kmem_free(p, psize);
1072 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1076 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1077 int parity_map[VDEV_RAIDZ_MAXPARITY];
1082 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1083 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1089 n = rm->rm_cols - rm->rm_firstdatacol;
1092 * Figure out which data columns are missing.
1095 for (t = 0; t < ntgts; t++) {
1096 if (tgts[t] >= rm->rm_firstdatacol) {
1097 missing_rows[nmissing_rows++] =
1098 tgts[t] - rm->rm_firstdatacol;
1103 * Figure out which parity columns to use to help generate the missing
1106 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1108 ASSERT(c < rm->rm_firstdatacol);
1111 * Skip any targeted parity columns.
1113 if (c == tgts[tt]) {
1125 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1127 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1128 nmissing_rows * n + sizeof (used[0]) * n;
1129 p = kmem_alloc(psize, KM_SLEEP);
1131 for (pp = p, i = 0; i < nmissing_rows; i++) {
1139 for (i = 0; i < nmissing_rows; i++) {
1140 used[i] = parity_map[i];
1143 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1144 if (tt < nmissing_rows &&
1145 c == missing_rows[tt] + rm->rm_firstdatacol) {
1156 * Initialize the interesting rows of the matrix.
1158 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1161 * Invert the matrix.
1163 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1167 * Reconstruct the missing data using the generated matrix.
1169 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1172 kmem_free(p, psize);
1178 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1180 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1184 int nbadparity, nbaddata;
1185 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1188 * The tgts list must already be sorted.
1190 for (i = 1; i < nt; i++) {
1191 ASSERT(t[i] > t[i - 1]);
1194 nbadparity = rm->rm_firstdatacol;
1195 nbaddata = rm->rm_cols - nbadparity;
1197 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1198 if (c < rm->rm_firstdatacol)
1199 parity_valid[c] = B_FALSE;
1201 if (i < nt && c == t[i]) {
1204 } else if (rm->rm_col[c].rc_error != 0) {
1206 } else if (c >= rm->rm_firstdatacol) {
1209 parity_valid[c] = B_TRUE;
1214 ASSERT(ntgts >= nt);
1215 ASSERT(nbaddata >= 0);
1216 ASSERT(nbaddata + nbadparity == ntgts);
1218 dt = &tgts[nbadparity];
1221 * See if we can use any of our optimized reconstruction routines.
1223 if (!vdev_raidz_default_to_general) {
1226 if (parity_valid[VDEV_RAIDZ_P])
1227 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1229 ASSERT(rm->rm_firstdatacol > 1);
1231 if (parity_valid[VDEV_RAIDZ_Q])
1232 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1234 ASSERT(rm->rm_firstdatacol > 2);
1238 ASSERT(rm->rm_firstdatacol > 1);
1240 if (parity_valid[VDEV_RAIDZ_P] &&
1241 parity_valid[VDEV_RAIDZ_Q])
1242 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1244 ASSERT(rm->rm_firstdatacol > 2);
1250 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1251 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1257 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
1260 uint64_t nparity = vd->vdev_nparity;
1265 ASSERT(nparity > 0);
1267 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1268 vd->vdev_children < nparity + 1) {
1269 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1273 vdev_open_children(vd);
1275 for (c = 0; c < vd->vdev_children; c++) {
1276 cvd = vd->vdev_child[c];
1278 if (cvd->vdev_open_error != 0) {
1279 lasterror = cvd->vdev_open_error;
1284 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1285 *ashift = MAX(*ashift, cvd->vdev_ashift);
1288 *asize *= vd->vdev_children;
1290 if (numerrors > nparity) {
1291 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1299 vdev_raidz_close(vdev_t *vd)
1303 for (c = 0; c < vd->vdev_children; c++)
1304 vdev_close(vd->vdev_child[c]);
1308 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1311 uint64_t ashift = vd->vdev_top->vdev_ashift;
1312 uint64_t cols = vd->vdev_children;
1313 uint64_t nparity = vd->vdev_nparity;
1315 asize = ((psize - 1) >> ashift) + 1;
1316 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1317 asize = roundup(asize, nparity + 1) << ashift;
1323 vdev_raidz_child_done(zio_t *zio)
1325 raidz_col_t *rc = zio->io_private;
1327 rc->rc_error = zio->io_error;
1333 vdev_raidz_io_start(zio_t *zio)
1335 vdev_t *vd = zio->io_vd;
1336 vdev_t *tvd = vd->vdev_top;
1338 blkptr_t *bp = zio->io_bp;
1343 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1346 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1348 if (zio->io_type == ZIO_TYPE_WRITE) {
1349 vdev_raidz_generate_parity(rm);
1351 for (c = 0; c < rm->rm_cols; c++) {
1352 rc = &rm->rm_col[c];
1353 cvd = vd->vdev_child[rc->rc_devidx];
1354 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1355 rc->rc_offset, rc->rc_data, rc->rc_size,
1356 zio->io_type, zio->io_priority, 0,
1357 vdev_raidz_child_done, rc));
1361 * Generate optional I/Os for any skipped sectors to improve
1362 * aggregation contiguity.
1364 for (c = rm->rm_bigcols, i = 0; i < rm->rm_skipped; c++, i++) {
1365 ASSERT(c <= rm->rm_scols);
1366 if (c == rm->rm_scols)
1368 rc = &rm->rm_col[c];
1369 cvd = vd->vdev_child[rc->rc_devidx];
1370 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1371 rc->rc_offset + rc->rc_size, NULL,
1372 1 << tvd->vdev_ashift,
1373 zio->io_type, zio->io_priority,
1374 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1377 return (ZIO_PIPELINE_CONTINUE);
1380 ASSERT(zio->io_type == ZIO_TYPE_READ);
1383 * Iterate over the columns in reverse order so that we hit the parity
1384 * last -- any errors along the way will force us to read the parity.
1386 for (c = rm->rm_cols - 1; c >= 0; c--) {
1387 rc = &rm->rm_col[c];
1388 cvd = vd->vdev_child[rc->rc_devidx];
1389 if (!vdev_readable(cvd)) {
1390 if (c >= rm->rm_firstdatacol)
1391 rm->rm_missingdata++;
1393 rm->rm_missingparity++;
1394 rc->rc_error = ENXIO;
1395 rc->rc_tried = 1; /* don't even try */
1399 if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
1400 if (c >= rm->rm_firstdatacol)
1401 rm->rm_missingdata++;
1403 rm->rm_missingparity++;
1404 rc->rc_error = ESTALE;
1408 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1409 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1410 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1411 rc->rc_offset, rc->rc_data, rc->rc_size,
1412 zio->io_type, zio->io_priority, 0,
1413 vdev_raidz_child_done, rc));
1417 return (ZIO_PIPELINE_CONTINUE);
1421 * Report a checksum error for a child of a RAID-Z device.
1424 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
1426 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1428 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1429 mutex_enter(&vd->vdev_stat_lock);
1430 vd->vdev_stat.vs_checksum_errors++;
1431 mutex_exit(&vd->vdev_stat_lock);
1434 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
1435 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1436 zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
1440 * Generate the parity from the data columns. If we tried and were able to
1441 * read the parity without error, verify that the generated parity matches the
1442 * data we read. If it doesn't, we fire off a checksum error. Return the
1443 * number such failures.
1446 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1448 void *orig[VDEV_RAIDZ_MAXPARITY];
1452 for (c = 0; c < rm->rm_firstdatacol; c++) {
1453 rc = &rm->rm_col[c];
1454 if (!rc->rc_tried || rc->rc_error != 0)
1456 orig[c] = zio_buf_alloc(rc->rc_size);
1457 bcopy(rc->rc_data, orig[c], rc->rc_size);
1460 vdev_raidz_generate_parity(rm);
1462 for (c = 0; c < rm->rm_firstdatacol; c++) {
1463 rc = &rm->rm_col[c];
1464 if (!rc->rc_tried || rc->rc_error != 0)
1466 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1467 raidz_checksum_error(zio, rc);
1468 rc->rc_error = ECKSUM;
1471 zio_buf_free(orig[c], rc->rc_size);
1478 * Keep statistics on all the ways that we used parity to correct data.
1480 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1483 vdev_raidz_worst_error(raidz_map_t *rm)
1487 for (int c = 0; c < rm->rm_cols; c++)
1488 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1494 * Iterate over all combinations of bad data and attempt a reconstruction.
1495 * Note that the algorithm below is non-optimal because it doesn't take into
1496 * account how reconstruction is actually performed. For example, with
1497 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1498 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1499 * cases we'd only use parity information in column 0.
1502 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1504 raidz_map_t *rm = zio->io_vsd;
1506 void *orig[VDEV_RAIDZ_MAXPARITY];
1507 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1508 int *tgts = &tstore[1];
1509 int current, next, i, c, n;
1512 ASSERT(total_errors < rm->rm_firstdatacol);
1515 * This simplifies one edge condition.
1519 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1521 * Initialize the targets array by finding the first n columns
1522 * that contain no error.
1524 * If there were no data errors, we need to ensure that we're
1525 * always explicitly attempting to reconstruct at least one
1526 * data column. To do this, we simply push the highest target
1527 * up into the data columns.
1529 for (c = 0, i = 0; i < n; i++) {
1530 if (i == n - 1 && data_errors == 0 &&
1531 c < rm->rm_firstdatacol) {
1532 c = rm->rm_firstdatacol;
1535 while (rm->rm_col[c].rc_error != 0) {
1537 ASSERT3S(c, <, rm->rm_cols);
1544 * Setting tgts[n] simplifies the other edge condition.
1546 tgts[n] = rm->rm_cols;
1549 * These buffers were allocated in previous iterations.
1551 for (i = 0; i < n - 1; i++) {
1552 ASSERT(orig[i] != NULL);
1555 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1558 next = tgts[current];
1560 while (current != n) {
1561 tgts[current] = next;
1565 * Save off the original data that we're going to
1566 * attempt to reconstruct.
1568 for (i = 0; i < n; i++) {
1569 ASSERT(orig[i] != NULL);
1572 ASSERT3S(c, <, rm->rm_cols);
1573 rc = &rm->rm_col[c];
1574 bcopy(rc->rc_data, orig[i], rc->rc_size);
1578 * Attempt a reconstruction and exit the outer loop on
1581 code = vdev_raidz_reconstruct(rm, tgts, n);
1582 if (zio_checksum_error(zio) == 0) {
1583 atomic_inc_64(&raidz_corrected[code]);
1585 for (i = 0; i < n; i++) {
1587 rc = &rm->rm_col[c];
1588 ASSERT(rc->rc_error == 0);
1590 raidz_checksum_error(zio, rc);
1591 rc->rc_error = ECKSUM;
1599 * Restore the original data.
1601 for (i = 0; i < n; i++) {
1603 rc = &rm->rm_col[c];
1604 bcopy(orig[i], rc->rc_data, rc->rc_size);
1609 * Find the next valid column after the current
1612 for (next = tgts[current] + 1;
1613 next < rm->rm_cols &&
1614 rm->rm_col[next].rc_error != 0; next++)
1617 ASSERT(next <= tgts[current + 1]);
1620 * If that spot is available, we're done here.
1622 if (next != tgts[current + 1])
1626 * Otherwise, find the next valid column after
1627 * the previous position.
1629 for (c = tgts[current - 1] + 1;
1630 rm->rm_col[c].rc_error != 0; c++)
1636 } while (current != n);
1641 for (i = 0; i < n; i++) {
1642 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1649 vdev_raidz_io_done(zio_t *zio)
1651 vdev_t *vd = zio->io_vd;
1653 raidz_map_t *rm = zio->io_vsd;
1655 int unexpected_errors = 0;
1656 int parity_errors = 0;
1657 int parity_untried = 0;
1658 int data_errors = 0;
1659 int total_errors = 0;
1661 int tgts[VDEV_RAIDZ_MAXPARITY];
1664 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1666 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1667 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1669 for (c = 0; c < rm->rm_cols; c++) {
1670 rc = &rm->rm_col[c];
1673 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1675 if (c < rm->rm_firstdatacol)
1680 if (!rc->rc_skipped)
1681 unexpected_errors++;
1684 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1689 if (zio->io_type == ZIO_TYPE_WRITE) {
1691 * XXX -- for now, treat partial writes as a success.
1692 * (If we couldn't write enough columns to reconstruct
1693 * the data, the I/O failed. Otherwise, good enough.)
1695 * Now that we support write reallocation, it would be better
1696 * to treat partial failure as real failure unless there are
1697 * no non-degraded top-level vdevs left, and not update DTLs
1698 * if we intend to reallocate.
1701 if (total_errors > rm->rm_firstdatacol)
1702 zio->io_error = vdev_raidz_worst_error(rm);
1707 ASSERT(zio->io_type == ZIO_TYPE_READ);
1709 * There are three potential phases for a read:
1710 * 1. produce valid data from the columns read
1711 * 2. read all disks and try again
1712 * 3. perform combinatorial reconstruction
1714 * Each phase is progressively both more expensive and less likely to
1715 * occur. If we encounter more errors than we can repair or all phases
1716 * fail, we have no choice but to return an error.
1720 * If the number of errors we saw was correctable -- less than or equal
1721 * to the number of parity disks read -- attempt to produce data that
1722 * has a valid checksum. Naturally, this case applies in the absence of
1725 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1726 if (data_errors == 0) {
1727 if (zio_checksum_error(zio) == 0) {
1729 * If we read parity information (unnecessarily
1730 * as it happens since no reconstruction was
1731 * needed) regenerate and verify the parity.
1732 * We also regenerate parity when resilvering
1733 * so we can write it out to the failed device
1736 if (parity_errors + parity_untried <
1737 rm->rm_firstdatacol ||
1738 (zio->io_flags & ZIO_FLAG_RESILVER)) {
1739 n = raidz_parity_verify(zio, rm);
1740 unexpected_errors += n;
1741 ASSERT(parity_errors + n <=
1742 rm->rm_firstdatacol);
1748 * We either attempt to read all the parity columns or
1749 * none of them. If we didn't try to read parity, we
1750 * wouldn't be here in the correctable case. There must
1751 * also have been fewer parity errors than parity
1752 * columns or, again, we wouldn't be in this code path.
1754 ASSERT(parity_untried == 0);
1755 ASSERT(parity_errors < rm->rm_firstdatacol);
1758 * Identify the data columns that reported an error.
1761 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1762 rc = &rm->rm_col[c];
1763 if (rc->rc_error != 0) {
1764 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1769 ASSERT(rm->rm_firstdatacol >= n);
1771 code = vdev_raidz_reconstruct(rm, tgts, n);
1773 if (zio_checksum_error(zio) == 0) {
1774 atomic_inc_64(&raidz_corrected[code]);
1777 * If we read more parity disks than were used
1778 * for reconstruction, confirm that the other
1779 * parity disks produced correct data. This
1780 * routine is suboptimal in that it regenerates
1781 * the parity that we already used in addition
1782 * to the parity that we're attempting to
1783 * verify, but this should be a relatively
1784 * uncommon case, and can be optimized if it
1785 * becomes a problem. Note that we regenerate
1786 * parity when resilvering so we can write it
1787 * out to failed devices later.
1789 if (parity_errors < rm->rm_firstdatacol - n ||
1790 (zio->io_flags & ZIO_FLAG_RESILVER)) {
1791 n = raidz_parity_verify(zio, rm);
1792 unexpected_errors += n;
1793 ASSERT(parity_errors + n <=
1794 rm->rm_firstdatacol);
1803 * This isn't a typical situation -- either we got a read error or
1804 * a child silently returned bad data. Read every block so we can
1805 * try again with as much data and parity as we can track down. If
1806 * we've already been through once before, all children will be marked
1807 * as tried so we'll proceed to combinatorial reconstruction.
1809 unexpected_errors = 1;
1810 rm->rm_missingdata = 0;
1811 rm->rm_missingparity = 0;
1813 for (c = 0; c < rm->rm_cols; c++) {
1814 if (rm->rm_col[c].rc_tried)
1817 zio_vdev_io_redone(zio);
1819 rc = &rm->rm_col[c];
1822 zio_nowait(zio_vdev_child_io(zio, NULL,
1823 vd->vdev_child[rc->rc_devidx],
1824 rc->rc_offset, rc->rc_data, rc->rc_size,
1825 zio->io_type, zio->io_priority, 0,
1826 vdev_raidz_child_done, rc));
1827 } while (++c < rm->rm_cols);
1833 * At this point we've attempted to reconstruct the data given the
1834 * errors we detected, and we've attempted to read all columns. There
1835 * must, therefore, be one or more additional problems -- silent errors
1836 * resulting in invalid data rather than explicit I/O errors resulting
1837 * in absent data. We check if there is enough additional data to
1838 * possibly reconstruct the data and then perform combinatorial
1839 * reconstruction over all possible combinations. If that fails,
1842 if (total_errors >= rm->rm_firstdatacol) {
1843 zio->io_error = vdev_raidz_worst_error(rm);
1845 * If there were exactly as many device errors as parity
1846 * columns, yet we couldn't reconstruct the data, then at
1847 * least one device must have returned bad data silently.
1849 if (total_errors == rm->rm_firstdatacol)
1850 zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
1852 } else if ((code = vdev_raidz_combrec(zio, total_errors,
1853 data_errors)) != 0) {
1855 * If we didn't use all the available parity for the
1856 * combinatorial reconstruction, verify that the remaining
1857 * parity is correct.
1859 if (code != (1 << rm->rm_firstdatacol) - 1)
1860 (void) raidz_parity_verify(zio, rm);
1863 * All combinations failed to checksum. Generate checksum
1864 * ereports for all children.
1866 zio->io_error = ECKSUM;
1868 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1869 for (c = 0; c < rm->rm_cols; c++) {
1870 rc = &rm->rm_col[c];
1871 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1872 zio->io_spa, vd->vdev_child[rc->rc_devidx],
1873 zio, rc->rc_offset, rc->rc_size);
1879 zio_checksum_verified(zio);
1881 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
1882 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1884 * Use the good data we have in hand to repair damaged children.
1886 for (c = 0; c < rm->rm_cols; c++) {
1887 rc = &rm->rm_col[c];
1888 cvd = vd->vdev_child[rc->rc_devidx];
1890 if (rc->rc_error == 0)
1893 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1894 rc->rc_offset, rc->rc_data, rc->rc_size,
1895 ZIO_TYPE_WRITE, zio->io_priority,
1896 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
1897 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
1903 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1905 if (faulted > vd->vdev_nparity)
1906 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1907 VDEV_AUX_NO_REPLICAS);
1908 else if (degraded + faulted != 0)
1909 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1911 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1914 vdev_ops_t vdev_raidz_ops = {
1918 vdev_raidz_io_start,
1920 vdev_raidz_state_change,
1921 VDEV_TYPE_RAIDZ, /* name of this vdev type */
1922 B_FALSE /* not a leaf vdev */