md: remove 'go_faster' option from ->sync_request()
[sfrench/cifs-2.6.git] / drivers / md / raid5.c
1 /*
2  * raid5.c : Multiple Devices driver for Linux
3  *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4  *         Copyright (C) 1999, 2000 Ingo Molnar
5  *         Copyright (C) 2002, 2003 H. Peter Anvin
6  *
7  * RAID-4/5/6 management functions.
8  * Thanks to Penguin Computing for making the RAID-6 development possible
9  * by donating a test server!
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2, or (at your option)
14  * any later version.
15  *
16  * You should have received a copy of the GNU General Public License
17  * (for example /usr/src/linux/COPYING); if not, write to the Free
18  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19  */
20
21 /*
22  * BITMAP UNPLUGGING:
23  *
24  * The sequencing for updating the bitmap reliably is a little
25  * subtle (and I got it wrong the first time) so it deserves some
26  * explanation.
27  *
28  * We group bitmap updates into batches.  Each batch has a number.
29  * We may write out several batches at once, but that isn't very important.
30  * conf->seq_write is the number of the last batch successfully written.
31  * conf->seq_flush is the number of the last batch that was closed to
32  *    new additions.
33  * When we discover that we will need to write to any block in a stripe
34  * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35  * the number of the batch it will be in. This is seq_flush+1.
36  * When we are ready to do a write, if that batch hasn't been written yet,
37  *   we plug the array and queue the stripe for later.
38  * When an unplug happens, we increment bm_flush, thus closing the current
39  *   batch.
40  * When we notice that bm_flush > bm_write, we write out all pending updates
41  * to the bitmap, and advance bm_write to where bm_flush was.
42  * This may occasionally write a bit out twice, but is sure never to
43  * miss any bits.
44  */
45
46 #include <linux/blkdev.h>
47 #include <linux/kthread.h>
48 #include <linux/raid/pq.h>
49 #include <linux/async_tx.h>
50 #include <linux/module.h>
51 #include <linux/async.h>
52 #include <linux/seq_file.h>
53 #include <linux/cpu.h>
54 #include <linux/slab.h>
55 #include <linux/ratelimit.h>
56 #include <linux/nodemask.h>
57 #include <trace/events/block.h>
58
59 #include "md.h"
60 #include "raid5.h"
61 #include "raid0.h"
62 #include "bitmap.h"
63
64 #define cpu_to_group(cpu) cpu_to_node(cpu)
65 #define ANY_GROUP NUMA_NO_NODE
66
67 static bool devices_handle_discard_safely = false;
68 module_param(devices_handle_discard_safely, bool, 0644);
69 MODULE_PARM_DESC(devices_handle_discard_safely,
70                  "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
71 static struct workqueue_struct *raid5_wq;
72 /*
73  * Stripe cache
74  */
75
76 #define NR_STRIPES              256
77 #define STRIPE_SIZE             PAGE_SIZE
78 #define STRIPE_SHIFT            (PAGE_SHIFT - 9)
79 #define STRIPE_SECTORS          (STRIPE_SIZE>>9)
80 #define IO_THRESHOLD            1
81 #define BYPASS_THRESHOLD        1
82 #define NR_HASH                 (PAGE_SIZE / sizeof(struct hlist_head))
83 #define HASH_MASK               (NR_HASH - 1)
84 #define MAX_STRIPE_BATCH        8
85
86 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
87 {
88         int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
89         return &conf->stripe_hashtbl[hash];
90 }
91
92 static inline int stripe_hash_locks_hash(sector_t sect)
93 {
94         return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
95 }
96
97 static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
98 {
99         spin_lock_irq(conf->hash_locks + hash);
100         spin_lock(&conf->device_lock);
101 }
102
103 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
104 {
105         spin_unlock(&conf->device_lock);
106         spin_unlock_irq(conf->hash_locks + hash);
107 }
108
109 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
110 {
111         int i;
112         local_irq_disable();
113         spin_lock(conf->hash_locks);
114         for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
115                 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
116         spin_lock(&conf->device_lock);
117 }
118
119 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
120 {
121         int i;
122         spin_unlock(&conf->device_lock);
123         for (i = NR_STRIPE_HASH_LOCKS; i; i--)
124                 spin_unlock(conf->hash_locks + i - 1);
125         local_irq_enable();
126 }
127
128 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
129  * order without overlap.  There may be several bio's per stripe+device, and
130  * a bio could span several devices.
131  * When walking this list for a particular stripe+device, we must never proceed
132  * beyond a bio that extends past this device, as the next bio might no longer
133  * be valid.
134  * This function is used to determine the 'next' bio in the list, given the sector
135  * of the current stripe+device
136  */
137 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
138 {
139         int sectors = bio_sectors(bio);
140         if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
141                 return bio->bi_next;
142         else
143                 return NULL;
144 }
145
146 /*
147  * We maintain a biased count of active stripes in the bottom 16 bits of
148  * bi_phys_segments, and a count of processed stripes in the upper 16 bits
149  */
150 static inline int raid5_bi_processed_stripes(struct bio *bio)
151 {
152         atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
153         return (atomic_read(segments) >> 16) & 0xffff;
154 }
155
156 static inline int raid5_dec_bi_active_stripes(struct bio *bio)
157 {
158         atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
159         return atomic_sub_return(1, segments) & 0xffff;
160 }
161
162 static inline void raid5_inc_bi_active_stripes(struct bio *bio)
163 {
164         atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
165         atomic_inc(segments);
166 }
167
168 static inline void raid5_set_bi_processed_stripes(struct bio *bio,
169         unsigned int cnt)
170 {
171         atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
172         int old, new;
173
174         do {
175                 old = atomic_read(segments);
176                 new = (old & 0xffff) | (cnt << 16);
177         } while (atomic_cmpxchg(segments, old, new) != old);
178 }
179
180 static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
181 {
182         atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
183         atomic_set(segments, cnt);
184 }
185
186 /* Find first data disk in a raid6 stripe */
187 static inline int raid6_d0(struct stripe_head *sh)
188 {
189         if (sh->ddf_layout)
190                 /* ddf always start from first device */
191                 return 0;
192         /* md starts just after Q block */
193         if (sh->qd_idx == sh->disks - 1)
194                 return 0;
195         else
196                 return sh->qd_idx + 1;
197 }
198 static inline int raid6_next_disk(int disk, int raid_disks)
199 {
200         disk++;
201         return (disk < raid_disks) ? disk : 0;
202 }
203
204 /* When walking through the disks in a raid5, starting at raid6_d0,
205  * We need to map each disk to a 'slot', where the data disks are slot
206  * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
207  * is raid_disks-1.  This help does that mapping.
208  */
209 static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
210                              int *count, int syndrome_disks)
211 {
212         int slot = *count;
213
214         if (sh->ddf_layout)
215                 (*count)++;
216         if (idx == sh->pd_idx)
217                 return syndrome_disks;
218         if (idx == sh->qd_idx)
219                 return syndrome_disks + 1;
220         if (!sh->ddf_layout)
221                 (*count)++;
222         return slot;
223 }
224
225 static void return_io(struct bio *return_bi)
226 {
227         struct bio *bi = return_bi;
228         while (bi) {
229
230                 return_bi = bi->bi_next;
231                 bi->bi_next = NULL;
232                 bi->bi_iter.bi_size = 0;
233                 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
234                                          bi, 0);
235                 bio_endio(bi, 0);
236                 bi = return_bi;
237         }
238 }
239
240 static void print_raid5_conf (struct r5conf *conf);
241
242 static int stripe_operations_active(struct stripe_head *sh)
243 {
244         return sh->check_state || sh->reconstruct_state ||
245                test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
246                test_bit(STRIPE_COMPUTE_RUN, &sh->state);
247 }
248
249 static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
250 {
251         struct r5conf *conf = sh->raid_conf;
252         struct r5worker_group *group;
253         int thread_cnt;
254         int i, cpu = sh->cpu;
255
256         if (!cpu_online(cpu)) {
257                 cpu = cpumask_any(cpu_online_mask);
258                 sh->cpu = cpu;
259         }
260
261         if (list_empty(&sh->lru)) {
262                 struct r5worker_group *group;
263                 group = conf->worker_groups + cpu_to_group(cpu);
264                 list_add_tail(&sh->lru, &group->handle_list);
265                 group->stripes_cnt++;
266                 sh->group = group;
267         }
268
269         if (conf->worker_cnt_per_group == 0) {
270                 md_wakeup_thread(conf->mddev->thread);
271                 return;
272         }
273
274         group = conf->worker_groups + cpu_to_group(sh->cpu);
275
276         group->workers[0].working = true;
277         /* at least one worker should run to avoid race */
278         queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
279
280         thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
281         /* wakeup more workers */
282         for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
283                 if (group->workers[i].working == false) {
284                         group->workers[i].working = true;
285                         queue_work_on(sh->cpu, raid5_wq,
286                                       &group->workers[i].work);
287                         thread_cnt--;
288                 }
289         }
290 }
291
292 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
293                               struct list_head *temp_inactive_list)
294 {
295         BUG_ON(!list_empty(&sh->lru));
296         BUG_ON(atomic_read(&conf->active_stripes)==0);
297         if (test_bit(STRIPE_HANDLE, &sh->state)) {
298                 if (test_bit(STRIPE_DELAYED, &sh->state) &&
299                     !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
300                         list_add_tail(&sh->lru, &conf->delayed_list);
301                 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
302                            sh->bm_seq - conf->seq_write > 0)
303                         list_add_tail(&sh->lru, &conf->bitmap_list);
304                 else {
305                         clear_bit(STRIPE_DELAYED, &sh->state);
306                         clear_bit(STRIPE_BIT_DELAY, &sh->state);
307                         if (conf->worker_cnt_per_group == 0) {
308                                 list_add_tail(&sh->lru, &conf->handle_list);
309                         } else {
310                                 raid5_wakeup_stripe_thread(sh);
311                                 return;
312                         }
313                 }
314                 md_wakeup_thread(conf->mddev->thread);
315         } else {
316                 BUG_ON(stripe_operations_active(sh));
317                 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
318                         if (atomic_dec_return(&conf->preread_active_stripes)
319                             < IO_THRESHOLD)
320                                 md_wakeup_thread(conf->mddev->thread);
321                 atomic_dec(&conf->active_stripes);
322                 if (!test_bit(STRIPE_EXPANDING, &sh->state))
323                         list_add_tail(&sh->lru, temp_inactive_list);
324         }
325 }
326
327 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
328                              struct list_head *temp_inactive_list)
329 {
330         if (atomic_dec_and_test(&sh->count))
331                 do_release_stripe(conf, sh, temp_inactive_list);
332 }
333
334 /*
335  * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
336  *
337  * Be careful: Only one task can add/delete stripes from temp_inactive_list at
338  * given time. Adding stripes only takes device lock, while deleting stripes
339  * only takes hash lock.
340  */
341 static void release_inactive_stripe_list(struct r5conf *conf,
342                                          struct list_head *temp_inactive_list,
343                                          int hash)
344 {
345         int size;
346         bool do_wakeup = false;
347         unsigned long flags;
348
349         if (hash == NR_STRIPE_HASH_LOCKS) {
350                 size = NR_STRIPE_HASH_LOCKS;
351                 hash = NR_STRIPE_HASH_LOCKS - 1;
352         } else
353                 size = 1;
354         while (size) {
355                 struct list_head *list = &temp_inactive_list[size - 1];
356
357                 /*
358                  * We don't hold any lock here yet, get_active_stripe() might
359                  * remove stripes from the list
360                  */
361                 if (!list_empty_careful(list)) {
362                         spin_lock_irqsave(conf->hash_locks + hash, flags);
363                         if (list_empty(conf->inactive_list + hash) &&
364                             !list_empty(list))
365                                 atomic_dec(&conf->empty_inactive_list_nr);
366                         list_splice_tail_init(list, conf->inactive_list + hash);
367                         do_wakeup = true;
368                         spin_unlock_irqrestore(conf->hash_locks + hash, flags);
369                 }
370                 size--;
371                 hash--;
372         }
373
374         if (do_wakeup) {
375                 wake_up(&conf->wait_for_stripe);
376                 if (conf->retry_read_aligned)
377                         md_wakeup_thread(conf->mddev->thread);
378         }
379 }
380
381 /* should hold conf->device_lock already */
382 static int release_stripe_list(struct r5conf *conf,
383                                struct list_head *temp_inactive_list)
384 {
385         struct stripe_head *sh;
386         int count = 0;
387         struct llist_node *head;
388
389         head = llist_del_all(&conf->released_stripes);
390         head = llist_reverse_order(head);
391         while (head) {
392                 int hash;
393
394                 sh = llist_entry(head, struct stripe_head, release_list);
395                 head = llist_next(head);
396                 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
397                 smp_mb();
398                 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
399                 /*
400                  * Don't worry the bit is set here, because if the bit is set
401                  * again, the count is always > 1. This is true for
402                  * STRIPE_ON_UNPLUG_LIST bit too.
403                  */
404                 hash = sh->hash_lock_index;
405                 __release_stripe(conf, sh, &temp_inactive_list[hash]);
406                 count++;
407         }
408
409         return count;
410 }
411
412 static void release_stripe(struct stripe_head *sh)
413 {
414         struct r5conf *conf = sh->raid_conf;
415         unsigned long flags;
416         struct list_head list;
417         int hash;
418         bool wakeup;
419
420         /* Avoid release_list until the last reference.
421          */
422         if (atomic_add_unless(&sh->count, -1, 1))
423                 return;
424
425         if (unlikely(!conf->mddev->thread) ||
426                 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
427                 goto slow_path;
428         wakeup = llist_add(&sh->release_list, &conf->released_stripes);
429         if (wakeup)
430                 md_wakeup_thread(conf->mddev->thread);
431         return;
432 slow_path:
433         local_irq_save(flags);
434         /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
435         if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
436                 INIT_LIST_HEAD(&list);
437                 hash = sh->hash_lock_index;
438                 do_release_stripe(conf, sh, &list);
439                 spin_unlock(&conf->device_lock);
440                 release_inactive_stripe_list(conf, &list, hash);
441         }
442         local_irq_restore(flags);
443 }
444
445 static inline void remove_hash(struct stripe_head *sh)
446 {
447         pr_debug("remove_hash(), stripe %llu\n",
448                 (unsigned long long)sh->sector);
449
450         hlist_del_init(&sh->hash);
451 }
452
453 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
454 {
455         struct hlist_head *hp = stripe_hash(conf, sh->sector);
456
457         pr_debug("insert_hash(), stripe %llu\n",
458                 (unsigned long long)sh->sector);
459
460         hlist_add_head(&sh->hash, hp);
461 }
462
463 /* find an idle stripe, make sure it is unhashed, and return it. */
464 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
465 {
466         struct stripe_head *sh = NULL;
467         struct list_head *first;
468
469         if (list_empty(conf->inactive_list + hash))
470                 goto out;
471         first = (conf->inactive_list + hash)->next;
472         sh = list_entry(first, struct stripe_head, lru);
473         list_del_init(first);
474         remove_hash(sh);
475         atomic_inc(&conf->active_stripes);
476         BUG_ON(hash != sh->hash_lock_index);
477         if (list_empty(conf->inactive_list + hash))
478                 atomic_inc(&conf->empty_inactive_list_nr);
479 out:
480         return sh;
481 }
482
483 static void shrink_buffers(struct stripe_head *sh)
484 {
485         struct page *p;
486         int i;
487         int num = sh->raid_conf->pool_size;
488
489         for (i = 0; i < num ; i++) {
490                 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
491                 p = sh->dev[i].page;
492                 if (!p)
493                         continue;
494                 sh->dev[i].page = NULL;
495                 put_page(p);
496         }
497 }
498
499 static int grow_buffers(struct stripe_head *sh)
500 {
501         int i;
502         int num = sh->raid_conf->pool_size;
503
504         for (i = 0; i < num; i++) {
505                 struct page *page;
506
507                 if (!(page = alloc_page(GFP_KERNEL))) {
508                         return 1;
509                 }
510                 sh->dev[i].page = page;
511                 sh->dev[i].orig_page = page;
512         }
513         return 0;
514 }
515
516 static void raid5_build_block(struct stripe_head *sh, int i, int previous);
517 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
518                             struct stripe_head *sh);
519
520 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
521 {
522         struct r5conf *conf = sh->raid_conf;
523         int i, seq;
524
525         BUG_ON(atomic_read(&sh->count) != 0);
526         BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
527         BUG_ON(stripe_operations_active(sh));
528
529         pr_debug("init_stripe called, stripe %llu\n",
530                 (unsigned long long)sector);
531 retry:
532         seq = read_seqcount_begin(&conf->gen_lock);
533         sh->generation = conf->generation - previous;
534         sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
535         sh->sector = sector;
536         stripe_set_idx(sector, conf, previous, sh);
537         sh->state = 0;
538
539         for (i = sh->disks; i--; ) {
540                 struct r5dev *dev = &sh->dev[i];
541
542                 if (dev->toread || dev->read || dev->towrite || dev->written ||
543                     test_bit(R5_LOCKED, &dev->flags)) {
544                         printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
545                                (unsigned long long)sh->sector, i, dev->toread,
546                                dev->read, dev->towrite, dev->written,
547                                test_bit(R5_LOCKED, &dev->flags));
548                         WARN_ON(1);
549                 }
550                 dev->flags = 0;
551                 raid5_build_block(sh, i, previous);
552         }
553         if (read_seqcount_retry(&conf->gen_lock, seq))
554                 goto retry;
555         insert_hash(conf, sh);
556         sh->cpu = smp_processor_id();
557 }
558
559 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
560                                          short generation)
561 {
562         struct stripe_head *sh;
563
564         pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
565         hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
566                 if (sh->sector == sector && sh->generation == generation)
567                         return sh;
568         pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
569         return NULL;
570 }
571
572 /*
573  * Need to check if array has failed when deciding whether to:
574  *  - start an array
575  *  - remove non-faulty devices
576  *  - add a spare
577  *  - allow a reshape
578  * This determination is simple when no reshape is happening.
579  * However if there is a reshape, we need to carefully check
580  * both the before and after sections.
581  * This is because some failed devices may only affect one
582  * of the two sections, and some non-in_sync devices may
583  * be insync in the section most affected by failed devices.
584  */
585 static int calc_degraded(struct r5conf *conf)
586 {
587         int degraded, degraded2;
588         int i;
589
590         rcu_read_lock();
591         degraded = 0;
592         for (i = 0; i < conf->previous_raid_disks; i++) {
593                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
594                 if (rdev && test_bit(Faulty, &rdev->flags))
595                         rdev = rcu_dereference(conf->disks[i].replacement);
596                 if (!rdev || test_bit(Faulty, &rdev->flags))
597                         degraded++;
598                 else if (test_bit(In_sync, &rdev->flags))
599                         ;
600                 else
601                         /* not in-sync or faulty.
602                          * If the reshape increases the number of devices,
603                          * this is being recovered by the reshape, so
604                          * this 'previous' section is not in_sync.
605                          * If the number of devices is being reduced however,
606                          * the device can only be part of the array if
607                          * we are reverting a reshape, so this section will
608                          * be in-sync.
609                          */
610                         if (conf->raid_disks >= conf->previous_raid_disks)
611                                 degraded++;
612         }
613         rcu_read_unlock();
614         if (conf->raid_disks == conf->previous_raid_disks)
615                 return degraded;
616         rcu_read_lock();
617         degraded2 = 0;
618         for (i = 0; i < conf->raid_disks; i++) {
619                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
620                 if (rdev && test_bit(Faulty, &rdev->flags))
621                         rdev = rcu_dereference(conf->disks[i].replacement);
622                 if (!rdev || test_bit(Faulty, &rdev->flags))
623                         degraded2++;
624                 else if (test_bit(In_sync, &rdev->flags))
625                         ;
626                 else
627                         /* not in-sync or faulty.
628                          * If reshape increases the number of devices, this
629                          * section has already been recovered, else it
630                          * almost certainly hasn't.
631                          */
632                         if (conf->raid_disks <= conf->previous_raid_disks)
633                                 degraded2++;
634         }
635         rcu_read_unlock();
636         if (degraded2 > degraded)
637                 return degraded2;
638         return degraded;
639 }
640
641 static int has_failed(struct r5conf *conf)
642 {
643         int degraded;
644
645         if (conf->mddev->reshape_position == MaxSector)
646                 return conf->mddev->degraded > conf->max_degraded;
647
648         degraded = calc_degraded(conf);
649         if (degraded > conf->max_degraded)
650                 return 1;
651         return 0;
652 }
653
654 static struct stripe_head *
655 get_active_stripe(struct r5conf *conf, sector_t sector,
656                   int previous, int noblock, int noquiesce)
657 {
658         struct stripe_head *sh;
659         int hash = stripe_hash_locks_hash(sector);
660
661         pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
662
663         spin_lock_irq(conf->hash_locks + hash);
664
665         do {
666                 wait_event_lock_irq(conf->wait_for_stripe,
667                                     conf->quiesce == 0 || noquiesce,
668                                     *(conf->hash_locks + hash));
669                 sh = __find_stripe(conf, sector, conf->generation - previous);
670                 if (!sh) {
671                         if (!conf->inactive_blocked)
672                                 sh = get_free_stripe(conf, hash);
673                         if (noblock && sh == NULL)
674                                 break;
675                         if (!sh) {
676                                 conf->inactive_blocked = 1;
677                                 wait_event_lock_irq(
678                                         conf->wait_for_stripe,
679                                         !list_empty(conf->inactive_list + hash) &&
680                                         (atomic_read(&conf->active_stripes)
681                                          < (conf->max_nr_stripes * 3 / 4)
682                                          || !conf->inactive_blocked),
683                                         *(conf->hash_locks + hash));
684                                 conf->inactive_blocked = 0;
685                         } else {
686                                 init_stripe(sh, sector, previous);
687                                 atomic_inc(&sh->count);
688                         }
689                 } else if (!atomic_inc_not_zero(&sh->count)) {
690                         spin_lock(&conf->device_lock);
691                         if (!atomic_read(&sh->count)) {
692                                 if (!test_bit(STRIPE_HANDLE, &sh->state))
693                                         atomic_inc(&conf->active_stripes);
694                                 BUG_ON(list_empty(&sh->lru) &&
695                                        !test_bit(STRIPE_EXPANDING, &sh->state));
696                                 list_del_init(&sh->lru);
697                                 if (sh->group) {
698                                         sh->group->stripes_cnt--;
699                                         sh->group = NULL;
700                                 }
701                         }
702                         atomic_inc(&sh->count);
703                         spin_unlock(&conf->device_lock);
704                 }
705         } while (sh == NULL);
706
707         spin_unlock_irq(conf->hash_locks + hash);
708         return sh;
709 }
710
711 /* Determine if 'data_offset' or 'new_data_offset' should be used
712  * in this stripe_head.
713  */
714 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
715 {
716         sector_t progress = conf->reshape_progress;
717         /* Need a memory barrier to make sure we see the value
718          * of conf->generation, or ->data_offset that was set before
719          * reshape_progress was updated.
720          */
721         smp_rmb();
722         if (progress == MaxSector)
723                 return 0;
724         if (sh->generation == conf->generation - 1)
725                 return 0;
726         /* We are in a reshape, and this is a new-generation stripe,
727          * so use new_data_offset.
728          */
729         return 1;
730 }
731
732 static void
733 raid5_end_read_request(struct bio *bi, int error);
734 static void
735 raid5_end_write_request(struct bio *bi, int error);
736
737 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
738 {
739         struct r5conf *conf = sh->raid_conf;
740         int i, disks = sh->disks;
741
742         might_sleep();
743
744         for (i = disks; i--; ) {
745                 int rw;
746                 int replace_only = 0;
747                 struct bio *bi, *rbi;
748                 struct md_rdev *rdev, *rrdev = NULL;
749                 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
750                         if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
751                                 rw = WRITE_FUA;
752                         else
753                                 rw = WRITE;
754                         if (test_bit(R5_Discard, &sh->dev[i].flags))
755                                 rw |= REQ_DISCARD;
756                 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
757                         rw = READ;
758                 else if (test_and_clear_bit(R5_WantReplace,
759                                             &sh->dev[i].flags)) {
760                         rw = WRITE;
761                         replace_only = 1;
762                 } else
763                         continue;
764                 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
765                         rw |= REQ_SYNC;
766
767                 bi = &sh->dev[i].req;
768                 rbi = &sh->dev[i].rreq; /* For writing to replacement */
769
770                 rcu_read_lock();
771                 rrdev = rcu_dereference(conf->disks[i].replacement);
772                 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
773                 rdev = rcu_dereference(conf->disks[i].rdev);
774                 if (!rdev) {
775                         rdev = rrdev;
776                         rrdev = NULL;
777                 }
778                 if (rw & WRITE) {
779                         if (replace_only)
780                                 rdev = NULL;
781                         if (rdev == rrdev)
782                                 /* We raced and saw duplicates */
783                                 rrdev = NULL;
784                 } else {
785                         if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
786                                 rdev = rrdev;
787                         rrdev = NULL;
788                 }
789
790                 if (rdev && test_bit(Faulty, &rdev->flags))
791                         rdev = NULL;
792                 if (rdev)
793                         atomic_inc(&rdev->nr_pending);
794                 if (rrdev && test_bit(Faulty, &rrdev->flags))
795                         rrdev = NULL;
796                 if (rrdev)
797                         atomic_inc(&rrdev->nr_pending);
798                 rcu_read_unlock();
799
800                 /* We have already checked bad blocks for reads.  Now
801                  * need to check for writes.  We never accept write errors
802                  * on the replacement, so we don't to check rrdev.
803                  */
804                 while ((rw & WRITE) && rdev &&
805                        test_bit(WriteErrorSeen, &rdev->flags)) {
806                         sector_t first_bad;
807                         int bad_sectors;
808                         int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
809                                               &first_bad, &bad_sectors);
810                         if (!bad)
811                                 break;
812
813                         if (bad < 0) {
814                                 set_bit(BlockedBadBlocks, &rdev->flags);
815                                 if (!conf->mddev->external &&
816                                     conf->mddev->flags) {
817                                         /* It is very unlikely, but we might
818                                          * still need to write out the
819                                          * bad block log - better give it
820                                          * a chance*/
821                                         md_check_recovery(conf->mddev);
822                                 }
823                                 /*
824                                  * Because md_wait_for_blocked_rdev
825                                  * will dec nr_pending, we must
826                                  * increment it first.
827                                  */
828                                 atomic_inc(&rdev->nr_pending);
829                                 md_wait_for_blocked_rdev(rdev, conf->mddev);
830                         } else {
831                                 /* Acknowledged bad block - skip the write */
832                                 rdev_dec_pending(rdev, conf->mddev);
833                                 rdev = NULL;
834                         }
835                 }
836
837                 if (rdev) {
838                         if (s->syncing || s->expanding || s->expanded
839                             || s->replacing)
840                                 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
841
842                         set_bit(STRIPE_IO_STARTED, &sh->state);
843
844                         bio_reset(bi);
845                         bi->bi_bdev = rdev->bdev;
846                         bi->bi_rw = rw;
847                         bi->bi_end_io = (rw & WRITE)
848                                 ? raid5_end_write_request
849                                 : raid5_end_read_request;
850                         bi->bi_private = sh;
851
852                         pr_debug("%s: for %llu schedule op %ld on disc %d\n",
853                                 __func__, (unsigned long long)sh->sector,
854                                 bi->bi_rw, i);
855                         atomic_inc(&sh->count);
856                         if (use_new_offset(conf, sh))
857                                 bi->bi_iter.bi_sector = (sh->sector
858                                                  + rdev->new_data_offset);
859                         else
860                                 bi->bi_iter.bi_sector = (sh->sector
861                                                  + rdev->data_offset);
862                         if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
863                                 bi->bi_rw |= REQ_NOMERGE;
864
865                         if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
866                                 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
867                         sh->dev[i].vec.bv_page = sh->dev[i].page;
868                         bi->bi_vcnt = 1;
869                         bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
870                         bi->bi_io_vec[0].bv_offset = 0;
871                         bi->bi_iter.bi_size = STRIPE_SIZE;
872                         /*
873                          * If this is discard request, set bi_vcnt 0. We don't
874                          * want to confuse SCSI because SCSI will replace payload
875                          */
876                         if (rw & REQ_DISCARD)
877                                 bi->bi_vcnt = 0;
878                         if (rrdev)
879                                 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
880
881                         if (conf->mddev->gendisk)
882                                 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
883                                                       bi, disk_devt(conf->mddev->gendisk),
884                                                       sh->dev[i].sector);
885                         generic_make_request(bi);
886                 }
887                 if (rrdev) {
888                         if (s->syncing || s->expanding || s->expanded
889                             || s->replacing)
890                                 md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
891
892                         set_bit(STRIPE_IO_STARTED, &sh->state);
893
894                         bio_reset(rbi);
895                         rbi->bi_bdev = rrdev->bdev;
896                         rbi->bi_rw = rw;
897                         BUG_ON(!(rw & WRITE));
898                         rbi->bi_end_io = raid5_end_write_request;
899                         rbi->bi_private = sh;
900
901                         pr_debug("%s: for %llu schedule op %ld on "
902                                  "replacement disc %d\n",
903                                 __func__, (unsigned long long)sh->sector,
904                                 rbi->bi_rw, i);
905                         atomic_inc(&sh->count);
906                         if (use_new_offset(conf, sh))
907                                 rbi->bi_iter.bi_sector = (sh->sector
908                                                   + rrdev->new_data_offset);
909                         else
910                                 rbi->bi_iter.bi_sector = (sh->sector
911                                                   + rrdev->data_offset);
912                         if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
913                                 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
914                         sh->dev[i].rvec.bv_page = sh->dev[i].page;
915                         rbi->bi_vcnt = 1;
916                         rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
917                         rbi->bi_io_vec[0].bv_offset = 0;
918                         rbi->bi_iter.bi_size = STRIPE_SIZE;
919                         /*
920                          * If this is discard request, set bi_vcnt 0. We don't
921                          * want to confuse SCSI because SCSI will replace payload
922                          */
923                         if (rw & REQ_DISCARD)
924                                 rbi->bi_vcnt = 0;
925                         if (conf->mddev->gendisk)
926                                 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
927                                                       rbi, disk_devt(conf->mddev->gendisk),
928                                                       sh->dev[i].sector);
929                         generic_make_request(rbi);
930                 }
931                 if (!rdev && !rrdev) {
932                         if (rw & WRITE)
933                                 set_bit(STRIPE_DEGRADED, &sh->state);
934                         pr_debug("skip op %ld on disc %d for sector %llu\n",
935                                 bi->bi_rw, i, (unsigned long long)sh->sector);
936                         clear_bit(R5_LOCKED, &sh->dev[i].flags);
937                         set_bit(STRIPE_HANDLE, &sh->state);
938                 }
939         }
940 }
941
942 static struct dma_async_tx_descriptor *
943 async_copy_data(int frombio, struct bio *bio, struct page **page,
944         sector_t sector, struct dma_async_tx_descriptor *tx,
945         struct stripe_head *sh)
946 {
947         struct bio_vec bvl;
948         struct bvec_iter iter;
949         struct page *bio_page;
950         int page_offset;
951         struct async_submit_ctl submit;
952         enum async_tx_flags flags = 0;
953
954         if (bio->bi_iter.bi_sector >= sector)
955                 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
956         else
957                 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
958
959         if (frombio)
960                 flags |= ASYNC_TX_FENCE;
961         init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
962
963         bio_for_each_segment(bvl, bio, iter) {
964                 int len = bvl.bv_len;
965                 int clen;
966                 int b_offset = 0;
967
968                 if (page_offset < 0) {
969                         b_offset = -page_offset;
970                         page_offset += b_offset;
971                         len -= b_offset;
972                 }
973
974                 if (len > 0 && page_offset + len > STRIPE_SIZE)
975                         clen = STRIPE_SIZE - page_offset;
976                 else
977                         clen = len;
978
979                 if (clen > 0) {
980                         b_offset += bvl.bv_offset;
981                         bio_page = bvl.bv_page;
982                         if (frombio) {
983                                 if (sh->raid_conf->skip_copy &&
984                                     b_offset == 0 && page_offset == 0 &&
985                                     clen == STRIPE_SIZE)
986                                         *page = bio_page;
987                                 else
988                                         tx = async_memcpy(*page, bio_page, page_offset,
989                                                   b_offset, clen, &submit);
990                         } else
991                                 tx = async_memcpy(bio_page, *page, b_offset,
992                                                   page_offset, clen, &submit);
993                 }
994                 /* chain the operations */
995                 submit.depend_tx = tx;
996
997                 if (clen < len) /* hit end of page */
998                         break;
999                 page_offset +=  len;
1000         }
1001
1002         return tx;
1003 }
1004
1005 static void ops_complete_biofill(void *stripe_head_ref)
1006 {
1007         struct stripe_head *sh = stripe_head_ref;
1008         struct bio *return_bi = NULL;
1009         int i;
1010
1011         pr_debug("%s: stripe %llu\n", __func__,
1012                 (unsigned long long)sh->sector);
1013
1014         /* clear completed biofills */
1015         for (i = sh->disks; i--; ) {
1016                 struct r5dev *dev = &sh->dev[i];
1017
1018                 /* acknowledge completion of a biofill operation */
1019                 /* and check if we need to reply to a read request,
1020                  * new R5_Wantfill requests are held off until
1021                  * !STRIPE_BIOFILL_RUN
1022                  */
1023                 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1024                         struct bio *rbi, *rbi2;
1025
1026                         BUG_ON(!dev->read);
1027                         rbi = dev->read;
1028                         dev->read = NULL;
1029                         while (rbi && rbi->bi_iter.bi_sector <
1030                                 dev->sector + STRIPE_SECTORS) {
1031                                 rbi2 = r5_next_bio(rbi, dev->sector);
1032                                 if (!raid5_dec_bi_active_stripes(rbi)) {
1033                                         rbi->bi_next = return_bi;
1034                                         return_bi = rbi;
1035                                 }
1036                                 rbi = rbi2;
1037                         }
1038                 }
1039         }
1040         clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1041
1042         return_io(return_bi);
1043
1044         set_bit(STRIPE_HANDLE, &sh->state);
1045         release_stripe(sh);
1046 }
1047
1048 static void ops_run_biofill(struct stripe_head *sh)
1049 {
1050         struct dma_async_tx_descriptor *tx = NULL;
1051         struct async_submit_ctl submit;
1052         int i;
1053
1054         pr_debug("%s: stripe %llu\n", __func__,
1055                 (unsigned long long)sh->sector);
1056
1057         for (i = sh->disks; i--; ) {
1058                 struct r5dev *dev = &sh->dev[i];
1059                 if (test_bit(R5_Wantfill, &dev->flags)) {
1060                         struct bio *rbi;
1061                         spin_lock_irq(&sh->stripe_lock);
1062                         dev->read = rbi = dev->toread;
1063                         dev->toread = NULL;
1064                         spin_unlock_irq(&sh->stripe_lock);
1065                         while (rbi && rbi->bi_iter.bi_sector <
1066                                 dev->sector + STRIPE_SECTORS) {
1067                                 tx = async_copy_data(0, rbi, &dev->page,
1068                                         dev->sector, tx, sh);
1069                                 rbi = r5_next_bio(rbi, dev->sector);
1070                         }
1071                 }
1072         }
1073
1074         atomic_inc(&sh->count);
1075         init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1076         async_trigger_callback(&submit);
1077 }
1078
1079 static void mark_target_uptodate(struct stripe_head *sh, int target)
1080 {
1081         struct r5dev *tgt;
1082
1083         if (target < 0)
1084                 return;
1085
1086         tgt = &sh->dev[target];
1087         set_bit(R5_UPTODATE, &tgt->flags);
1088         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1089         clear_bit(R5_Wantcompute, &tgt->flags);
1090 }
1091
1092 static void ops_complete_compute(void *stripe_head_ref)
1093 {
1094         struct stripe_head *sh = stripe_head_ref;
1095
1096         pr_debug("%s: stripe %llu\n", __func__,
1097                 (unsigned long long)sh->sector);
1098
1099         /* mark the computed target(s) as uptodate */
1100         mark_target_uptodate(sh, sh->ops.target);
1101         mark_target_uptodate(sh, sh->ops.target2);
1102
1103         clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1104         if (sh->check_state == check_state_compute_run)
1105                 sh->check_state = check_state_compute_result;
1106         set_bit(STRIPE_HANDLE, &sh->state);
1107         release_stripe(sh);
1108 }
1109
1110 /* return a pointer to the address conversion region of the scribble buffer */
1111 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1112                                  struct raid5_percpu *percpu)
1113 {
1114         return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
1115 }
1116
1117 static struct dma_async_tx_descriptor *
1118 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1119 {
1120         int disks = sh->disks;
1121         struct page **xor_srcs = percpu->scribble;
1122         int target = sh->ops.target;
1123         struct r5dev *tgt = &sh->dev[target];
1124         struct page *xor_dest = tgt->page;
1125         int count = 0;
1126         struct dma_async_tx_descriptor *tx;
1127         struct async_submit_ctl submit;
1128         int i;
1129
1130         pr_debug("%s: stripe %llu block: %d\n",
1131                 __func__, (unsigned long long)sh->sector, target);
1132         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1133
1134         for (i = disks; i--; )
1135                 if (i != target)
1136                         xor_srcs[count++] = sh->dev[i].page;
1137
1138         atomic_inc(&sh->count);
1139
1140         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1141                           ops_complete_compute, sh, to_addr_conv(sh, percpu));
1142         if (unlikely(count == 1))
1143                 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1144         else
1145                 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1146
1147         return tx;
1148 }
1149
1150 /* set_syndrome_sources - populate source buffers for gen_syndrome
1151  * @srcs - (struct page *) array of size sh->disks
1152  * @sh - stripe_head to parse
1153  *
1154  * Populates srcs in proper layout order for the stripe and returns the
1155  * 'count' of sources to be used in a call to async_gen_syndrome.  The P
1156  * destination buffer is recorded in srcs[count] and the Q destination
1157  * is recorded in srcs[count+1]].
1158  */
1159 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
1160 {
1161         int disks = sh->disks;
1162         int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1163         int d0_idx = raid6_d0(sh);
1164         int count;
1165         int i;
1166
1167         for (i = 0; i < disks; i++)
1168                 srcs[i] = NULL;
1169
1170         count = 0;
1171         i = d0_idx;
1172         do {
1173                 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1174
1175                 srcs[slot] = sh->dev[i].page;
1176                 i = raid6_next_disk(i, disks);
1177         } while (i != d0_idx);
1178
1179         return syndrome_disks;
1180 }
1181
1182 static struct dma_async_tx_descriptor *
1183 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1184 {
1185         int disks = sh->disks;
1186         struct page **blocks = percpu->scribble;
1187         int target;
1188         int qd_idx = sh->qd_idx;
1189         struct dma_async_tx_descriptor *tx;
1190         struct async_submit_ctl submit;
1191         struct r5dev *tgt;
1192         struct page *dest;
1193         int i;
1194         int count;
1195
1196         if (sh->ops.target < 0)
1197                 target = sh->ops.target2;
1198         else if (sh->ops.target2 < 0)
1199                 target = sh->ops.target;
1200         else
1201                 /* we should only have one valid target */
1202                 BUG();
1203         BUG_ON(target < 0);
1204         pr_debug("%s: stripe %llu block: %d\n",
1205                 __func__, (unsigned long long)sh->sector, target);
1206
1207         tgt = &sh->dev[target];
1208         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1209         dest = tgt->page;
1210
1211         atomic_inc(&sh->count);
1212
1213         if (target == qd_idx) {
1214                 count = set_syndrome_sources(blocks, sh);
1215                 blocks[count] = NULL; /* regenerating p is not necessary */
1216                 BUG_ON(blocks[count+1] != dest); /* q should already be set */
1217                 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1218                                   ops_complete_compute, sh,
1219                                   to_addr_conv(sh, percpu));
1220                 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1221         } else {
1222                 /* Compute any data- or p-drive using XOR */
1223                 count = 0;
1224                 for (i = disks; i-- ; ) {
1225                         if (i == target || i == qd_idx)
1226                                 continue;
1227                         blocks[count++] = sh->dev[i].page;
1228                 }
1229
1230                 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1231                                   NULL, ops_complete_compute, sh,
1232                                   to_addr_conv(sh, percpu));
1233                 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
1234         }
1235
1236         return tx;
1237 }
1238
1239 static struct dma_async_tx_descriptor *
1240 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1241 {
1242         int i, count, disks = sh->disks;
1243         int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1244         int d0_idx = raid6_d0(sh);
1245         int faila = -1, failb = -1;
1246         int target = sh->ops.target;
1247         int target2 = sh->ops.target2;
1248         struct r5dev *tgt = &sh->dev[target];
1249         struct r5dev *tgt2 = &sh->dev[target2];
1250         struct dma_async_tx_descriptor *tx;
1251         struct page **blocks = percpu->scribble;
1252         struct async_submit_ctl submit;
1253
1254         pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1255                  __func__, (unsigned long long)sh->sector, target, target2);
1256         BUG_ON(target < 0 || target2 < 0);
1257         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1258         BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1259
1260         /* we need to open-code set_syndrome_sources to handle the
1261          * slot number conversion for 'faila' and 'failb'
1262          */
1263         for (i = 0; i < disks ; i++)
1264                 blocks[i] = NULL;
1265         count = 0;
1266         i = d0_idx;
1267         do {
1268                 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1269
1270                 blocks[slot] = sh->dev[i].page;
1271
1272                 if (i == target)
1273                         faila = slot;
1274                 if (i == target2)
1275                         failb = slot;
1276                 i = raid6_next_disk(i, disks);
1277         } while (i != d0_idx);
1278
1279         BUG_ON(faila == failb);
1280         if (failb < faila)
1281                 swap(faila, failb);
1282         pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1283                  __func__, (unsigned long long)sh->sector, faila, failb);
1284
1285         atomic_inc(&sh->count);
1286
1287         if (failb == syndrome_disks+1) {
1288                 /* Q disk is one of the missing disks */
1289                 if (faila == syndrome_disks) {
1290                         /* Missing P+Q, just recompute */
1291                         init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1292                                           ops_complete_compute, sh,
1293                                           to_addr_conv(sh, percpu));
1294                         return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1295                                                   STRIPE_SIZE, &submit);
1296                 } else {
1297                         struct page *dest;
1298                         int data_target;
1299                         int qd_idx = sh->qd_idx;
1300
1301                         /* Missing D+Q: recompute D from P, then recompute Q */
1302                         if (target == qd_idx)
1303                                 data_target = target2;
1304                         else
1305                                 data_target = target;
1306
1307                         count = 0;
1308                         for (i = disks; i-- ; ) {
1309                                 if (i == data_target || i == qd_idx)
1310                                         continue;
1311                                 blocks[count++] = sh->dev[i].page;
1312                         }
1313                         dest = sh->dev[data_target].page;
1314                         init_async_submit(&submit,
1315                                           ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1316                                           NULL, NULL, NULL,
1317                                           to_addr_conv(sh, percpu));
1318                         tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1319                                        &submit);
1320
1321                         count = set_syndrome_sources(blocks, sh);
1322                         init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1323                                           ops_complete_compute, sh,
1324                                           to_addr_conv(sh, percpu));
1325                         return async_gen_syndrome(blocks, 0, count+2,
1326                                                   STRIPE_SIZE, &submit);
1327                 }
1328         } else {
1329                 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1330                                   ops_complete_compute, sh,
1331                                   to_addr_conv(sh, percpu));
1332                 if (failb == syndrome_disks) {
1333                         /* We're missing D+P. */
1334                         return async_raid6_datap_recov(syndrome_disks+2,
1335                                                        STRIPE_SIZE, faila,
1336                                                        blocks, &submit);
1337                 } else {
1338                         /* We're missing D+D. */
1339                         return async_raid6_2data_recov(syndrome_disks+2,
1340                                                        STRIPE_SIZE, faila, failb,
1341                                                        blocks, &submit);
1342                 }
1343         }
1344 }
1345
1346 static void ops_complete_prexor(void *stripe_head_ref)
1347 {
1348         struct stripe_head *sh = stripe_head_ref;
1349
1350         pr_debug("%s: stripe %llu\n", __func__,
1351                 (unsigned long long)sh->sector);
1352 }
1353
1354 static struct dma_async_tx_descriptor *
1355 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
1356                struct dma_async_tx_descriptor *tx)
1357 {
1358         int disks = sh->disks;
1359         struct page **xor_srcs = percpu->scribble;
1360         int count = 0, pd_idx = sh->pd_idx, i;
1361         struct async_submit_ctl submit;
1362
1363         /* existing parity data subtracted */
1364         struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1365
1366         pr_debug("%s: stripe %llu\n", __func__,
1367                 (unsigned long long)sh->sector);
1368
1369         for (i = disks; i--; ) {
1370                 struct r5dev *dev = &sh->dev[i];
1371                 /* Only process blocks that are known to be uptodate */
1372                 if (test_bit(R5_Wantdrain, &dev->flags))
1373                         xor_srcs[count++] = dev->page;
1374         }
1375
1376         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1377                           ops_complete_prexor, sh, to_addr_conv(sh, percpu));
1378         tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1379
1380         return tx;
1381 }
1382
1383 static struct dma_async_tx_descriptor *
1384 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1385 {
1386         int disks = sh->disks;
1387         int i;
1388
1389         pr_debug("%s: stripe %llu\n", __func__,
1390                 (unsigned long long)sh->sector);
1391
1392         for (i = disks; i--; ) {
1393                 struct r5dev *dev = &sh->dev[i];
1394                 struct bio *chosen;
1395
1396                 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1397                         struct bio *wbi;
1398
1399                         spin_lock_irq(&sh->stripe_lock);
1400                         chosen = dev->towrite;
1401                         dev->towrite = NULL;
1402                         BUG_ON(dev->written);
1403                         wbi = dev->written = chosen;
1404                         spin_unlock_irq(&sh->stripe_lock);
1405                         WARN_ON(dev->page != dev->orig_page);
1406
1407                         while (wbi && wbi->bi_iter.bi_sector <
1408                                 dev->sector + STRIPE_SECTORS) {
1409                                 if (wbi->bi_rw & REQ_FUA)
1410                                         set_bit(R5_WantFUA, &dev->flags);
1411                                 if (wbi->bi_rw & REQ_SYNC)
1412                                         set_bit(R5_SyncIO, &dev->flags);
1413                                 if (wbi->bi_rw & REQ_DISCARD)
1414                                         set_bit(R5_Discard, &dev->flags);
1415                                 else {
1416                                         tx = async_copy_data(1, wbi, &dev->page,
1417                                                 dev->sector, tx, sh);
1418                                         if (dev->page != dev->orig_page) {
1419                                                 set_bit(R5_SkipCopy, &dev->flags);
1420                                                 clear_bit(R5_UPTODATE, &dev->flags);
1421                                                 clear_bit(R5_OVERWRITE, &dev->flags);
1422                                         }
1423                                 }
1424                                 wbi = r5_next_bio(wbi, dev->sector);
1425                         }
1426                 }
1427         }
1428
1429         return tx;
1430 }
1431
1432 static void ops_complete_reconstruct(void *stripe_head_ref)
1433 {
1434         struct stripe_head *sh = stripe_head_ref;
1435         int disks = sh->disks;
1436         int pd_idx = sh->pd_idx;
1437         int qd_idx = sh->qd_idx;
1438         int i;
1439         bool fua = false, sync = false, discard = false;
1440
1441         pr_debug("%s: stripe %llu\n", __func__,
1442                 (unsigned long long)sh->sector);
1443
1444         for (i = disks; i--; ) {
1445                 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1446                 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1447                 discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1448         }
1449
1450         for (i = disks; i--; ) {
1451                 struct r5dev *dev = &sh->dev[i];
1452
1453                 if (dev->written || i == pd_idx || i == qd_idx) {
1454                         if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
1455                                 set_bit(R5_UPTODATE, &dev->flags);
1456                         if (fua)
1457                                 set_bit(R5_WantFUA, &dev->flags);
1458                         if (sync)
1459                                 set_bit(R5_SyncIO, &dev->flags);
1460                 }
1461         }
1462
1463         if (sh->reconstruct_state == reconstruct_state_drain_run)
1464                 sh->reconstruct_state = reconstruct_state_drain_result;
1465         else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1466                 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1467         else {
1468                 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1469                 sh->reconstruct_state = reconstruct_state_result;
1470         }
1471
1472         set_bit(STRIPE_HANDLE, &sh->state);
1473         release_stripe(sh);
1474 }
1475
1476 static void
1477 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1478                      struct dma_async_tx_descriptor *tx)
1479 {
1480         int disks = sh->disks;
1481         struct page **xor_srcs = percpu->scribble;
1482         struct async_submit_ctl submit;
1483         int count = 0, pd_idx = sh->pd_idx, i;
1484         struct page *xor_dest;
1485         int prexor = 0;
1486         unsigned long flags;
1487
1488         pr_debug("%s: stripe %llu\n", __func__,
1489                 (unsigned long long)sh->sector);
1490
1491         for (i = 0; i < sh->disks; i++) {
1492                 if (pd_idx == i)
1493                         continue;
1494                 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1495                         break;
1496         }
1497         if (i >= sh->disks) {
1498                 atomic_inc(&sh->count);
1499                 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
1500                 ops_complete_reconstruct(sh);
1501                 return;
1502         }
1503         /* check if prexor is active which means only process blocks
1504          * that are part of a read-modify-write (written)
1505          */
1506         if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1507                 prexor = 1;
1508                 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1509                 for (i = disks; i--; ) {
1510                         struct r5dev *dev = &sh->dev[i];
1511                         if (dev->written)
1512                                 xor_srcs[count++] = dev->page;
1513                 }
1514         } else {
1515                 xor_dest = sh->dev[pd_idx].page;
1516                 for (i = disks; i--; ) {
1517                         struct r5dev *dev = &sh->dev[i];
1518                         if (i != pd_idx)
1519                                 xor_srcs[count++] = dev->page;
1520                 }
1521         }
1522
1523         /* 1/ if we prexor'd then the dest is reused as a source
1524          * 2/ if we did not prexor then we are redoing the parity
1525          * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
1526          * for the synchronous xor case
1527          */
1528         flags = ASYNC_TX_ACK |
1529                 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1530
1531         atomic_inc(&sh->count);
1532
1533         init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
1534                           to_addr_conv(sh, percpu));
1535         if (unlikely(count == 1))
1536                 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1537         else
1538                 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1539 }
1540
1541 static void
1542 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1543                      struct dma_async_tx_descriptor *tx)
1544 {
1545         struct async_submit_ctl submit;
1546         struct page **blocks = percpu->scribble;
1547         int count, i;
1548
1549         pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1550
1551         for (i = 0; i < sh->disks; i++) {
1552                 if (sh->pd_idx == i || sh->qd_idx == i)
1553                         continue;
1554                 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1555                         break;
1556         }
1557         if (i >= sh->disks) {
1558                 atomic_inc(&sh->count);
1559                 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
1560                 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
1561                 ops_complete_reconstruct(sh);
1562                 return;
1563         }
1564
1565         count = set_syndrome_sources(blocks, sh);
1566
1567         atomic_inc(&sh->count);
1568
1569         init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
1570                           sh, to_addr_conv(sh, percpu));
1571         async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
1572 }
1573
1574 static void ops_complete_check(void *stripe_head_ref)
1575 {
1576         struct stripe_head *sh = stripe_head_ref;
1577
1578         pr_debug("%s: stripe %llu\n", __func__,
1579                 (unsigned long long)sh->sector);
1580
1581         sh->check_state = check_state_check_result;
1582         set_bit(STRIPE_HANDLE, &sh->state);
1583         release_stripe(sh);
1584 }
1585
1586 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1587 {
1588         int disks = sh->disks;
1589         int pd_idx = sh->pd_idx;
1590         int qd_idx = sh->qd_idx;
1591         struct page *xor_dest;
1592         struct page **xor_srcs = percpu->scribble;
1593         struct dma_async_tx_descriptor *tx;
1594         struct async_submit_ctl submit;
1595         int count;
1596         int i;
1597
1598         pr_debug("%s: stripe %llu\n", __func__,
1599                 (unsigned long long)sh->sector);
1600
1601         count = 0;
1602         xor_dest = sh->dev[pd_idx].page;
1603         xor_srcs[count++] = xor_dest;
1604         for (i = disks; i--; ) {
1605                 if (i == pd_idx || i == qd_idx)
1606                         continue;
1607                 xor_srcs[count++] = sh->dev[i].page;
1608         }
1609
1610         init_async_submit(&submit, 0, NULL, NULL, NULL,
1611                           to_addr_conv(sh, percpu));
1612         tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1613                            &sh->ops.zero_sum_result, &submit);
1614
1615         atomic_inc(&sh->count);
1616         init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1617         tx = async_trigger_callback(&submit);
1618 }
1619
1620 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1621 {
1622         struct page **srcs = percpu->scribble;
1623         struct async_submit_ctl submit;
1624         int count;
1625
1626         pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1627                 (unsigned long long)sh->sector, checkp);
1628
1629         count = set_syndrome_sources(srcs, sh);
1630         if (!checkp)
1631                 srcs[count] = NULL;
1632
1633         atomic_inc(&sh->count);
1634         init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
1635                           sh, to_addr_conv(sh, percpu));
1636         async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1637                            &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1638 }
1639
1640 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1641 {
1642         int overlap_clear = 0, i, disks = sh->disks;
1643         struct dma_async_tx_descriptor *tx = NULL;
1644         struct r5conf *conf = sh->raid_conf;
1645         int level = conf->level;
1646         struct raid5_percpu *percpu;
1647         unsigned long cpu;
1648
1649         cpu = get_cpu();
1650         percpu = per_cpu_ptr(conf->percpu, cpu);
1651         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
1652                 ops_run_biofill(sh);
1653                 overlap_clear++;
1654         }
1655
1656         if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
1657                 if (level < 6)
1658                         tx = ops_run_compute5(sh, percpu);
1659                 else {
1660                         if (sh->ops.target2 < 0 || sh->ops.target < 0)
1661                                 tx = ops_run_compute6_1(sh, percpu);
1662                         else
1663                                 tx = ops_run_compute6_2(sh, percpu);
1664                 }
1665                 /* terminate the chain if reconstruct is not set to be run */
1666                 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
1667                         async_tx_ack(tx);
1668         }
1669
1670         if (test_bit(STRIPE_OP_PREXOR, &ops_request))
1671                 tx = ops_run_prexor(sh, percpu, tx);
1672
1673         if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
1674                 tx = ops_run_biodrain(sh, tx);
1675                 overlap_clear++;
1676         }
1677
1678         if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
1679                 if (level < 6)
1680                         ops_run_reconstruct5(sh, percpu, tx);
1681                 else
1682                         ops_run_reconstruct6(sh, percpu, tx);
1683         }
1684
1685         if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
1686                 if (sh->check_state == check_state_run)
1687                         ops_run_check_p(sh, percpu);
1688                 else if (sh->check_state == check_state_run_q)
1689                         ops_run_check_pq(sh, percpu, 0);
1690                 else if (sh->check_state == check_state_run_pq)
1691                         ops_run_check_pq(sh, percpu, 1);
1692                 else
1693                         BUG();
1694         }
1695
1696         if (overlap_clear)
1697                 for (i = disks; i--; ) {
1698                         struct r5dev *dev = &sh->dev[i];
1699                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
1700                                 wake_up(&sh->raid_conf->wait_for_overlap);
1701                 }
1702         put_cpu();
1703 }
1704
1705 static int grow_one_stripe(struct r5conf *conf, int hash)
1706 {
1707         struct stripe_head *sh;
1708         sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
1709         if (!sh)
1710                 return 0;
1711
1712         sh->raid_conf = conf;
1713
1714         spin_lock_init(&sh->stripe_lock);
1715
1716         if (grow_buffers(sh)) {
1717                 shrink_buffers(sh);
1718                 kmem_cache_free(conf->slab_cache, sh);
1719                 return 0;
1720         }
1721         sh->hash_lock_index = hash;
1722         /* we just created an active stripe so... */
1723         atomic_set(&sh->count, 1);
1724         atomic_inc(&conf->active_stripes);
1725         INIT_LIST_HEAD(&sh->lru);
1726         release_stripe(sh);
1727         return 1;
1728 }
1729
1730 static int grow_stripes(struct r5conf *conf, int num)
1731 {
1732         struct kmem_cache *sc;
1733         int devs = max(conf->raid_disks, conf->previous_raid_disks);
1734         int hash;
1735
1736         if (conf->mddev->gendisk)
1737                 sprintf(conf->cache_name[0],
1738                         "raid%d-%s", conf->level, mdname(conf->mddev));
1739         else
1740                 sprintf(conf->cache_name[0],
1741                         "raid%d-%p", conf->level, conf->mddev);
1742         sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
1743
1744         conf->active_name = 0;
1745         sc = kmem_cache_create(conf->cache_name[conf->active_name],
1746                                sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
1747                                0, 0, NULL);
1748         if (!sc)
1749                 return 1;
1750         conf->slab_cache = sc;
1751         conf->pool_size = devs;
1752         hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
1753         while (num--) {
1754                 if (!grow_one_stripe(conf, hash))
1755                         return 1;
1756                 conf->max_nr_stripes++;
1757                 hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
1758         }
1759         return 0;
1760 }
1761
1762 /**
1763  * scribble_len - return the required size of the scribble region
1764  * @num - total number of disks in the array
1765  *
1766  * The size must be enough to contain:
1767  * 1/ a struct page pointer for each device in the array +2
1768  * 2/ room to convert each entry in (1) to its corresponding dma
1769  *    (dma_map_page()) or page (page_address()) address.
1770  *
1771  * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
1772  * calculate over all devices (not just the data blocks), using zeros in place
1773  * of the P and Q blocks.
1774  */
1775 static size_t scribble_len(int num)
1776 {
1777         size_t len;
1778
1779         len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
1780
1781         return len;
1782 }
1783
1784 static int resize_stripes(struct r5conf *conf, int newsize)
1785 {
1786         /* Make all the stripes able to hold 'newsize' devices.
1787          * New slots in each stripe get 'page' set to a new page.
1788          *
1789          * This happens in stages:
1790          * 1/ create a new kmem_cache and allocate the required number of
1791          *    stripe_heads.
1792          * 2/ gather all the old stripe_heads and transfer the pages across
1793          *    to the new stripe_heads.  This will have the side effect of
1794          *    freezing the array as once all stripe_heads have been collected,
1795          *    no IO will be possible.  Old stripe heads are freed once their
1796          *    pages have been transferred over, and the old kmem_cache is
1797          *    freed when all stripes are done.
1798          * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
1799          *    we simple return a failre status - no need to clean anything up.
1800          * 4/ allocate new pages for the new slots in the new stripe_heads.
1801          *    If this fails, we don't bother trying the shrink the
1802          *    stripe_heads down again, we just leave them as they are.
1803          *    As each stripe_head is processed the new one is released into
1804          *    active service.
1805          *
1806          * Once step2 is started, we cannot afford to wait for a write,
1807          * so we use GFP_NOIO allocations.
1808          */
1809         struct stripe_head *osh, *nsh;
1810         LIST_HEAD(newstripes);
1811         struct disk_info *ndisks;
1812         unsigned long cpu;
1813         int err;
1814         struct kmem_cache *sc;
1815         int i;
1816         int hash, cnt;
1817
1818         if (newsize <= conf->pool_size)
1819                 return 0; /* never bother to shrink */
1820
1821         err = md_allow_write(conf->mddev);
1822         if (err)
1823                 return err;
1824
1825         /* Step 1 */
1826         sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
1827                                sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
1828                                0, 0, NULL);
1829         if (!sc)
1830                 return -ENOMEM;
1831
1832         for (i = conf->max_nr_stripes; i; i--) {
1833                 nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
1834                 if (!nsh)
1835                         break;
1836
1837                 nsh->raid_conf = conf;
1838                 spin_lock_init(&nsh->stripe_lock);
1839
1840                 list_add(&nsh->lru, &newstripes);
1841         }
1842         if (i) {
1843                 /* didn't get enough, give up */
1844                 while (!list_empty(&newstripes)) {
1845                         nsh = list_entry(newstripes.next, struct stripe_head, lru);
1846                         list_del(&nsh->lru);
1847                         kmem_cache_free(sc, nsh);
1848                 }
1849                 kmem_cache_destroy(sc);
1850                 return -ENOMEM;
1851         }
1852         /* Step 2 - Must use GFP_NOIO now.
1853          * OK, we have enough stripes, start collecting inactive
1854          * stripes and copying them over
1855          */
1856         hash = 0;
1857         cnt = 0;
1858         list_for_each_entry(nsh, &newstripes, lru) {
1859                 lock_device_hash_lock(conf, hash);
1860                 wait_event_cmd(conf->wait_for_stripe,
1861                                     !list_empty(conf->inactive_list + hash),
1862                                     unlock_device_hash_lock(conf, hash),
1863                                     lock_device_hash_lock(conf, hash));
1864                 osh = get_free_stripe(conf, hash);
1865                 unlock_device_hash_lock(conf, hash);
1866                 atomic_set(&nsh->count, 1);
1867                 for(i=0; i<conf->pool_size; i++) {
1868                         nsh->dev[i].page = osh->dev[i].page;
1869                         nsh->dev[i].orig_page = osh->dev[i].page;
1870                 }
1871                 for( ; i<newsize; i++)
1872                         nsh->dev[i].page = NULL;
1873                 nsh->hash_lock_index = hash;
1874                 kmem_cache_free(conf->slab_cache, osh);
1875                 cnt++;
1876                 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
1877                     !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
1878                         hash++;
1879                         cnt = 0;
1880                 }
1881         }
1882         kmem_cache_destroy(conf->slab_cache);
1883
1884         /* Step 3.
1885          * At this point, we are holding all the stripes so the array
1886          * is completely stalled, so now is a good time to resize
1887          * conf->disks and the scribble region
1888          */
1889         ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1890         if (ndisks) {
1891                 for (i=0; i<conf->raid_disks; i++)
1892                         ndisks[i] = conf->disks[i];
1893                 kfree(conf->disks);
1894                 conf->disks = ndisks;
1895         } else
1896                 err = -ENOMEM;
1897
1898         get_online_cpus();
1899         conf->scribble_len = scribble_len(newsize);
1900         for_each_present_cpu(cpu) {
1901                 struct raid5_percpu *percpu;
1902                 void *scribble;
1903
1904                 percpu = per_cpu_ptr(conf->percpu, cpu);
1905                 scribble = kmalloc(conf->scribble_len, GFP_NOIO);
1906
1907                 if (scribble) {
1908                         kfree(percpu->scribble);
1909                         percpu->scribble = scribble;
1910                 } else {
1911                         err = -ENOMEM;
1912                         break;
1913                 }
1914         }
1915         put_online_cpus();
1916
1917         /* Step 4, return new stripes to service */
1918         while(!list_empty(&newstripes)) {
1919                 nsh = list_entry(newstripes.next, struct stripe_head, lru);
1920                 list_del_init(&nsh->lru);
1921
1922                 for (i=conf->raid_disks; i < newsize; i++)
1923                         if (nsh->dev[i].page == NULL) {
1924                                 struct page *p = alloc_page(GFP_NOIO);
1925                                 nsh->dev[i].page = p;
1926                                 nsh->dev[i].orig_page = p;
1927                                 if (!p)
1928                                         err = -ENOMEM;
1929                         }
1930                 release_stripe(nsh);
1931         }
1932         /* critical section pass, GFP_NOIO no longer needed */
1933
1934         conf->slab_cache = sc;
1935         conf->active_name = 1-conf->active_name;
1936         conf->pool_size = newsize;
1937         return err;
1938 }
1939
1940 static int drop_one_stripe(struct r5conf *conf, int hash)
1941 {
1942         struct stripe_head *sh;
1943
1944         spin_lock_irq(conf->hash_locks + hash);
1945         sh = get_free_stripe(conf, hash);
1946         spin_unlock_irq(conf->hash_locks + hash);
1947         if (!sh)
1948                 return 0;
1949         BUG_ON(atomic_read(&sh->count));
1950         shrink_buffers(sh);
1951         kmem_cache_free(conf->slab_cache, sh);
1952         atomic_dec(&conf->active_stripes);
1953         return 1;
1954 }
1955
1956 static void shrink_stripes(struct r5conf *conf)
1957 {
1958         int hash;
1959         for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
1960                 while (drop_one_stripe(conf, hash))
1961                         ;
1962
1963         if (conf->slab_cache)
1964                 kmem_cache_destroy(conf->slab_cache);
1965         conf->slab_cache = NULL;
1966 }
1967
1968 static void raid5_end_read_request(struct bio * bi, int error)
1969 {
1970         struct stripe_head *sh = bi->bi_private;
1971         struct r5conf *conf = sh->raid_conf;
1972         int disks = sh->disks, i;
1973         int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1974         char b[BDEVNAME_SIZE];
1975         struct md_rdev *rdev = NULL;
1976         sector_t s;
1977
1978         for (i=0 ; i<disks; i++)
1979                 if (bi == &sh->dev[i].req)
1980                         break;
1981
1982         pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
1983                 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1984                 uptodate);
1985         if (i == disks) {
1986                 BUG();
1987                 return;
1988         }
1989         if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
1990                 /* If replacement finished while this request was outstanding,
1991                  * 'replacement' might be NULL already.
1992                  * In that case it moved down to 'rdev'.
1993                  * rdev is not removed until all requests are finished.
1994                  */
1995                 rdev = conf->disks[i].replacement;
1996         if (!rdev)
1997                 rdev = conf->disks[i].rdev;
1998
1999         if (use_new_offset(conf, sh))
2000                 s = sh->sector + rdev->new_data_offset;
2001         else
2002                 s = sh->sector + rdev->data_offset;
2003         if (uptodate) {
2004                 set_bit(R5_UPTODATE, &sh->dev[i].flags);
2005                 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2006                         /* Note that this cannot happen on a
2007                          * replacement device.  We just fail those on
2008                          * any error
2009                          */
2010                         printk_ratelimited(
2011                                 KERN_INFO
2012                                 "md/raid:%s: read error corrected"
2013                                 " (%lu sectors at %llu on %s)\n",
2014                                 mdname(conf->mddev), STRIPE_SECTORS,
2015                                 (unsigned long long)s,
2016                                 bdevname(rdev->bdev, b));
2017                         atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2018                         clear_bit(R5_ReadError, &sh->dev[i].flags);
2019                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
2020                 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2021                         clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2022
2023                 if (atomic_read(&rdev->read_errors))
2024                         atomic_set(&rdev->read_errors, 0);
2025         } else {
2026                 const char *bdn = bdevname(rdev->bdev, b);
2027                 int retry = 0;
2028                 int set_bad = 0;
2029
2030                 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2031                 atomic_inc(&rdev->read_errors);
2032                 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2033                         printk_ratelimited(
2034                                 KERN_WARNING
2035                                 "md/raid:%s: read error on replacement device "
2036                                 "(sector %llu on %s).\n",
2037                                 mdname(conf->mddev),
2038                                 (unsigned long long)s,
2039                                 bdn);
2040                 else if (conf->mddev->degraded >= conf->max_degraded) {
2041                         set_bad = 1;
2042                         printk_ratelimited(
2043                                 KERN_WARNING
2044                                 "md/raid:%s: read error not correctable "
2045                                 "(sector %llu on %s).\n",
2046                                 mdname(conf->mddev),
2047                                 (unsigned long long)s,
2048                                 bdn);
2049                 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2050                         /* Oh, no!!! */
2051                         set_bad = 1;
2052                         printk_ratelimited(
2053                                 KERN_WARNING
2054                                 "md/raid:%s: read error NOT corrected!! "
2055                                 "(sector %llu on %s).\n",
2056                                 mdname(conf->mddev),
2057                                 (unsigned long long)s,
2058                                 bdn);
2059                 } else if (atomic_read(&rdev->read_errors)
2060                          > conf->max_nr_stripes)
2061                         printk(KERN_WARNING
2062                                "md/raid:%s: Too many read errors, failing device %s.\n",
2063                                mdname(conf->mddev), bdn);
2064                 else
2065                         retry = 1;
2066                 if (set_bad && test_bit(In_sync, &rdev->flags)
2067                     && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2068                         retry = 1;
2069                 if (retry)
2070                         if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2071                                 set_bit(R5_ReadError, &sh->dev[i].flags);
2072                                 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2073                         } else
2074                                 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2075                 else {
2076                         clear_bit(R5_ReadError, &sh->dev[i].flags);
2077                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
2078                         if (!(set_bad
2079                               && test_bit(In_sync, &rdev->flags)
2080                               && rdev_set_badblocks(
2081                                       rdev, sh->sector, STRIPE_SECTORS, 0)))
2082                                 md_error(conf->mddev, rdev);
2083                 }
2084         }
2085         rdev_dec_pending(rdev, conf->mddev);
2086         clear_bit(R5_LOCKED, &sh->dev[i].flags);
2087         set_bit(STRIPE_HANDLE, &sh->state);
2088         release_stripe(sh);
2089 }
2090
2091 static void raid5_end_write_request(struct bio *bi, int error)
2092 {
2093         struct stripe_head *sh = bi->bi_private;
2094         struct r5conf *conf = sh->raid_conf;
2095         int disks = sh->disks, i;
2096         struct md_rdev *uninitialized_var(rdev);
2097         int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
2098         sector_t first_bad;
2099         int bad_sectors;
2100         int replacement = 0;
2101
2102         for (i = 0 ; i < disks; i++) {
2103                 if (bi == &sh->dev[i].req) {
2104                         rdev = conf->disks[i].rdev;
2105                         break;
2106                 }
2107                 if (bi == &sh->dev[i].rreq) {
2108                         rdev = conf->disks[i].replacement;
2109                         if (rdev)
2110                                 replacement = 1;
2111                         else
2112                                 /* rdev was removed and 'replacement'
2113                                  * replaced it.  rdev is not removed
2114                                  * until all requests are finished.
2115                                  */
2116                                 rdev = conf->disks[i].rdev;
2117                         break;
2118                 }
2119         }
2120         pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
2121                 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2122                 uptodate);
2123         if (i == disks) {
2124                 BUG();
2125                 return;
2126         }
2127
2128         if (replacement) {
2129                 if (!uptodate)
2130                         md_error(conf->mddev, rdev);
2131                 else if (is_badblock(rdev, sh->sector,
2132                                      STRIPE_SECTORS,
2133                                      &first_bad, &bad_sectors))
2134                         set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2135         } else {
2136                 if (!uptodate) {
2137                         set_bit(STRIPE_DEGRADED, &sh->state);
2138                         set_bit(WriteErrorSeen, &rdev->flags);
2139                         set_bit(R5_WriteError, &sh->dev[i].flags);
2140                         if (!test_and_set_bit(WantReplacement, &rdev->flags))
2141                                 set_bit(MD_RECOVERY_NEEDED,
2142                                         &rdev->mddev->recovery);
2143                 } else if (is_badblock(rdev, sh->sector,
2144                                        STRIPE_SECTORS,
2145                                        &first_bad, &bad_sectors)) {
2146                         set_bit(R5_MadeGood, &sh->dev[i].flags);
2147                         if (test_bit(R5_ReadError, &sh->dev[i].flags))
2148                                 /* That was a successful write so make
2149                                  * sure it looks like we already did
2150                                  * a re-write.
2151                                  */
2152                                 set_bit(R5_ReWrite, &sh->dev[i].flags);
2153                 }
2154         }
2155         rdev_dec_pending(rdev, conf->mddev);
2156
2157         if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2158                 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2159         set_bit(STRIPE_HANDLE, &sh->state);
2160         release_stripe(sh);
2161 }
2162
2163 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
2164
2165 static void raid5_build_block(struct stripe_head *sh, int i, int previous)
2166 {
2167         struct r5dev *dev = &sh->dev[i];
2168
2169         bio_init(&dev->req);
2170         dev->req.bi_io_vec = &dev->vec;
2171         dev->req.bi_max_vecs = 1;
2172         dev->req.bi_private = sh;
2173
2174         bio_init(&dev->rreq);
2175         dev->rreq.bi_io_vec = &dev->rvec;
2176         dev->rreq.bi_max_vecs = 1;
2177         dev->rreq.bi_private = sh;
2178
2179         dev->flags = 0;
2180         dev->sector = compute_blocknr(sh, i, previous);
2181 }
2182
2183 static void error(struct mddev *mddev, struct md_rdev *rdev)
2184 {
2185         char b[BDEVNAME_SIZE];
2186         struct r5conf *conf = mddev->private;
2187         unsigned long flags;
2188         pr_debug("raid456: error called\n");
2189
2190         spin_lock_irqsave(&conf->device_lock, flags);
2191         clear_bit(In_sync, &rdev->flags);
2192         mddev->degraded = calc_degraded(conf);
2193         spin_unlock_irqrestore(&conf->device_lock, flags);
2194         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2195
2196         set_bit(Blocked, &rdev->flags);
2197         set_bit(Faulty, &rdev->flags);
2198         set_bit(MD_CHANGE_DEVS, &mddev->flags);
2199         printk(KERN_ALERT
2200                "md/raid:%s: Disk failure on %s, disabling device.\n"
2201                "md/raid:%s: Operation continuing on %d devices.\n",
2202                mdname(mddev),
2203                bdevname(rdev->bdev, b),
2204                mdname(mddev),
2205                conf->raid_disks - mddev->degraded);
2206 }
2207
2208 /*
2209  * Input: a 'big' sector number,
2210  * Output: index of the data and parity disk, and the sector # in them.
2211  */
2212 static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2213                                      int previous, int *dd_idx,
2214                                      struct stripe_head *sh)
2215 {
2216         sector_t stripe, stripe2;
2217         sector_t chunk_number;
2218         unsigned int chunk_offset;
2219         int pd_idx, qd_idx;
2220         int ddf_layout = 0;
2221         sector_t new_sector;
2222         int algorithm = previous ? conf->prev_algo
2223                                  : conf->algorithm;
2224         int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2225                                          : conf->chunk_sectors;
2226         int raid_disks = previous ? conf->previous_raid_disks
2227                                   : conf->raid_disks;
2228         int data_disks = raid_disks - conf->max_degraded;
2229
2230         /* First compute the information on this sector */
2231
2232         /*
2233          * Compute the chunk number and the sector offset inside the chunk
2234          */
2235         chunk_offset = sector_div(r_sector, sectors_per_chunk);
2236         chunk_number = r_sector;
2237
2238         /*
2239          * Compute the stripe number
2240          */
2241         stripe = chunk_number;
2242         *dd_idx = sector_div(stripe, data_disks);
2243         stripe2 = stripe;
2244         /*
2245          * Select the parity disk based on the user selected algorithm.
2246          */
2247         pd_idx = qd_idx = -1;
2248         switch(conf->level) {
2249         case 4:
2250                 pd_idx = data_disks;
2251                 break;
2252         case 5:
2253                 switch (algorithm) {
2254                 case ALGORITHM_LEFT_ASYMMETRIC:
2255                         pd_idx = data_disks - sector_div(stripe2, raid_disks);
2256                         if (*dd_idx >= pd_idx)
2257                                 (*dd_idx)++;
2258                         break;
2259                 case ALGORITHM_RIGHT_ASYMMETRIC:
2260                         pd_idx = sector_div(stripe2, raid_disks);
2261                         if (*dd_idx >= pd_idx)
2262                                 (*dd_idx)++;
2263                         break;
2264                 case ALGORITHM_LEFT_SYMMETRIC:
2265                         pd_idx = data_disks - sector_div(stripe2, raid_disks);
2266                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2267                         break;
2268                 case ALGORITHM_RIGHT_SYMMETRIC:
2269                         pd_idx = sector_div(stripe2, raid_disks);
2270                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2271                         break;
2272                 case ALGORITHM_PARITY_0:
2273                         pd_idx = 0;
2274                         (*dd_idx)++;
2275                         break;
2276                 case ALGORITHM_PARITY_N:
2277                         pd_idx = data_disks;
2278                         break;
2279                 default:
2280                         BUG();
2281                 }
2282                 break;
2283         case 6:
2284
2285                 switch (algorithm) {
2286                 case ALGORITHM_LEFT_ASYMMETRIC:
2287                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2288                         qd_idx = pd_idx + 1;
2289                         if (pd_idx == raid_disks-1) {
2290                                 (*dd_idx)++;    /* Q D D D P */
2291                                 qd_idx = 0;
2292                         } else if (*dd_idx >= pd_idx)
2293                                 (*dd_idx) += 2; /* D D P Q D */
2294                         break;
2295                 case ALGORITHM_RIGHT_ASYMMETRIC:
2296                         pd_idx = sector_div(stripe2, raid_disks);
2297                         qd_idx = pd_idx + 1;
2298                         if (pd_idx == raid_disks-1) {
2299                                 (*dd_idx)++;    /* Q D D D P */
2300                                 qd_idx = 0;
2301                         } else if (*dd_idx >= pd_idx)
2302                                 (*dd_idx) += 2; /* D D P Q D */
2303                         break;
2304                 case ALGORITHM_LEFT_SYMMETRIC:
2305                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2306                         qd_idx = (pd_idx + 1) % raid_disks;
2307                         *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2308                         break;
2309                 case ALGORITHM_RIGHT_SYMMETRIC:
2310                         pd_idx = sector_div(stripe2, raid_disks);
2311                         qd_idx = (pd_idx + 1) % raid_disks;
2312                         *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2313                         break;
2314
2315                 case ALGORITHM_PARITY_0:
2316                         pd_idx = 0;
2317                         qd_idx = 1;
2318                         (*dd_idx) += 2;
2319                         break;
2320                 case ALGORITHM_PARITY_N:
2321                         pd_idx = data_disks;
2322                         qd_idx = data_disks + 1;
2323                         break;
2324
2325                 case ALGORITHM_ROTATING_ZERO_RESTART:
2326                         /* Exactly the same as RIGHT_ASYMMETRIC, but or
2327                          * of blocks for computing Q is different.
2328                          */
2329                         pd_idx = sector_div(stripe2, raid_disks);
2330                         qd_idx = pd_idx + 1;
2331                         if (pd_idx == raid_disks-1) {
2332                                 (*dd_idx)++;    /* Q D D D P */
2333                                 qd_idx = 0;
2334                         } else if (*dd_idx >= pd_idx)
2335                                 (*dd_idx) += 2; /* D D P Q D */
2336                         ddf_layout = 1;
2337                         break;
2338
2339                 case ALGORITHM_ROTATING_N_RESTART:
2340                         /* Same a left_asymmetric, by first stripe is
2341                          * D D D P Q  rather than
2342                          * Q D D D P
2343                          */
2344                         stripe2 += 1;
2345                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2346                         qd_idx = pd_idx + 1;
2347                         if (pd_idx == raid_disks-1) {
2348                                 (*dd_idx)++;    /* Q D D D P */
2349                                 qd_idx = 0;
2350                         } else if (*dd_idx >= pd_idx)
2351                                 (*dd_idx) += 2; /* D D P Q D */
2352                         ddf_layout = 1;
2353                         break;
2354
2355                 case ALGORITHM_ROTATING_N_CONTINUE:
2356                         /* Same as left_symmetric but Q is before P */
2357                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2358                         qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
2359                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2360                         ddf_layout = 1;
2361                         break;
2362
2363                 case ALGORITHM_LEFT_ASYMMETRIC_6:
2364                         /* RAID5 left_asymmetric, with Q on last device */
2365                         pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2366                         if (*dd_idx >= pd_idx)
2367                                 (*dd_idx)++;
2368                         qd_idx = raid_disks - 1;
2369                         break;
2370
2371                 case ALGORITHM_RIGHT_ASYMMETRIC_6:
2372                         pd_idx = sector_div(stripe2, raid_disks-1);
2373                         if (*dd_idx >= pd_idx)
2374                                 (*dd_idx)++;
2375                         qd_idx = raid_disks - 1;
2376                         break;
2377
2378                 case ALGORITHM_LEFT_SYMMETRIC_6:
2379                         pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2380                         *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2381                         qd_idx = raid_disks - 1;
2382                         break;
2383
2384                 case ALGORITHM_RIGHT_SYMMETRIC_6:
2385                         pd_idx = sector_div(stripe2, raid_disks-1);
2386                         *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2387                         qd_idx = raid_disks - 1;
2388                         break;
2389
2390                 case ALGORITHM_PARITY_0_6:
2391                         pd_idx = 0;
2392                         (*dd_idx)++;
2393                         qd_idx = raid_disks - 1;
2394                         break;
2395
2396                 default:
2397                         BUG();
2398                 }
2399                 break;
2400         }
2401
2402         if (sh) {
2403                 sh->pd_idx = pd_idx;
2404                 sh->qd_idx = qd_idx;
2405                 sh->ddf_layout = ddf_layout;
2406         }
2407         /*
2408          * Finally, compute the new sector number
2409          */
2410         new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
2411         return new_sector;
2412 }
2413
2414 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
2415 {
2416         struct r5conf *conf = sh->raid_conf;
2417         int raid_disks = sh->disks;
2418         int data_disks = raid_disks - conf->max_degraded;
2419         sector_t new_sector = sh->sector, check;
2420         int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2421                                          : conf->chunk_sectors;
2422         int algorithm = previous ? conf->prev_algo
2423                                  : conf->algorithm;
2424         sector_t stripe;
2425         int chunk_offset;
2426         sector_t chunk_number;
2427         int dummy1, dd_idx = i;
2428         sector_t r_sector;
2429         struct stripe_head sh2;
2430
2431         chunk_offset = sector_div(new_sector, sectors_per_chunk);
2432         stripe = new_sector;
2433
2434         if (i == sh->pd_idx)
2435                 return 0;
2436         switch(conf->level) {
2437         case 4: break;
2438         case 5:
2439                 switch (algorithm) {
2440                 case ALGORITHM_LEFT_ASYMMETRIC:
2441                 case ALGORITHM_RIGHT_ASYMMETRIC:
2442                         if (i > sh->pd_idx)
2443                                 i--;
2444                         break;
2445                 case ALGORITHM_LEFT_SYMMETRIC:
2446                 case ALGORITHM_RIGHT_SYMMETRIC:
2447                         if (i < sh->pd_idx)
2448                                 i += raid_disks;
2449                         i -= (sh->pd_idx + 1);
2450                         break;
2451                 case ALGORITHM_PARITY_0:
2452                         i -= 1;
2453                         break;
2454                 case ALGORITHM_PARITY_N:
2455                         break;
2456                 default:
2457                         BUG();
2458                 }
2459                 break;
2460         case 6:
2461                 if (i == sh->qd_idx)
2462                         return 0; /* It is the Q disk */
2463                 switch (algorithm) {
2464                 case ALGORITHM_LEFT_ASYMMETRIC:
2465                 case ALGORITHM_RIGHT_ASYMMETRIC:
2466                 case ALGORITHM_ROTATING_ZERO_RESTART:
2467                 case ALGORITHM_ROTATING_N_RESTART:
2468                         if (sh->pd_idx == raid_disks-1)
2469                                 i--;    /* Q D D D P */
2470                         else if (i > sh->pd_idx)
2471                                 i -= 2; /* D D P Q D */
2472                         break;
2473                 case ALGORITHM_LEFT_SYMMETRIC:
2474                 case ALGORITHM_RIGHT_SYMMETRIC:
2475                         if (sh->pd_idx == raid_disks-1)
2476                                 i--; /* Q D D D P */
2477                         else {
2478                                 /* D D P Q D */
2479                                 if (i < sh->pd_idx)
2480                                         i += raid_disks;
2481                                 i -= (sh->pd_idx + 2);
2482                         }
2483                         break;
2484                 case ALGORITHM_PARITY_0:
2485                         i -= 2;
2486                         break;
2487                 case ALGORITHM_PARITY_N:
2488                         break;
2489                 case ALGORITHM_ROTATING_N_CONTINUE:
2490                         /* Like left_symmetric, but P is before Q */
2491                         if (sh->pd_idx == 0)
2492                                 i--;    /* P D D D Q */
2493                         else {
2494                                 /* D D Q P D */
2495                                 if (i < sh->pd_idx)
2496                                         i += raid_disks;
2497                                 i -= (sh->pd_idx + 1);
2498                         }
2499                         break;
2500                 case ALGORITHM_LEFT_ASYMMETRIC_6:
2501                 case ALGORITHM_RIGHT_ASYMMETRIC_6:
2502                         if (i > sh->pd_idx)
2503                                 i--;
2504                         break;
2505                 case ALGORITHM_LEFT_SYMMETRIC_6:
2506                 case ALGORITHM_RIGHT_SYMMETRIC_6:
2507                         if (i < sh->pd_idx)
2508                                 i += data_disks + 1;
2509                         i -= (sh->pd_idx + 1);
2510                         break;
2511                 case ALGORITHM_PARITY_0_6:
2512                         i -= 1;
2513                         break;
2514                 default:
2515                         BUG();
2516                 }
2517                 break;
2518         }
2519
2520         chunk_number = stripe * data_disks + i;
2521         r_sector = chunk_number * sectors_per_chunk + chunk_offset;
2522
2523         check = raid5_compute_sector(conf, r_sector,
2524                                      previous, &dummy1, &sh2);
2525         if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
2526                 || sh2.qd_idx != sh->qd_idx) {
2527                 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
2528                        mdname(conf->mddev));
2529                 return 0;
2530         }
2531         return r_sector;
2532 }
2533
2534 static void
2535 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2536                          int rcw, int expand)
2537 {
2538         int i, pd_idx = sh->pd_idx, disks = sh->disks;
2539         struct r5conf *conf = sh->raid_conf;
2540         int level = conf->level;
2541
2542         if (rcw) {
2543
2544                 for (i = disks; i--; ) {
2545                         struct r5dev *dev = &sh->dev[i];
2546
2547                         if (dev->towrite) {
2548                                 set_bit(R5_LOCKED, &dev->flags);
2549                                 set_bit(R5_Wantdrain, &dev->flags);
2550                                 if (!expand)
2551                                         clear_bit(R5_UPTODATE, &dev->flags);
2552                                 s->locked++;
2553                         }
2554                 }
2555                 /* if we are not expanding this is a proper write request, and
2556                  * there will be bios with new data to be drained into the
2557                  * stripe cache
2558                  */
2559                 if (!expand) {
2560                         if (!s->locked)
2561                                 /* False alarm, nothing to do */
2562                                 return;
2563                         sh->reconstruct_state = reconstruct_state_drain_run;
2564                         set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2565                 } else
2566                         sh->reconstruct_state = reconstruct_state_run;
2567
2568                 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2569
2570                 if (s->locked + conf->max_degraded == disks)
2571                         if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2572                                 atomic_inc(&conf->pending_full_writes);
2573         } else {
2574                 BUG_ON(level == 6);
2575                 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
2576                         test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
2577
2578                 for (i = disks; i--; ) {
2579                         struct r5dev *dev = &sh->dev[i];
2580                         if (i == pd_idx)
2581                                 continue;
2582
2583                         if (dev->towrite &&
2584                             (test_bit(R5_UPTODATE, &dev->flags) ||
2585                              test_bit(R5_Wantcompute, &dev->flags))) {
2586                                 set_bit(R5_Wantdrain, &dev->flags);
2587                                 set_bit(R5_LOCKED, &dev->flags);
2588                                 clear_bit(R5_UPTODATE, &dev->flags);
2589                                 s->locked++;
2590                         }
2591                 }
2592                 if (!s->locked)
2593                         /* False alarm - nothing to do */
2594                         return;
2595                 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
2596                 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
2597                 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2598                 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2599         }
2600
2601         /* keep the parity disk(s) locked while asynchronous operations
2602          * are in flight
2603          */
2604         set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
2605         clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2606         s->locked++;
2607
2608         if (level == 6) {
2609                 int qd_idx = sh->qd_idx;
2610                 struct r5dev *dev = &sh->dev[qd_idx];
2611
2612                 set_bit(R5_LOCKED, &dev->flags);
2613                 clear_bit(R5_UPTODATE, &dev->flags);
2614                 s->locked++;
2615         }
2616
2617         pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
2618                 __func__, (unsigned long long)sh->sector,
2619                 s->locked, s->ops_request);
2620 }
2621
2622 /*
2623  * Each stripe/dev can have one or more bion attached.
2624  * toread/towrite point to the first in a chain.
2625  * The bi_next chain must be in order.
2626  */
2627 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
2628 {
2629         struct bio **bip;
2630         struct r5conf *conf = sh->raid_conf;
2631         int firstwrite=0;
2632
2633         pr_debug("adding bi b#%llu to stripe s#%llu\n",
2634                 (unsigned long long)bi->bi_iter.bi_sector,
2635                 (unsigned long long)sh->sector);
2636
2637         /*
2638          * If several bio share a stripe. The bio bi_phys_segments acts as a
2639          * reference count to avoid race. The reference count should already be
2640          * increased before this function is called (for example, in
2641          * make_request()), so other bio sharing this stripe will not free the
2642          * stripe. If a stripe is owned by one stripe, the stripe lock will
2643          * protect it.
2644          */
2645         spin_lock_irq(&sh->stripe_lock);
2646         if (forwrite) {
2647                 bip = &sh->dev[dd_idx].towrite;
2648                 if (*bip == NULL)
2649                         firstwrite = 1;
2650         } else
2651                 bip = &sh->dev[dd_idx].toread;
2652         while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
2653                 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
2654                         goto overlap;
2655                 bip = & (*bip)->bi_next;
2656         }
2657         if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
2658                 goto overlap;
2659
2660         BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
2661         if (*bip)
2662                 bi->bi_next = *bip;
2663         *bip = bi;
2664         raid5_inc_bi_active_stripes(bi);
2665
2666         if (forwrite) {
2667                 /* check if page is covered */
2668                 sector_t sector = sh->dev[dd_idx].sector;
2669                 for (bi=sh->dev[dd_idx].towrite;
2670                      sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
2671                              bi && bi->bi_iter.bi_sector <= sector;
2672                      bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
2673                         if (bio_end_sector(bi) >= sector)
2674                                 sector = bio_end_sector(bi);
2675                 }
2676                 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2677                         set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2678         }
2679
2680         pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2681                 (unsigned long long)(*bip)->bi_iter.bi_sector,
2682                 (unsigned long long)sh->sector, dd_idx);
2683         spin_unlock_irq(&sh->stripe_lock);
2684
2685         if (conf->mddev->bitmap && firstwrite) {
2686                 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2687                                   STRIPE_SECTORS, 0);
2688                 sh->bm_seq = conf->seq_flush+1;
2689                 set_bit(STRIPE_BIT_DELAY, &sh->state);
2690         }
2691         return 1;
2692
2693  overlap:
2694         set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2695         spin_unlock_irq(&sh->stripe_lock);
2696         return 0;
2697 }
2698
2699 static void end_reshape(struct r5conf *conf);
2700
2701 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
2702                             struct stripe_head *sh)
2703 {
2704         int sectors_per_chunk =
2705                 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
2706         int dd_idx;
2707         int chunk_offset = sector_div(stripe, sectors_per_chunk);
2708         int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
2709
2710         raid5_compute_sector(conf,
2711                              stripe * (disks - conf->max_degraded)
2712                              *sectors_per_chunk + chunk_offset,
2713                              previous,
2714                              &dd_idx, sh);
2715 }
2716
2717 static void
2718 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2719                                 struct stripe_head_state *s, int disks,
2720                                 struct bio **return_bi)
2721 {
2722         int i;
2723         for (i = disks; i--; ) {
2724                 struct bio *bi;
2725                 int bitmap_end = 0;
2726
2727                 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2728                         struct md_rdev *rdev;
2729                         rcu_read_lock();
2730                         rdev = rcu_dereference(conf->disks[i].rdev);
2731                         if (rdev && test_bit(In_sync, &rdev->flags))
2732                                 atomic_inc(&rdev->nr_pending);
2733                         else
2734                                 rdev = NULL;
2735                         rcu_read_unlock();
2736                         if (rdev) {
2737                                 if (!rdev_set_badblocks(
2738                                             rdev,
2739                                             sh->sector,
2740                                             STRIPE_SECTORS, 0))
2741                                         md_error(conf->mddev, rdev);
2742                                 rdev_dec_pending(rdev, conf->mddev);
2743                         }
2744                 }
2745                 spin_lock_irq(&sh->stripe_lock);
2746                 /* fail all writes first */
2747                 bi = sh->dev[i].towrite;
2748                 sh->dev[i].towrite = NULL;
2749                 spin_unlock_irq(&sh->stripe_lock);
2750                 if (bi)
2751                         bitmap_end = 1;
2752
2753                 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2754                         wake_up(&conf->wait_for_overlap);
2755
2756                 while (bi && bi->bi_iter.bi_sector <
2757                         sh->dev[i].sector + STRIPE_SECTORS) {
2758                         struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2759                         clear_bit(BIO_UPTODATE, &bi->bi_flags);
2760                         if (!raid5_dec_bi_active_stripes(bi)) {
2761                                 md_write_end(conf->mddev);
2762                                 bi->bi_next = *return_bi;
2763                                 *return_bi = bi;
2764                         }
2765                         bi = nextbi;
2766                 }
2767                 if (bitmap_end)
2768                         bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2769                                 STRIPE_SECTORS, 0, 0);
2770                 bitmap_end = 0;
2771                 /* and fail all 'written' */
2772                 bi = sh->dev[i].written;
2773                 sh->dev[i].written = NULL;
2774                 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
2775                         WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
2776                         sh->dev[i].page = sh->dev[i].orig_page;
2777                 }
2778
2779                 if (bi) bitmap_end = 1;
2780                 while (bi && bi->bi_iter.bi_sector <
2781                        sh->dev[i].sector + STRIPE_SECTORS) {
2782                         struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2783                         clear_bit(BIO_UPTODATE, &bi->bi_flags);
2784                         if (!raid5_dec_bi_active_stripes(bi)) {
2785                                 md_write_end(conf->mddev);
2786                                 bi->bi_next = *return_bi;
2787                                 *return_bi = bi;
2788                         }
2789                         bi = bi2;
2790                 }
2791
2792                 /* fail any reads if this device is non-operational and
2793                  * the data has not reached the cache yet.
2794                  */
2795                 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
2796                     (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2797                       test_bit(R5_ReadError, &sh->dev[i].flags))) {
2798                         spin_lock_irq(&sh->stripe_lock);
2799                         bi = sh->dev[i].toread;
2800                         sh->dev[i].toread = NULL;
2801                         spin_unlock_irq(&sh->stripe_lock);
2802                         if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2803                                 wake_up(&conf->wait_for_overlap);
2804                         while (bi && bi->bi_iter.bi_sector <
2805                                sh->dev[i].sector + STRIPE_SECTORS) {
2806                                 struct bio *nextbi =
2807                                         r5_next_bio(bi, sh->dev[i].sector);
2808                                 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2809                                 if (!raid5_dec_bi_active_stripes(bi)) {
2810                                         bi->bi_next = *return_bi;
2811                                         *return_bi = bi;
2812                                 }
2813                                 bi = nextbi;
2814                         }
2815                 }
2816                 if (bitmap_end)
2817                         bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2818                                         STRIPE_SECTORS, 0, 0);
2819                 /* If we were in the middle of a write the parity block might
2820                  * still be locked - so just clear all R5_LOCKED flags
2821                  */
2822                 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2823         }
2824
2825         if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2826                 if (atomic_dec_and_test(&conf->pending_full_writes))
2827                         md_wakeup_thread(conf->mddev->thread);
2828 }
2829
2830 static void
2831 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2832                    struct stripe_head_state *s)
2833 {
2834         int abort = 0;
2835         int i;
2836
2837         clear_bit(STRIPE_SYNCING, &sh->state);
2838         if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
2839                 wake_up(&conf->wait_for_overlap);
2840         s->syncing = 0;
2841         s->replacing = 0;
2842         /* There is nothing more to do for sync/check/repair.
2843          * Don't even need to abort as that is handled elsewhere
2844          * if needed, and not always wanted e.g. if there is a known
2845          * bad block here.
2846          * For recover/replace we need to record a bad block on all
2847          * non-sync devices, or abort the recovery
2848          */
2849         if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
2850                 /* During recovery devices cannot be removed, so
2851                  * locking and refcounting of rdevs is not needed
2852                  */
2853                 for (i = 0; i < conf->raid_disks; i++) {
2854                         struct md_rdev *rdev = conf->disks[i].rdev;
2855                         if (rdev
2856                             && !test_bit(Faulty, &rdev->flags)
2857                             && !test_bit(In_sync, &rdev->flags)
2858                             && !rdev_set_badblocks(rdev, sh->sector,
2859                                                    STRIPE_SECTORS, 0))
2860                                 abort = 1;
2861                         rdev = conf->disks[i].replacement;
2862                         if (rdev
2863                             && !test_bit(Faulty, &rdev->flags)
2864                             && !test_bit(In_sync, &rdev->flags)
2865                             && !rdev_set_badblocks(rdev, sh->sector,
2866                                                    STRIPE_SECTORS, 0))
2867                                 abort = 1;
2868                 }
2869                 if (abort)
2870                         conf->recovery_disabled =
2871                                 conf->mddev->recovery_disabled;
2872         }
2873         md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
2874 }
2875
2876 static int want_replace(struct stripe_head *sh, int disk_idx)
2877 {
2878         struct md_rdev *rdev;
2879         int rv = 0;
2880         /* Doing recovery so rcu locking not required */
2881         rdev = sh->raid_conf->disks[disk_idx].replacement;
2882         if (rdev
2883             && !test_bit(Faulty, &rdev->flags)
2884             && !test_bit(In_sync, &rdev->flags)
2885             && (rdev->recovery_offset <= sh->sector
2886                 || rdev->mddev->recovery_cp <= sh->sector))
2887                 rv = 1;
2888
2889         return rv;
2890 }
2891
2892 /* fetch_block - checks the given member device to see if its data needs
2893  * to be read or computed to satisfy a request.
2894  *
2895  * Returns 1 when no more member devices need to be checked, otherwise returns
2896  * 0 to tell the loop in handle_stripe_fill to continue
2897  */
2898
2899 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
2900                            int disk_idx, int disks)
2901 {
2902         struct r5dev *dev = &sh->dev[disk_idx];
2903         struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
2904                                   &sh->dev[s->failed_num[1]] };
2905         int i;
2906
2907
2908         if (test_bit(R5_LOCKED, &dev->flags) ||
2909             test_bit(R5_UPTODATE, &dev->flags))
2910                 /* No point reading this as we already have it or have
2911                  * decided to get it.
2912                  */
2913                 return 0;
2914
2915         if (dev->toread ||
2916             (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
2917                 /* We need this block to directly satisfy a request */
2918                 return 1;
2919
2920         if (s->syncing || s->expanding ||
2921             (s->replacing && want_replace(sh, disk_idx)))
2922                 /* When syncing, or expanding we read everything.
2923                  * When replacing, we need the replaced block.
2924                  */
2925                 return 1;
2926
2927         if ((s->failed >= 1 && fdev[0]->toread) ||
2928             (s->failed >= 2 && fdev[1]->toread))
2929                 /* If we want to read from a failed device, then
2930                  * we need to actually read every other device.
2931                  */
2932                 return 1;
2933
2934         /* Sometimes neither read-modify-write nor reconstruct-write
2935          * cycles can work.  In those cases we read every block we
2936          * can.  Then the parity-update is certain to have enough to
2937          * work with.
2938          * This can only be a problem when we need to write something,
2939          * and some device has failed.  If either of those tests
2940          * fail we need look no further.
2941          */
2942         if (!s->failed || !s->to_write)
2943                 return 0;
2944
2945         if (test_bit(R5_Insync, &dev->flags) &&
2946             !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
2947                 /* Pre-reads at not permitted until after short delay
2948                  * to gather multiple requests.  However if this
2949                  * device is no Insync, the block could only be be computed
2950                  * and there is no need to delay that.
2951                  */
2952                 return 0;
2953
2954         for (i = 0; i < s->failed; i++) {
2955                 if (fdev[i]->towrite &&
2956                     !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
2957                     !test_bit(R5_OVERWRITE, &fdev[i]->flags))
2958                         /* If we have a partial write to a failed
2959                          * device, then we will need to reconstruct
2960                          * the content of that device, so all other
2961                          * devices must be read.
2962                          */
2963                         return 1;
2964         }
2965
2966         /* If we are forced to do a reconstruct-write, either because
2967          * the current RAID6 implementation only supports that, or
2968          * or because parity cannot be trusted and we are currently
2969          * recovering it, there is extra need to be careful.
2970          * If one of the devices that we would need to read, because
2971          * it is not being overwritten (and maybe not written at all)
2972          * is missing/faulty, then we need to read everything we can.
2973          */
2974         if (sh->raid_conf->level != 6 &&
2975             sh->sector < sh->raid_conf->mddev->recovery_cp)
2976                 /* reconstruct-write isn't being forced */
2977                 return 0;
2978         for (i = 0; i < s->failed; i++) {
2979                 if (!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
2980                     !test_bit(R5_OVERWRITE, &fdev[i]->flags))
2981                         return 1;
2982         }
2983
2984         return 0;
2985 }
2986
2987 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
2988                        int disk_idx, int disks)
2989 {
2990         struct r5dev *dev = &sh->dev[disk_idx];
2991
2992         /* is the data in this block needed, and can we get it? */
2993         if (need_this_block(sh, s, disk_idx, disks)) {
2994                 /* we would like to get this block, possibly by computing it,
2995                  * otherwise read it if the backing disk is insync
2996                  */
2997                 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2998                 BUG_ON(test_bit(R5_Wantread, &dev->flags));
2999                 if ((s->uptodate == disks - 1) &&
3000                     (s->failed && (disk_idx == s->failed_num[0] ||
3001                                    disk_idx == s->failed_num[1]))) {
3002                         /* have disk failed, and we're requested to fetch it;
3003                          * do compute it
3004                          */
3005                         pr_debug("Computing stripe %llu block %d\n",
3006                                (unsigned long long)sh->sector, disk_idx);
3007                         set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3008                         set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3009                         set_bit(R5_Wantcompute, &dev->flags);
3010                         sh->ops.target = disk_idx;
3011                         sh->ops.target2 = -1; /* no 2nd target */
3012                         s->req_compute = 1;
3013                         /* Careful: from this point on 'uptodate' is in the eye
3014                          * of raid_run_ops which services 'compute' operations
3015                          * before writes. R5_Wantcompute flags a block that will
3016                          * be R5_UPTODATE by the time it is needed for a
3017                          * subsequent operation.
3018                          */
3019                         s->uptodate++;
3020                         return 1;
3021                 } else if (s->uptodate == disks-2 && s->failed >= 2) {
3022                         /* Computing 2-failure is *very* expensive; only
3023                          * do it if failed >= 2
3024                          */
3025                         int other;
3026                         for (other = disks; other--; ) {
3027                                 if (other == disk_idx)
3028                                         continue;
3029                                 if (!test_bit(R5_UPTODATE,
3030                                       &sh->dev[other].flags))
3031                                         break;
3032                         }
3033                         BUG_ON(other < 0);
3034                         pr_debug("Computing stripe %llu blocks %d,%d\n",
3035                                (unsigned long long)sh->sector,
3036                                disk_idx, other);
3037                         set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3038                         set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3039                         set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
3040                         set_bit(R5_Wantcompute, &sh->dev[other].flags);
3041                         sh->ops.target = disk_idx;
3042                         sh->ops.target2 = other;
3043                         s->uptodate += 2;
3044                         s->req_compute = 1;
3045                         return 1;
3046                 } else if (test_bit(R5_Insync, &dev->flags)) {
3047                         set_bit(R5_LOCKED, &dev->flags);
3048                         set_bit(R5_Wantread, &dev->flags);
3049                         s->locked++;
3050                         pr_debug("Reading block %d (sync=%d)\n",
3051                                 disk_idx, s->syncing);
3052                 }
3053         }
3054
3055         return 0;
3056 }
3057
3058 /**
3059  * handle_stripe_fill - read or compute data to satisfy pending requests.
3060  */
3061 static void handle_stripe_fill(struct stripe_head *sh,
3062                                struct stripe_head_state *s,
3063                                int disks)
3064 {
3065         int i;
3066
3067         /* look for blocks to read/compute, skip this if a compute
3068          * is already in flight, or if the stripe contents are in the
3069          * midst of changing due to a write
3070          */
3071         if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
3072             !sh->reconstruct_state)
3073                 for (i = disks; i--; )
3074                         if (fetch_block(sh, s, i, disks))
3075                                 break;
3076         set_bit(STRIPE_HANDLE, &sh->state);
3077 }
3078
3079 /* handle_stripe_clean_event
3080  * any written block on an uptodate or failed drive can be returned.
3081  * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
3082  * never LOCKED, so we don't need to test 'failed' directly.
3083  */
3084 static void handle_stripe_clean_event(struct r5conf *conf,
3085         struct stripe_head *sh, int disks, struct bio **return_bi)
3086 {
3087         int i;
3088         struct r5dev *dev;
3089         int discard_pending = 0;
3090
3091         for (i = disks; i--; )
3092                 if (sh->dev[i].written) {
3093                         dev = &sh->dev[i];
3094                         if (!test_bit(R5_LOCKED, &dev->flags) &&
3095                             (test_bit(R5_UPTODATE, &dev->flags) ||
3096                              test_bit(R5_Discard, &dev->flags) ||
3097                              test_bit(R5_SkipCopy, &dev->flags))) {
3098                                 /* We can return any write requests */
3099                                 struct bio *wbi, *wbi2;
3100                                 pr_debug("Return write for disc %d\n", i);
3101                                 if (test_and_clear_bit(R5_Discard, &dev->flags))
3102                                         clear_bit(R5_UPTODATE, &dev->flags);
3103                                 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
3104                                         WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
3105                                         dev->page = dev->orig_page;
3106                                 }
3107                                 wbi = dev->written;
3108                                 dev->written = NULL;
3109                                 while (wbi && wbi->bi_iter.bi_sector <
3110                                         dev->sector + STRIPE_SECTORS) {
3111                                         wbi2 = r5_next_bio(wbi, dev->sector);
3112                                         if (!raid5_dec_bi_active_stripes(wbi)) {
3113                                                 md_write_end(conf->mddev);
3114                                                 wbi->bi_next = *return_bi;
3115                                                 *return_bi = wbi;
3116                                         }
3117                                         wbi = wbi2;
3118                                 }
3119                                 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3120                                                 STRIPE_SECTORS,
3121                                          !test_bit(STRIPE_DEGRADED, &sh->state),
3122                                                 0);
3123                         } else if (test_bit(R5_Discard, &dev->flags))
3124                                 discard_pending = 1;
3125                         WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
3126                         WARN_ON(dev->page != dev->orig_page);
3127                 }
3128         if (!discard_pending &&
3129             test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
3130                 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
3131                 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
3132                 if (sh->qd_idx >= 0) {
3133                         clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
3134                         clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
3135                 }
3136                 /* now that discard is done we can proceed with any sync */
3137                 clear_bit(STRIPE_DISCARD, &sh->state);
3138                 /*
3139                  * SCSI discard will change some bio fields and the stripe has
3140                  * no updated data, so remove it from hash list and the stripe
3141                  * will be reinitialized
3142                  */
3143                 spin_lock_irq(&conf->device_lock);
3144                 remove_hash(sh);
3145                 spin_unlock_irq(&conf->device_lock);
3146                 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
3147                         set_bit(STRIPE_HANDLE, &sh->state);
3148
3149         }
3150
3151         if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3152                 if (atomic_dec_and_test(&conf->pending_full_writes))
3153                         md_wakeup_thread(conf->mddev->thread);
3154 }
3155
3156 static void handle_stripe_dirtying(struct r5conf *conf,
3157                                    struct stripe_head *sh,
3158                                    struct stripe_head_state *s,
3159                                    int disks)
3160 {
3161         int rmw = 0, rcw = 0, i;
3162         sector_t recovery_cp = conf->mddev->recovery_cp;
3163
3164         /* RAID6 requires 'rcw' in current implementation.
3165          * Otherwise, check whether resync is now happening or should start.
3166          * If yes, then the array is dirty (after unclean shutdown or
3167          * initial creation), so parity in some stripes might be inconsistent.
3168          * In this case, we need to always do reconstruct-write, to ensure
3169          * that in case of drive failure or read-error correction, we
3170          * generate correct data from the parity.
3171          */
3172         if (conf->max_degraded == 2 ||
3173             (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
3174              s->failed == 0)) {
3175                 /* Calculate the real rcw later - for now make it
3176                  * look like rcw is cheaper
3177                  */
3178                 rcw = 1; rmw = 2;
3179                 pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
3180                          conf->max_degraded, (unsigned long long)recovery_cp,
3181                          (unsigned long long)sh->sector);
3182         } else for (i = disks; i--; ) {
3183                 /* would I have to read this buffer for read_modify_write */
3184                 struct r5dev *dev = &sh->dev[i];
3185                 if ((dev->towrite || i == sh->pd_idx) &&
3186                     !test_bit(R5_LOCKED, &dev->flags) &&
3187                     !(test_bit(R5_UPTODATE, &dev->flags) ||
3188                       test_bit(R5_Wantcompute, &dev->flags))) {
3189                         if (test_bit(R5_Insync, &dev->flags))
3190                                 rmw++;
3191                         else
3192                                 rmw += 2*disks;  /* cannot read it */
3193                 }
3194                 /* Would I have to read this buffer for reconstruct_write */
3195                 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
3196                     !test_bit(R5_LOCKED, &dev->flags) &&
3197                     !(test_bit(R5_UPTODATE, &dev->flags) ||
3198                     test_bit(R5_Wantcompute, &dev->flags))) {
3199                         if (test_bit(R5_Insync, &dev->flags))
3200                                 rcw++;
3201                         else
3202                                 rcw += 2*disks;
3203                 }
3204         }
3205         pr_debug("for sector %llu, rmw=%d rcw=%d\n",
3206                 (unsigned long long)sh->sector, rmw, rcw);
3207         set_bit(STRIPE_HANDLE, &sh->state);
3208         if (rmw < rcw && rmw > 0) {
3209                 /* prefer read-modify-write, but need to get some data */
3210                 if (conf->mddev->queue)
3211                         blk_add_trace_msg(conf->mddev->queue,
3212                                           "raid5 rmw %llu %d",
3213                                           (unsigned long long)sh->sector, rmw);
3214                 for (i = disks; i--; ) {
3215                         struct r5dev *dev = &sh->dev[i];
3216                         if ((dev->towrite || i == sh->pd_idx) &&
3217                             !test_bit(R5_LOCKED, &dev->flags) &&
3218                             !(test_bit(R5_UPTODATE, &dev->flags) ||
3219                             test_bit(R5_Wantcompute, &dev->flags)) &&
3220                             test_bit(R5_Insync, &dev->flags)) {
3221                                 if (test_bit(STRIPE_PREREAD_ACTIVE,
3222                                              &sh->state)) {
3223                                         pr_debug("Read_old block %d for r-m-w\n",
3224                                                  i);
3225                                         set_bit(R5_LOCKED, &dev->flags);
3226                                         set_bit(R5_Wantread, &dev->flags);
3227                                         s->locked++;
3228                                 } else {
3229                                         set_bit(STRIPE_DELAYED, &sh->state);
3230                                         set_bit(STRIPE_HANDLE, &sh->state);
3231                                 }
3232                         }
3233                 }
3234         }
3235         if (rcw <= rmw && rcw > 0) {
3236                 /* want reconstruct write, but need to get some data */
3237                 int qread =0;
3238                 rcw = 0;
3239                 for (i = disks; i--; ) {
3240                         struct r5dev *dev = &sh->dev[i];
3241                         if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3242                             i != sh->pd_idx && i != sh->qd_idx &&
3243                             !test_bit(R5_LOCKED, &dev->flags) &&
3244                             !(test_bit(R5_UPTODATE, &dev->flags) ||
3245                               test_bit(R5_Wantcompute, &dev->flags))) {
3246                                 rcw++;
3247                                 if (test_bit(R5_Insync, &dev->flags) &&
3248                                     test_bit(STRIPE_PREREAD_ACTIVE,
3249                                              &sh->state)) {
3250                                         pr_debug("Read_old block "
3251                                                 "%d for Reconstruct\n", i);
3252                                         set_bit(R5_LOCKED, &dev->flags);
3253                                         set_bit(R5_Wantread, &dev->flags);
3254                                         s->locked++;
3255  &nbs