fd4af4de03b40117c98a4584bf0c4b7eedd3dba9
[sfrench/cifs-2.6.git] / drivers / md / md.c
1 /*
2    md.c : Multiple Devices driver for Linux
3      Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5      completely rewritten, based on the MD driver code from Marc Zyngier
6
7    Changes:
8
9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13    - kmod support by: Cyrus Durgin
14    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17    - lots of fixes and improvements to the RAID1/RAID5 and generic
18      RAID code (such as request based resynchronization):
19
20      Neil Brown <neilb@cse.unsw.edu.au>.
21
22    - persistent bitmap code
23      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
25    This program is free software; you can redistribute it and/or modify
26    it under the terms of the GNU General Public License as published by
27    the Free Software Foundation; either version 2, or (at your option)
28    any later version.
29
30    You should have received a copy of the GNU General Public License
31    (for example /usr/src/linux/COPYING); if not, write to the Free
32    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33
34    Errors, Warnings, etc.
35    Please use:
36      pr_crit() for error conditions that risk data loss
37      pr_err() for error conditions that are unexpected, like an IO error
38          or internal inconsistency
39      pr_warn() for error conditions that could have been predicated, like
40          adding a device to an array when it has incompatible metadata
41      pr_info() for every interesting, very rare events, like an array starting
42          or stopping, or resync starting or stopping
43      pr_debug() for everything else.
44
45 */
46
47 #include <linux/sched/signal.h>
48 #include <linux/kthread.h>
49 #include <linux/blkdev.h>
50 #include <linux/badblocks.h>
51 #include <linux/sysctl.h>
52 #include <linux/seq_file.h>
53 #include <linux/fs.h>
54 #include <linux/poll.h>
55 #include <linux/ctype.h>
56 #include <linux/string.h>
57 #include <linux/hdreg.h>
58 #include <linux/proc_fs.h>
59 #include <linux/random.h>
60 #include <linux/module.h>
61 #include <linux/reboot.h>
62 #include <linux/file.h>
63 #include <linux/compat.h>
64 #include <linux/delay.h>
65 #include <linux/raid/md_p.h>
66 #include <linux/raid/md_u.h>
67 #include <linux/slab.h>
68 #include <linux/percpu-refcount.h>
69
70 #include <trace/events/block.h>
71 #include "md.h"
72 #include "md-bitmap.h"
73 #include "md-cluster.h"
74
75 #ifndef MODULE
76 static void autostart_arrays(int part);
77 #endif
78
79 /* pers_list is a list of registered personalities protected
80  * by pers_lock.
81  * pers_lock does extra service to protect accesses to
82  * mddev->thread when the mutex cannot be held.
83  */
84 static LIST_HEAD(pers_list);
85 static DEFINE_SPINLOCK(pers_lock);
86
87 static struct kobj_type md_ktype;
88
89 struct md_cluster_operations *md_cluster_ops;
90 EXPORT_SYMBOL(md_cluster_ops);
91 struct module *md_cluster_mod;
92 EXPORT_SYMBOL(md_cluster_mod);
93
94 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
95 static struct workqueue_struct *md_wq;
96 static struct workqueue_struct *md_misc_wq;
97
98 static int remove_and_add_spares(struct mddev *mddev,
99                                  struct md_rdev *this);
100 static void mddev_detach(struct mddev *mddev);
101
102 /*
103  * Default number of read corrections we'll attempt on an rdev
104  * before ejecting it from the array. We divide the read error
105  * count by 2 for every hour elapsed between read errors.
106  */
107 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
108 /*
109  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
110  * is 1000 KB/sec, so the extra system load does not show up that much.
111  * Increase it if you want to have more _guaranteed_ speed. Note that
112  * the RAID driver will use the maximum available bandwidth if the IO
113  * subsystem is idle. There is also an 'absolute maximum' reconstruction
114  * speed limit - in case reconstruction slows down your system despite
115  * idle IO detection.
116  *
117  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
118  * or /sys/block/mdX/md/sync_speed_{min,max}
119  */
120
121 static int sysctl_speed_limit_min = 1000;
122 static int sysctl_speed_limit_max = 200000;
123 static inline int speed_min(struct mddev *mddev)
124 {
125         return mddev->sync_speed_min ?
126                 mddev->sync_speed_min : sysctl_speed_limit_min;
127 }
128
129 static inline int speed_max(struct mddev *mddev)
130 {
131         return mddev->sync_speed_max ?
132                 mddev->sync_speed_max : sysctl_speed_limit_max;
133 }
134
135 static void * flush_info_alloc(gfp_t gfp_flags, void *data)
136 {
137         return kzalloc(sizeof(struct flush_info), gfp_flags);
138 }
139 static void flush_info_free(void *flush_info, void *data)
140 {
141         kfree(flush_info);
142 }
143
144 static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
145 {
146         return kzalloc(sizeof(struct flush_bio), gfp_flags);
147 }
148 static void flush_bio_free(void *flush_bio, void *data)
149 {
150         kfree(flush_bio);
151 }
152
153 static struct ctl_table_header *raid_table_header;
154
155 static struct ctl_table raid_table[] = {
156         {
157                 .procname       = "speed_limit_min",
158                 .data           = &sysctl_speed_limit_min,
159                 .maxlen         = sizeof(int),
160                 .mode           = S_IRUGO|S_IWUSR,
161                 .proc_handler   = proc_dointvec,
162         },
163         {
164                 .procname       = "speed_limit_max",
165                 .data           = &sysctl_speed_limit_max,
166                 .maxlen         = sizeof(int),
167                 .mode           = S_IRUGO|S_IWUSR,
168                 .proc_handler   = proc_dointvec,
169         },
170         { }
171 };
172
173 static struct ctl_table raid_dir_table[] = {
174         {
175                 .procname       = "raid",
176                 .maxlen         = 0,
177                 .mode           = S_IRUGO|S_IXUGO,
178                 .child          = raid_table,
179         },
180         { }
181 };
182
183 static struct ctl_table raid_root_table[] = {
184         {
185                 .procname       = "dev",
186                 .maxlen         = 0,
187                 .mode           = 0555,
188                 .child          = raid_dir_table,
189         },
190         {  }
191 };
192
193 static const struct block_device_operations md_fops;
194
195 static int start_readonly;
196
197 /*
198  * The original mechanism for creating an md device is to create
199  * a device node in /dev and to open it.  This causes races with device-close.
200  * The preferred method is to write to the "new_array" module parameter.
201  * This can avoid races.
202  * Setting create_on_open to false disables the original mechanism
203  * so all the races disappear.
204  */
205 static bool create_on_open = true;
206
207 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
208                             struct mddev *mddev)
209 {
210         struct bio *b;
211
212         if (!mddev || !bioset_initialized(&mddev->bio_set))
213                 return bio_alloc(gfp_mask, nr_iovecs);
214
215         b = bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
216         if (!b)
217                 return NULL;
218         return b;
219 }
220 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
221
222 static struct bio *md_bio_alloc_sync(struct mddev *mddev)
223 {
224         if (!mddev || !bioset_initialized(&mddev->sync_set))
225                 return bio_alloc(GFP_NOIO, 1);
226
227         return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
228 }
229
230 /*
231  * We have a system wide 'event count' that is incremented
232  * on any 'interesting' event, and readers of /proc/mdstat
233  * can use 'poll' or 'select' to find out when the event
234  * count increases.
235  *
236  * Events are:
237  *  start array, stop array, error, add device, remove device,
238  *  start build, activate spare
239  */
240 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
241 static atomic_t md_event_count;
242 void md_new_event(struct mddev *mddev)
243 {
244         atomic_inc(&md_event_count);
245         wake_up(&md_event_waiters);
246 }
247 EXPORT_SYMBOL_GPL(md_new_event);
248
249 /*
250  * Enables to iterate over all existing md arrays
251  * all_mddevs_lock protects this list.
252  */
253 static LIST_HEAD(all_mddevs);
254 static DEFINE_SPINLOCK(all_mddevs_lock);
255
256 /*
257  * iterates through all used mddevs in the system.
258  * We take care to grab the all_mddevs_lock whenever navigating
259  * the list, and to always hold a refcount when unlocked.
260  * Any code which breaks out of this loop while own
261  * a reference to the current mddev and must mddev_put it.
262  */
263 #define for_each_mddev(_mddev,_tmp)                                     \
264                                                                         \
265         for (({ spin_lock(&all_mddevs_lock);                            \
266                 _tmp = all_mddevs.next;                                 \
267                 _mddev = NULL;});                                       \
268              ({ if (_tmp != &all_mddevs)                                \
269                         mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
270                 spin_unlock(&all_mddevs_lock);                          \
271                 if (_mddev) mddev_put(_mddev);                          \
272                 _mddev = list_entry(_tmp, struct mddev, all_mddevs);    \
273                 _tmp != &all_mddevs;});                                 \
274              ({ spin_lock(&all_mddevs_lock);                            \
275                 _tmp = _tmp->next;})                                    \
276                 )
277
278 /* Rather than calling directly into the personality make_request function,
279  * IO requests come here first so that we can check if the device is
280  * being suspended pending a reconfiguration.
281  * We hold a refcount over the call to ->make_request.  By the time that
282  * call has finished, the bio has been linked into some internal structure
283  * and so is visible to ->quiesce(), so we don't need the refcount any more.
284  */
285 static bool is_suspended(struct mddev *mddev, struct bio *bio)
286 {
287         if (mddev->suspended)
288                 return true;
289         if (bio_data_dir(bio) != WRITE)
290                 return false;
291         if (mddev->suspend_lo >= mddev->suspend_hi)
292                 return false;
293         if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
294                 return false;
295         if (bio_end_sector(bio) < mddev->suspend_lo)
296                 return false;
297         return true;
298 }
299
300 void md_handle_request(struct mddev *mddev, struct bio *bio)
301 {
302 check_suspended:
303         rcu_read_lock();
304         if (is_suspended(mddev, bio)) {
305                 DEFINE_WAIT(__wait);
306                 for (;;) {
307                         prepare_to_wait(&mddev->sb_wait, &__wait,
308                                         TASK_UNINTERRUPTIBLE);
309                         if (!is_suspended(mddev, bio))
310                                 break;
311                         rcu_read_unlock();
312                         schedule();
313                         rcu_read_lock();
314                 }
315                 finish_wait(&mddev->sb_wait, &__wait);
316         }
317         atomic_inc(&mddev->active_io);
318         rcu_read_unlock();
319
320         if (!mddev->pers->make_request(mddev, bio)) {
321                 atomic_dec(&mddev->active_io);
322                 wake_up(&mddev->sb_wait);
323                 goto check_suspended;
324         }
325
326         if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
327                 wake_up(&mddev->sb_wait);
328 }
329 EXPORT_SYMBOL(md_handle_request);
330
331 static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
332 {
333         const int rw = bio_data_dir(bio);
334         const int sgrp = op_stat_group(bio_op(bio));
335         struct mddev *mddev = q->queuedata;
336         unsigned int sectors;
337
338         blk_queue_split(q, &bio);
339
340         if (mddev == NULL || mddev->pers == NULL) {
341                 bio_io_error(bio);
342                 return BLK_QC_T_NONE;
343         }
344         if (mddev->ro == 1 && unlikely(rw == WRITE)) {
345                 if (bio_sectors(bio) != 0)
346                         bio->bi_status = BLK_STS_IOERR;
347                 bio_endio(bio);
348                 return BLK_QC_T_NONE;
349         }
350
351         /*
352          * save the sectors now since our bio can
353          * go away inside make_request
354          */
355         sectors = bio_sectors(bio);
356         /* bio could be mergeable after passing to underlayer */
357         bio->bi_opf &= ~REQ_NOMERGE;
358
359         md_handle_request(mddev, bio);
360
361         part_stat_lock();
362         part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
363         part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
364         part_stat_unlock();
365
366         return BLK_QC_T_NONE;
367 }
368
369 /* mddev_suspend makes sure no new requests are submitted
370  * to the device, and that any requests that have been submitted
371  * are completely handled.
372  * Once mddev_detach() is called and completes, the module will be
373  * completely unused.
374  */
375 void mddev_suspend(struct mddev *mddev)
376 {
377         WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
378         lockdep_assert_held(&mddev->reconfig_mutex);
379         if (mddev->suspended++)
380                 return;
381         synchronize_rcu();
382         wake_up(&mddev->sb_wait);
383         set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
384         smp_mb__after_atomic();
385         wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
386         mddev->pers->quiesce(mddev, 1);
387         clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
388         wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
389
390         del_timer_sync(&mddev->safemode_timer);
391 }
392 EXPORT_SYMBOL_GPL(mddev_suspend);
393
394 void mddev_resume(struct mddev *mddev)
395 {
396         lockdep_assert_held(&mddev->reconfig_mutex);
397         if (--mddev->suspended)
398                 return;
399         wake_up(&mddev->sb_wait);
400         mddev->pers->quiesce(mddev, 0);
401
402         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
403         md_wakeup_thread(mddev->thread);
404         md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
405 }
406 EXPORT_SYMBOL_GPL(mddev_resume);
407
408 int mddev_congested(struct mddev *mddev, int bits)
409 {
410         struct md_personality *pers = mddev->pers;
411         int ret = 0;
412
413         rcu_read_lock();
414         if (mddev->suspended)
415                 ret = 1;
416         else if (pers && pers->congested)
417                 ret = pers->congested(mddev, bits);
418         rcu_read_unlock();
419         return ret;
420 }
421 EXPORT_SYMBOL_GPL(mddev_congested);
422 static int md_congested(void *data, int bits)
423 {
424         struct mddev *mddev = data;
425         return mddev_congested(mddev, bits);
426 }
427
428 /*
429  * Generic flush handling for md
430  */
431 static void submit_flushes(struct work_struct *ws)
432 {
433         struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
434         struct mddev *mddev = fi->mddev;
435         struct bio *bio = fi->bio;
436
437         bio->bi_opf &= ~REQ_PREFLUSH;
438         md_handle_request(mddev, bio);
439
440         mempool_free(fi, mddev->flush_pool);
441 }
442
443 static void md_end_flush(struct bio *fbio)
444 {
445         struct flush_bio *fb = fbio->bi_private;
446         struct md_rdev *rdev = fb->rdev;
447         struct flush_info *fi = fb->fi;
448         struct bio *bio = fi->bio;
449         struct mddev *mddev = fi->mddev;
450
451         rdev_dec_pending(rdev, mddev);
452
453         if (atomic_dec_and_test(&fi->flush_pending)) {
454                 if (bio->bi_iter.bi_size == 0) {
455                         /* an empty barrier - all done */
456                         bio_endio(bio);
457                         mempool_free(fi, mddev->flush_pool);
458                 } else {
459                         INIT_WORK(&fi->flush_work, submit_flushes);
460                         queue_work(md_wq, &fi->flush_work);
461                 }
462         }
463
464         mempool_free(fb, mddev->flush_bio_pool);
465         bio_put(fbio);
466 }
467
468 void md_flush_request(struct mddev *mddev, struct bio *bio)
469 {
470         struct md_rdev *rdev;
471         struct flush_info *fi;
472
473         fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);
474
475         fi->bio = bio;
476         fi->mddev = mddev;
477         atomic_set(&fi->flush_pending, 1);
478
479         rcu_read_lock();
480         rdev_for_each_rcu(rdev, mddev)
481                 if (rdev->raid_disk >= 0 &&
482                     !test_bit(Faulty, &rdev->flags)) {
483                         /* Take two references, one is dropped
484                          * when request finishes, one after
485                          * we reclaim rcu_read_lock
486                          */
487                         struct bio *bi;
488                         struct flush_bio *fb;
489                         atomic_inc(&rdev->nr_pending);
490                         atomic_inc(&rdev->nr_pending);
491                         rcu_read_unlock();
492
493                         fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
494                         fb->fi = fi;
495                         fb->rdev = rdev;
496
497                         bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
498                         bio_set_dev(bi, rdev->bdev);
499                         bi->bi_end_io = md_end_flush;
500                         bi->bi_private = fb;
501                         bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
502
503                         atomic_inc(&fi->flush_pending);
504                         submit_bio(bi);
505
506                         rcu_read_lock();
507                         rdev_dec_pending(rdev, mddev);
508                 }
509         rcu_read_unlock();
510
511         if (atomic_dec_and_test(&fi->flush_pending)) {
512                 if (bio->bi_iter.bi_size == 0) {
513                         /* an empty barrier - all done */
514                         bio_endio(bio);
515                         mempool_free(fi, mddev->flush_pool);
516                 } else {
517                         INIT_WORK(&fi->flush_work, submit_flushes);
518                         queue_work(md_wq, &fi->flush_work);
519                 }
520         }
521 }
522 EXPORT_SYMBOL(md_flush_request);
523
524 static inline struct mddev *mddev_get(struct mddev *mddev)
525 {
526         atomic_inc(&mddev->active);
527         return mddev;
528 }
529
530 static void mddev_delayed_delete(struct work_struct *ws);
531
532 static void mddev_put(struct mddev *mddev)
533 {
534         if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
535                 return;
536         if (!mddev->raid_disks && list_empty(&mddev->disks) &&
537             mddev->ctime == 0 && !mddev->hold_active) {
538                 /* Array is not configured at all, and not held active,
539                  * so destroy it */
540                 list_del_init(&mddev->all_mddevs);
541
542                 /*
543                  * Call queue_work inside the spinlock so that
544                  * flush_workqueue() after mddev_find will succeed in waiting
545                  * for the work to be done.
546                  */
547                 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
548                 queue_work(md_misc_wq, &mddev->del_work);
549         }
550         spin_unlock(&all_mddevs_lock);
551 }
552
553 static void md_safemode_timeout(struct timer_list *t);
554
555 void mddev_init(struct mddev *mddev)
556 {
557         kobject_init(&mddev->kobj, &md_ktype);
558         mutex_init(&mddev->open_mutex);
559         mutex_init(&mddev->reconfig_mutex);
560         mutex_init(&mddev->bitmap_info.mutex);
561         INIT_LIST_HEAD(&mddev->disks);
562         INIT_LIST_HEAD(&mddev->all_mddevs);
563         timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
564         atomic_set(&mddev->active, 1);
565         atomic_set(&mddev->openers, 0);
566         atomic_set(&mddev->active_io, 0);
567         spin_lock_init(&mddev->lock);
568         init_waitqueue_head(&mddev->sb_wait);
569         init_waitqueue_head(&mddev->recovery_wait);
570         mddev->reshape_position = MaxSector;
571         mddev->reshape_backwards = 0;
572         mddev->last_sync_action = "none";
573         mddev->resync_min = 0;
574         mddev->resync_max = MaxSector;
575         mddev->level = LEVEL_NONE;
576 }
577 EXPORT_SYMBOL_GPL(mddev_init);
578
579 static struct mddev *mddev_find(dev_t unit)
580 {
581         struct mddev *mddev, *new = NULL;
582
583         if (unit && MAJOR(unit) != MD_MAJOR)
584                 unit &= ~((1<<MdpMinorShift)-1);
585
586  retry:
587         spin_lock(&all_mddevs_lock);
588
589         if (unit) {
590                 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
591                         if (mddev->unit == unit) {
592                                 mddev_get(mddev);
593                                 spin_unlock(&all_mddevs_lock);
594                                 kfree(new);
595                                 return mddev;
596                         }
597
598                 if (new) {
599                         list_add(&new->all_mddevs, &all_mddevs);
600                         spin_unlock(&all_mddevs_lock);
601                         new->hold_active = UNTIL_IOCTL;
602                         return new;
603                 }
604         } else if (new) {
605                 /* find an unused unit number */
606                 static int next_minor = 512;
607                 int start = next_minor;
608                 int is_free = 0;
609                 int dev = 0;
610                 while (!is_free) {
611                         dev = MKDEV(MD_MAJOR, next_minor);
612                         next_minor++;
613                         if (next_minor > MINORMASK)
614                                 next_minor = 0;
615                         if (next_minor == start) {
616                                 /* Oh dear, all in use. */
617                                 spin_unlock(&all_mddevs_lock);
618                                 kfree(new);
619                                 return NULL;
620                         }
621
622                         is_free = 1;
623                         list_for_each_entry(mddev, &all_mddevs, all_mddevs)
624                                 if (mddev->unit == dev) {
625                                         is_free = 0;
626                                         break;
627                                 }
628                 }
629                 new->unit = dev;
630                 new->md_minor = MINOR(dev);
631                 new->hold_active = UNTIL_STOP;
632                 list_add(&new->all_mddevs, &all_mddevs);
633                 spin_unlock(&all_mddevs_lock);
634                 return new;
635         }
636         spin_unlock(&all_mddevs_lock);
637
638         new = kzalloc(sizeof(*new), GFP_KERNEL);
639         if (!new)
640                 return NULL;
641
642         new->unit = unit;
643         if (MAJOR(unit) == MD_MAJOR)
644                 new->md_minor = MINOR(unit);
645         else
646                 new->md_minor = MINOR(unit) >> MdpMinorShift;
647
648         mddev_init(new);
649
650         goto retry;
651 }
652
653 static struct attribute_group md_redundancy_group;
654
655 void mddev_unlock(struct mddev *mddev)
656 {
657         if (mddev->to_remove) {
658                 /* These cannot be removed under reconfig_mutex as
659                  * an access to the files will try to take reconfig_mutex
660                  * while holding the file unremovable, which leads to
661                  * a deadlock.
662                  * So hold set sysfs_active while the remove in happeing,
663                  * and anything else which might set ->to_remove or my
664                  * otherwise change the sysfs namespace will fail with
665                  * -EBUSY if sysfs_active is still set.
666                  * We set sysfs_active under reconfig_mutex and elsewhere
667                  * test it under the same mutex to ensure its correct value
668                  * is seen.
669                  */
670                 struct attribute_group *to_remove = mddev->to_remove;
671                 mddev->to_remove = NULL;
672                 mddev->sysfs_active = 1;
673                 mutex_unlock(&mddev->reconfig_mutex);
674
675                 if (mddev->kobj.sd) {
676                         if (to_remove != &md_redundancy_group)
677                                 sysfs_remove_group(&mddev->kobj, to_remove);
678                         if (mddev->pers == NULL ||
679                             mddev->pers->sync_request == NULL) {
680                                 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
681                                 if (mddev->sysfs_action)
682                                         sysfs_put(mddev->sysfs_action);
683                                 mddev->sysfs_action = NULL;
684                         }
685                 }
686                 mddev->sysfs_active = 0;
687         } else
688                 mutex_unlock(&mddev->reconfig_mutex);
689
690         /* As we've dropped the mutex we need a spinlock to
691          * make sure the thread doesn't disappear
692          */
693         spin_lock(&pers_lock);
694         md_wakeup_thread(mddev->thread);
695         wake_up(&mddev->sb_wait);
696         spin_unlock(&pers_lock);
697 }
698 EXPORT_SYMBOL_GPL(mddev_unlock);
699
700 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
701 {
702         struct md_rdev *rdev;
703
704         rdev_for_each_rcu(rdev, mddev)
705                 if (rdev->desc_nr == nr)
706                         return rdev;
707
708         return NULL;
709 }
710 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
711
712 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
713 {
714         struct md_rdev *rdev;
715
716         rdev_for_each(rdev, mddev)
717                 if (rdev->bdev->bd_dev == dev)
718                         return rdev;
719
720         return NULL;
721 }
722
723 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
724 {
725         struct md_rdev *rdev;
726
727         rdev_for_each_rcu(rdev, mddev)
728                 if (rdev->bdev->bd_dev == dev)
729                         return rdev;
730
731         return NULL;
732 }
733 EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
734
735 static struct md_personality *find_pers(int level, char *clevel)
736 {
737         struct md_personality *pers;
738         list_for_each_entry(pers, &pers_list, list) {
739                 if (level != LEVEL_NONE && pers->level == level)
740                         return pers;
741                 if (strcmp(pers->name, clevel)==0)
742                         return pers;
743         }
744         return NULL;
745 }
746
747 /* return the offset of the super block in 512byte sectors */
748 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
749 {
750         sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
751         return MD_NEW_SIZE_SECTORS(num_sectors);
752 }
753
754 static int alloc_disk_sb(struct md_rdev *rdev)
755 {
756         rdev->sb_page = alloc_page(GFP_KERNEL);
757         if (!rdev->sb_page)
758                 return -ENOMEM;
759         return 0;
760 }
761
762 void md_rdev_clear(struct md_rdev *rdev)
763 {
764         if (rdev->sb_page) {
765                 put_page(rdev->sb_page);
766                 rdev->sb_loaded = 0;
767                 rdev->sb_page = NULL;
768                 rdev->sb_start = 0;
769                 rdev->sectors = 0;
770         }
771         if (rdev->bb_page) {
772                 put_page(rdev->bb_page);
773                 rdev->bb_page = NULL;
774         }
775         badblocks_exit(&rdev->badblocks);
776 }
777 EXPORT_SYMBOL_GPL(md_rdev_clear);
778
779 static void super_written(struct bio *bio)
780 {
781         struct md_rdev *rdev = bio->bi_private;
782         struct mddev *mddev = rdev->mddev;
783
784         if (bio->bi_status) {
785                 pr_err("md: super_written gets error=%d\n", bio->bi_status);
786                 md_error(mddev, rdev);
787                 if (!test_bit(Faulty, &rdev->flags)
788                     && (bio->bi_opf & MD_FAILFAST)) {
789                         set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
790                         set_bit(LastDev, &rdev->flags);
791                 }
792         } else
793                 clear_bit(LastDev, &rdev->flags);
794
795         if (atomic_dec_and_test(&mddev->pending_writes))
796                 wake_up(&mddev->sb_wait);
797         rdev_dec_pending(rdev, mddev);
798         bio_put(bio);
799 }
800
801 void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
802                    sector_t sector, int size, struct page *page)
803 {
804         /* write first size bytes of page to sector of rdev
805          * Increment mddev->pending_writes before returning
806          * and decrement it on completion, waking up sb_wait
807          * if zero is reached.
808          * If an error occurred, call md_error
809          */
810         struct bio *bio;
811         int ff = 0;
812
813         if (!page)
814                 return;
815
816         if (test_bit(Faulty, &rdev->flags))
817                 return;
818
819         bio = md_bio_alloc_sync(mddev);
820
821         atomic_inc(&rdev->nr_pending);
822
823         bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
824         bio->bi_iter.bi_sector = sector;
825         bio_add_page(bio, page, size, 0);
826         bio->bi_private = rdev;
827         bio->bi_end_io = super_written;
828
829         if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
830             test_bit(FailFast, &rdev->flags) &&
831             !test_bit(LastDev, &rdev->flags))
832                 ff = MD_FAILFAST;
833         bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff;
834
835         atomic_inc(&mddev->pending_writes);
836         submit_bio(bio);
837 }
838
839 int md_super_wait(struct mddev *mddev)
840 {
841         /* wait for all superblock writes that were scheduled to complete */
842         wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
843         if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
844                 return -EAGAIN;
845         return 0;
846 }
847
848 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
849                  struct page *page, int op, int op_flags, bool metadata_op)
850 {
851         struct bio *bio = md_bio_alloc_sync(rdev->mddev);
852         int ret;
853
854         if (metadata_op && rdev->meta_bdev)
855                 bio_set_dev(bio, rdev->meta_bdev);
856         else
857                 bio_set_dev(bio, rdev->bdev);
858         bio_set_op_attrs(bio, op, op_flags);
859         if (metadata_op)
860                 bio->bi_iter.bi_sector = sector + rdev->sb_start;
861         else if (rdev->mddev->reshape_position != MaxSector &&
862                  (rdev->mddev->reshape_backwards ==
863                   (sector >= rdev->mddev->reshape_position)))
864                 bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
865         else
866                 bio->bi_iter.bi_sector = sector + rdev->data_offset;
867         bio_add_page(bio, page, size, 0);
868
869         submit_bio_wait(bio);
870
871         ret = !bio->bi_status;
872         bio_put(bio);
873         return ret;
874 }
875 EXPORT_SYMBOL_GPL(sync_page_io);
876
877 static int read_disk_sb(struct md_rdev *rdev, int size)
878 {
879         char b[BDEVNAME_SIZE];
880
881         if (rdev->sb_loaded)
882                 return 0;
883
884         if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true))
885                 goto fail;
886         rdev->sb_loaded = 1;
887         return 0;
888
889 fail:
890         pr_err("md: disabled device %s, could not read superblock.\n",
891                bdevname(rdev->bdev,b));
892         return -EINVAL;
893 }
894
895 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
896 {
897         return  sb1->set_uuid0 == sb2->set_uuid0 &&
898                 sb1->set_uuid1 == sb2->set_uuid1 &&
899                 sb1->set_uuid2 == sb2->set_uuid2 &&
900                 sb1->set_uuid3 == sb2->set_uuid3;
901 }
902
903 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
904 {
905         int ret;
906         mdp_super_t *tmp1, *tmp2;
907
908         tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
909         tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
910
911         if (!tmp1 || !tmp2) {
912                 ret = 0;
913                 goto abort;
914         }
915
916         *tmp1 = *sb1;
917         *tmp2 = *sb2;
918
919         /*
920          * nr_disks is not constant
921          */
922         tmp1->nr_disks = 0;
923         tmp2->nr_disks = 0;
924
925         ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
926 abort:
927         kfree(tmp1);
928         kfree(tmp2);
929         return ret;
930 }
931
932 static u32 md_csum_fold(u32 csum)
933 {
934         csum = (csum & 0xffff) + (csum >> 16);
935         return (csum & 0xffff) + (csum >> 16);
936 }
937
938 static unsigned int calc_sb_csum(mdp_super_t *sb)
939 {
940         u64 newcsum = 0;
941         u32 *sb32 = (u32*)sb;
942         int i;
943         unsigned int disk_csum, csum;
944
945         disk_csum = sb->sb_csum;
946         sb->sb_csum = 0;
947
948         for (i = 0; i < MD_SB_BYTES/4 ; i++)
949                 newcsum += sb32[i];
950         csum = (newcsum & 0xffffffff) + (newcsum>>32);
951
952 #ifdef CONFIG_ALPHA
953         /* This used to use csum_partial, which was wrong for several
954          * reasons including that different results are returned on
955          * different architectures.  It isn't critical that we get exactly
956          * the same return value as before (we always csum_fold before
957          * testing, and that removes any differences).  However as we
958          * know that csum_partial always returned a 16bit value on
959          * alphas, do a fold to maximise conformity to previous behaviour.
960          */
961         sb->sb_csum = md_csum_fold(disk_csum);
962 #else
963         sb->sb_csum = disk_csum;
964 #endif
965         return csum;
966 }
967
968 /*
969  * Handle superblock details.
970  * We want to be able to handle multiple superblock formats
971  * so we have a common interface to them all, and an array of
972  * different handlers.
973  * We rely on user-space to write the initial superblock, and support
974  * reading and updating of superblocks.
975  * Interface methods are:
976  *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
977  *      loads and validates a superblock on dev.
978  *      if refdev != NULL, compare superblocks on both devices
979  *    Return:
980  *      0 - dev has a superblock that is compatible with refdev
981  *      1 - dev has a superblock that is compatible and newer than refdev
982  *          so dev should be used as the refdev in future
983  *     -EINVAL superblock incompatible or invalid
984  *     -othererror e.g. -EIO
985  *
986  *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
987  *      Verify that dev is acceptable into mddev.
988  *       The first time, mddev->raid_disks will be 0, and data from
989  *       dev should be merged in.  Subsequent calls check that dev
990  *       is new enough.  Return 0 or -EINVAL
991  *
992  *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
993  *     Update the superblock for rdev with data in mddev
994  *     This does not write to disc.
995  *
996  */
997
998 struct super_type  {
999         char                *name;
1000         struct module       *owner;
1001         int                 (*load_super)(struct md_rdev *rdev,
1002                                           struct md_rdev *refdev,
1003                                           int minor_version);
1004         int                 (*validate_super)(struct mddev *mddev,
1005                                               struct md_rdev *rdev);
1006         void                (*sync_super)(struct mddev *mddev,
1007                                           struct md_rdev *rdev);
1008         unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
1009                                                 sector_t num_sectors);
1010         int                 (*allow_new_offset)(struct md_rdev *rdev,
1011                                                 unsigned long long new_offset);
1012 };
1013
1014 /*
1015  * Check that the given mddev has no bitmap.
1016  *
1017  * This function is called from the run method of all personalities that do not
1018  * support bitmaps. It prints an error message and returns non-zero if mddev
1019  * has a bitmap. Otherwise, it returns 0.
1020  *
1021  */
1022 int md_check_no_bitmap(struct mddev *mddev)
1023 {
1024         if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1025                 return 0;
1026         pr_warn("%s: bitmaps are not supported for %s\n",
1027                 mdname(mddev), mddev->pers->name);
1028         return 1;
1029 }
1030 EXPORT_SYMBOL(md_check_no_bitmap);
1031
1032 /*
1033  * load_super for 0.90.0
1034  */
1035 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1036 {
1037         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1038         mdp_super_t *sb;
1039         int ret;
1040
1041         /*
1042          * Calculate the position of the superblock (512byte sectors),
1043          * it's at the end of the disk.
1044          *
1045          * It also happens to be a multiple of 4Kb.
1046          */
1047         rdev->sb_start = calc_dev_sboffset(rdev);
1048
1049         ret = read_disk_sb(rdev, MD_SB_BYTES);
1050         if (ret)
1051                 return ret;
1052
1053         ret = -EINVAL;
1054
1055         bdevname(rdev->bdev, b);
1056         sb = page_address(rdev->sb_page);
1057
1058         if (sb->md_magic != MD_SB_MAGIC) {
1059                 pr_warn("md: invalid raid superblock magic on %s\n", b);
1060                 goto abort;
1061         }
1062
1063         if (sb->major_version != 0 ||
1064             sb->minor_version < 90 ||
1065             sb->minor_version > 91) {
1066                 pr_warn("Bad version number %d.%d on %s\n",
1067                         sb->major_version, sb->minor_version, b);
1068                 goto abort;
1069         }
1070
1071         if (sb->raid_disks <= 0)
1072                 goto abort;
1073
1074         if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1075                 pr_warn("md: invalid superblock checksum on %s\n", b);
1076                 goto abort;
1077         }
1078
1079         rdev->preferred_minor = sb->md_minor;
1080         rdev->data_offset = 0;
1081         rdev->new_data_offset = 0;
1082         rdev->sb_size = MD_SB_BYTES;
1083         rdev->badblocks.shift = -1;
1084
1085         if (sb->level == LEVEL_MULTIPATH)
1086                 rdev->desc_nr = -1;
1087         else
1088                 rdev->desc_nr = sb->this_disk.number;
1089
1090         if (!refdev) {
1091                 ret = 1;
1092         } else {
1093                 __u64 ev1, ev2;
1094                 mdp_super_t *refsb = page_address(refdev->sb_page);
1095                 if (!md_uuid_equal(refsb, sb)) {
1096                         pr_warn("md: %s has different UUID to %s\n",
1097                                 b, bdevname(refdev->bdev,b2));
1098                         goto abort;
1099                 }
1100                 if (!md_sb_equal(refsb, sb)) {
1101                         pr_warn("md: %s has same UUID but different superblock to %s\n",
1102                                 b, bdevname(refdev->bdev, b2));
1103                         goto abort;
1104                 }
1105                 ev1 = md_event(sb);
1106                 ev2 = md_event(refsb);
1107                 if (ev1 > ev2)
1108                         ret = 1;
1109                 else
1110                         ret = 0;
1111         }
1112         rdev->sectors = rdev->sb_start;
1113         /* Limit to 4TB as metadata cannot record more than that.
1114          * (not needed for Linear and RAID0 as metadata doesn't
1115          * record this size)
1116          */
1117         if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) &&
1118             sb->level >= 1)
1119                 rdev->sectors = (sector_t)(2ULL << 32) - 2;
1120
1121         if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1122                 /* "this cannot possibly happen" ... */
1123                 ret = -EINVAL;
1124
1125  abort:
1126         return ret;
1127 }
1128
1129 /*
1130  * validate_super for 0.90.0
1131  */
1132 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1133 {
1134         mdp_disk_t *desc;
1135         mdp_super_t *sb = page_address(rdev->sb_page);
1136         __u64 ev1 = md_event(sb);
1137
1138         rdev->raid_disk = -1;
1139         clear_bit(Faulty, &rdev->flags);
1140         clear_bit(In_sync, &rdev->flags);
1141         clear_bit(Bitmap_sync, &rdev->flags);
1142         clear_bit(WriteMostly, &rdev->flags);
1143
1144         if (mddev->raid_disks == 0) {
1145                 mddev->major_version = 0;
1146                 mddev->minor_version = sb->minor_version;
1147                 mddev->patch_version = sb->patch_version;
1148                 mddev->external = 0;
1149                 mddev->chunk_sectors = sb->chunk_size >> 9;
1150                 mddev->ctime = sb->ctime;
1151                 mddev->utime = sb->utime;
1152                 mddev->level = sb->level;
1153                 mddev->clevel[0] = 0;
1154                 mddev->layout = sb->layout;
1155                 mddev->raid_disks = sb->raid_disks;
1156                 mddev->dev_sectors = ((sector_t)sb->size) * 2;
1157                 mddev->events = ev1;
1158                 mddev->bitmap_info.offset = 0;
1159                 mddev->bitmap_info.space = 0;
1160                 /* bitmap can use 60 K after the 4K superblocks */
1161                 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1162                 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1163                 mddev->reshape_backwards = 0;
1164
1165                 if (mddev->minor_version >= 91) {
1166                         mddev->reshape_position = sb->reshape_position;
1167                         mddev->delta_disks = sb->delta_disks;
1168                         mddev->new_level = sb->new_level;
1169                         mddev->new_layout = sb->new_layout;
1170                         mddev->new_chunk_sectors = sb->new_chunk >> 9;
1171                         if (mddev->delta_disks < 0)
1172                                 mddev->reshape_backwards = 1;
1173                 } else {
1174                         mddev->reshape_position = MaxSector;
1175                         mddev->delta_disks = 0;
1176                         mddev->new_level = mddev->level;
1177                         mddev->new_layout = mddev->layout;
1178                         mddev->new_chunk_sectors = mddev->chunk_sectors;
1179                 }
1180
1181                 if (sb->state & (1<<MD_SB_CLEAN))
1182                         mddev->recovery_cp = MaxSector;
1183                 else {
1184                         if (sb->events_hi == sb->cp_events_hi &&
1185                                 sb->events_lo == sb->cp_events_lo) {
1186                                 mddev->recovery_cp = sb->recovery_cp;
1187                         } else
1188                                 mddev->recovery_cp = 0;
1189                 }
1190
1191                 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1192                 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1193                 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1194                 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1195
1196                 mddev->max_disks = MD_SB_DISKS;
1197
1198                 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1199                     mddev->bitmap_info.file == NULL) {
1200                         mddev->bitmap_info.offset =
1201                                 mddev->bitmap_info.default_offset;
1202                         mddev->bitmap_info.space =
1203                                 mddev->bitmap_info.default_space;
1204                 }
1205
1206         } else if (mddev->pers == NULL) {
1207                 /* Insist on good event counter while assembling, except
1208                  * for spares (which don't need an event count) */
1209                 ++ev1;
1210                 if (sb->disks[rdev->desc_nr].state & (
1211                             (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1212                         if (ev1 < mddev->events)
1213                                 return -EINVAL;
1214         } else if (mddev->bitmap) {
1215                 /* if adding to array with a bitmap, then we can accept an
1216                  * older device ... but not too old.
1217                  */
1218                 if (ev1 < mddev->bitmap->events_cleared)
1219                         return 0;
1220                 if (ev1 < mddev->events)
1221                         set_bit(Bitmap_sync, &rdev->flags);
1222         } else {
1223                 if (ev1 < mddev->events)
1224                         /* just a hot-add of a new device, leave raid_disk at -1 */
1225                         return 0;
1226         }
1227
1228         if (mddev->level != LEVEL_MULTIPATH) {
1229                 desc = sb->disks + rdev->desc_nr;
1230
1231                 if (desc->state & (1<<MD_DISK_FAULTY))
1232                         set_bit(Faulty, &rdev->flags);
1233                 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1234                             desc->raid_disk < mddev->raid_disks */) {
1235                         set_bit(In_sync, &rdev->flags);
1236                         rdev->raid_disk = desc->raid_disk;
1237                         rdev->saved_raid_disk = desc->raid_disk;
1238                 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1239                         /* active but not in sync implies recovery up to
1240                          * reshape position.  We don't know exactly where
1241                          * that is, so set to zero for now */
1242                         if (mddev->minor_version >= 91) {
1243                                 rdev->recovery_offset = 0;
1244                                 rdev->raid_disk = desc->raid_disk;
1245                         }
1246                 }
1247                 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1248                         set_bit(WriteMostly, &rdev->flags);
1249                 if (desc->state & (1<<MD_DISK_FAILFAST))
1250                         set_bit(FailFast, &rdev->flags);
1251         } else /* MULTIPATH are always insync */
1252                 set_bit(In_sync, &rdev->flags);
1253         return 0;
1254 }
1255
1256 /*
1257  * sync_super for 0.90.0
1258  */
1259 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1260 {
1261         mdp_super_t *sb;
1262         struct md_rdev *rdev2;
1263         int next_spare = mddev->raid_disks;
1264
1265         /* make rdev->sb match mddev data..
1266          *
1267          * 1/ zero out disks
1268          * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1269          * 3/ any empty disks < next_spare become removed
1270          *
1271          * disks[0] gets initialised to REMOVED because
1272          * we cannot be sure from other fields if it has
1273          * been initialised or not.
1274          */
1275         int i;
1276         int active=0, working=0,failed=0,spare=0,nr_disks=0;
1277
1278         rdev->sb_size = MD_SB_BYTES;
1279
1280         sb = page_address(rdev->sb_page);
1281
1282         memset(sb, 0, sizeof(*sb));
1283
1284         sb->md_magic = MD_SB_MAGIC;
1285         sb->major_version = mddev->major_version;
1286         sb->patch_version = mddev->patch_version;
1287         sb->gvalid_words  = 0; /* ignored */
1288         memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1289         memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1290         memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1291         memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1292
1293         sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1294         sb->level = mddev->level;
1295         sb->size = mddev->dev_sectors / 2;
1296         sb->raid_disks = mddev->raid_disks;
1297         sb->md_minor = mddev->md_minor;
1298         sb->not_persistent = 0;
1299         sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1300         sb->state = 0;
1301         sb->events_hi = (mddev->events>>32);
1302         sb->events_lo = (u32)mddev->events;
1303
1304         if (mddev->reshape_position == MaxSector)
1305                 sb->minor_version = 90;
1306         else {
1307                 sb->minor_version = 91;
1308                 sb->reshape_position = mddev->reshape_position;
1309                 sb->new_level = mddev->new_level;
1310                 sb->delta_disks = mddev->delta_disks;
1311                 sb->new_layout = mddev->new_layout;
1312                 sb->new_chunk = mddev->new_chunk_sectors << 9;
1313         }
1314         mddev->minor_version = sb->minor_version;
1315         if (mddev->in_sync)
1316         {
1317                 sb->recovery_cp = mddev->recovery_cp;
1318                 sb->cp_events_hi = (mddev->events>>32);
1319                 sb->cp_events_lo = (u32)mddev->events;
1320                 if (mddev->recovery_cp == MaxSector)
1321                         sb->state = (1<< MD_SB_CLEAN);
1322         } else
1323                 sb->recovery_cp = 0;
1324
1325         sb->layout = mddev->layout;
1326         sb->chunk_size = mddev->chunk_sectors << 9;
1327
1328         if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1329                 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1330
1331         sb->disks[0].state = (1<<MD_DISK_REMOVED);
1332         rdev_for_each(rdev2, mddev) {
1333                 mdp_disk_t *d;
1334                 int desc_nr;
1335                 int is_active = test_bit(In_sync, &rdev2->flags);
1336
1337                 if (rdev2->raid_disk >= 0 &&
1338                     sb->minor_version >= 91)
1339                         /* we have nowhere to store the recovery_offset,
1340                          * but if it is not below the reshape_position,
1341                          * we can piggy-back on that.
1342                          */
1343                         is_active = 1;
1344                 if (rdev2->raid_disk < 0 ||
1345                     test_bit(Faulty, &rdev2->flags))
1346                         is_active = 0;
1347                 if (is_active)
1348                         desc_nr = rdev2->raid_disk;
1349                 else
1350                         desc_nr = next_spare++;
1351                 rdev2->desc_nr = desc_nr;
1352                 d = &sb->disks[rdev2->desc_nr];
1353                 nr_disks++;
1354                 d->number = rdev2->desc_nr;
1355                 d->major = MAJOR(rdev2->bdev->bd_dev);
1356                 d->minor = MINOR(rdev2->bdev->bd_dev);
1357                 if (is_active)
1358                         d->raid_disk = rdev2->raid_disk;
1359                 else
1360                         d->raid_disk = rdev2->desc_nr; /* compatibility */
1361                 if (test_bit(Faulty, &rdev2->flags))
1362                         d->state = (1<<MD_DISK_FAULTY);
1363                 else if (is_active) {
1364                         d->state = (1<<MD_DISK_ACTIVE);
1365                         if (test_bit(In_sync, &rdev2->flags))
1366                                 d->state |= (1<<MD_DISK_SYNC);
1367                         active++;
1368                         working++;
1369                 } else {
1370                         d->state = 0;
1371                         spare++;
1372                         working++;
1373                 }
1374                 if (test_bit(WriteMostly, &rdev2->flags))
1375                         d->state |= (1<<MD_DISK_WRITEMOSTLY);
1376                 if (test_bit(FailFast, &rdev2->flags))
1377                         d->state |= (1<<MD_DISK_FAILFAST);
1378         }
1379         /* now set the "removed" and "faulty" bits on any missing devices */
1380         for (i=0 ; i < mddev->raid_disks ; i++) {
1381                 mdp_disk_t *d = &sb->disks[i];
1382                 if (d->state == 0 && d->number == 0) {
1383                         d->number = i;
1384                         d->raid_disk = i;
1385                         d->state = (1<<MD_DISK_REMOVED);
1386                         d->state |= (1<<MD_DISK_FAULTY);
1387                         failed++;
1388                 }
1389         }
1390         sb->nr_disks = nr_disks;
1391         sb->active_disks = active;
1392         sb->working_disks = working;
1393         sb->failed_disks = failed;
1394         sb->spare_disks = spare;
1395
1396         sb->this_disk = sb->disks[rdev->desc_nr];
1397         sb->sb_csum = calc_sb_csum(sb);
1398 }
1399
1400 /*
1401  * rdev_size_change for 0.90.0
1402  */
1403 static unsigned long long
1404 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1405 {
1406         if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1407                 return 0; /* component must fit device */
1408         if (rdev->mddev->bitmap_info.offset)
1409                 return 0; /* can't move bitmap */
1410         rdev->sb_start = calc_dev_sboffset(rdev);
1411         if (!num_sectors || num_sectors > rdev->sb_start)
1412                 num_sectors = rdev->sb_start;
1413         /* Limit to 4TB as metadata cannot record more than that.
1414          * 4TB == 2^32 KB, or 2*2^32 sectors.
1415          */
1416         if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
1417             rdev->mddev->level >= 1)
1418                 num_sectors = (sector_t)(2ULL << 32) - 2;
1419         do {
1420                 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1421                        rdev->sb_page);
1422         } while (md_super_wait(rdev->mddev) < 0);
1423         return num_sectors;
1424 }
1425
1426 static int
1427 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1428 {
1429         /* non-zero offset changes not possible with v0.90 */
1430         return new_offset == 0;
1431 }
1432
1433 /*
1434  * version 1 superblock
1435  */
1436
1437 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1438 {
1439         __le32 disk_csum;
1440         u32 csum;
1441         unsigned long long newcsum;
1442         int size = 256 + le32_to_cpu(sb->max_dev)*2;
1443         __le32 *isuper = (__le32*)sb;
1444
1445         disk_csum = sb->sb_csum;
1446         sb->sb_csum = 0;
1447         newcsum = 0;
1448         for (; size >= 4; size -= 4)
1449                 newcsum += le32_to_cpu(*isuper++);
1450
1451         if (size == 2)
1452                 newcsum += le16_to_cpu(*(__le16*) isuper);
1453
1454         csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1455         sb->sb_csum = disk_csum;
1456         return cpu_to_le32(csum);
1457 }
1458
1459 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1460 {
1461         struct mdp_superblock_1 *sb;
1462         int ret;
1463         sector_t sb_start;
1464         sector_t sectors;
1465         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1466         int bmask;
1467
1468         /*
1469          * Calculate the position of the superblock in 512byte sectors.
1470          * It is always aligned to a 4K boundary and
1471          * depeding on minor_version, it can be:
1472          * 0: At least 8K, but less than 12K, from end of device
1473          * 1: At start of device
1474          * 2: 4K from start of device.
1475          */
1476         switch(minor_version) {
1477         case 0:
1478                 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1479                 sb_start -= 8*2;
1480                 sb_start &= ~(sector_t)(4*2-1);
1481                 break;
1482         case 1:
1483                 sb_start = 0;
1484                 break;
1485         case 2:
1486                 sb_start = 8;
1487                 break;
1488         default:
1489                 return -EINVAL;
1490         }
1491         rdev->sb_start = sb_start;
1492
1493         /* superblock is rarely larger than 1K, but it can be larger,
1494          * and it is safe to read 4k, so we do that
1495          */
1496         ret = read_disk_sb(rdev, 4096);
1497         if (ret) return ret;
1498
1499         sb = page_address(rdev->sb_page);
1500
1501         if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1502             sb->major_version != cpu_to_le32(1) ||
1503             le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1504             le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1505             (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1506                 return -EINVAL;
1507
1508         if (calc_sb_1_csum(sb) != sb->sb_csum) {
1509                 pr_warn("md: invalid superblock checksum on %s\n",
1510                         bdevname(rdev->bdev,b));
1511                 return -EINVAL;
1512         }
1513         if (le64_to_cpu(sb->data_size) < 10) {
1514                 pr_warn("md: data_size too small on %s\n",
1515                         bdevname(rdev->bdev,b));
1516                 return -EINVAL;
1517         }
1518         if (sb->pad0 ||
1519             sb->pad3[0] ||
1520             memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1521                 /* Some padding is non-zero, might be a new feature */
1522                 return -EINVAL;
1523
1524         rdev->preferred_minor = 0xffff;
1525         rdev->data_offset = le64_to_cpu(sb->data_offset);
1526         rdev->new_data_offset = rdev->data_offset;
1527         if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1528             (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1529                 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1530         atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1531
1532         rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1533         bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1534         if (rdev->sb_size & bmask)
1535                 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1536
1537         if (minor_version
1538             && rdev->data_offset < sb_start + (rdev->sb_size/512))
1539                 return -EINVAL;
1540         if (minor_version
1541             && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1542                 return -EINVAL;
1543
1544         if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1545                 rdev->desc_nr = -1;
1546         else
1547                 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1548
1549         if (!rdev->bb_page) {
1550                 rdev->bb_page = alloc_page(GFP_KERNEL);
1551                 if (!rdev->bb_page)
1552                         return -ENOMEM;
1553         }
1554         if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1555             rdev->badblocks.count == 0) {
1556                 /* need to load the bad block list.
1557                  * Currently we limit it to one page.
1558                  */
1559                 s32 offset;
1560                 sector_t bb_sector;
1561                 u64 *bbp;
1562                 int i;
1563                 int sectors = le16_to_cpu(sb->bblog_size);
1564                 if (sectors > (PAGE_SIZE / 512))
1565                         return -EINVAL;
1566                 offset = le32_to_cpu(sb->bblog_offset);
1567                 if (offset == 0)
1568                         return -EINVAL;
1569                 bb_sector = (long long)offset;
1570                 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1571                                   rdev->bb_page, REQ_OP_READ, 0, true))
1572                         return -EIO;
1573                 bbp = (u64 *)page_address(rdev->bb_page);
1574                 rdev->badblocks.shift = sb->bblog_shift;
1575                 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1576                         u64 bb = le64_to_cpu(*bbp);
1577                         int count = bb & (0x3ff);
1578                         u64 sector = bb >> 10;
1579                         sector <<= sb->bblog_shift;
1580                         count <<= sb->bblog_shift;
1581                         if (bb + 1 == 0)
1582                                 break;
1583                         if (badblocks_set(&rdev->badblocks, sector, count, 1))
1584                                 return -EINVAL;
1585                 }
1586         } else if (sb->bblog_offset != 0)
1587                 rdev->badblocks.shift = 0;
1588
1589         if ((le32_to_cpu(sb->feature_map) &
1590             (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1591                 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1592                 rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1593                 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1594         }
1595
1596         if (!refdev) {
1597                 ret = 1;
1598         } else {
1599                 __u64 ev1, ev2;
1600                 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1601
1602                 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1603                     sb->level != refsb->level ||
1604                     sb->layout != refsb->layout ||
1605                     sb->chunksize != refsb->chunksize) {
1606                         pr_warn("md: %s has strangely different superblock to %s\n",
1607                                 bdevname(rdev->bdev,b),
1608                                 bdevname(refdev->bdev,b2));
1609                         return -EINVAL;
1610                 }
1611                 ev1 = le64_to_cpu(sb->events);
1612                 ev2 = le64_to_cpu(refsb->events);
1613
1614                 if (ev1 > ev2)
1615                         ret = 1;
1616                 else
1617                         ret = 0;
1618         }
1619         if (minor_version) {
1620                 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1621                 sectors -= rdev->data_offset;
1622         } else
1623                 sectors = rdev->sb_start;
1624         if (sectors < le64_to_cpu(sb->data_size))
1625                 return -EINVAL;
1626         rdev->sectors = le64_to_cpu(sb->data_size);
1627         return ret;
1628 }
1629
1630 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1631 {
1632         struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1633         __u64 ev1 = le64_to_cpu(sb->events);
1634
1635         rdev->raid_disk = -1;
1636         clear_bit(Faulty, &rdev->flags);
1637         clear_bit(In_sync, &rdev->flags);
1638         clear_bit(Bitmap_sync, &rdev->flags);
1639         clear_bit(WriteMostly, &rdev->flags);
1640
1641         if (mddev->raid_disks == 0) {
1642                 mddev->major_version = 1;
1643                 mddev->patch_version = 0;
1644                 mddev->external = 0;
1645                 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1646                 mddev->ctime = le64_to_cpu(sb->ctime);
1647                 mddev->utime = le64_to_cpu(sb->utime);
1648                 mddev->level = le32_to_cpu(sb->level);
1649                 mddev->clevel[0] = 0;
1650                 mddev->layout = le32_to_cpu(sb->layout);
1651                 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1652                 mddev->dev_sectors = le64_to_cpu(sb->size);
1653                 mddev->events = ev1;
1654                 mddev->bitmap_info.offset = 0;
1655                 mddev->bitmap_info.space = 0;
1656                 /* Default location for bitmap is 1K after superblock
1657                  * using 3K - total of 4K
1658                  */
1659                 mddev->bitmap_info.default_offset = 1024 >> 9;
1660                 mddev->bitmap_info.default_space = (4096-1024) >> 9;
1661                 mddev->reshape_backwards = 0;
1662
1663                 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1664                 memcpy(mddev->uuid, sb->set_uuid, 16);
1665
1666                 mddev->max_disks =  (4096-256)/2;
1667
1668                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1669                     mddev->bitmap_info.file == NULL) {
1670                         mddev->bitmap_info.offset =
1671                                 (__s32)le32_to_cpu(sb->bitmap_offset);
1672                         /* Metadata doesn't record how much space is available.
1673                          * For 1.0, we assume we can use up to the superblock
1674                          * if before, else to 4K beyond superblock.
1675                          * For others, assume no change is possible.
1676                          */
1677                         if (mddev->minor_version > 0)
1678                                 mddev->bitmap_info.space = 0;
1679                         else if (mddev->bitmap_info.offset > 0)
1680                                 mddev->bitmap_info.space =
1681                                         8 - mddev->bitmap_info.offset;
1682                         else
1683                                 mddev->bitmap_info.space =
1684                                         -mddev->bitmap_info.offset;
1685                 }
1686
1687                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1688                         mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1689                         mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1690                         mddev->new_level = le32_to_cpu(sb->new_level);
1691                         mddev->new_layout = le32_to_cpu(sb->new_layout);
1692                         mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1693                         if (mddev->delta_disks < 0 ||
1694                             (mddev->delta_disks == 0 &&
1695                              (le32_to_cpu(sb->feature_map)
1696                               & MD_FEATURE_RESHAPE_BACKWARDS)))
1697                                 mddev->reshape_backwards = 1;
1698                 } else {
1699                         mddev->reshape_position = MaxSector;
1700                         mddev->delta_disks = 0;
1701                         mddev->new_level = mddev->level;
1702                         mddev->new_layout = mddev->layout;
1703                         mddev->new_chunk_sectors = mddev->chunk_sectors;
1704                 }
1705
1706                 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1707                         set_bit(MD_HAS_JOURNAL, &mddev->flags);
1708
1709                 if (le32_to_cpu(sb->feature_map) &
1710                     (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
1711                         if (le32_to_cpu(sb->feature_map) &
1712                             (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
1713                                 return -EINVAL;
1714                         if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
1715                             (le32_to_cpu(sb->feature_map) &
1716                                             MD_FEATURE_MULTIPLE_PPLS))
1717                                 return -EINVAL;
1718                         set_bit(MD_HAS_PPL, &mddev->flags);
1719                 }
1720         } else if (mddev->pers == NULL) {
1721                 /* Insist of good event counter while assembling, except for
1722                  * spares (which don't need an event count) */
1723                 ++ev1;
1724                 if (rdev->desc_nr >= 0 &&
1725                     rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1726                     (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1727                      le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1728                         if (ev1 < mddev->events)
1729                                 return -EINVAL;
1730         } else if (mddev->bitmap) {
1731                 /* If adding to array with a bitmap, then we can accept an
1732                  * older device, but not too old.
1733                  */
1734                 if (ev1 < mddev->bitmap->events_cleared)
1735                         return 0;
1736                 if (ev1 < mddev->events)
1737                         set_bit(Bitmap_sync, &rdev->flags);
1738         } else {
1739                 if (ev1 < mddev->events)
1740                         /* just a hot-add of a new device, leave raid_disk at -1 */
1741                         return 0;
1742         }
1743         if (mddev->level != LEVEL_MULTIPATH) {
1744                 int role;
1745                 if (rdev->desc_nr < 0 ||
1746                     rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1747                         role = MD_DISK_ROLE_SPARE;
1748                         rdev->desc_nr = -1;
1749                 } else
1750                         role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1751                 switch(role) {
1752                 case MD_DISK_ROLE_SPARE: /* spare */
1753                         break;
1754                 case MD_DISK_ROLE_FAULTY: /* faulty */
1755                         set_bit(Faulty, &rdev->flags);
1756                         break;
1757                 case MD_DISK_ROLE_JOURNAL: /* journal device */
1758                         if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
1759                                 /* journal device without journal feature */
1760                                 pr_warn("md: journal device provided without journal feature, ignoring the device\n");
1761                                 return -EINVAL;
1762                         }
1763                         set_bit(Journal, &rdev->flags);
1764                         rdev->journal_tail = le64_to_cpu(sb->journal_tail);
1765                         rdev->raid_disk = 0;
1766                         break;
1767                 default:
1768                         rdev->saved_raid_disk = role;
1769                         if ((le32_to_cpu(sb->feature_map) &
1770                              MD_FEATURE_RECOVERY_OFFSET)) {
1771                                 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1772                                 if (!(le32_to_cpu(sb->feature_map) &
1773                                       MD_FEATURE_RECOVERY_BITMAP))
1774                                         rdev->saved_raid_disk = -1;
1775                         } else
1776                                 set_bit(In_sync, &rdev->flags);
1777                         rdev->raid_disk = role;
1778                         break;
1779                 }
1780                 if (sb->devflags & WriteMostly1)
1781                         set_bit(WriteMostly, &rdev->flags);
1782                 if (sb->devflags & FailFast1)
1783                         set_bit(FailFast, &rdev->flags);
1784                 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1785                         set_bit(Replacement, &rdev->flags);
1786         } else /* MULTIPATH are always insync */
1787                 set_bit(In_sync, &rdev->flags);
1788
1789         return 0;
1790 }
1791
1792 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1793 {
1794         struct mdp_superblock_1 *sb;
1795         struct md_rdev *rdev2;
1796         int max_dev, i;
1797         /* make rdev->sb match mddev and rdev data. */
1798
1799         sb = page_address(rdev->sb_page);
1800
1801         sb->feature_map = 0;
1802         sb->pad0 = 0;
1803         sb->recovery_offset = cpu_to_le64(0);
1804         memset(sb->pad3, 0, sizeof(sb->pad3));
1805
1806         sb->utime = cpu_to_le64((__u64)mddev->utime);
1807         sb->events = cpu_to_le64(mddev->events);
1808         if (mddev->in_sync)
1809                 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1810         else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
1811                 sb->resync_offset = cpu_to_le64(MaxSector);
1812         else
1813                 sb->resync_offset = cpu_to_le64(0);
1814
1815         sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1816
1817         sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1818         sb->size = cpu_to_le64(mddev->dev_sectors);
1819         sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1820         sb->level = cpu_to_le32(mddev->level);
1821         sb->layout = cpu_to_le32(mddev->layout);
1822         if (test_bit(FailFast, &rdev->flags))
1823                 sb->devflags |= FailFast1;
1824         else
1825                 sb->devflags &= ~FailFast1;
1826
1827         if (test_bit(WriteMostly, &rdev->flags))
1828                 sb->devflags |= WriteMostly1;
1829         else
1830                 sb->devflags &= ~WriteMostly1;
1831         sb->data_offset = cpu_to_le64(rdev->data_offset);
1832         sb->data_size = cpu_to_le64(rdev->sectors);
1833
1834         if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1835                 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1836                 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1837         }
1838
1839         if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
1840             !test_bit(In_sync, &rdev->flags)) {
1841                 sb->feature_map |=
1842                         cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1843                 sb->recovery_offset =
1844                         cpu_to_le64(rdev->recovery_offset);
1845                 if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
1846                         sb->feature_map |=
1847                                 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
1848         }
1849         /* Note: recovery_offset and journal_tail share space  */
1850         if (test_bit(Journal, &rdev->flags))
1851                 sb->journal_tail = cpu_to_le64(rdev->journal_tail);
1852         if (test_bit(Replacement, &rdev->flags))
1853                 sb->feature_map |=
1854                         cpu_to_le32(MD_FEATURE_REPLACEMENT);
1855
1856         if (mddev->reshape_position != MaxSector) {
1857                 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1858                 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1859                 sb->new_layout = cpu_to_le32(mddev->new_layout);
1860                 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1861                 sb->new_level = cpu_to_le32(mddev->new_level);
1862                 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1863                 if (mddev->delta_disks == 0 &&
1864                     mddev->reshape_backwards)
1865                         sb->feature_map
1866                                 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1867                 if (rdev->new_data_offset != rdev->data_offset) {
1868                         sb->feature_map
1869                                 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1870                         sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1871                                                              - rdev->data_offset));
1872                 }
1873         }
1874
1875         if (mddev_is_clustered(mddev))
1876                 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
1877
1878         if (rdev->badblocks.count == 0)
1879                 /* Nothing to do for bad blocks*/ ;
1880         else if (sb->bblog_offset == 0)
1881                 /* Cannot record bad blocks on this device */
1882                 md_error(mddev, rdev);
1883         else {
1884                 struct badblocks *bb = &rdev->badblocks;
1885                 u64 *bbp = (u64 *)page_address(rdev->bb_page);
1886                 u64 *p = bb->page;
1887                 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1888                 if (bb->changed) {
1889                         unsigned seq;
1890
1891 retry:
1892                         seq = read_seqbegin(&bb->lock);
1893
1894                         memset(bbp, 0xff, PAGE_SIZE);
1895
1896                         for (i = 0 ; i < bb->count ; i++) {
1897                                 u64 internal_bb = p[i];
1898                                 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1899                                                 | BB_LEN(internal_bb));
1900                                 bbp[i] = cpu_to_le64(store_bb);
1901                         }
1902                         bb->changed = 0;
1903                         if (read_seqretry(&bb->lock, seq))
1904                                 goto retry;
1905
1906                         bb->sector = (rdev->sb_start +
1907                                       (int)le32_to_cpu(sb->bblog_offset));
1908                         bb->size = le16_to_cpu(sb->bblog_size);
1909                 }
1910         }
1911
1912         max_dev = 0;
1913         rdev_for_each(rdev2, mddev)
1914                 if (rdev2->desc_nr+1 > max_dev)
1915                         max_dev = rdev2->desc_nr+1;
1916
1917         if (max_dev > le32_to_cpu(sb->max_dev)) {
1918                 int bmask;
1919                 sb->max_dev = cpu_to_le32(max_dev);
1920                 rdev->sb_size = max_dev * 2 + 256;
1921                 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1922                 if (rdev->sb_size & bmask)
1923                         rdev->sb_size = (rdev->sb_size | bmask) + 1;
1924         } else
1925                 max_dev = le32_to_cpu(sb->max_dev);
1926
1927         for (i=0; i<max_dev;i++)
1928                 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
1929
1930         if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
1931                 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
1932
1933         if (test_bit(MD_HAS_PPL, &mddev->flags)) {
1934                 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
1935                         sb->feature_map |=
1936                             cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
1937                 else
1938                         sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
1939                 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
1940                 sb->ppl.size = cpu_to_le16(rdev->ppl.size);
1941         }
1942
1943         rdev_for_each(rdev2, mddev) {
1944                 i = rdev2->desc_nr;
1945                 if (test_bit(Faulty, &rdev2->flags))
1946                         sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
1947                 else if (test_bit(In_sync, &rdev2->flags))
1948                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1949                 else if (test_bit(Journal, &rdev2->flags))
1950                         sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
1951                 else if (rdev2->raid_disk >= 0)
1952                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1953                 else
1954                         sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
1955         }
1956
1957         sb->sb_csum = calc_sb_1_csum(sb);
1958 }
1959
1960 static unsigned long long
1961 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1962 {
1963         struct mdp_superblock_1 *sb;
1964         sector_t max_sectors;
1965         if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1966                 return 0; /* component must fit device */
1967         if (rdev->data_offset != rdev->new_data_offset)
1968                 return 0; /* too confusing */
1969         if (rdev->sb_start < rdev->data_offset) {
1970                 /* minor versions 1 and 2; superblock before data */
1971                 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
1972                 max_sectors -= rdev->data_offset;
1973                 if (!num_sectors || num_sectors > max_sectors)
1974                         num_sectors = max_sectors;
1975         } else if (rdev->mddev->bitmap_info.offset) {
1976                 /* minor version 0 with bitmap we can't move */
1977                 return 0;
1978         } else {
1979                 /* minor version 0; superblock after data */
1980                 sector_t sb_start;
1981                 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
1982                 sb_start &= ~(sector_t)(4*2 - 1);
1983                 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1984                 if (!num_sectors || num_sectors > max_sectors)
1985                         num_sectors = max_sectors;
1986                 rdev->sb_start = sb_start;
1987         }
1988         sb = page_address(rdev->sb_page);
1989         sb->data_size = cpu_to_le64(num_sectors);
1990         sb->super_offset = cpu_to_le64(rdev->sb_start);
1991         sb->sb_csum = calc_sb_1_csum(sb);
1992         do {
1993                 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1994                                rdev->sb_page);
1995         } while (md_super_wait(rdev->mddev) < 0);
1996         return num_sectors;
1997
1998 }
1999
2000 static int
2001 super_1_allow_new_offset(struct md_rdev *rdev,
2002                          unsigned long long new_offset)
2003 {
2004         /* All necessary checks on new >= old have been done */
2005         struct bitmap *bitmap;
2006         if (new_offset >= rdev->data_offset)
2007                 return 1;
2008
2009         /* with 1.0 metadata, there is no metadata to tread on
2010          * so we can always move back */
2011         if (rdev->mddev->minor_version == 0)
2012                 return 1;
2013
2014         /* otherwise we must be sure not to step on
2015          * any metadata, so stay:
2016          * 36K beyond start of superblock
2017          * beyond end of badblocks
2018          * beyond write-intent bitmap
2019          */
2020         if (rdev->sb_start + (32+4)*2 > new_offset)
2021                 return 0;
2022         bitmap = rdev->mddev->bitmap;
2023         if (bitmap && !rdev->mddev->bitmap_info.file &&
2024             rdev->sb_start + rdev->mddev->bitmap_info.offset +
2025             bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
2026                 return 0;
2027         if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
2028                 return 0;
2029
2030         return 1;
2031 }
2032
2033 static struct super_type super_types[] = {
2034         [0] = {
2035                 .name   = "0.90.0",
2036                 .owner  = THIS_MODULE,
2037                 .load_super         = super_90_load,
2038                 .validate_super     = super_90_validate,
2039                 .sync_super         = super_90_sync,
2040                 .rdev_size_change   = super_90_rdev_size_change,
2041                 .allow_new_offset   = super_90_allow_new_offset,
2042         },
2043         [1] = {
2044                 .name   = "md-1",
2045                 .owner  = THIS_MODULE,
2046                 .load_super         = super_1_load,
2047                 .validate_super     = super_1_validate,
2048                 .sync_super         = super_1_sync,
2049                 .rdev_size_change   = super_1_rdev_size_change,
2050                 .allow_new_offset   = super_1_allow_new_offset,
2051         },
2052 };
2053
2054 static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2055 {
2056         if (mddev->sync_super) {
2057                 mddev->sync_super(mddev, rdev);
2058                 return;
2059         }
2060
2061         BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
2062
2063         super_types[mddev->major_version].sync_super(mddev, rdev);
2064 }
2065
2066 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
2067 {
2068         struct md_rdev *rdev, *rdev2;
2069
2070         rcu_read_lock();
2071         rdev_for_each_rcu(rdev, mddev1) {
2072                 if (test_bit(Faulty, &rdev->flags) ||
2073                     test_bit(Journal, &rdev->flags) ||
2074                     rdev->raid_disk == -1)
2075                         continue;
2076                 rdev_for_each_rcu(rdev2, mddev2) {
2077                         if (test_bit(Faulty, &rdev2->flags) ||
2078                             test_bit(Journal, &rdev2->flags) ||
2079                             rdev2->raid_disk == -1)
2080                                 continue;
2081                         if (rdev->bdev->bd_contains ==
2082                             rdev2->bdev->bd_contains) {
2083                                 rcu_read_unlock();
2084                                 return 1;
2085                         }
2086                 }
2087         }
2088         rcu_read_unlock();
2089         return 0;
2090 }
2091
2092 static LIST_HEAD(pending_raid_disks);
2093
2094 /*
2095  * Try to register data integrity profile for an mddev
2096  *
2097  * This is called when an array is started and after a disk has been kicked
2098  * from the array. It only succeeds if all working and active component devices
2099  * are integrity capable with matching profiles.
2100  */
2101 int md_integrity_register(struct mddev *mddev)
2102 {
2103         struct md_rdev *rdev, *reference = NULL;
2104
2105         if (list_empty(&mddev->disks))
2106                 return 0; /* nothing to do */
2107         if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
2108                 return 0; /* shouldn't register, or already is */
2109         rdev_for_each(rdev, mddev) {
2110                 /* skip spares and non-functional disks */
2111                 if (test_bit(Faulty, &rdev->flags))
2112                         continue;
2113                 if (rdev->raid_disk < 0)
2114                         continue;
2115                 if (!reference) {
2116                         /* Use the first rdev as the reference */
2117                         reference = rdev;
2118                         continue;
2119                 }
2120                 /* does this rdev's profile match the reference profile? */
2121                 if (blk_integrity_compare(reference->bdev->bd_disk,
2122                                 rdev->bdev->bd_disk) < 0)
2123                         return -EINVAL;
2124         }
2125         if (!reference || !bdev_get_integrity(reference->bdev))
2126                 return 0;
2127         /*
2128          * All component devices are integrity capable and have matching
2129          * profiles, register the common profile for the md device.
2130          */
2131         blk_integrity_register(mddev->gendisk,
2132                                bdev_get_integrity(reference->bdev));
2133
2134         pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2135         if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) {
2136                 pr_err("md: failed to create integrity pool for %s\n",
2137                        mdname(mddev));
2138                 return -EINVAL;
2139         }
2140         return 0;
2141 }
2142 EXPORT_SYMBOL(md_integrity_register);
2143
2144 /*
2145  * Attempt to add an rdev, but only if it is consistent with the current
2146  * integrity profile
2147  */
2148 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2149 {
2150         struct blk_integrity *bi_mddev;
2151         char name[BDEVNAME_SIZE];
2152
2153         if (!mddev->gendisk)
2154                 return 0;
2155
2156         bi_mddev = blk_get_integrity(mddev->gendisk);
2157
2158         if (!bi_mddev) /* nothing to do */
2159                 return 0;
2160
2161         if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
2162                 pr_err("%s: incompatible integrity profile for %s\n",
2163                        mdname(mddev), bdevname(rdev->bdev, name));
2164                 return -ENXIO;
2165         }
2166
2167         return 0;
2168 }
2169 EXPORT_SYMBOL(md_integrity_add_rdev);
2170
2171 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2172 {
2173         char b[BDEVNAME_SIZE];
2174         struct kobject *ko;
2175         int err;
2176
2177         /* prevent duplicates */
2178         if (find_rdev(mddev, rdev->bdev->bd_dev))
2179                 return -EEXIST;
2180
2181         if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) &&
2182             mddev->pers)
2183                 return -EROFS;
2184
2185         /* make sure rdev->sectors exceeds mddev->dev_sectors */
2186         if (!test_bit(Journal, &rdev->flags) &&
2187             rdev->sectors &&
2188             (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2189                 if (mddev->pers) {
2190                         /* Cannot change size, so fail
2191                          * If mddev->level <= 0, then we don't care
2192                          * about aligning sizes (e.g. linear)
2193                          */
2194                         if (mddev->level > 0)
2195                                 return -ENOSPC;
2196                 } else
2197                         mddev->dev_sectors = rdev->sectors;
2198         }
2199
2200         /* Verify rdev->desc_nr is unique.
2201          * If it is -1, assign a free number, else
2202          * check number is not in use
2203          */
2204         rcu_read_lock();
2205         if (rdev->desc_nr < 0) {
2206                 int choice = 0;
2207                 if (mddev->pers)
2208                         choice = mddev->raid_disks;
2209                 while (md_find_rdev_nr_rcu(mddev, choice))
2210                         choice++;
2211                 rdev->desc_nr = choice;
2212         } else {
2213                 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2214                         rcu_read_unlock();
2215                         return -EBUSY;
2216                 }
2217         }
2218         rcu_read_unlock();
2219         if (!test_bit(Journal, &rdev->flags) &&
2220             mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2221                 pr_warn("md: %s: array is limited to %d devices\n",
2222                         mdname(mddev), mddev->max_disks);
2223                 return -EBUSY;
2224         }
2225         bdevname(rdev->bdev,b);
2226         strreplace(b, '/', '!');
2227
2228         rdev->mddev = mddev;
2229         pr_debug("md: bind<%s>\n", b);
2230
2231         if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2232                 goto fail;
2233
2234         ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
2235         if (sysfs_create_link(&rdev->kobj, ko, "block"))
2236                 /* failure here is OK */;
2237         rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2238
2239         list_add_rcu(&rdev->same_set, &mddev->disks);
2240         bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2241
2242         /* May as well allow recovery to be retried once */
2243         mddev->recovery_disabled++;
2244
2245         return 0;
2246
2247  fail:
2248         pr_warn("md: failed to register dev-%s for %s\n",
2249                 b, mdname(mddev));
2250         return err;
2251 }
2252
2253 static void md_delayed_delete(struct work_struct *ws)
2254 {
2255         struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2256         kobject_del(&rdev->kobj);
2257         kobject_put(&rdev->kobj);
2258 }
2259
2260 static void unbind_rdev_from_array(struct md_rdev *rdev)
2261 {
2262         char b[BDEVNAME_SIZE];
2263
2264         bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2265         list_del_rcu(&rdev->same_set);
2266         pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
2267         rdev->mddev = NULL;
2268         sysfs_remove_link(&rdev->kobj, "block");
2269         sysfs_put(rdev->sysfs_state);
2270         rdev->sysfs_state = NULL;
2271         rdev->badblocks.count = 0;
2272         /* We need to delay this, otherwise we can deadlock when
2273          * writing to 'remove' to "dev/state".  We also need
2274          * to delay it due to rcu usage.
2275          */
2276         synchronize_rcu();
2277         INIT_WORK(&rdev->del_work, md_delayed_delete);
2278         kobject_get(&rdev->kobj);
2279         queue_work(md_misc_wq, &rdev->del_work);
2280 }
2281
2282 /*
2283  * prevent the device from being mounted, repartitioned or
2284  * otherwise reused by a RAID array (or any other kernel
2285  * subsystem), by bd_claiming the device.
2286  */
2287 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2288 {
2289         int err = 0;
2290         struct block_device *bdev;
2291         char b[BDEVNAME_SIZE];
2292
2293         bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2294                                  shared ? (struct md_rdev *)lock_rdev : rdev);
2295         if (IS_ERR(bdev)) {
2296                 pr_warn("md: could not open %s.\n", __bdevname(dev, b));
2297                 return PTR_ERR(bdev);
2298         }
2299         rdev->bdev = bdev;
2300         return err;
2301 }
2302
2303 static void unlock_rdev(struct md_rdev *rdev)
2304 {
2305         struct block_device *bdev = rdev->bdev;
2306         rdev->bdev = NULL;
2307         blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2308 }
2309
2310 void md_autodetect_dev(dev_t dev);
2311
2312 static void export_rdev(struct md_rdev *rdev)
2313 {
2314         char b[BDEVNAME_SIZE];
2315
2316         pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b));
2317         md_rdev_clear(rdev);
2318 #ifndef MODULE
2319         if (test_bit(AutoDetected, &rdev->flags))
2320                 md_autodetect_dev(rdev->bdev->bd_dev);
2321 #endif
2322         unlock_rdev(rdev);
2323         kobject_put(&rdev->kobj);
2324 }
2325
2326 void md_kick_rdev_from_array(struct md_rdev *rdev)
2327 {
2328         unbind_rdev_from_array(rdev);
2329         export_rdev(rdev);
2330 }
2331 EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
2332
2333 static void export_array(struct mddev *mddev)
2334 {
2335         struct md_rdev *rdev;
2336
2337         while (!list_empty(&mddev->disks)) {
2338                 rdev = list_first_entry(&mddev->disks, struct md_rdev,
2339                                         same_set);
2340                 md_kick_rdev_from_array(rdev);
2341         }
2342         mddev->raid_disks = 0;
2343         mddev->major_version = 0;
2344 }
2345
2346 static bool set_in_sync(struct mddev *mddev)
2347 {
2348         lockdep_assert_held(&mddev->lock);
2349         if (!mddev->in_sync) {
2350                 mddev->sync_checkers++;
2351                 spin_unlock(&mddev->lock);
2352                 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
2353                 spin_lock(&mddev->lock);
2354                 if (!mddev->in_sync &&
2355                     percpu_ref_is_zero(&mddev->writes_pending)) {
2356                         mddev->in_sync = 1;
2357                         /*
2358                          * Ensure ->in_sync is visible before we clear
2359                          * ->sync_checkers.
2360                          */
2361                         smp_mb();
2362                         set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2363                         sysfs_notify_dirent_safe(mddev->sysfs_state);
2364                 }
2365                 if (--mddev->sync_checkers == 0)
2366                         percpu_ref_switch_to_percpu(&mddev->writes_pending);
2367         }
2368         if (mddev->safemode == 1)
2369                 mddev->safemode = 0;
2370         return mddev->in_sync;
2371 }
2372
2373 static void sync_sbs(struct mddev *mddev, int nospares)
2374 {
2375         /* Update each superblock (in-memory image), but
2376          * if we are allowed to, skip spares which already
2377          * have the right event counter, or have one earlier
2378          * (which would mean they aren't being marked as dirty
2379          * with the rest of the array)
2380          */
2381         struct md_rdev *rdev;
2382         rdev_for_each(rdev, mddev) {
2383                 if (rdev->sb_events == mddev->events ||
2384                     (nospares &&
2385                      rdev->raid_disk < 0 &&
2386                      rdev->sb_events+1 == mddev->events)) {
2387                         /* Don't update this superblock */
2388                         rdev->sb_loaded = 2;
2389                 } else {
2390                         sync_super(mddev, rdev);
2391                         rdev->sb_loaded = 1;
2392                 }
2393         }
2394 }
2395
2396 static bool does_sb_need_changing(struct mddev *mddev)
2397 {
2398         struct md_rdev *rdev;
2399         struct mdp_superblock_1 *sb;
2400         int role;
2401
2402         /* Find a good rdev */
2403         rdev_for_each(rdev, mddev)
2404                 if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
2405                         break;
2406
2407         /* No good device found. */
2408         if (!rdev)
2409                 return false;
2410
2411         sb = page_address(rdev->sb_page);
2412         /* Check if a device has become faulty or a spare become active */
2413         rdev_for_each(rdev, mddev) {
2414                 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2415                 /* Device activated? */
2416                 if (role == 0xffff && rdev->raid_disk >=0 &&
2417                     !test_bit(Faulty, &rdev->flags))
2418                         return true;
2419                 /* Device turned faulty? */
2420                 if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
2421                         return true;
2422         }
2423
2424         /* Check if any mddev parameters have changed */
2425         if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2426             (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2427             (mddev->layout != le32_to_cpu(sb->layout)) ||
2428             (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2429             (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2430                 return true;
2431
2432         return false;
2433 }
2434
2435 void md_update_sb(struct mddev *mddev, int force_change)
2436 {
2437         struct md_rdev *rdev;
2438         int sync_req;
2439         int nospares = 0;
2440         int any_badblocks_changed = 0;
2441         int ret = -1;
2442
2443         if (mddev->ro) {
2444                 if (force_change)
2445                         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2446                 return;
2447         }
2448
2449 repeat:
2450         if (mddev_is_clustered(mddev)) {
2451                 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2452                         force_change = 1;
2453                 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2454                         nospares = 1;
2455                 ret = md_cluster_ops->metadata_update_start(mddev);
2456                 /* Has someone else has updated the sb */
2457                 if (!does_sb_need_changing(mddev)) {
2458                         if (ret == 0)
2459                                 md_cluster_ops->metadata_update_cancel(mddev);
2460                         bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2461                                                          BIT(MD_SB_CHANGE_DEVS) |
2462                                                          BIT(MD_SB_CHANGE_CLEAN));
2463                         return;
2464                 }
2465         }
2466
2467         /*
2468          * First make sure individual recovery_offsets are correct
2469          * curr_resync_completed can only be used during recovery.
2470          * During reshape/resync it might use array-addresses rather
2471          * that device addresses.
2472          */
2473         rdev_for_each(rdev, mddev) {
2474                 if (rdev->raid_disk >= 0 &&
2475                     mddev->delta_disks >= 0 &&
2476                     test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2477                     test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2478                     !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2479                     !test_bit(Journal, &rdev->flags) &&
2480                     !test_bit(In_sync, &rdev->flags) &&
2481                     mddev->curr_resync_completed > rdev->recovery_offset)
2482                                 rdev->recovery_offset = mddev->curr_resync_completed;
2483
2484         }
2485         if (!mddev->persistent) {
2486                 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2487                 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2488                 if (!mddev->external) {
2489                         clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2490                         rdev_for_each(rdev, mddev) {
2491                                 if (rdev->badblocks.changed) {
2492                                         rdev->badblocks.changed = 0;
2493                                         ack_all_badblocks(&rdev->badblocks);
2494                                         md_error(mddev, rdev);
2495                                 }
2496                                 clear_bit(Blocked, &rdev->flags);
2497                                 clear_bit(BlockedBadBlocks, &rdev->flags);
2498                                 wake_up(&rdev->blocked_wait);
2499                         }
2500                 }
2501                 wake_up(&mddev->sb_wait);
2502                 return;
2503         }
2504
2505         spin_lock(&mddev->lock);
2506
2507         mddev->utime = ktime_get_real_seconds();
2508
2509         if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2510                 force_change = 1;
2511         if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2512                 /* just a clean<-> dirty transition, possibly leave spares alone,
2513                  * though if events isn't the right even/odd, we will have to do
2514                  * spares after all
2515                  */
2516                 nospares = 1;
2517         if (force_change)
2518                 nospares = 0;
2519         if (mddev->degraded)
2520                 /* If the array is degraded, then skipping spares is both
2521                  * dangerous and fairly pointless.
2522                  * Dangerous because a device that was removed from the array
2523                  * might have a event_count that still looks up-to-date,
2524                  * so it can be re-added without a resync.
2525                  * Pointless because if there are any spares to skip,
2526                  * then a recovery will happen and soon that array won't
2527                  * be degraded any more and the spare can go back to sleep then.
2528                  */
2529                 nospares = 0;
2530
2531         sync_req = mddev->in_sync;
2532
2533         /* If this is just a dirty<->clean transition, and the array is clean
2534          * and 'events' is odd, we can roll back to the previous clean state */
2535         if (nospares
2536             && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2537             && mddev->can_decrease_events
2538             && mddev->events != 1) {
2539                 mddev->events--;
2540                 mddev->can_decrease_events = 0;
2541         } else {
2542                 /* otherwise we have to go forward and ... */
2543                 mddev->events ++;
2544                 mddev->can_decrease_events = nospares;
2545         }
2546
2547         /*
2548          * This 64-bit counter should never wrap.
2549          * Either we are in around ~1 trillion A.C., assuming
2550          * 1 reboot per second, or we have a bug...
2551          */
2552         WARN_ON(mddev->events == 0);
2553
2554         rdev_for_each(rdev, mddev) {
2555                 if (rdev->badblocks.changed)
2556                         any_badblocks_changed++;
2557                 if (test_bit(Faulty, &rdev->flags))
2558                         set_bit(FaultRecorded, &rdev->flags);
2559         }
2560
2561         sync_sbs(mddev, nospares);
2562         spin_unlock(&mddev->lock);
2563
2564         pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2565                  mdname(mddev), mddev->in_sync);
2566
2567         if (mddev->queue)
2568                 blk_add_trace_msg(mddev->queue, "md md_update_sb");
2569 rewrite:
2570         md_bitmap_update_sb(mddev->bitmap);
2571         rdev_for_each(rdev, mddev) {
2572                 char b[BDEVNAME_SIZE];
2573
2574                 if (rdev->sb_loaded != 1)
2575                         continue; /* no noise on spare devices */
2576
2577                 if (!test_bit(Faulty, &rdev->flags)) {
2578                         md_super_write(mddev,rdev,
2579                                        rdev->sb_start, rdev->sb_size,
2580                                        rdev->sb_page);
2581                         pr_debug("md: (write) %s's sb offset: %llu\n",
2582                                  bdevname(rdev->bdev, b),
2583                                  (unsigned long long)rdev->sb_start);
2584                         rdev->sb_events = mddev->events;
2585                         if (rdev->badblocks.size) {
2586                                 md_super_write(mddev, rdev,
2587                                                rdev->badblocks.sector,
2588                                                rdev->badblocks.size << 9,
2589                                                rdev->bb_page);
2590                                 rdev->badblocks.size = 0;
2591                         }
2592
2593                 } else
2594                         pr_debug("md: %s (skipping faulty)\n",
2595                                  bdevname(rdev->bdev, b));
2596
2597                 if (mddev->level == LEVEL_MULTIPATH)
2598                         /* only need to write one superblock... */
2599                         break;
2600         }
2601         if (md_super_wait(mddev) < 0)
2602                 goto rewrite;
2603         /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2604
2605         if (mddev_is_clustered(mddev) && ret == 0)
2606                 md_cluster_ops->metadata_update_finish(mddev);
2607
2608         if (mddev->in_sync != sync_req ||
2609             !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2610                                BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2611                 /* have to write it out again */
2612                 goto repeat;
2613         wake_up(&mddev->sb_wait);
2614         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2615                 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2616
2617         rdev_for_each(rdev, mddev) {
2618                 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2619                         clear_bit(Blocked, &rdev->flags);
2620
2621                 if (any_badblocks_changed)
2622                         ack_all_badblocks(&rdev->badblocks);
2623                 clear_bit(BlockedBadBlocks, &rdev->flags);
2624                 wake_up(&rdev->blocked_wait);
2625         }
2626 }
2627 EXPORT_SYMBOL(md_update_sb);
2628
2629 static int add_bound_rdev(struct md_rdev *rdev)
2630 {
2631         struct mddev *mddev = rdev->mddev;
2632         int err = 0;
2633         bool add_journal = test_bit(Journal, &rdev->flags);
2634
2635         if (!mddev->pers->hot_remove_disk || add_journal) {
2636                 /* If there is hot_add_disk but no hot_remove_disk
2637                  * then added disks for geometry changes,
2638                  * and should be added immediately.
2639                  */
2640                 super_types[mddev->major_version].
2641                         validate_super(mddev, rdev);
2642                 if (add_journal)
2643                         mddev_suspend(mddev);
2644                 err = mddev->pers->hot_add_disk(mddev, rdev);
2645                 if (add_journal)
2646                         mddev_resume(mddev);
2647                 if (err) {
2648                         md_kick_rdev_from_array(rdev);
2649                         return err;
2650                 }
2651         }
2652         sysfs_notify_dirent_safe(rdev->sysfs_state);
2653
2654         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2655         if (mddev->degraded)
2656                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2657         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2658         md_new_event(mddev);
2659         md_wakeup_thread(mddev->thread);
2660         return 0;
2661 }
2662
2663 /* words written to sysfs files may, or may not, be \n terminated.
2664  * We want to accept with case. For this we use cmd_match.
2665  */
2666 static int cmd_match(const char *cmd, const char *str)
2667 {
2668         /* See if cmd, written into a sysfs file, matches
2669          * str.  They must either be the same, or cmd can
2670          * have a trailing newline
2671          */
2672         while (*cmd && *str && *cmd == *str) {
2673                 cmd++;
2674                 str++;
2675         }
2676         if (*cmd == '\n')
2677                 cmd++;
2678         if (*str || *cmd)
2679                 return 0;
2680         return 1;
2681 }
2682
2683 struct rdev_sysfs_entry {
2684         struct attribute attr;
2685         ssize_t (*show)(struct md_rdev *, char *);
2686         ssize_t (*store)(struct md_rdev *, const char *, size_t);
2687 };
2688
2689 static ssize_t
2690 state_show(struct md_rdev *rdev, char *page)
2691 {
2692         char *sep = ",";
2693         size_t len = 0;
2694         unsigned long flags = READ_ONCE(rdev->flags);
2695
2696         if (test_bit(Faulty, &flags) ||
2697             (!test_bit(ExternalBbl, &flags) &&
2698             rdev->badblocks.unacked_exist))
2699                 len += sprintf(page+len, "faulty%s", sep);
2700         if (test_bit(In_sync, &flags))
2701                 len += sprintf(page+len, "in_sync%s", sep);
2702         if (test_bit(Journal, &flags))
2703                 len += sprintf(page+len, "journal%s", sep);
2704         if (test_bit(WriteMostly, &flags))
2705                 len += sprintf(page+len, "write_mostly%s", sep);
2706         if (test_bit(Blocked, &flags) ||
2707             (rdev->badblocks.unacked_exist
2708              && !test_bit(Faulty, &flags)))
2709                 len += sprintf(page+len, "blocked%s", sep);
2710         if (!test_bit(Faulty, &flags) &&
2711             !test_bit(Journal, &flags) &&
2712             !test_bit(In_sync, &flags))
2713                 len += sprintf(page+len, "spare%s", sep);
2714         if (test_bit(WriteErrorSeen, &flags))
2715                 len += sprintf(page+len, "write_error%s", sep);
2716         if (test_bit(WantReplacement, &flags))
2717                 len += sprintf(page+len, "want_replacement%s", sep);
2718         if (test_bit(Replacement, &flags))
2719                 len += sprintf(page+len, "replacement%s", sep);
2720         if (test_bit(ExternalBbl, &flags))
2721                 len += sprintf(page+len, "external_bbl%s", sep);
2722         if (test_bit(FailFast, &flags))
2723                 len += sprintf(page+len, "failfast%s", sep);
2724
2725         if (len)
2726                 len -= strlen(sep);
2727
2728         return len+sprintf(page+len, "\n");
2729 }
2730
2731 static ssize_t
2732 state_store(struct md_rdev *rdev, const char *buf, size_t len)
2733 {
2734         /* can write
2735          *  faulty  - simulates an error
2736          *  remove  - disconnects the device
2737          *  writemostly - sets write_mostly
2738          *  -writemostly - clears write_mostly
2739          *  blocked - sets the Blocked flags
2740          *  -blocked - clears the Blocked and possibly simulates an error
2741          *  insync - sets Insync providing device isn't active
2742          *  -insync - clear Insync for a device with a slot assigned,
2743          *            so that it gets rebuilt based on bitmap
2744          *  write_error - sets WriteErrorSeen
2745          *  -write_error - clears WriteErrorSeen
2746          *  {,-}failfast - set/clear FailFast
2747          */
2748         int err = -EINVAL;
2749         if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2750                 md_error(rdev->mddev, rdev);
2751                 if (test_bit(Faulty, &rdev->flags))
2752                         err = 0;
2753                 else
2754                         err = -EBUSY;
2755         } else if (cmd_match(buf, "remove")) {
2756                 if (rdev->mddev->pers) {
2757                         clear_bit(Blocked, &rdev->flags);
2758                         remove_and_add_spares(rdev->mddev, rdev);
2759                 }
2760                 if (rdev->raid_disk >= 0)
2761                         err = -EBUSY;
2762                 else {
2763                         struct mddev *mddev = rdev->mddev;
2764                         err = 0;
2765                         if (mddev_is_clustered(mddev))
2766                                 err = md_cluster_ops->remove_disk(mddev, rdev);
2767
2768                         if (err == 0) {
2769                                 md_kick_rdev_from_array(rdev);
2770                                 if (mddev->pers) {
2771                                         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2772                                         md_wakeup_thread(mddev->thread);
2773                                 }
2774                                 md_new_event(mddev);
2775                         }
2776                 }
2777         } else if (cmd_match(buf, "writemostly")) {
2778                 set_bit(WriteMostly, &rdev->flags);
2779                 err = 0;
2780         } else if (cmd_match(buf, "-writemostly")) {
2781                 clear_bit(WriteMostly, &rdev->flags);
2782                 err = 0;
2783         } else if (cmd_match(buf, "blocked")) {
2784                 set_bit(Blocked, &rdev->flags);
2785                 err = 0;
2786         } else if (cmd_match(buf, "-blocked")) {
2787                 if (!test_bit(Faulty, &rdev->flags) &&
2788                     !test_bit(ExternalBbl, &rdev->flags) &&
2789                     rdev->badblocks.unacked_exist) {
2790                         /* metadata handler doesn't understand badblocks,
2791                          * so we need to fail the device
2792                          */
2793                         md_error(rdev->mddev, rdev);
2794                 }
2795                 clear_bit(Blocked, &rdev->flags);
2796                 clear_bit(BlockedBadBlocks, &rdev->flags);
2797                 wake_up(&rdev->blocked_wait);
2798                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2799                 md_wakeup_thread(rdev->mddev->thread);
2800
2801                 err = 0;
2802         } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2803                 set_bit(In_sync, &rdev->flags);
2804                 err = 0;
2805         } else if (cmd_match(buf, "failfast")) {
2806                 set_bit(FailFast, &rdev->flags);
2807                 err = 0;
2808         } else if (cmd_match(buf, "-failfast")) {
2809                 clear_bit(FailFast, &rdev->flags);
2810                 err = 0;
2811         } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
2812                    !test_bit(Journal, &rdev->flags)) {
2813                 if (rdev->mddev->pers == NULL) {
2814                         clear_bit(In_sync, &rdev->flags);
2815                         rdev->saved_raid_disk = rdev->raid_disk;
2816                         rdev->raid_disk = -1;
2817                         err = 0;
2818                 }
2819         } else if (cmd_match(buf, "write_error")) {
2820                 set_bit(WriteErrorSeen, &rdev->flags);
2821                 err = 0;
2822         } else if (cmd_match(buf, "-write_error")) {
2823                 clear_bit(WriteErrorSeen, &rdev->flags);
2824                 err = 0;
2825         } else if (cmd_match(buf, "want_replacement")) {
2826                 /* Any non-spare device that is not a replacement can
2827                  * become want_replacement at any time, but we then need to
2828                  * check if recovery is needed.
2829                  */
2830                 if (rdev->raid_disk >= 0 &&
2831                     !test_bit(Journal, &rdev->flags) &&
2832                     !test_bit(Replacement, &rdev->flags))
2833                         set_bit(WantReplacement, &rdev->flags);
2834                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2835                 md_wakeup_thread(rdev->mddev->thread);
2836                 err = 0;
2837         } else if (cmd_match(buf, "-want_replacement")) {
2838                 /* Clearing 'want_replacement' is always allowed.
2839                  * Once replacements starts it is too late though.
2840                  */
2841                 err = 0;
2842                 clear_bit(WantReplacement, &rdev->flags);
2843         } else if (cmd_match(buf, "replacement")) {
2844                 /* Can only set a device as a replacement when array has not
2845                  * yet been started.  Once running, replacement is automatic
2846                  * from spares, or by assigning 'slot'.
2847                  */
2848                 if (rdev->mddev->pers)
2849                         err = -EBUSY;
2850                 else {
2851                         set_bit(Replacement, &rdev->flags);
2852                         err = 0;
2853                 }
2854         } else if (cmd_match(buf, "-replacement")) {
2855                 /* Similarly, can only clear Replacement before start */
2856                 if (rdev->mddev->pers)
2857                         err = -EBUSY;
2858                 else {
2859                         clear_bit(Replacement, &rdev->flags);
2860                         err = 0;
2861                 }
2862         } else if (cmd_match(buf, "re-add")) {
2863                 if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
2864                         rdev->saved_raid_disk >= 0) {
2865                         /* clear_bit is performed _after_ all the devices
2866                          * have their local Faulty bit cleared. If any writes
2867                          * happen in the meantime in the local node, they
2868                          * will land in the local bitmap, which will be synced
2869                          * by this node eventually
2870                          */
2871                         if (!mddev_is_clustered(rdev->mddev) ||
2872                             (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
2873                                 clear_bit(Faulty, &rdev->flags);
2874                                 err = add_bound_rdev(rdev);
2875                         }
2876                 } else
2877                         err = -EBUSY;
2878         } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
2879                 set_bit(ExternalBbl, &rdev->flags);
2880                 rdev->badblocks.shift = 0;
2881                 err = 0;
2882         } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
2883                 clear_bit(ExternalBbl, &rdev->flags);
2884                 err = 0;
2885         }
2886         if (!err)
2887                 sysfs_notify_dirent_safe(rdev->sysfs_state);
2888         return err ? err : len;
2889 }
2890 static struct rdev_sysfs_entry rdev_state =
2891 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
2892
2893 static ssize_t
2894 errors_show(struct md_rdev *rdev, char *page)
2895 {
2896         return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2897 }
2898
2899 static ssize_t
2900 errors_store(struct md_rdev *rdev, const char *buf, size_t len)
2901 {
2902         unsigned int n;
2903         int rv;
2904
2905         rv = kstrtouint(buf, 10, &n);
2906         if (rv < 0)
2907                 return rv;
2908         atomic_set(&rdev->corrected_errors, n);
2909         return len;
2910 }
2911 static struct rdev_sysfs_entry rdev_errors =
2912 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2913
2914 static ssize_t
2915 slot_show(struct md_rdev *rdev, char *page)
2916 {
2917         if (test_bit(Journal, &rdev->flags))
2918                 return sprintf(page, "journal\n");
2919         else if (rdev->raid_disk < 0)
2920                 return sprintf(page, "none\n");
2921         else
2922                 return sprintf(page, "%d\n", rdev->raid_disk);
2923 }
2924
2925 static ssize_t
2926 slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2927 {
2928         int slot;
2929         int err;
2930
2931         if (test_bit(Journal, &rdev->flags))
2932                 return -EBUSY;
2933         if (strncmp(buf, "none", 4)==0)
2934                 slot = -1;
2935         else {
2936                 err = kstrtouint(buf, 10, (unsigned int *)&slot);
2937                 if (err < 0)
2938                         return err;
2939         }
2940         if (rdev->mddev->pers && slot == -1) {
2941                 /* Setting 'slot' on an active array requires also
2942                  * updating the 'rd%d' link, and communicating
2943                  * with the personality with ->hot_*_disk.
2944                  * For now we only support removing
2945                  * failed/spare devices.  This normally happens automatically,
2946                  * but not when the metadata is externally managed.
2947                  */
2948                 if (rdev->raid_disk == -1)
2949                         return -EEXIST;
2950                 /* personality does all needed checks */
2951                 if (rdev->mddev->pers->hot_remove_disk == NULL)
2952                         return -EINVAL;
2953                 clear_bit(Blocked, &rdev->flags);
2954                 remove_and_add_spares(rdev->mddev, rdev);
2955                 if (rdev->raid_disk >= 0)
2956                         return -EBUSY;
2957                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2958                 md_wakeup_thread(rdev->mddev->thread);
2959         } else if (rdev->mddev->pers) {
2960                 /* Activating a spare .. or possibly reactivating
2961                  * if we ever get bitmaps working here.
2962                  */
2963                 int err;
2964
2965                 if (rdev->raid_disk != -1)
2966                         return -EBUSY;
2967
2968                 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
2969                         return -EBUSY;
2970
2971                 if (rdev->mddev->pers->hot_add_disk == NULL)
2972                         return -EINVAL;
2973
2974                 if (slot >= rdev->mddev->raid_disks &&
2975                     slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2976                         return -ENOSPC;
2977
2978                 rdev->raid_disk = slot;
2979                 if (test_bit(In_sync, &rdev->flags))
2980                         rdev->saved_raid_disk = slot;
2981                 else
2982                         rdev->saved_raid_disk = -1;
2983                 clear_bit(In_sync, &rdev->flags);
2984                 clear_bit(Bitmap_sync, &rdev->flags);
2985                 err = rdev->mddev->pers->
2986                         hot_add_disk(rdev->mddev, rdev);
2987                 if (err) {
2988                         rdev->raid_disk = -1;
2989                         return err;
2990                 } else
2991                         sysfs_notify_dirent_safe(rdev->sysfs_state);
2992                 if (sysfs_link_rdev(rdev->mddev, rdev))
2993                         /* failure here is OK */;
2994                 /* don't wakeup anyone, leave that to userspace. */
2995         } else {
2996                 if (slot >= rdev->mddev->raid_disks &&
2997                     slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2998                         return -ENOSPC;
2999                 rdev->raid_disk = slot;
3000                 /* assume it is working */
3001                 clear_bit(Faulty, &rdev->flags);
3002                 clear_bit(WriteMostly, &rdev->flags);
3003                 set_bit(In_sync, &rdev->flags);
3004                 sysfs_notify_dirent_safe(rdev->sysfs_state);
3005         }
3006         return len;
3007 }
3008
3009 static struct rdev_sysfs_entry rdev_slot =
3010 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
3011
3012 static ssize_t
3013 offset_show(struct md_rdev *rdev, char *page)
3014 {
3015         return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3016 }
3017
3018 static ssize_t
3019 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3020 {
3021         unsigned long long offset;
3022         if (kstrtoull(buf, 10, &offset) < 0)
3023                 return -EINVAL;
3024         if (rdev->mddev->pers && rdev->raid_disk >= 0)
3025                 return -EBUSY;
3026         if (rdev->sectors && rdev->mddev->external)
3027                 /* Must set offset before size, so overlap checks
3028                  * can be sane */
3029                 return -EBUSY;
3030         rdev->data_offset = offset;
3031         rdev->new_data_offset = offset;
3032         return len;
3033 }
3034
3035 static struct rdev_sysfs_entry rdev_offset =
3036 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3037
3038 static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
3039 {
3040         return sprintf(page, "%llu\n",
3041                        (unsigned long long)rdev->new_data_offset);
3042 }
3043
3044 static ssize_t new_offset_store(struct md_rdev *rdev,
3045                                 const char *buf, size_t len)
3046 {
3047         unsigned long long new_offset;
3048         struct mddev *mddev = rdev->mddev;
3049
3050         if (kstrtoull(buf, 10, &new_offset) < 0)
3051                 return -EINVAL;
3052
3053         if (mddev->sync_thread ||
3054             test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
3055                 return -EBUSY;
3056         if (new_offset == rdev->data_offset)
3057                 /* reset is always permitted */
3058                 ;
3059         else if (new_offset > rdev->data_offset) {
3060                 /* must not push array size beyond rdev_sectors */
3061                 if (new_offset - rdev->data_offset
3062                     + mddev->dev_sectors > rdev->sectors)
3063                                 return -E2BIG;
3064         }
3065         /* Metadata worries about other space details. */
3066
3067         /* decreasing the offset is inconsistent with a backwards
3068          * reshape.
3069          */
3070         if (new_offset < rdev->data_offset &&
3071             mddev->reshape_backwards)
3072                 return -EINVAL;
3073         /* Increasing offset is inconsistent with forwards
3074          * reshape.  reshape_direction should be set to
3075          * 'backwards' first.
3076          */
3077         if (new_offset > rdev->data_offset &&
3078             !mddev->reshape_backwards)
3079                 return -EINVAL;
3080
3081         if (mddev->pers && mddev->persistent &&
3082             !super_types[mddev->major_version]
3083             .allow_new_offset(rdev, new_offset))
3084                 return -E2BIG;
3085         rdev->new_data_offset = new_offset;
3086         if (new_offset > rdev->data_offset)
3087                 mddev->reshape_backwards = 1;
3088         else if (new_offset < rdev->data_offset)
3089                 mddev->reshape_backwards = 0;
3090
3091         return len;
3092 }
3093 static struct rdev_sysfs_entry rdev_new_offset =
3094 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
3095
3096 static ssize_t
3097 rdev_size_show(struct md_rdev *rdev, char *page)
3098 {
3099         return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3100 }
3101
3102 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
3103 {
3104         /* check if two start/length pairs overlap */
3105         if (s1+l1 <= s2)
3106                 return 0;
3107         if (s2+l2 <= s1)
3108                 return 0;
3109         return 1;
3110 }
3111
3112 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
3113 {
3114         unsigned long long blocks;
3115         sector_t new;
3116
3117         if (kstrtoull(buf, 10, &blocks) < 0)
3118                 return -EINVAL;
3119
3120         if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
3121                 return -EINVAL; /* sector conversion overflow */
3122
3123         new = blocks * 2;
3124         if (new != blocks * 2)
3125                 return -EINVAL; /* unsigned long long to sector_t overflow */
3126
3127         *sectors = new;
3128         return 0;
3129 }
3130
3131 static ssize_t
3132 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3133 {
3134         struct mddev *my_mddev = rdev->mddev;
3135         sector_t oldsectors = rdev->sectors;
3136         sector_t sectors;
3137
3138         if (test_bit(Journal, &rdev->flags))
3139                 return -EBUSY;
3140         if (strict_blocks_to_sectors(buf, &sectors) < 0)
3141                 return -EINVAL;
3142         if (rdev->data_offset != rdev->new_data_offset)
3143                 return -EINVAL; /* too confusing */
3144         if (my_mddev->pers && rdev->raid_disk >= 0) {
3145                 if (my_mddev->persistent) {
3146                         sectors = super_types[my_mddev->major_version].
3147                                 rdev_size_change(rdev, sectors);
3148                         if (!sectors)
3149                                 return -EBUSY;
3150                 } else if (!sectors)
3151                         sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
3152                                 rdev->data_offset;
3153                 if (!my_mddev->pers->resize)
3154                         /* Cannot change size for RAID0 or Linear etc */
3155                         return -EINVAL;
3156         }
3157         if (sectors < my_mddev->dev_sectors)
3158                 return -EINVAL; /* component must fit device */
3159
3160         rdev->sectors = sectors;
3161         if (sectors > oldsectors && my_mddev->external) {
3162                 /* Need to check that all other rdevs with the same
3163                  * ->bdev do not overlap.  'rcu' is sufficient to walk
3164                  * the rdev lists safely.
3165                  * This check does not provide a hard guarantee, it
3166                  * just helps avoid dangerous mistakes.
3167                  */
3168                 struct mddev *mddev;
3169                 int overlap = 0;
3170                 struct list_head *tmp;
3171
3172                 rcu_read_lock();
3173                 for_each_mddev(mddev, tmp) {
3174                         struct md_rdev *rdev2;
3175
3176                         rdev_for_each(rdev2, mddev)
3177                                 if (rdev->bdev == rdev2->bdev &&
3178                                     rdev != rdev2 &&
3179                                     overlaps(rdev->data_offset, rdev->sectors,
3180                                              rdev2->data_offset,
3181                                              rdev2->sectors)) {
3182                                         overlap = 1;
3183                                         break;
3184                                 }
3185                         if (overlap) {
3186                                 mddev_put(mddev);
3187                                 break;
3188                         }
3189                 }