Merge branch 'x86-64'
[sfrench/cifs-2.6.git] / drivers / md / md.c
index f19b874753a9c4aec3802b557ab05100f387ff61..306268ec99ff11572e975145f3f916ee4144cf63 100644 (file)
@@ -44,6 +44,7 @@
 #include <linux/suspend.h>
 #include <linux/poll.h>
 #include <linux/mutex.h>
+#include <linux/ctype.h>
 
 #include <linux/init.h>
 
@@ -72,6 +73,10 @@ static void autostart_arrays (int part);
 static LIST_HEAD(pers_list);
 static DEFINE_SPINLOCK(pers_lock);
 
+static void md_print_devices(void);
+
+#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
+
 /*
  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
  * is 1000 KB/sec, so the extra system load does not show up that much.
@@ -170,7 +175,7 @@ EXPORT_SYMBOL_GPL(md_new_event);
 /* Alternate version that can be called from interrupts
  * when calling sysfs_notify isn't needed.
  */
-void md_new_event_inintr(mddev_t *mddev)
+static void md_new_event_inintr(mddev_t *mddev)
 {
        atomic_inc(&md_event_count);
        wake_up(&md_event_waiters);
@@ -732,6 +737,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        mdp_disk_t *desc;
        mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
+       __u64 ev1 = md_event(sb);
 
        rdev->raid_disk = -1;
        rdev->flags = 0;
@@ -748,7 +754,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->layout = sb->layout;
                mddev->raid_disks = sb->raid_disks;
                mddev->size = sb->size;
-               mddev->events = md_event(sb);
+               mddev->events = ev1;
                mddev->bitmap_offset = 0;
                mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 
@@ -797,7 +803,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 
        } else if (mddev->pers == NULL) {
                /* Insist on good event counter while assembling */
-               __u64 ev1 = md_event(sb);
                ++ev1;
                if (ev1 < mddev->events) 
                        return -EINVAL;
@@ -805,19 +810,21 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                /* if adding to array with a bitmap, then we can accept an
                 * older device ... but not too old.
                 */
-               __u64 ev1 = md_event(sb);
                if (ev1 < mddev->bitmap->events_cleared)
                        return 0;
-       } else /* just a hot-add of a new device, leave raid_disk at -1 */
-               return 0;
+       } else {
+               if (ev1 < mddev->events)
+                       /* just a hot-add of a new device, leave raid_disk at -1 */
+                       return 0;
+       }
 
        if (mddev->level != LEVEL_MULTIPATH) {
                desc = sb->disks + rdev->desc_nr;
 
                if (desc->state & (1<<MD_DISK_FAULTY))
                        set_bit(Faulty, &rdev->flags);
-               else if (desc->state & (1<<MD_DISK_SYNC) &&
-                        desc->raid_disk < mddev->raid_disks) {
+               else if (desc->state & (1<<MD_DISK_SYNC) /* &&
+                           desc->raid_disk < mddev->raid_disks */) {
                        set_bit(In_sync, &rdev->flags);
                        rdev->raid_disk = desc->raid_disk;
                }
@@ -1100,6 +1107,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+       __u64 ev1 = le64_to_cpu(sb->events);
 
        rdev->raid_disk = -1;
        rdev->flags = 0;
@@ -1115,7 +1123,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->layout = le32_to_cpu(sb->layout);
                mddev->raid_disks = le32_to_cpu(sb->raid_disks);
                mddev->size = le64_to_cpu(sb->size)/2;
-               mddev->events = le64_to_cpu(sb->events);
+               mddev->events = ev1;
                mddev->bitmap_offset = 0;
                mddev->default_bitmap_offset = 1024 >> 9;
                
@@ -1149,7 +1157,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 
        } else if (mddev->pers == NULL) {
                /* Insist of good event counter while assembling */
-               __u64 ev1 = le64_to_cpu(sb->events);
                ++ev1;
                if (ev1 < mddev->events)
                        return -EINVAL;
@@ -1157,12 +1164,13 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                /* If adding to array with a bitmap, then we can accept an
                 * older device, but not too old.
                 */
-               __u64 ev1 = le64_to_cpu(sb->events);
                if (ev1 < mddev->bitmap->events_cleared)
                        return 0;
-       } else /* just a hot-add of a new device, leave raid_disk at -1 */
-               return 0;
-
+       } else {
+               if (ev1 < mddev->events)
+                       /* just a hot-add of a new device, leave raid_disk at -1 */
+                       return 0;
+       }
        if (mddev->level != LEVEL_MULTIPATH) {
                int role;
                rdev->desc_nr = le32_to_cpu(sb->dev_number);
@@ -1174,7 +1182,11 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                        set_bit(Faulty, &rdev->flags);
                        break;
                default:
-                       set_bit(In_sync, &rdev->flags);
+                       if ((le32_to_cpu(sb->feature_map) &
+                            MD_FEATURE_RECOVERY_OFFSET))
+                               rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
+                       else
+                               set_bit(In_sync, &rdev->flags);
                        rdev->raid_disk = role;
                        break;
                }
@@ -1198,6 +1210,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
        sb->feature_map = 0;
        sb->pad0 = 0;
+       sb->recovery_offset = cpu_to_le64(0);
        memset(sb->pad1, 0, sizeof(sb->pad1));
        memset(sb->pad2, 0, sizeof(sb->pad2));
        memset(sb->pad3, 0, sizeof(sb->pad3));
@@ -1218,6 +1231,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
                sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
                sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
        }
+
+       if (rdev->raid_disk >= 0 &&
+           !test_bit(In_sync, &rdev->flags) &&
+           rdev->recovery_offset > 0) {
+               sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
+               sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
+       }
+
        if (mddev->reshape_position != MaxSector) {
                sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
                sb->reshape_position = cpu_to_le64(mddev->reshape_position);
@@ -1242,11 +1263,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
                        sb->dev_roles[i] = cpu_to_le16(0xfffe);
                else if (test_bit(In_sync, &rdev2->flags))
                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+               else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
+                       sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
                else
                        sb->dev_roles[i] = cpu_to_le16(0xffff);
        }
 
-       sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
        sb->sb_csum = calc_sb_1_csum(sb);
 }
 
@@ -1507,7 +1529,7 @@ static void print_rdev(mdk_rdev_t *rdev)
                printk(KERN_INFO "md: no rdev superblock!\n");
 }
 
-void md_print_devices(void)
+static void md_print_devices(void)
 {
        struct list_head *tmp, *tmp2;
        mdk_rdev_t *rdev;
@@ -1536,15 +1558,30 @@ void md_print_devices(void)
 }
 
 
-static void sync_sbs(mddev_t * mddev)
+static void sync_sbs(mddev_t * mddev, int nospares)
 {
+       /* Update each superblock (in-memory image), but
+        * if we are allowed to, skip spares which already
+        * have the right event counter, or have one earlier
+        * (which would mean they aren't being marked as dirty
+        * with the rest of the array)
+        */
        mdk_rdev_t *rdev;
        struct list_head *tmp;
 
        ITERATE_RDEV(mddev,rdev,tmp) {
-               super_types[mddev->major_version].
-                       sync_super(mddev, rdev);
-               rdev->sb_loaded = 1;
+               if (rdev->sb_events == mddev->events ||
+                   (nospares &&
+                    rdev->raid_disk < 0 &&
+                    (rdev->sb_events&1)==0 &&
+                    rdev->sb_events+1 == mddev->events)) {
+                       /* Don't update this superblock */
+                       rdev->sb_loaded = 2;
+               } else {
+                       super_types[mddev->major_version].
+                               sync_super(mddev, rdev);
+                       rdev->sb_loaded = 1;
+               }
        }
 }
 
@@ -1554,12 +1591,42 @@ void md_update_sb(mddev_t * mddev)
        struct list_head *tmp;
        mdk_rdev_t *rdev;
        int sync_req;
+       int nospares = 0;
 
 repeat:
        spin_lock_irq(&mddev->write_lock);
        sync_req = mddev->in_sync;
        mddev->utime = get_seconds();
-       mddev->events ++;
+       if (mddev->sb_dirty == 3)
+               /* just a clean<-> dirty transition, possibly leave spares alone,
+                * though if events isn't the right even/odd, we will have to do
+                * spares after all
+                */
+               nospares = 1;
+
+       /* If this is just a dirty<->clean transition, and the array is clean
+        * and 'events' is odd, we can roll back to the previous clean state */
+       if (mddev->sb_dirty == 3
+           && (mddev->in_sync && mddev->recovery_cp == MaxSector)
+           && (mddev->events & 1))
+               mddev->events--;
+       else {
+               /* otherwise we have to go forward and ... */
+               mddev->events ++;
+               if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
+                       /* .. if the array isn't clean, insist on an odd 'events' */
+                       if ((mddev->events&1)==0) {
+                               mddev->events++;
+                               nospares = 0;
+                       }
+               } else {
+                       /* otherwise insist on an even 'events' (for clean states) */
+                       if ((mddev->events&1)) {
+                               mddev->events++;
+                               nospares = 0;
+                       }
+               }
+       }
 
        if (!mddev->events) {
                /*
@@ -1571,7 +1638,7 @@ repeat:
                mddev->events --;
        }
        mddev->sb_dirty = 2;
-       sync_sbs(mddev);
+       sync_sbs(mddev, nospares);
 
        /*
         * do not write anything to disk if using
@@ -1593,6 +1660,8 @@ repeat:
        ITERATE_RDEV(mddev,rdev,tmp) {
                char b[BDEVNAME_SIZE];
                dprintk(KERN_INFO "md: ");
+               if (rdev->sb_loaded != 1)
+                       continue; /* no noise on spare devices */
                if (test_bit(Faulty, &rdev->flags))
                        dprintk("(skipping faulty ");
 
@@ -1604,6 +1673,7 @@ repeat:
                        dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
                                bdevname(rdev->bdev,b),
                                (unsigned long long)rdev->sb_offset);
+                       rdev->sb_events = mddev->events;
 
                } else
                        dprintk(")\n");
@@ -1667,6 +1737,10 @@ state_show(mdk_rdev_t *rdev, char *page)
                len += sprintf(page+len, "%sin_sync",sep);
                sep = ",";
        }
+       if (test_bit(WriteMostly, &rdev->flags)) {
+               len += sprintf(page+len, "%swrite_mostly",sep);
+               sep = ",";
+       }
        if (!test_bit(Faulty, &rdev->flags) &&
            !test_bit(In_sync, &rdev->flags)) {
                len += sprintf(page+len, "%sspare", sep);
@@ -1675,8 +1749,40 @@ state_show(mdk_rdev_t *rdev, char *page)
        return len+sprintf(page+len, "\n");
 }
 
+static ssize_t
+state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+       /* can write
+        *  faulty  - simulates and error
+        *  remove  - disconnects the device
+        *  writemostly - sets write_mostly
+        *  -writemostly - clears write_mostly
+        */
+       int err = -EINVAL;
+       if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
+               md_error(rdev->mddev, rdev);
+               err = 0;
+       } else if (cmd_match(buf, "remove")) {
+               if (rdev->raid_disk >= 0)
+                       err = -EBUSY;
+               else {
+                       mddev_t *mddev = rdev->mddev;
+                       kick_rdev_from_array(rdev);
+                       md_update_sb(mddev);
+                       md_new_event(mddev);
+                       err = 0;
+               }
+       } else if (cmd_match(buf, "writemostly")) {
+               set_bit(WriteMostly, &rdev->flags);
+               err = 0;
+       } else if (cmd_match(buf, "-writemostly")) {
+               clear_bit(WriteMostly, &rdev->flags);
+               err = 0;
+       }
+       return err ? err : len;
+}
 static struct rdev_sysfs_entry
-rdev_state = __ATTR_RO(state);
+rdev_state = __ATTR(state, 0644, state_show, state_store);
 
 static ssize_t
 super_show(mdk_rdev_t *rdev, char *page)
@@ -1873,6 +1979,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
        rdev->desc_nr = -1;
        rdev->flags = 0;
        rdev->data_offset = 0;
+       rdev->sb_events = 0;
        atomic_set(&rdev->nr_pending, 0);
        atomic_set(&rdev->read_errors, 0);
        atomic_set(&rdev->corrected_errors, 0);
@@ -1977,6 +2084,54 @@ static void analyze_sbs(mddev_t * mddev)
 
 }
 
+static ssize_t
+safe_delay_show(mddev_t *mddev, char *page)
+{
+       int msec = (mddev->safemode_delay*1000)/HZ;
+       return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
+}
+static ssize_t
+safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
+{
+       int scale=1;
+       int dot=0;
+       int i;
+       unsigned long msec;
+       char buf[30];
+       char *e;
+       /* remove a period, and count digits after it */
+       if (len >= sizeof(buf))
+               return -EINVAL;
+       strlcpy(buf, cbuf, len);
+       buf[len] = 0;
+       for (i=0; i<len; i++) {
+               if (dot) {
+                       if (isdigit(buf[i])) {
+                               buf[i-1] = buf[i];
+                               scale *= 10;
+                       }
+                       buf[i] = 0;
+               } else if (buf[i] == '.') {
+                       dot=1;
+                       buf[i] = 0;
+               }
+       }
+       msec = simple_strtoul(buf, &e, 10);
+       if (e == buf || (*e && *e != '\n'))
+               return -EINVAL;
+       msec = (msec * 1000) / scale;
+       if (msec == 0)
+               mddev->safemode_delay = 0;
+       else {
+               mddev->safemode_delay = (msec*HZ)/1000;
+               if (mddev->safemode_delay == 0)
+                       mddev->safemode_delay = 1;
+       }
+       return len;
+}
+static struct md_sysfs_entry md_safe_delay =
+__ATTR(safe_mode_delay, 0644,safe_delay_show, safe_delay_store);
+
 static ssize_t
 level_show(mddev_t *mddev, char *page)
 {
@@ -2012,6 +2167,32 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry md_level =
 __ATTR(level, 0644, level_show, level_store);
 
+
+static ssize_t
+layout_show(mddev_t *mddev, char *page)
+{
+       /* just a number, not meaningful for all levels */
+       return sprintf(page, "%d\n", mddev->layout);
+}
+
+static ssize_t
+layout_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       char *e;
+       unsigned long n = simple_strtoul(buf, &e, 10);
+       if (mddev->pers)
+               return -EBUSY;
+
+       if (!*buf || (*e && *e != '\n'))
+               return -EINVAL;
+
+       mddev->layout = n;
+       return len;
+}
+static struct md_sysfs_entry md_layout =
+__ATTR(layout, 0655, layout_show, layout_store);
+
+
 static ssize_t
 raid_disks_show(mddev_t *mddev, char *page)
 {
@@ -2066,6 +2247,200 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry md_chunk_size =
 __ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
 
+static ssize_t
+resync_start_show(mddev_t *mddev, char *page)
+{
+       return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
+}
+
+static ssize_t
+resync_start_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       /* can only set chunk_size if array is not yet active */
+       char *e;
+       unsigned long long n = simple_strtoull(buf, &e, 10);
+
+       if (mddev->pers)
+               return -EBUSY;
+       if (!*buf || (*e && *e != '\n'))
+               return -EINVAL;
+
+       mddev->recovery_cp = n;
+       return len;
+}
+static struct md_sysfs_entry md_resync_start =
+__ATTR(resync_start, 0644, resync_start_show, resync_start_store);
+
+/*
+ * The array state can be:
+ *
+ * clear
+ *     No devices, no size, no level
+ *     Equivalent to STOP_ARRAY ioctl
+ * inactive
+ *     May have some settings, but array is not active
+ *        all IO results in error
+ *     When written, doesn't tear down array, but just stops it
+ * suspended (not supported yet)
+ *     All IO requests will block. The array can be reconfigured.
+ *     Writing this, if accepted, will block until array is quiessent
+ * readonly
+ *     no resync can happen.  no superblocks get written.
+ *     write requests fail
+ * read-auto
+ *     like readonly, but behaves like 'clean' on a write request.
+ *
+ * clean - no pending writes, but otherwise active.
+ *     When written to inactive array, starts without resync
+ *     If a write request arrives then
+ *       if metadata is known, mark 'dirty' and switch to 'active'.
+ *       if not known, block and switch to write-pending
+ *     If written to an active array that has pending writes, then fails.
+ * active
+ *     fully active: IO and resync can be happening.
+ *     When written to inactive array, starts with resync
+ *
+ * write-pending
+ *     clean, but writes are blocked waiting for 'active' to be written.
+ *
+ * active-idle
+ *     like active, but no writes have been seen for a while (100msec).
+ *
+ */
+enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
+                  write_pending, active_idle, bad_word};
+static char *array_states[] = {
+       "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
+       "write-pending", "active-idle", NULL };
+
+static int match_word(const char *word, char **list)
+{
+       int n;
+       for (n=0; list[n]; n++)
+               if (cmd_match(word, list[n]))
+                       break;
+       return n;
+}
+
+static ssize_t
+array_state_show(mddev_t *mddev, char *page)
+{
+       enum array_state st = inactive;
+
+       if (mddev->pers)
+               switch(mddev->ro) {
+               case 1:
+                       st = readonly;
+                       break;
+               case 2:
+                       st = read_auto;
+                       break;
+               case 0:
+                       if (mddev->in_sync)
+                               st = clean;
+                       else if (mddev->safemode)
+                               st = active_idle;
+                       else
+                               st = active;
+               }
+       else {
+               if (list_empty(&mddev->disks) &&
+                   mddev->raid_disks == 0 &&
+                   mddev->size == 0)
+                       st = clear;
+               else
+                       st = inactive;
+       }
+       return sprintf(page, "%s\n", array_states[st]);
+}
+
+static int do_md_stop(mddev_t * mddev, int ro);
+static int do_md_run(mddev_t * mddev);
+static int restart_array(mddev_t *mddev);
+
+static ssize_t
+array_state_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       int err = -EINVAL;
+       enum array_state st = match_word(buf, array_states);
+       switch(st) {
+       case bad_word:
+               break;
+       case clear:
+               /* stopping an active array */
+               if (mddev->pers) {
+                       if (atomic_read(&mddev->active) > 1)
+                               return -EBUSY;
+                       err = do_md_stop(mddev, 0);
+               }
+               break;
+       case inactive:
+               /* stopping an active array */
+               if (mddev->pers) {
+                       if (atomic_read(&mddev->active) > 1)
+                               return -EBUSY;
+                       err = do_md_stop(mddev, 2);
+               }
+               break;
+       case suspended:
+               break; /* not supported yet */
+       case readonly:
+               if (mddev->pers)
+                       err = do_md_stop(mddev, 1);
+               else {
+                       mddev->ro = 1;
+                       err = do_md_run(mddev);
+               }
+               break;
+       case read_auto:
+               /* stopping an active array */
+               if (mddev->pers) {
+                       err = do_md_stop(mddev, 1);
+                       if (err == 0)
+                               mddev->ro = 2; /* FIXME mark devices writable */
+               } else {
+                       mddev->ro = 2;
+                       err = do_md_run(mddev);
+               }
+               break;
+       case clean:
+               if (mddev->pers) {
+                       restart_array(mddev);
+                       spin_lock_irq(&mddev->write_lock);
+                       if (atomic_read(&mddev->writes_pending) == 0) {
+                               mddev->in_sync = 1;
+                               mddev->sb_dirty = 1;
+                       }
+                       spin_unlock_irq(&mddev->write_lock);
+               } else {
+                       mddev->ro = 0;
+                       mddev->recovery_cp = MaxSector;
+                       err = do_md_run(mddev);
+               }
+               break;
+       case active:
+               if (mddev->pers) {
+                       restart_array(mddev);
+                       mddev->sb_dirty = 0;
+                       wake_up(&mddev->sb_wait);
+                       err = 0;
+               } else {
+                       mddev->ro = 0;
+                       err = do_md_run(mddev);
+               }
+               break;
+       case write_pending:
+       case active_idle:
+               /* these cannot be set */
+               break;
+       }
+       if (err)
+               return err;
+       else
+               return len;
+}
+static struct md_sysfs_entry md_array_state = __ATTR(array_state, 0644, array_state_show, array_state_store);
+
 static ssize_t
 null_show(mddev_t *mddev, char *page)
 {
@@ -2428,11 +2803,15 @@ __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
 
 static struct attribute *md_default_attrs[] = {
        &md_level.attr,
+       &md_layout.attr,
        &md_raid_disks.attr,
        &md_chunk_size.attr,
        &md_size.attr,
+       &md_resync_start.attr,
        &md_metadata.attr,
        &md_new_device.attr,
+       &md_safe_delay.attr,
+       &md_array_state.attr,
        NULL,
 };
 
@@ -2553,8 +2932,6 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
        return NULL;
 }
 
-void md_wakeup_thread(mdk_thread_t *thread);
-
 static void md_safemode_timeout(unsigned long data)
 {
        mddev_t *mddev = (mddev_t *) data;
@@ -2708,7 +3085,7 @@ static int do_md_run(mddev_t * mddev)
        mddev->safemode = 0;
        mddev->safemode_timer.function = md_safemode_timeout;
        mddev->safemode_timer.data = (unsigned long) mddev;
-       mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
+       mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
        mddev->in_sync = 1;
 
        ITERATE_RDEV(mddev,rdev,tmp)
@@ -2736,6 +3113,36 @@ static int do_md_run(mddev_t * mddev)
        mddev->queue->queuedata = mddev;
        mddev->queue->make_request_fn = mddev->pers->make_request;
 
+       /* If there is a partially-recovered drive we need to
+        * start recovery here.  If we leave it to md_check_recovery,
+        * it will remove the drives and not do the right thing
+        */
+       if (mddev->degraded) {
+               struct list_head *rtmp;
+               int spares = 0;
+               ITERATE_RDEV(mddev,rdev,rtmp)
+                       if (rdev->raid_disk >= 0 &&
+                           !test_bit(In_sync, &rdev->flags) &&
+                           !test_bit(Faulty, &rdev->flags))
+                               /* complete an interrupted recovery */
+                               spares++;
+               if (spares && mddev->pers->sync_request) {
+                       mddev->recovery = 0;
+                       set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+                       mddev->sync_thread = md_register_thread(md_do_sync,
+                                                               mddev,
+                                                               "%s_resync");
+                       if (!mddev->sync_thread) {
+                               printk(KERN_ERR "%s: could not start resync"
+                                      " thread...\n",
+                                      mdname(mddev));
+                               /* leave the spares where they are, it shouldn't hurt */
+                               mddev->recovery = 0;
+                       } else
+                               md_wakeup_thread(mddev->sync_thread);
+               }
+       }
+
        mddev->changed = 1;
        md_new_event(mddev);
        return 0;
@@ -2769,18 +3176,47 @@ static int restart_array(mddev_t *mddev)
                 */
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
                md_wakeup_thread(mddev->thread);
+               md_wakeup_thread(mddev->sync_thread);
                err = 0;
-       } else {
-               printk(KERN_ERR "md: %s has no personality assigned.\n",
-                       mdname(mddev));
+       } else
                err = -EINVAL;
-       }
 
 out:
        return err;
 }
 
-static int do_md_stop(mddev_t * mddev, int ro)
+/* similar to deny_write_access, but accounts for our holding a reference
+ * to the file ourselves */
+static int deny_bitmap_write_access(struct file * file)
+{
+       struct inode *inode = file->f_mapping->host;
+
+       spin_lock(&inode->i_lock);
+       if (atomic_read(&inode->i_writecount) > 1) {
+               spin_unlock(&inode->i_lock);
+               return -ETXTBSY;
+       }
+       atomic_set(&inode->i_writecount, -1);
+       spin_unlock(&inode->i_lock);
+
+       return 0;
+}
+
+static void restore_bitmap_write_access(struct file *file)
+{
+       struct inode *inode = file->f_mapping->host;
+
+       spin_lock(&inode->i_lock);
+       atomic_set(&inode->i_writecount, 1);
+       spin_unlock(&inode->i_lock);
+}
+
+/* mode:
+ *   0 - completely stop and dis-assemble array
+ *   1 - switch to readonly
+ *   2 - stop but do not disassemble array
+ */
+static int do_md_stop(mddev_t * mddev, int mode)
 {
        int err = 0;
        struct gendisk *disk = mddev->gendisk;
@@ -2792,6 +3228,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
                }
 
                if (mddev->sync_thread) {
+                       set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
                        md_unregister_thread(mddev->sync_thread);
                        mddev->sync_thread = NULL;
@@ -2801,12 +3238,15 @@ static int do_md_stop(mddev_t * mddev, int ro)
 
                invalidate_partition(disk, 0);
 
-               if (ro) {
+               switch(mode) {
+               case 1: /* readonly */
                        err  = -ENXIO;
                        if (mddev->ro==1)
                                goto out;
                        mddev->ro = 1;
-               } else {
+                       break;
+               case 0: /* disassemble */
+               case 2: /* stop */
                        bitmap_flush(mddev);
                        md_super_wait(mddev);
                        if (mddev->ro)
@@ -2821,19 +3261,20 @@ static int do_md_stop(mddev_t * mddev, int ro)
                        if (mddev->ro)
                                mddev->ro = 0;
                }
-               if (!mddev->in_sync) {
+               if (!mddev->in_sync || mddev->sb_dirty) {
                        /* mark array as shutdown cleanly */
                        mddev->in_sync = 1;
                        md_update_sb(mddev);
                }
-               if (ro)
+               if (mode == 1)
                        set_disk_ro(disk, 1);
+               clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
        }
 
        /*
         * Free resources if final stop
         */
-       if (!ro) {
+       if (mode == 0) {
                mdk_rdev_t *rdev;
                struct list_head *tmp;
                struct gendisk *disk;
@@ -2841,7 +3282,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
 
                bitmap_destroy(mddev);
                if (mddev->bitmap_file) {
-                       atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1);
+                       restore_bitmap_write_access(mddev->bitmap_file);
                        fput(mddev->bitmap_file);
                        mddev->bitmap_file = NULL;
                }
@@ -2857,11 +3298,15 @@ static int do_md_stop(mddev_t * mddev, int ro)
                export_array(mddev);
 
                mddev->array_size = 0;
+               mddev->size = 0;
+               mddev->raid_disks = 0;
+               mddev->recovery_cp = 0;
+
                disk = mddev->gendisk;
                if (disk)
                        set_capacity(disk, 0);
                mddev->changed = 1;
-       } else
+       } else if (mddev->pers)
                printk(KERN_INFO "md: %s switched to read-only mode.\n",
                        mdname(mddev));
        err = 0;
@@ -3264,6 +3709,17 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 
                rdev->raid_disk = -1;
                err = bind_rdev_to_array(rdev, mddev);
+               if (!err && !mddev->pers->hot_remove_disk) {
+                       /* If there is hot_add_disk but no hot_remove_disk
+                        * then added disks for geometry changes,
+                        * and should be added immediately.
+                        */
+                       super_types[mddev->major_version].
+                               validate_super(mddev, rdev);
+                       err = mddev->pers->hot_add_disk(mddev, rdev);
+                       if (err)
+                               unbind_rdev_from_array(rdev);
+               }
                if (err)
                        export_rdev(rdev);
 
@@ -3434,23 +3890,6 @@ abort_export:
        return err;
 }
 
-/* similar to deny_write_access, but accounts for our holding a reference
- * to the file ourselves */
-static int deny_bitmap_write_access(struct file * file)
-{
-       struct inode *inode = file->f_mapping->host;
-
-       spin_lock(&inode->i_lock);
-       if (atomic_read(&inode->i_writecount) > 1) {
-               spin_unlock(&inode->i_lock);
-               return -ETXTBSY;
-       }
-       atomic_set(&inode->i_writecount, -1);
-       spin_unlock(&inode->i_lock);
-
-       return 0;
-}
-
 static int set_bitmap_file(mddev_t *mddev, int fd)
 {
        int err;
@@ -3491,12 +3930,17 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
                mddev->pers->quiesce(mddev, 1);
                if (fd >= 0)
                        err = bitmap_create(mddev);
-               if (fd < 0 || err)
+               if (fd < 0 || err) {
                        bitmap_destroy(mddev);
+                       fd = -1; /* make sure to put the file */
+               }
                mddev->pers->quiesce(mddev, 0);
-       } else if (fd < 0) {
-               if (mddev->bitmap_file)
+       }
+       if (fd < 0) {
+               if (mddev->bitmap_file) {
+                       restore_bitmap_write_access(mddev->bitmap_file);
                        fput(mddev->bitmap_file);
+               }
                mddev->bitmap_file = NULL;
        }
 
@@ -3977,11 +4421,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
                        goto done_unlock;
 
                default:
-                       if (_IOC_TYPE(cmd) == MD_MAJOR)
-                               printk(KERN_WARNING "md: %s(pid %d) used"
-                                       " obsolete MD ioctl, upgrade your"
-                                       " software to use new ictls.\n",
-                                       current->comm, current->pid);
                        err = -EINVAL;
                        goto abort_unlock;
        }
@@ -4586,7 +5025,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
                spin_lock_irq(&mddev->write_lock);
                if (mddev->in_sync) {
                        mddev->in_sync = 0;
-                       mddev->sb_dirty = 1;
+                       mddev->sb_dirty = 3;
                        md_wakeup_thread(mddev->thread);
                }
                spin_unlock_irq(&mddev->write_lock);
@@ -4599,7 +5038,7 @@ void md_write_end(mddev_t *mddev)
        if (atomic_dec_and_test(&mddev->writes_pending)) {
                if (mddev->safemode == 2)
                        md_wakeup_thread(mddev->thread);
-               else
+               else if (mddev->safemode_delay)
                        mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
        }
 }
@@ -4620,10 +5059,14 @@ void md_do_sync(mddev_t *mddev)
        struct list_head *tmp;
        sector_t last_check;
        int skipped = 0;
+       struct list_head *rtmp;
+       mdk_rdev_t *rdev;
 
        /* just incase thread restarts... */
        if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
                return;
+       if (mddev->ro) /* never try to sync a read-only array */
+               return;
 
        /* we overload curr_resync somewhat here.
         * 0 == not engaged in resync at all
@@ -4682,17 +5125,30 @@ void md_do_sync(mddev_t *mddev)
                }
        } while (mddev->curr_resync < 2);
 
+       j = 0;
        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
                /* resync follows the size requested by the personality,
                 * which defaults to physical size, but can be virtual size
                 */
                max_sectors = mddev->resync_max_sectors;
                mddev->resync_mismatches = 0;
+               /* we don't use the checkpoint if there's a bitmap */
+               if (!mddev->bitmap &&
+                   !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+                       j = mddev->recovery_cp;
        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
                max_sectors = mddev->size << 1;
-       else
+       else {
                /* recovery follows the physical size of devices */
                max_sectors = mddev->size << 1;
+               j = MaxSector;
+               ITERATE_RDEV(mddev,rdev,rtmp)
+                       if (rdev->raid_disk >= 0 &&
+                           !test_bit(Faulty, &rdev->flags) &&
+                           !test_bit(In_sync, &rdev->flags) &&
+                           rdev->recovery_offset < j)
+                               j = rdev->recovery_offset;
+       }
 
        printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
        printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
@@ -4702,12 +5158,7 @@ void md_do_sync(mddev_t *mddev)
               speed_max(mddev));
 
        is_mddev_idle(mddev); /* this also initializes IO event counters */
-       /* we don't use the checkpoint if there's a bitmap */
-       if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap
-           && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
-               j = mddev->recovery_cp;
-       else
-               j = 0;
+
        io_sectors = 0;
        for (m = 0; m < SYNC_MARKS; m++) {
                mark[m] = jiffies;
@@ -4828,15 +5279,28 @@ void md_do_sync(mddev_t *mddev)
        if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
            test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
            !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
-           mddev->curr_resync > 2 &&
-           mddev->curr_resync >= mddev->recovery_cp) {
-               if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
-                       printk(KERN_INFO 
-                               "md: checkpointing recovery of %s.\n",
-                               mdname(mddev));
-                       mddev->recovery_cp = mddev->curr_resync;
-               } else
-                       mddev->recovery_cp = MaxSector;
+           mddev->curr_resync > 2) {
+               if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+                       if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+                               if (mddev->curr_resync >= mddev->recovery_cp) {
+                                       printk(KERN_INFO
+                                              "md: checkpointing recovery of %s.\n",
+                                              mdname(mddev));
+                                       mddev->recovery_cp = mddev->curr_resync;
+                               }
+                       } else
+                               mddev->recovery_cp = MaxSector;
+               } else {
+                       if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+                               mddev->curr_resync = MaxSector;
+                       ITERATE_RDEV(mddev,rdev,rtmp)
+                               if (rdev->raid_disk >= 0 &&
+                                   !test_bit(Faulty, &rdev->flags) &&
+                                   !test_bit(In_sync, &rdev->flags) &&
+                                   rdev->recovery_offset < mddev->curr_resync)
+                                       rdev->recovery_offset = mddev->curr_resync;
+                       mddev->sb_dirty = 1;
+               }
        }
 
  skip:
@@ -4908,7 +5372,7 @@ void md_check_recovery(mddev_t *mddev)
                if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
                    !mddev->in_sync && mddev->recovery_cp == MaxSector) {
                        mddev->in_sync = 1;
-                       mddev->sb_dirty = 1;
+                       mddev->sb_dirty = 3;
                }
                if (mddev->safemode == 1)
                        mddev->safemode = 0;
@@ -4957,6 +5421,8 @@ void md_check_recovery(mddev_t *mddev)
                clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
                clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 
+               if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+                       goto unlock;
                /* no recovery is running.
                 * remove any failed drives, then
                 * add spares if possible.
@@ -4979,6 +5445,7 @@ void md_check_recovery(mddev_t *mddev)
                        ITERATE_RDEV(mddev,rdev,rtmp)
                                if (rdev->raid_disk < 0
                                    && !test_bit(Faulty, &rdev->flags)) {
+                                       rdev->recovery_offset = 0;
                                        if (mddev->pers->hot_add_disk(mddev,rdev)) {
                                                char nm[20];
                                                sprintf(nm, "rd%d", rdev->raid_disk);
@@ -5216,7 +5683,6 @@ EXPORT_SYMBOL(md_write_end);
 EXPORT_SYMBOL(md_register_thread);
 EXPORT_SYMBOL(md_unregister_thread);
 EXPORT_SYMBOL(md_wakeup_thread);
-EXPORT_SYMBOL(md_print_devices);
 EXPORT_SYMBOL(md_check_recovery);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md");