Merge branch 'x86-64'
[sfrench/cifs-2.6.git] / drivers / md / md.c
index 6f97817d28b905b1ada0baaf94763f1b92246604..306268ec99ff11572e975145f3f916ee4144cf63 100644 (file)
@@ -175,7 +175,7 @@ EXPORT_SYMBOL_GPL(md_new_event);
 /* Alternate version that can be called from interrupts
  * when calling sysfs_notify isn't needed.
  */
-void md_new_event_inintr(mddev_t *mddev)
+static void md_new_event_inintr(mddev_t *mddev)
 {
        atomic_inc(&md_event_count);
        wake_up(&md_event_waiters);
@@ -1558,15 +1558,30 @@ static void md_print_devices(void)
 }
 
 
-static void sync_sbs(mddev_t * mddev)
+static void sync_sbs(mddev_t * mddev, int nospares)
 {
+       /* Update each superblock (in-memory image), but
+        * if we are allowed to, skip spares which already
+        * have the right event counter, or have one earlier
+        * (which would mean they aren't being marked as dirty
+        * with the rest of the array)
+        */
        mdk_rdev_t *rdev;
        struct list_head *tmp;
 
        ITERATE_RDEV(mddev,rdev,tmp) {
-               super_types[mddev->major_version].
-                       sync_super(mddev, rdev);
-               rdev->sb_loaded = 1;
+               if (rdev->sb_events == mddev->events ||
+                   (nospares &&
+                    rdev->raid_disk < 0 &&
+                    (rdev->sb_events&1)==0 &&
+                    rdev->sb_events+1 == mddev->events)) {
+                       /* Don't update this superblock */
+                       rdev->sb_loaded = 2;
+               } else {
+                       super_types[mddev->major_version].
+                               sync_super(mddev, rdev);
+                       rdev->sb_loaded = 1;
+               }
        }
 }
 
@@ -1576,12 +1591,42 @@ void md_update_sb(mddev_t * mddev)
        struct list_head *tmp;
        mdk_rdev_t *rdev;
        int sync_req;
+       int nospares = 0;
 
 repeat:
        spin_lock_irq(&mddev->write_lock);
        sync_req = mddev->in_sync;
        mddev->utime = get_seconds();
-       mddev->events ++;
+       if (mddev->sb_dirty == 3)
+               /* just a clean<-> dirty transition, possibly leave spares alone,
+                * though if events isn't the right even/odd, we will have to do
+                * spares after all
+                */
+               nospares = 1;
+
+       /* If this is just a dirty<->clean transition, and the array is clean
+        * and 'events' is odd, we can roll back to the previous clean state */
+       if (mddev->sb_dirty == 3
+           && (mddev->in_sync && mddev->recovery_cp == MaxSector)
+           && (mddev->events & 1))
+               mddev->events--;
+       else {
+               /* otherwise we have to go forward and ... */
+               mddev->events ++;
+               if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
+                       /* .. if the array isn't clean, insist on an odd 'events' */
+                       if ((mddev->events&1)==0) {
+                               mddev->events++;
+                               nospares = 0;
+                       }
+               } else {
+                       /* otherwise insist on an even 'events' (for clean states) */
+                       if ((mddev->events&1)) {
+                               mddev->events++;
+                               nospares = 0;
+                       }
+               }
+       }
 
        if (!mddev->events) {
                /*
@@ -1593,7 +1638,7 @@ repeat:
                mddev->events --;
        }
        mddev->sb_dirty = 2;
-       sync_sbs(mddev);
+       sync_sbs(mddev, nospares);
 
        /*
         * do not write anything to disk if using
@@ -1615,6 +1660,8 @@ repeat:
        ITERATE_RDEV(mddev,rdev,tmp) {
                char b[BDEVNAME_SIZE];
                dprintk(KERN_INFO "md: ");
+               if (rdev->sb_loaded != 1)
+                       continue; /* no noise on spare devices */
                if (test_bit(Faulty, &rdev->flags))
                        dprintk("(skipping faulty ");
 
@@ -1626,6 +1673,7 @@ repeat:
                        dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
                                bdevname(rdev->bdev,b),
                                (unsigned long long)rdev->sb_offset);
+                       rdev->sb_events = mddev->events;
 
                } else
                        dprintk(")\n");
@@ -1689,6 +1737,10 @@ state_show(mdk_rdev_t *rdev, char *page)
                len += sprintf(page+len, "%sin_sync",sep);
                sep = ",";
        }
+       if (test_bit(WriteMostly, &rdev->flags)) {
+               len += sprintf(page+len, "%swrite_mostly",sep);
+               sep = ",";
+       }
        if (!test_bit(Faulty, &rdev->flags) &&
            !test_bit(In_sync, &rdev->flags)) {
                len += sprintf(page+len, "%sspare", sep);
@@ -1697,8 +1749,40 @@ state_show(mdk_rdev_t *rdev, char *page)
        return len+sprintf(page+len, "\n");
 }
 
+static ssize_t
+state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+       /* can write
+        *  faulty  - simulates and error
+        *  remove  - disconnects the device
+        *  writemostly - sets write_mostly
+        *  -writemostly - clears write_mostly
+        */
+       int err = -EINVAL;
+       if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
+               md_error(rdev->mddev, rdev);
+               err = 0;
+       } else if (cmd_match(buf, "remove")) {
+               if (rdev->raid_disk >= 0)
+                       err = -EBUSY;
+               else {
+                       mddev_t *mddev = rdev->mddev;
+                       kick_rdev_from_array(rdev);
+                       md_update_sb(mddev);
+                       md_new_event(mddev);
+                       err = 0;
+               }
+       } else if (cmd_match(buf, "writemostly")) {
+               set_bit(WriteMostly, &rdev->flags);
+               err = 0;
+       } else if (cmd_match(buf, "-writemostly")) {
+               clear_bit(WriteMostly, &rdev->flags);
+               err = 0;
+       }
+       return err ? err : len;
+}
 static struct rdev_sysfs_entry
-rdev_state = __ATTR_RO(state);
+rdev_state = __ATTR(state, 0644, state_show, state_store);
 
 static ssize_t
 super_show(mdk_rdev_t *rdev, char *page)
@@ -1895,6 +1979,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
        rdev->desc_nr = -1;
        rdev->flags = 0;
        rdev->data_offset = 0;
+       rdev->sb_events = 0;
        atomic_set(&rdev->nr_pending, 0);
        atomic_set(&rdev->read_errors, 0);
        atomic_set(&rdev->corrected_errors, 0);
@@ -2082,6 +2167,32 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry md_level =
 __ATTR(level, 0644, level_show, level_store);
 
+
+static ssize_t
+layout_show(mddev_t *mddev, char *page)
+{
+       /* just a number, not meaningful for all levels */
+       return sprintf(page, "%d\n", mddev->layout);
+}
+
+static ssize_t
+layout_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       char *e;
+       unsigned long n = simple_strtoul(buf, &e, 10);
+       if (mddev->pers)
+               return -EBUSY;
+
+       if (!*buf || (*e && *e != '\n'))
+               return -EINVAL;
+
+       mddev->layout = n;
+       return len;
+}
+static struct md_sysfs_entry md_layout =
+__ATTR(layout, 0655, layout_show, layout_store);
+
+
 static ssize_t
 raid_disks_show(mddev_t *mddev, char *page)
 {
@@ -2136,6 +2247,200 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry md_chunk_size =
 __ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
 
+static ssize_t
+resync_start_show(mddev_t *mddev, char *page)
+{
+       return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
+}
+
+static ssize_t
+resync_start_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       /* can only set chunk_size if array is not yet active */
+       char *e;
+       unsigned long long n = simple_strtoull(buf, &e, 10);
+
+       if (mddev->pers)
+               return -EBUSY;
+       if (!*buf || (*e && *e != '\n'))
+               return -EINVAL;
+
+       mddev->recovery_cp = n;
+       return len;
+}
+static struct md_sysfs_entry md_resync_start =
+__ATTR(resync_start, 0644, resync_start_show, resync_start_store);
+
+/*
+ * The array state can be:
+ *
+ * clear
+ *     No devices, no size, no level
+ *     Equivalent to STOP_ARRAY ioctl
+ * inactive
+ *     May have some settings, but array is not active
+ *        all IO results in error
+ *     When written, doesn't tear down array, but just stops it
+ * suspended (not supported yet)
+ *     All IO requests will block. The array can be reconfigured.
+ *     Writing this, if accepted, will block until array is quiessent
+ * readonly
+ *     no resync can happen.  no superblocks get written.
+ *     write requests fail
+ * read-auto
+ *     like readonly, but behaves like 'clean' on a write request.
+ *
+ * clean - no pending writes, but otherwise active.
+ *     When written to inactive array, starts without resync
+ *     If a write request arrives then
+ *       if metadata is known, mark 'dirty' and switch to 'active'.
+ *       if not known, block and switch to write-pending
+ *     If written to an active array that has pending writes, then fails.
+ * active
+ *     fully active: IO and resync can be happening.
+ *     When written to inactive array, starts with resync
+ *
+ * write-pending
+ *     clean, but writes are blocked waiting for 'active' to be written.
+ *
+ * active-idle
+ *     like active, but no writes have been seen for a while (100msec).
+ *
+ */
+enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
+                  write_pending, active_idle, bad_word};
+static char *array_states[] = {
+       "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
+       "write-pending", "active-idle", NULL };
+
+static int match_word(const char *word, char **list)
+{
+       int n;
+       for (n=0; list[n]; n++)
+               if (cmd_match(word, list[n]))
+                       break;
+       return n;
+}
+
+static ssize_t
+array_state_show(mddev_t *mddev, char *page)
+{
+       enum array_state st = inactive;
+
+       if (mddev->pers)
+               switch(mddev->ro) {
+               case 1:
+                       st = readonly;
+                       break;
+               case 2:
+                       st = read_auto;
+                       break;
+               case 0:
+                       if (mddev->in_sync)
+                               st = clean;
+                       else if (mddev->safemode)
+                               st = active_idle;
+                       else
+                               st = active;
+               }
+       else {
+               if (list_empty(&mddev->disks) &&
+                   mddev->raid_disks == 0 &&
+                   mddev->size == 0)
+                       st = clear;
+               else
+                       st = inactive;
+       }
+       return sprintf(page, "%s\n", array_states[st]);
+}
+
+static int do_md_stop(mddev_t * mddev, int ro);
+static int do_md_run(mddev_t * mddev);
+static int restart_array(mddev_t *mddev);
+
+static ssize_t
+array_state_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       int err = -EINVAL;
+       enum array_state st = match_word(buf, array_states);
+       switch(st) {
+       case bad_word:
+               break;
+       case clear:
+               /* stopping an active array */
+               if (mddev->pers) {
+                       if (atomic_read(&mddev->active) > 1)
+                               return -EBUSY;
+                       err = do_md_stop(mddev, 0);
+               }
+               break;
+       case inactive:
+               /* stopping an active array */
+               if (mddev->pers) {
+                       if (atomic_read(&mddev->active) > 1)
+                               return -EBUSY;
+                       err = do_md_stop(mddev, 2);
+               }
+               break;
+       case suspended:
+               break; /* not supported yet */
+       case readonly:
+               if (mddev->pers)
+                       err = do_md_stop(mddev, 1);
+               else {
+                       mddev->ro = 1;
+                       err = do_md_run(mddev);
+               }
+               break;
+       case read_auto:
+               /* stopping an active array */
+               if (mddev->pers) {
+                       err = do_md_stop(mddev, 1);
+                       if (err == 0)
+                               mddev->ro = 2; /* FIXME mark devices writable */
+               } else {
+                       mddev->ro = 2;
+                       err = do_md_run(mddev);
+               }
+               break;
+       case clean:
+               if (mddev->pers) {
+                       restart_array(mddev);
+                       spin_lock_irq(&mddev->write_lock);
+                       if (atomic_read(&mddev->writes_pending) == 0) {
+                               mddev->in_sync = 1;
+                               mddev->sb_dirty = 1;
+                       }
+                       spin_unlock_irq(&mddev->write_lock);
+               } else {
+                       mddev->ro = 0;
+                       mddev->recovery_cp = MaxSector;
+                       err = do_md_run(mddev);
+               }
+               break;
+       case active:
+               if (mddev->pers) {
+                       restart_array(mddev);
+                       mddev->sb_dirty = 0;
+                       wake_up(&mddev->sb_wait);
+                       err = 0;
+               } else {
+                       mddev->ro = 0;
+                       err = do_md_run(mddev);
+               }
+               break;
+       case write_pending:
+       case active_idle:
+               /* these cannot be set */
+               break;
+       }
+       if (err)
+               return err;
+       else
+               return len;
+}
+static struct md_sysfs_entry md_array_state = __ATTR(array_state, 0644, array_state_show, array_state_store);
+
 static ssize_t
 null_show(mddev_t *mddev, char *page)
 {
@@ -2498,12 +2803,15 @@ __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
 
 static struct attribute *md_default_attrs[] = {
        &md_level.attr,
+       &md_layout.attr,
        &md_raid_disks.attr,
        &md_chunk_size.attr,
        &md_size.attr,
+       &md_resync_start.attr,
        &md_metadata.attr,
        &md_new_device.attr,
        &md_safe_delay.attr,
+       &md_array_state.attr,
        NULL,
 };
 
@@ -2870,11 +3178,8 @@ static int restart_array(mddev_t *mddev)
                md_wakeup_thread(mddev->thread);
                md_wakeup_thread(mddev->sync_thread);
                err = 0;
-       } else {
-               printk(KERN_ERR "md: %s has no personality assigned.\n",
-                       mdname(mddev));
+       } else
                err = -EINVAL;
-       }
 
 out:
        return err;
@@ -2906,7 +3211,12 @@ static void restore_bitmap_write_access(struct file *file)
        spin_unlock(&inode->i_lock);
 }
 
-static int do_md_stop(mddev_t * mddev, int ro)
+/* mode:
+ *   0 - completely stop and dis-assemble array
+ *   1 - switch to readonly
+ *   2 - stop but do not disassemble array
+ */
+static int do_md_stop(mddev_t * mddev, int mode)
 {
        int err = 0;
        struct gendisk *disk = mddev->gendisk;
@@ -2928,12 +3238,15 @@ static int do_md_stop(mddev_t * mddev, int ro)
 
                invalidate_partition(disk, 0);
 
-               if (ro) {
+               switch(mode) {
+               case 1: /* readonly */
                        err  = -ENXIO;
                        if (mddev->ro==1)
                                goto out;
                        mddev->ro = 1;
-               } else {
+                       break;
+               case 0: /* disassemble */
+               case 2: /* stop */
                        bitmap_flush(mddev);
                        md_super_wait(mddev);
                        if (mddev->ro)
@@ -2953,7 +3266,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
                        mddev->in_sync = 1;
                        md_update_sb(mddev);
                }
-               if (ro)
+               if (mode == 1)
                        set_disk_ro(disk, 1);
                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
        }
@@ -2961,7 +3274,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
        /*
         * Free resources if final stop
         */
-       if (!ro) {
+       if (mode == 0) {
                mdk_rdev_t *rdev;
                struct list_head *tmp;
                struct gendisk *disk;
@@ -2985,6 +3298,10 @@ static int do_md_stop(mddev_t * mddev, int ro)
                export_array(mddev);
 
                mddev->array_size = 0;
+               mddev->size = 0;
+               mddev->raid_disks = 0;
+               mddev->recovery_cp = 0;
+
                disk = mddev->gendisk;
                if (disk)
                        set_capacity(disk, 0);
@@ -4708,7 +5025,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
                spin_lock_irq(&mddev->write_lock);
                if (mddev->in_sync) {
                        mddev->in_sync = 0;
-                       mddev->sb_dirty = 1;
+                       mddev->sb_dirty = 3;
                        md_wakeup_thread(mddev->thread);
                }
                spin_unlock_irq(&mddev->write_lock);
@@ -5055,7 +5372,7 @@ void md_check_recovery(mddev_t *mddev)
                if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
                    !mddev->in_sync && mddev->recovery_cp == MaxSector) {
                        mddev->in_sync = 1;
-                       mddev->sb_dirty = 1;
+                       mddev->sb_dirty = 3;
                }
                if (mddev->safemode == 1)
                        mddev->safemode = 0;