Merge branch 'for-linus' of git://git.kernel.dk/linux-2.6-block

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 4 Jun 2010 22:37:44 +0000 (15:37 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 4 Jun 2010 22:37:44 +0000 (15:37 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 4 Jun 2010 22:37:44 +0000 (15:37 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 4 Jun 2010 22:37:44 +0000 (15:37 -0700)
diff --git a/block/blk-core.c b/block/blk-core.c

index 3bc5579d6f543fa57783e09268707178de26c4b7..f84cce42fc58da2dc0bc48186cbd9f6bd1d7dfc3 100644 (file)
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -467,6 +467,9 @@ static int blk_init_free_list(struct request_queue *q)
  {
         struct request_list *rl = &q->rq;
  
+       if (unlikely(rl->rq_pool))
+               return 0;
+
         rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
         rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
         rl->elvpriv = 0;
@@ -570,9 +573,17 @@ EXPORT_SYMBOL(blk_init_queue);
  struct request_queue *
  blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
  {
-       struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
+       struct request_queue *uninit_q, *q;
+
+       uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
+       if (!uninit_q)
+               return NULL;
+
+       q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);
+       if (!q)
+               blk_cleanup_queue(uninit_q);
  
-       return blk_init_allocated_queue_node(q, rfn, lock, node_id);
+       return q;
  }
  EXPORT_SYMBOL(blk_init_queue_node);
  
@@ -592,10 +603,8 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
                 return NULL;
  
         q->node = node_id;
-       if (blk_init_free_list(q)) {
-               kmem_cache_free(blk_requestq_cachep, q);
+       if (blk_init_free_list(q))
                 return NULL;
-       }
  
         q->request_fn           = rfn;
         q->prep_rq_fn           = NULL;
@@ -618,7 +627,6 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
                 return q;
         }
  
-       blk_put_queue(q);
         return NULL;
  }
  EXPORT_SYMBOL(blk_init_allocated_queue_node);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c

index ed897b5ef315dedfe01d0fe8817ef2259b770252..5ff4f4850e717ddb319423e9678e0e44cd7f265c 100644 (file)
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -64,6 +64,9 @@ static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
  static struct completion *ioc_gone;
  static DEFINE_SPINLOCK(ioc_gone_lock);
  
+static DEFINE_SPINLOCK(cic_index_lock);
+static DEFINE_IDA(cic_index_ida);
+
  #define CFQ_PRIO_LISTS         IOPRIO_BE_NR
  #define cfq_class_idle(cfqq)   ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
  #define cfq_class_rt(cfqq)     ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
@@ -271,6 +274,7 @@ struct cfq_data {
         unsigned int cfq_latency;
         unsigned int cfq_group_isolation;
  
+       unsigned int cic_index;
         struct list_head cic_list;
  
         /*
@@ -430,6 +434,24 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic,
         cic->cfqq[is_sync] = cfqq;
  }
  
+#define CIC_DEAD_KEY   1ul
+#define CIC_DEAD_INDEX_SHIFT   1
+
+static inline void *cfqd_dead_key(struct cfq_data *cfqd)
+{
+       return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
+}
+
+static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
+{
+       struct cfq_data *cfqd = cic->key;
+
+       if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
+               return NULL;
+
+       return cfqd;
+}
+
  /*
   * We regard a request as SYNC, if it's either a read or has the SYNC bit
   * set (in which case it could also be direct WRITE).
@@ -2510,11 +2532,12 @@ static void cfq_cic_free(struct cfq_io_context *cic)
  static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
  {
         unsigned long flags;
+       unsigned long dead_key = (unsigned long) cic->key;
  
-       BUG_ON(!cic->dead_key);
+       BUG_ON(!(dead_key & CIC_DEAD_KEY));
  
         spin_lock_irqsave(&ioc->lock, flags);
-       radix_tree_delete(&ioc->radix_root, cic->dead_key);
+       radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
         hlist_del_rcu(&cic->cic_list);
         spin_unlock_irqrestore(&ioc->lock, flags);
  
@@ -2537,15 +2560,10 @@ static void cfq_free_io_context(struct io_context *ioc)
         __call_for_each_cic(ioc, cic_free_func);
  }
  
-static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+static void cfq_put_cooperator(struct cfq_queue *cfqq)
  {
         struct cfq_queue *__cfqq, *next;
  
-       if (unlikely(cfqq == cfqd->active_queue)) {
-               __cfq_slice_expired(cfqd, cfqq, 0);
-               cfq_schedule_dispatch(cfqd);
-       }
-
         /*
          * If this queue was scheduled to merge with another queue, be
          * sure to drop the reference taken on that queue (and others in
@@ -2561,6 +2579,16 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
                 cfq_put_queue(__cfqq);
                 __cfqq = next;
         }
+}
+
+static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+       if (unlikely(cfqq == cfqd->active_queue)) {
+               __cfq_slice_expired(cfqd, cfqq, 0);
+               cfq_schedule_dispatch(cfqd);
+       }
+
+       cfq_put_cooperator(cfqq);
  
         cfq_put_queue(cfqq);
  }
@@ -2573,11 +2601,10 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
         list_del_init(&cic->queue_list);
  
         /*
-        * Make sure key == NULL is seen for dead queues
+        * Make sure dead mark is seen for dead queues
          */
         smp_wmb();
-       cic->dead_key = (unsigned long) cic->key;
-       cic->key = NULL;
+       cic->key = cfqd_dead_key(cfqd);
  
         if (ioc->ioc_data == cic)
                 rcu_assign_pointer(ioc->ioc_data, NULL);
@@ -2596,7 +2623,7 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
  static void cfq_exit_single_io_context(struct io_context *ioc,
                                        struct cfq_io_context *cic)
  {
-       struct cfq_data *cfqd = cic->key;
+       struct cfq_data *cfqd = cic_to_cfqd(cic);
  
         if (cfqd) {
                 struct request_queue *q = cfqd->queue;
@@ -2609,7 +2636,7 @@ static void cfq_exit_single_io_context(struct io_context *ioc,
                  * race between exiting task and queue
                  */
                 smp_read_barrier_depends();
-               if (cic->key)
+               if (cic->key == cfqd)
                         __cfq_exit_single_io_context(cfqd, cic);
  
                 spin_unlock_irqrestore(q->queue_lock, flags);
@@ -2689,7 +2716,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
  
  static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
  {
-       struct cfq_data *cfqd = cic->key;
+       struct cfq_data *cfqd = cic_to_cfqd(cic);
         struct cfq_queue *cfqq;
         unsigned long flags;
  
@@ -2746,7 +2773,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
  static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
  {
         struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
-       struct cfq_data *cfqd = cic->key;
+       struct cfq_data *cfqd = cic_to_cfqd(cic);
         unsigned long flags;
         struct request_queue *q;
  
@@ -2883,12 +2910,13 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
         unsigned long flags;
  
         WARN_ON(!list_empty(&cic->queue_list));
+       BUG_ON(cic->key != cfqd_dead_key(cfqd));
  
         spin_lock_irqsave(&ioc->lock, flags);
  
         BUG_ON(ioc->ioc_data == cic);
  
-       radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd);
+       radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
         hlist_del_rcu(&cic->cic_list);
         spin_unlock_irqrestore(&ioc->lock, flags);
  
@@ -2900,7 +2928,6 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
  {
         struct cfq_io_context *cic;
         unsigned long flags;
-       void *k;
  
         if (unlikely(!ioc))
                 return NULL;
@@ -2917,13 +2944,11 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
         }
  
         do {
-               cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd);
+               cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
                 rcu_read_unlock();
                 if (!cic)
                         break;
-               /* ->key must be copied to avoid race with cfq_exit_queue() */
-               k = cic->key;
-               if (unlikely(!k)) {
+               if (unlikely(cic->key != cfqd)) {
                         cfq_drop_dead_cic(cfqd, ioc, cic);
                         rcu_read_lock();
                         continue;
@@ -2956,7 +2981,7 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
  
                 spin_lock_irqsave(&ioc->lock, flags);
                 ret = radix_tree_insert(&ioc->radix_root,
-                                               (unsigned long) cfqd, cic);
+                                               cfqd->cic_index, cic);
                 if (!ret)
                         hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
                 spin_unlock_irqrestore(&ioc->lock, flags);
@@ -3516,6 +3541,9 @@ split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
         }
  
         cic_set_cfqq(cic, NULL, 1);
+
+       cfq_put_cooperator(cfqq);
+
         cfq_put_queue(cfqq);
         return NULL;
  }
@@ -3708,10 +3736,32 @@ static void cfq_exit_queue(struct elevator_queue *e)
  
         cfq_shutdown_timer_wq(cfqd);
  
+       spin_lock(&cic_index_lock);
+       ida_remove(&cic_index_ida, cfqd->cic_index);
+       spin_unlock(&cic_index_lock);
+
         /* Wait for cfqg->blkg->key accessors to exit their grace periods. */
         call_rcu(&cfqd->rcu, cfq_cfqd_free);
  }
  
+static int cfq_alloc_cic_index(void)
+{
+       int index, error;
+
+       do {
+               if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
+                       return -ENOMEM;
+
+               spin_lock(&cic_index_lock);
+               error = ida_get_new(&cic_index_ida, &index);
+               spin_unlock(&cic_index_lock);
+               if (error && error != -EAGAIN)
+                       return error;
+       } while (error);
+
+       return index;
+}
+
  static void *cfq_init_queue(struct request_queue *q)
  {
         struct cfq_data *cfqd;
@@ -3719,10 +3769,16 @@ static void *cfq_init_queue(struct request_queue *q)
         struct cfq_group *cfqg;
         struct cfq_rb_root *st;
  
+       i = cfq_alloc_cic_index();
+       if (i < 0)
+               return NULL;
+
         cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
         if (!cfqd)
                 return NULL;
  
+       cfqd->cic_index = i;
+
         /* Init root service tree */
         cfqd->grp_service_tree = CFQ_RB_ROOT;
  
@@ -3984,6 +4040,7 @@ static void __exit cfq_exit(void)
          */
         if (elv_ioc_count_read(cfq_ioc_count))
                 wait_for_completion(&all_gone);
+       ida_destroy(&cic_index_ida);
         cfq_slab_kill();
  }
  
diff --git a/block/elevator.c b/block/elevator.c

index 6df2b5056b51e1347c10cb7c5f3c50d09aa21650..923a9139106c51cbdb63dca5d1c97ce81a30d316 100644 (file)
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -242,9 +242,11 @@ int elevator_init(struct request_queue *q, char *name)
  {
         struct elevator_type *e = NULL;
         struct elevator_queue *eq;
-       int ret = 0;
         void *data;
  
+       if (unlikely(q->elevator))
+               return 0;
+
         INIT_LIST_HEAD(&q->queue_head);
         q->last_merge = NULL;
         q->end_sector = 0;
@@ -284,7 +286,7 @@ int elevator_init(struct request_queue *q, char *name)
         }
  
         elevator_attach(q, eq, data);
-       return ret;
+       return 0;
  }
  EXPORT_SYMBOL(elevator_init);
  
@@ -1097,7 +1099,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
         struct elevator_type *__e;
         int len = 0;
  
-       if (!q->elevator)
+       if (!q->elevator || !blk_queue_stackable(q))
                 return sprintf(name, "none\n");
  
         elv = e->elevator_type;
diff --git a/drivers/block/brd.c b/drivers/block/brd.c

index 6081e81d5738b3fecb90da4350d7325b6b78d28a..f1bf79d9bc0a1c65df988ef4ec0b3c1eab907d93 100644 (file)
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -133,6 +133,28 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
         return page;
  }
  
+static void brd_free_page(struct brd_device *brd, sector_t sector)
+{
+       struct page *page;
+       pgoff_t idx;
+
+       spin_lock(&brd->brd_lock);
+       idx = sector >> PAGE_SECTORS_SHIFT;
+       page = radix_tree_delete(&brd->brd_pages, idx);
+       spin_unlock(&brd->brd_lock);
+       if (page)
+               __free_page(page);
+}
+
+static void brd_zero_page(struct brd_device *brd, sector_t sector)
+{
+       struct page *page;
+
+       page = brd_lookup_page(brd, sector);
+       if (page)
+               clear_highpage(page);
+}
+
  /*
   * Free all backing store pages and radix tree. This must only be called when
   * there are no other users of the device.
@@ -189,6 +211,24 @@ static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
         return 0;
  }
  
+static void discard_from_brd(struct brd_device *brd,
+                       sector_t sector, size_t n)
+{
+       while (n >= PAGE_SIZE) {
+               /*
+                * Don't want to actually discard pages here because
+                * re-allocating the pages can result in writeback
+                * deadlocks under heavy load.
+                */
+               if (0)
+                       brd_free_page(brd, sector);
+               else
+                       brd_zero_page(brd, sector);
+               sector += PAGE_SIZE >> SECTOR_SHIFT;
+               n -= PAGE_SIZE;
+       }
+}
+
  /*
   * Copy n bytes from src to the brd starting at sector. Does not sleep.
   */
@@ -300,6 +340,12 @@ static int brd_make_request(struct request_queue *q, struct bio *bio)
                                                 get_capacity(bdev->bd_disk))
                 goto out;
  
+       if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) {
+               err = 0;
+               discard_from_brd(brd, sector, bio->bi_size);
+               goto out;
+       }
+
         rw = bio_rw(bio);
         if (rw == READA)
                 rw = READ;
@@ -320,7 +366,7 @@ out:
  }
  
  #ifdef CONFIG_BLK_DEV_XIP
-static int brd_direct_access (struct block_device *bdev, sector_t sector,
+static int brd_direct_access(struct block_device *bdev, sector_t sector,
                         void **kaddr, unsigned long *pfn)
  {
         struct brd_device *brd = bdev->bd_disk->private_data;
@@ -437,6 +483,11 @@ static struct brd_device *brd_alloc(int i)
         blk_queue_max_hw_sectors(brd->brd_queue, 1024);
         blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
  
+       brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
+       brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
+       brd->brd_queue->limits.discard_zeroes_data = 1;
+       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
+
         disk = brd->brd_disk = alloc_disk(1 << part_shift);
         if (!disk)
                 goto out_free_queue;
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c

index e1d0e2cfec72aaae4d332e693005d5d30833e8ab..3381505c8a6c309c7838cb7d0a4daff7e3e162e7 100644 (file)
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -188,11 +188,11 @@ scsi_cmd_free(ctlr_info_t *h, CommandList_struct *cmd)
  
         sa = h->scsi_ctlr;
         stk = &sa->cmd_stack; 
+       stk->top++;
         if (stk->top >= CMD_STACK_SIZE) {
                 printk("cciss: scsi_cmd_free called too many times.\n");
                 BUG();
         }
-       stk->top++;
         stk->elem[stk->top] = (struct cciss_scsi_cmd_stack_elem_t *) cmd;
  }
  
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h

index e9654c8d5b6265f5f125c80185bd54b8034b697a..485ed8c7d623986b1aabbcd19ee50768f6c60c4c 100644 (file)
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -943,8 +943,7 @@ struct drbd_conf {
         struct drbd_work  resync_work,
                           unplug_work,
                           md_sync_work,
-                         delay_probe_work,
-                         uuid_work;
+                         delay_probe_work;
         struct timer_list resync_timer;
         struct timer_list md_sync_timer;
         struct timer_list delay_probe_timer;
@@ -1069,7 +1068,6 @@ struct drbd_conf {
         struct timeval dps_time; /* delay-probes-start-time */
         unsigned int dp_volume_last;  /* send_cnt of last delay probe */
         int c_sync_rate; /* current resync rate after delay_probe magic */
-       atomic_t new_c_uuid;
  };
  
  static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1476,7 +1474,6 @@ extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
  extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
  extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int);
  extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
-extern int w_io_error(struct drbd_conf *, struct drbd_work *, int);
  extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
  extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int);
  extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
@@ -1542,7 +1539,7 @@ static inline void drbd_tcp_nodelay(struct socket *sock)
  
  static inline void drbd_tcp_quickack(struct socket *sock)
  {
-       int __user val = 1;
+       int __user val = 2;
         (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
                         (char __user *)&val, sizeof(val));
  }
@@ -1728,7 +1725,7 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
         switch (mdev->ldev->dc.on_io_error) {
         case EP_PASS_ON:
                 if (!forcedetach) {
-                       if (printk_ratelimit())
+                       if (__ratelimit(&drbd_ratelimit_state))
                                 dev_err(DEV, "Local IO failed in %s."
                                              "Passing error on...\n", where);
                         break;
@@ -2219,8 +2216,6 @@ static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
                 return 0;
         if (test_bit(BITMAP_IO, &mdev->flags))
                 return 0;
-       if (atomic_read(&mdev->new_c_uuid))
-               return 0;
         return 1;
  }
  
@@ -2241,9 +2236,6 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
          * to avoid races with the reconnect code,
          * we need to atomic_inc within the spinlock. */
  
-       if (atomic_read(&mdev->new_c_uuid) && atomic_add_unless(&mdev->new_c_uuid, -1, 1))
-               drbd_queue_work_front(&mdev->data.work, &mdev->uuid_work);
-
         spin_lock_irq(&mdev->req_lock);
         while (!__inc_ap_bio_cond(mdev)) {
                 prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c

index be2d2da9cdba41db2c09f8bea3b1d8df0474f24e..6b077f93acc620eaf40fa85599dbe83745779c28 100644 (file)
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1215,18 +1215,17 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
              ns.pdsk == D_OUTDATED)) {
                 if (get_ldev(mdev)) {
                         if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
-                           mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE &&
-                           !atomic_read(&mdev->new_c_uuid))
-                               atomic_set(&mdev->new_c_uuid, 2);
+                           mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+                               drbd_uuid_new_current(mdev);
+                               drbd_send_uuids(mdev);
+                       }
                         put_ldev(mdev);
                 }
         }
  
         if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
-               /* Diskless peer becomes primary or got connected do diskless, primary peer. */
-               if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0 &&
-                   !atomic_read(&mdev->new_c_uuid))
-                       atomic_set(&mdev->new_c_uuid, 2);
+               if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
+                       drbd_uuid_new_current(mdev);
  
                 /* D_DISKLESS Peer becomes secondary */
                 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
@@ -1350,24 +1349,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
         drbd_md_sync(mdev);
  }
  
-static int w_new_current_uuid(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
-       if (get_ldev(mdev)) {
-               if (mdev->ldev->md.uuid[UI_BITMAP] == 0) {
-                       drbd_uuid_new_current(mdev);
-                       if (get_net_conf(mdev)) {
-                               drbd_send_uuids(mdev);
-                               put_net_conf(mdev);
-                       }
-                       drbd_md_sync(mdev);
-               }
-               put_ldev(mdev);
-       }
-       atomic_dec(&mdev->new_c_uuid);
-       wake_up(&mdev->misc_wait);
-
-       return 1;
-}
  
  static int drbd_thread_setup(void *arg)
  {
@@ -2291,9 +2272,9 @@ static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *
   * with page_count == 0 or PageSlab.
   */
  static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
-                  int offset, size_t size)
+                  int offset, size_t size, unsigned msg_flags)
  {
-       int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0);
+       int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
         kunmap(page);
         if (sent == size)
                 mdev->send_cnt += size>>9;
@@ -2301,7 +2282,7 @@ static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
  }
  
  static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
-                   int offset, size_t size)
+                   int offset, size_t size, unsigned msg_flags)
  {
         mm_segment_t oldfs = get_fs();
         int sent, ok;
@@ -2314,14 +2295,15 @@ static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
          * __page_cache_release a page that would actually still be referenced
          * by someone, leading to some obscure delayed Oops somewhere else. */
         if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
-               return _drbd_no_send_page(mdev, page, offset, size);
+               return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
  
+       msg_flags |= MSG_NOSIGNAL;
         drbd_update_congested(mdev);
         set_fs(KERNEL_DS);
         do {
                 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
                                                         offset, len,
-                                                       MSG_NOSIGNAL);
+                                                       msg_flags);
                 if (sent == -EAGAIN) {
                         if (we_should_drop_the_connection(mdev,
                                                           mdev->data.socket))
@@ -2350,9 +2332,11 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
  {
         struct bio_vec *bvec;
         int i;
+       /* hint all but last page with MSG_MORE */
         __bio_for_each_segment(bvec, bio, i, 0) {
                 if (!_drbd_no_send_page(mdev, bvec->bv_page,
-                                    bvec->bv_offset, bvec->bv_len))
+                                    bvec->bv_offset, bvec->bv_len,
+                                    i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
                         return 0;
         }
         return 1;
@@ -2362,12 +2346,13 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
  {
         struct bio_vec *bvec;
         int i;
+       /* hint all but last page with MSG_MORE */
         __bio_for_each_segment(bvec, bio, i, 0) {
                 if (!_drbd_send_page(mdev, bvec->bv_page,
-                                    bvec->bv_offset, bvec->bv_len))
+                                    bvec->bv_offset, bvec->bv_len,
+                                    i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
                         return 0;
         }
-
         return 1;
  }
  
@@ -2375,9 +2360,11 @@ static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
  {
         struct page *page = e->pages;
         unsigned len = e->size;
+       /* hint all but last page with MSG_MORE */
         page_chain_for_each(page) {
                 unsigned l = min_t(unsigned, len, PAGE_SIZE);
-               if (!_drbd_send_page(mdev, page, 0, l))
+               if (!_drbd_send_page(mdev, page, 0, l,
+                               page_chain_next(page) ? MSG_MORE : 0))
                         return 0;
                 len -= l;
         }
@@ -2457,11 +2444,11 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
         p.dp_flags = cpu_to_be32(dp_flags);
         set_bit(UNPLUG_REMOTE, &mdev->flags);
         ok = (sizeof(p) ==
-               drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
+               drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
         if (ok && dgs) {
                 dgb = mdev->int_dig_out;
                 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
-               ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
+               ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
         }
         if (ok) {
                 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
@@ -2510,11 +2497,11 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
                 return 0;
  
         ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
-                                       sizeof(p), MSG_MORE);
+                                       sizeof(p), dgs ? MSG_MORE : 0);
         if (ok && dgs) {
                 dgb = mdev->int_dig_out;
                 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
-               ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
+               ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
         }
         if (ok)
                 ok = _drbd_send_zc_ee(mdev, e);
@@ -2708,7 +2695,6 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
         atomic_set(&mdev->net_cnt, 0);
         atomic_set(&mdev->packet_seq, 0);
         atomic_set(&mdev->pp_in_use, 0);
-       atomic_set(&mdev->new_c_uuid, 0);
  
         mutex_init(&mdev->md_io_mutex);
         mutex_init(&mdev->data.mutex);
@@ -2739,14 +2725,12 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
         INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
         INIT_LIST_HEAD(&mdev->delay_probes);
         INIT_LIST_HEAD(&mdev->delay_probe_work.list);
-       INIT_LIST_HEAD(&mdev->uuid_work.list);
  
         mdev->resync_work.cb  = w_resync_inactive;
         mdev->unplug_work.cb  = w_send_write_hint;
         mdev->md_sync_work.cb = w_md_sync;
         mdev->bm_io_work.w.cb = w_bitmap_io;
         mdev->delay_probe_work.cb = w_delay_probes;
-       mdev->uuid_work.cb = w_new_current_uuid;
         init_timer(&mdev->resync_timer);
         init_timer(&mdev->md_sync_timer);
         init_timer(&mdev->delay_probe_timer);
@@ -3799,7 +3783,7 @@ _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
         if (ret) {
                 fault_count++;
  
-               if (printk_ratelimit())
+               if (__ratelimit(&drbd_ratelimit_state))
                         dev_warn(DEV, "***Simulating %s failure\n",
                                 _drbd_fault_str(type));
         }
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c

index bc9ab7fb2cc7d6eaeda21fcfa07706cfe5a4c922..dff48701b84d8784f0f85a9202abbaaf16c522fc 100644 (file)
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -42,7 +42,6 @@
  #include <linux/unistd.h>
  #include <linux/vmalloc.h>
  #include <linux/random.h>
-#include <linux/mm.h>
  #include <linux/string.h>
  #include <linux/scatterlist.h>
  #include "drbd_int.h"
@@ -571,6 +570,25 @@ static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
         return rv;
  }
  
+/* quoting tcp(7):
+ *   On individual connections, the socket buffer size must be set prior to the
+ *   listen(2) or connect(2) calls in order to have it take effect.
+ * This is our wrapper to do so.
+ */
+static void drbd_setbufsize(struct socket *sock, unsigned int snd,
+               unsigned int rcv)
+{
+       /* open coded SO_SNDBUF, SO_RCVBUF */
+       if (snd) {
+               sock->sk->sk_sndbuf = snd;
+               sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+       }
+       if (rcv) {
+               sock->sk->sk_rcvbuf = rcv;
+               sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+       }
+}
+
  static struct socket *drbd_try_connect(struct drbd_conf *mdev)
  {
         const char *what;
@@ -592,6 +610,8 @@ static struct socket *drbd_try_connect(struct drbd_conf *mdev)
  
         sock->sk->sk_rcvtimeo =
         sock->sk->sk_sndtimeo =  mdev->net_conf->try_connect_int*HZ;
+       drbd_setbufsize(sock, mdev->net_conf->sndbuf_size,
+                       mdev->net_conf->rcvbuf_size);
  
         /* explicitly bind to the configured IP as source IP
         *  for the outgoing connections.
@@ -670,6 +690,8 @@ static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
         s_listen->sk->sk_reuse    = 1; /* SO_REUSEADDR */
         s_listen->sk->sk_rcvtimeo = timeo;
         s_listen->sk->sk_sndtimeo = timeo;
+       drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
+                       mdev->net_conf->rcvbuf_size);
  
         what = "bind before listen";
         err = s_listen->ops->bind(s_listen,
@@ -856,16 +878,6 @@ retry:
         sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
         msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
  
-       if (mdev->net_conf->sndbuf_size) {
-               sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size;
-               sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
-       }
-
-       if (mdev->net_conf->rcvbuf_size) {
-               sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size;
-               sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
-       }
-
         /* NOT YET ...
          * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
          * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
@@ -1154,17 +1166,6 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
         unsigned n_bios = 0;
         unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
  
-       if (atomic_read(&mdev->new_c_uuid)) {
-               if (atomic_add_unless(&mdev->new_c_uuid, -1, 1)) {
-                       drbd_uuid_new_current(mdev);
-                       drbd_md_sync(mdev);
-
-                       atomic_dec(&mdev->new_c_uuid);
-                       wake_up(&mdev->misc_wait);
-               }
-               wait_event(mdev->misc_wait, !atomic_read(&mdev->new_c_uuid));
-       }
-
         /* In most cases, we will only need one bio.  But in case the lower
          * level restrictions happen to be different at this offset on this
          * side than those of the sending peer, we may need to submit the
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c

index 3397f11d0ba915fd29ca9cd6f65717d2f6a83f10..654f1ef5cbb0fb6e21c25430e8e2cace1a84630f 100644 (file)
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -102,32 +102,7 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
                 }
         }
  
-       /* if it was a local io error, we want to notify our
-        * peer about that, and see if we need to
-        * detach the disk and stuff.
-        * to avoid allocating some special work
-        * struct, reuse the request. */
-
-       /* THINK
-        * why do we do this not when we detect the error,
-        * but delay it until it is "done", i.e. possibly
-        * until the next barrier ack? */
-
-       if (rw == WRITE &&
-           ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) {
-               if (!(req->w.list.next == LIST_POISON1 ||
-                     list_empty(&req->w.list))) {
-                       /* DEBUG ASSERT only; if this triggers, we
-                        * probably corrupt the worker list here */
-                       dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next);
-                       dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev);
-               }
-               req->w.cb = w_io_error;
-               drbd_queue_work(&mdev->data.work, &req->w);
-               /* drbd_req_free() is done in w_io_error */
-       } else {
-               drbd_req_free(req);
-       }
+       drbd_req_free(req);
  }
  
  static void queue_barrier(struct drbd_conf *mdev)
@@ -453,9 +428,6 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
                 req->rq_state |= RQ_LOCAL_COMPLETED;
                 req->rq_state &= ~RQ_LOCAL_PENDING;
  
-               dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n",
-                     (unsigned long long)req->sector, req->size);
-               /* and now: check how to handle local io error. */
                 __drbd_chk_io_error(mdev, FALSE);
                 _req_may_be_done(req, m);
                 put_ldev(mdev);
@@ -475,22 +447,21 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
                 req->rq_state |= RQ_LOCAL_COMPLETED;
                 req->rq_state &= ~RQ_LOCAL_PENDING;
  
-               dev_alert(DEV, "Local READ failed sec=%llus size=%u\n",
-                     (unsigned long long)req->sector, req->size);
-               /* _req_mod(req,to_be_send); oops, recursion... */
                 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
-               req->rq_state |= RQ_NET_PENDING;
-               inc_ap_pending(mdev);
  
                 __drbd_chk_io_error(mdev, FALSE);
                 put_ldev(mdev);
-               /* NOTE: if we have no connection,
-                * or know the peer has no good data either,
-                * then we don't actually need to "queue_for_net_read",
-                * but we do so anyways, since the drbd_io_error()
-                * and the potential state change to "Diskless"
-                * needs to be done from process context */
  
+               /* no point in retrying if there is no good remote data,
+                * or we have no connection. */
+               if (mdev->state.pdsk != D_UP_TO_DATE) {
+                       _req_may_be_done(req, m);
+                       break;
+               }
+
+               /* _req_mod(req,to_be_send); oops, recursion... */
+               req->rq_state |= RQ_NET_PENDING;
+               inc_ap_pending(mdev);
                 /* fall through: _req_mod(req,queue_for_net_read); */
  
         case queue_for_net_read:
@@ -600,6 +571,9 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
                 _req_may_be_done(req, m);
                 break;
  
+       case read_retry_remote_canceled:
+               req->rq_state &= ~RQ_NET_QUEUED;
+               /* fall through, in case we raced with drbd_disconnect */
         case connection_lost_while_pending:
                 /* transfer log cleanup after connection loss */
                 /* assert something? */
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h

index 16119d7056cc65e1eae0e97245eaf7d28f78e884..02d575d245187b301e223cfc8ef5859990a637f6 100644 (file)
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -91,6 +91,7 @@ enum drbd_req_event {
         send_failed,
         handed_over_to_network,
         connection_lost_while_pending,
+       read_retry_remote_canceled,
         recv_acked_by_peer,
         write_acked_by_peer,
         write_acked_by_peer_and_sis, /* and set_in_sync */
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c

index 727ff6339754776c389f717ca98276bbdd72714e..b623ceee2a4a6b06a42b9beef06d24f18b28bf71 100644 (file)
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -224,9 +224,6 @@ void drbd_endio_pri(struct bio *bio, int error)
         enum drbd_req_event what;
         int uptodate = bio_flagged(bio, BIO_UPTODATE);
  
-       if (error)
-               dev_warn(DEV, "p %s: error=%d\n",
-                        bio_data_dir(bio) == WRITE ? "write" : "read", error);
         if (!error && !uptodate) {
                 dev_warn(DEV, "p %s: setting error to -EIO\n",
                          bio_data_dir(bio) == WRITE ? "write" : "read");
@@ -257,20 +254,6 @@ void drbd_endio_pri(struct bio *bio, int error)
                 complete_master_bio(mdev, &m);
  }
  
-int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
-       struct drbd_request *req = container_of(w, struct drbd_request, w);
-
-       /* NOTE: mdev->ldev can be NULL by the time we get here! */
-       /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */
-
-       /* the only way this callback is scheduled is from _req_may_be_done,
-        * when it is done and had a local write error, see comments there */
-       drbd_req_free(req);
-
-       return TRUE;
-}
-
  int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  {
         struct drbd_request *req = container_of(w, struct drbd_request, w);
@@ -280,12 +263,9 @@ int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
          * to give the disk the chance to relocate that block */
  
         spin_lock_irq(&mdev->req_lock);
-       if (cancel ||
-           mdev->state.conn < C_CONNECTED ||
-           mdev->state.pdsk <= D_INCONSISTENT) {
-               _req_mod(req, send_canceled);
+       if (cancel || mdev->state.pdsk != D_UP_TO_DATE) {
+               _req_mod(req, read_retry_remote_canceled);
                 spin_unlock_irq(&mdev->req_lock);
-               dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n");
                 return 1;
         }
         spin_unlock_irq(&mdev->req_lock);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c

index ea8592b906968913499b9847f35389a31e5dc1a3..1d1088f48bc2dfe713d53a9af0fa4d51e1161428 100644 (file)
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,7 +45,6 @@ struct wb_writeback_args {
         unsigned int for_kupdate:1;
         unsigned int range_cyclic:1;
         unsigned int for_background:1;
-       unsigned int sb_pinned:1;
  };
  
  /*
@@ -193,8 +192,7 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
  }
  
  static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
-                                struct wb_writeback_args *args,
-                                int wait)
+                                struct wb_writeback_args *args)
  {
         struct bdi_work *work;
  
@@ -206,8 +204,6 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
         if (work) {
                 bdi_work_init(work, args);
                 bdi_queue_work(bdi, work);
-               if (wait)
-                       bdi_wait_on_work_clear(work);
         } else {
                 struct bdi_writeback *wb = &bdi->wb;
  
@@ -234,11 +230,6 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
                 .sync_mode      = WB_SYNC_ALL,
                 .nr_pages       = LONG_MAX,
                 .range_cyclic   = 0,
-               /*
-                * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
-                * lets make it explicitly clear.
-                */
-               .sb_pinned      = 1,
         };
         struct bdi_work work;
  
@@ -254,23 +245,21 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
   * @bdi: the backing device to write from
   * @sb: write inodes from this super_block
   * @nr_pages: the number of pages to write
- * @sb_locked: caller already holds sb umount sem.
   *
   * Description:
   *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
   *   started when this function returns, we make no guarentees on
- *   completion. Caller specifies whether sb umount sem is held already or not.
+ *   completion. Caller need not hold sb s_umount semaphore.
   *
   */
  void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-                        long nr_pages, int sb_locked)
+                        long nr_pages)
  {
         struct wb_writeback_args args = {
                 .sb             = sb,
                 .sync_mode      = WB_SYNC_NONE,
                 .nr_pages       = nr_pages,
                 .range_cyclic   = 1,
-               .sb_pinned      = sb_locked,
         };
  
         /*
@@ -282,7 +271,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
                 args.for_background = 1;
         }
  
-       bdi_alloc_queue_work(bdi, &args, sb_locked);
+       bdi_alloc_queue_work(bdi, &args);
  }
  
  /*
@@ -595,7 +584,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
         /*
          * Caller must already hold the ref for this
          */
-       if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
+       if (wbc->sync_mode == WB_SYNC_ALL) {
                 WARN_ON(!rwsem_is_locked(&sb->s_umount));
                 return SB_NOT_PINNED;
         }
@@ -769,7 +758,6 @@ static long wb_writeback(struct bdi_writeback *wb,
                 .for_kupdate            = args->for_kupdate,
                 .for_background         = args->for_background,
                 .range_cyclic           = args->range_cyclic,
-               .sb_pinned              = args->sb_pinned,
         };
         unsigned long oldest_jif;
         long wrote = 0;
@@ -912,7 +900,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
  
         while ((work = get_next_work_item(bdi, wb)) != NULL) {
                 struct wb_writeback_args args = work->args;
-               int post_clear;
  
                 /*
                  * Override sync mode, in case we must wait for completion
@@ -920,13 +907,11 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
                 if (force_wait)
                         work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
  
-               post_clear = WB_SYNC_ALL || args.sb_pinned;
-
                 /*
                  * If this isn't a data integrity operation, just notify
                  * that we have seen this work and we are now starting it.
                  */
-               if (!post_clear)
+               if (args.sync_mode == WB_SYNC_NONE)
                         wb_clear_pending(wb, work);
  
                 wrote += wb_writeback(wb, &args);
@@ -935,7 +920,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
                  * This is a data integrity writeback, so only do the
                  * notification when we have completed the work.
                  */
-               if (post_clear)
+               if (args.sync_mode == WB_SYNC_ALL)
                         wb_clear_pending(wb, work);
         }
  
@@ -1011,7 +996,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages)
                 if (!bdi_has_dirty_io(bdi))
                         continue;
  
-               bdi_alloc_queue_work(bdi, &args, 0);
+               bdi_alloc_queue_work(bdi, &args);
         }
  
         rcu_read_unlock();
@@ -1220,18 +1205,6 @@ static void wait_sb_inodes(struct super_block *sb)
         iput(old_inode);
  }
  
-static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
-{
-       unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-       unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-       long nr_to_write;
-
-       nr_to_write = nr_dirty + nr_unstable +
-                       (inodes_stat.nr_inodes - inodes_stat.nr_unused);
-
-       bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
-}
-
  /**
   * writeback_inodes_sb -       writeback dirty inodes from given super_block
   * @sb: the superblock
@@ -1243,21 +1216,16 @@ static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
   */
  void writeback_inodes_sb(struct super_block *sb)
  {
-       __writeback_inodes_sb(sb, 0);
-}
-EXPORT_SYMBOL(writeback_inodes_sb);
+       unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+       unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+       long nr_to_write;
  
-/**
- * writeback_inodes_sb_locked  - writeback dirty inodes from given super_block
- * @sb: the superblock
- *
- * Like writeback_inodes_sb(), except the caller already holds the
- * sb umount sem.
- */
-void writeback_inodes_sb_locked(struct super_block *sb)
-{
-       __writeback_inodes_sb(sb, 1);
+       nr_to_write = nr_dirty + nr_unstable +
+                       (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+
+       bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
  }
+EXPORT_SYMBOL(writeback_inodes_sb);
  
  /**
   * writeback_inodes_sb_if_idle -       start writeback if none underway
diff --git a/fs/pipe.c b/fs/pipe.c

index db6eaaba0dd81fb777f2b64f1e385d40eb3f1819..69c4c7c13ea910db965d68a1c892a91791303119 100644 (file)
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -26,9 +26,14 @@
  
  /*
   * The max size that a non-root user is allowed to grow the pipe. Can
- * be set by root in /proc/sys/fs/pipe-max-pages
+ * be set by root in /proc/sys/fs/pipe-max-size
   */
-unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
+unsigned int pipe_max_size = 1048576;
+
+/*
+ * Minimum pipe size, as required by POSIX
+ */
+unsigned int pipe_min_size = PAGE_SIZE;
  
  /*
   * We use a start+len construction, which provides full use of the 
@@ -1118,26 +1123,20 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
   * Allocate a new array of pipe buffers and copy the info over. Returns the
   * pipe size if successful, or return -ERROR on error.
   */
-static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
  {
         struct pipe_buffer *bufs;
  
-       /*
-        * Must be a power-of-2 currently
-        */
-       if (!is_power_of_2(arg))
-               return -EINVAL;
-
         /*
          * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
          * expect a lot of shrink+grow operations, just free and allocate
          * again like we would do for growing. If the pipe currently
          * contains more buffers than arg, then return busy.
          */
-       if (arg < pipe->nrbufs)
+       if (nr_pages < pipe->nrbufs)
                 return -EBUSY;
  
-       bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
+       bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
         if (unlikely(!bufs))
                 return -ENOMEM;
  
@@ -1158,8 +1157,37 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
         pipe->curbuf = 0;
         kfree(pipe->bufs);
         pipe->bufs = bufs;
-       pipe->buffers = arg;
-       return arg;
+       pipe->buffers = nr_pages;
+       return nr_pages * PAGE_SIZE;
+}
+
+/*
+ * Currently we rely on the pipe array holding a power-of-2 number
+ * of pages.
+ */
+static inline unsigned int round_pipe_size(unsigned int size)
+{
+       unsigned long nr_pages;
+
+       nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
+}
+
+/*
+ * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
+ * will return an error.
+ */
+int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
+                size_t *lenp, loff_t *ppos)
+{
+       int ret;
+
+       ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
+       if (ret < 0 || !write)
+               return ret;
+
+       pipe_max_size = round_pipe_size(pipe_max_size);
+       return ret;
  }
  
  long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -1174,23 +1202,24 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
         mutex_lock(&pipe->inode->i_mutex);
  
         switch (cmd) {
-       case F_SETPIPE_SZ:
-               if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) {
-                       ret = -EINVAL;
+       case F_SETPIPE_SZ: {
+               unsigned int size, nr_pages;
+
+               size = round_pipe_size(arg);
+               nr_pages = size >> PAGE_SHIFT;
+
+               if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
+                       ret = -EPERM;
                         goto out;
-               }
-               /*
-                * The pipe needs to be at least 2 pages large to
-                * guarantee POSIX behaviour.
-                */
-               if (arg < 2) {
+               } else if (nr_pages < PAGE_SIZE) {
                         ret = -EINVAL;
                         goto out;
                 }
-               ret = pipe_set_size(pipe, arg);
+               ret = pipe_set_size(pipe, nr_pages);
                 break;
+               }
         case F_GETPIPE_SZ:
-               ret = pipe->buffers;
+               ret = pipe->buffers * PAGE_SIZE;
                 break;
         default:
                 ret = -EINVAL;
diff --git a/fs/splice.c b/fs/splice.c

index ac22b00d86c3fab51fe3f39cc6cb50a8e7bbcc59..740e6b9faf7ab2e10e0ebfc68725d0d05e0b547f 100644 (file)
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -354,7 +354,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                                 break;
  
                         error = add_to_page_cache_lru(page, mapping, index,
-                                               mapping_gfp_mask(mapping));
+                                               GFP_KERNEL);
                         if (unlikely(error)) {
                                 page_cache_release(page);
                                 if (error == -EEXIST)
diff --git a/fs/sync.c b/fs/sync.c

index c9f83f480ec55ec751d41021c73c9b3aa8c0b99a..15aa6f03b2da11e5282c564d8d674374d4882a49 100644 (file)
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
         if (wait)
                 sync_inodes_sb(sb);
         else
-               writeback_inodes_sb_locked(sb);
+               writeback_inodes_sb(sb);
  
         if (sb->s_op->sync_fs)
                 sb->s_op->sync_fs(sb, wait);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h

index e6e0cb5437e6ce6d00a7fcebe7756b9fdac92480..aee5f6ce166e9cc1f186e5e76cab77bb536edfd5 100644 (file)
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -106,7 +106,7 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
  void bdi_unregister(struct backing_dev_info *bdi);
  int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
  void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-                               long nr_pages, int sb_locked);
+                               long nr_pages);
  int bdi_writeback_task(struct bdi_writeback *wb);
  int bdi_has_dirty_io(struct backing_dev_info *bdi);
  void bdi_arm_supers_timer(void);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index 8b7f5e0914add2b751346224270a9e297d89f5c9..09a840264d6fdf54b0170970eb9c3c49acc9def6 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1211,14 +1211,23 @@ struct work_struct;
  int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
  
  #ifdef CONFIG_BLK_CGROUP
+/*
+ * This should not be using sched_clock(). A real patch is in progress
+ * to fix this up, until that is in place we need to disable preemption
+ * around sched_clock() in this function and set_io_start_time_ns().
+ */
  static inline void set_start_time_ns(struct request *req)
  {
+       preempt_disable();
         req->start_time_ns = sched_clock();
+       preempt_enable();
  }
  
  static inline void set_io_start_time_ns(struct request *req)
  {
+       preempt_disable();
         req->io_start_time_ns = sched_clock();
+       preempt_enable();
  }
  
  static inline uint64_t rq_start_time_ns(struct request *req)
diff --git a/include/linux/drbd.h b/include/linux/drbd.h

index 68530521ad0056edcda138183f8246b625b24c3f..30da4ae489724197a220a83a6e39d1cba0a264c4 100644 (file)
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -53,7 +53,7 @@
  
  
  extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.3.8rc1"
+#define REL_VERSION "8.3.8rc2"
  #define API_VERSION 88
  #define PRO_VERSION_MIN 86
  #define PRO_VERSION_MAX 94
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h

index a0bb301afac0812998370e6af2a778b08f14dc1c..64d5291330312ac718ba7f649e3d428063127f39 100644 (file)
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -7,7 +7,6 @@
  struct cfq_queue;
  struct cfq_io_context {
         void *key;
-       unsigned long dead_key;
  
         struct cfq_queue *cfqq[2];
  
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h

index 16de3933c45ed32b6c4e51a00514f00fe1667d97..445796945ac9d1c491c1d740f0f2ca7bfa1ae22b 100644 (file)
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -139,7 +139,9 @@ void pipe_lock(struct pipe_inode_info *);
  void pipe_unlock(struct pipe_inode_info *);
  void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
  
-extern unsigned int pipe_max_pages;
+extern unsigned int pipe_max_size, pipe_min_size;
+int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *);
+
  
  /* Drop the inode semaphore and wait for a pipe event, atomically */
  void pipe_wait(struct pipe_inode_info *pipe);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h

index cc97d6caf2b32af24505ebbc73d7f98265efbde1..f64134653a8c782caf9211c88a784d818b4c1e5b 100644 (file)
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -65,15 +65,6 @@ struct writeback_control {
          * so we use a single control to update them
          */
         unsigned no_nrwrite_index_update:1;
-
-       /*
-        * For WB_SYNC_ALL, the sb must always be pinned. For WB_SYNC_NONE,
-        * the writeback code will pin the sb for the caller. However,
-        * for eg umount, the caller does WB_SYNC_NONE but already has
-        * the sb pinned. If the below is set, caller already has the
-        * sb pinned.
-        */
-       unsigned sb_pinned:1;
  };
  
  /*
@@ -82,7 +73,6 @@ struct writeback_control {
  struct bdi_writeback;
  int inode_wait(void *);
  void writeback_inodes_sb(struct super_block *);
-void writeback_inodes_sb_locked(struct super_block *);
  int writeback_inodes_sb_if_idle(struct super_block *);
  void sync_inodes_sb(struct super_block *);
  void writeback_inodes_wbc(struct writeback_control *wbc);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 997080f00e0bcbfca3669af93a97eb5951c17811..d24f761f48769d925692efcbb233a276dad01905 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1471,12 +1471,12 @@ static struct ctl_table fs_table[] = {
         },
  #endif
         {
-               .procname       = "pipe-max-pages",
-               .data           = &pipe_max_pages,
+               .procname       = "pipe-max-size",
+               .data           = &pipe_max_size,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = &proc_dointvec_minmax,
-               .extra1         = &two,
+               .proc_handler   = &pipe_proc_fn,
+               .extra1         = &pipe_min_size,
         },
  /*
   * NOTE: do not add new entries to this table unless you have read
diff --git a/mm/page-writeback.c b/mm/page-writeback.c

index b289310e2c899ba472ea1f7a92a01b908f549f18..5fa63bdf52e491990dd89f0ec4348fb0b0d0ca00 100644 (file)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -597,7 +597,7 @@ static void balance_dirty_pages(struct address_space *mapping,
             (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
                                + global_page_state(NR_UNSTABLE_NFS))
                                           > background_thresh)))
-               bdi_start_writeback(bdi, NULL, 0, 0);
+               bdi_start_writeback(bdi, NULL, 0);
  }
  
  void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -707,7 +707,7 @@ void laptop_mode_timer_fn(unsigned long data)
          */
  
         if (bdi_has_dirty_io(&q->backing_dev_info))
-               bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages, 0);
+               bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages);
  }
  
  /*
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 4 Jun 2010 22:37:44 +0000 (15:37 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 4 Jun 2010 22:37:44 +0000 (15:37 -0700)
block/blk-core.c		patch \| blob \| history
block/cfq-iosched.c		patch \| blob \| history
block/elevator.c		patch \| blob \| history
drivers/block/brd.c		patch \| blob \| history
drivers/block/cciss_scsi.c		patch \| blob \| history
drivers/block/drbd/drbd_int.h		patch \| blob \| history
drivers/block/drbd/drbd_main.c		patch \| blob \| history
drivers/block/drbd/drbd_receiver.c		patch \| blob \| history
drivers/block/drbd/drbd_req.c		patch \| blob \| history
drivers/block/drbd/drbd_req.h		patch \| blob \| history
drivers/block/drbd/drbd_worker.c		patch \| blob \| history
fs/fs-writeback.c		patch \| blob \| history
fs/pipe.c		patch \| blob \| history
fs/splice.c		patch \| blob \| history
fs/sync.c		patch \| blob \| history
include/linux/backing-dev.h		patch \| blob \| history
include/linux/blkdev.h		patch \| blob \| history
include/linux/drbd.h		patch \| blob \| history
include/linux/iocontext.h		patch \| blob \| history
include/linux/pipe_fs_i.h		patch \| blob \| history
include/linux/writeback.h		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
mm/page-writeback.c		patch \| blob \| history