Merge branch 'for-linus' of git://neil.brown.name/md
[sfrench/cifs-2.6.git] / drivers / block / drbd / drbd_worker.c
1 /*
2    drbd_worker.c
3
4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10    drbd is free software; you can redistribute it and/or modify
11    it under the terms of the GNU General Public License as published by
12    the Free Software Foundation; either version 2, or (at your option)
13    any later version.
14
15    drbd is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with drbd; see the file COPYING.  If not, write to
22    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24  */
25
26 #include <linux/module.h>
27 #include <linux/drbd.h>
28 #include <linux/sched.h>
29 #include <linux/smp_lock.h>
30 #include <linux/wait.h>
31 #include <linux/mm.h>
32 #include <linux/memcontrol.h>
33 #include <linux/mm_inline.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/string.h>
37 #include <linux/scatterlist.h>
38
39 #include "drbd_int.h"
40 #include "drbd_req.h"
41
42 #define SLEEP_TIME (HZ/10)
43
44 static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
45
46
47
48 /* defined here:
49    drbd_md_io_complete
50    drbd_endio_sec
51    drbd_endio_pri
52
53  * more endio handlers:
54    atodb_endio in drbd_actlog.c
55    drbd_bm_async_io_complete in drbd_bitmap.c
56
57  * For all these callbacks, note the following:
58  * The callbacks will be called in irq context by the IDE drivers,
59  * and in Softirqs/Tasklets/BH context by the SCSI drivers.
60  * Try to get the locking right :)
61  *
62  */
63
64
65 /* About the global_state_lock
66    Each state transition on an device holds a read lock. In case we have
67    to evaluate the sync after dependencies, we grab a write lock, because
68    we need stable states on all devices for that.  */
69 rwlock_t global_state_lock;
70
71 /* used for synchronous meta data and bitmap IO
72  * submitted by drbd_md_sync_page_io()
73  */
74 void drbd_md_io_complete(struct bio *bio, int error)
75 {
76         struct drbd_md_io *md_io;
77
78         md_io = (struct drbd_md_io *)bio->bi_private;
79         md_io->error = error;
80
81         complete(&md_io->event);
82 }
83
84 /* reads on behalf of the partner,
85  * "submitted" by the receiver
86  */
87 void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
88 {
89         unsigned long flags = 0;
90         struct drbd_conf *mdev = e->mdev;
91
92         D_ASSERT(e->block_id != ID_VACANT);
93
94         spin_lock_irqsave(&mdev->req_lock, flags);
95         mdev->read_cnt += e->size >> 9;
96         list_del(&e->w.list);
97         if (list_empty(&mdev->read_ee))
98                 wake_up(&mdev->ee_wait);
99         if (test_bit(__EE_WAS_ERROR, &e->flags))
100                 __drbd_chk_io_error(mdev, FALSE);
101         spin_unlock_irqrestore(&mdev->req_lock, flags);
102
103         drbd_queue_work(&mdev->data.work, &e->w);
104         put_ldev(mdev);
105 }
106
107 static int is_failed_barrier(int ee_flags)
108 {
109         return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
110                         == (EE_IS_BARRIER|EE_WAS_ERROR);
111 }
112
113 /* writes on behalf of the partner, or resync writes,
114  * "submitted" by the receiver, final stage.  */
115 static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
116 {
117         unsigned long flags = 0;
118         struct drbd_conf *mdev = e->mdev;
119         sector_t e_sector;
120         int do_wake;
121         int is_syncer_req;
122         int do_al_complete_io;
123
124         /* if this is a failed barrier request, disable use of barriers,
125          * and schedule for resubmission */
126         if (is_failed_barrier(e->flags)) {
127                 drbd_bump_write_ordering(mdev, WO_bdev_flush);
128                 spin_lock_irqsave(&mdev->req_lock, flags);
129                 list_del(&e->w.list);
130                 e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
131                 e->w.cb = w_e_reissue;
132                 /* put_ldev actually happens below, once we come here again. */
133                 __release(local);
134                 spin_unlock_irqrestore(&mdev->req_lock, flags);
135                 drbd_queue_work(&mdev->data.work, &e->w);
136                 return;
137         }
138
139         D_ASSERT(e->block_id != ID_VACANT);
140
141         /* after we moved e to done_ee,
142          * we may no longer access it,
143          * it may be freed/reused already!
144          * (as soon as we release the req_lock) */
145         e_sector = e->sector;
146         do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
147         is_syncer_req = is_syncer_block_id(e->block_id);
148
149         spin_lock_irqsave(&mdev->req_lock, flags);
150         mdev->writ_cnt += e->size >> 9;
151         list_del(&e->w.list); /* has been on active_ee or sync_ee */
152         list_add_tail(&e->w.list, &mdev->done_ee);
153
154         /* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
155          * neither did we wake possibly waiting conflicting requests.
156          * done from "drbd_process_done_ee" within the appropriate w.cb
157          * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
158
159         do_wake = is_syncer_req
160                 ? list_empty(&mdev->sync_ee)
161                 : list_empty(&mdev->active_ee);
162
163         if (test_bit(__EE_WAS_ERROR, &e->flags))
164                 __drbd_chk_io_error(mdev, FALSE);
165         spin_unlock_irqrestore(&mdev->req_lock, flags);
166
167         if (is_syncer_req)
168                 drbd_rs_complete_io(mdev, e_sector);
169
170         if (do_wake)
171                 wake_up(&mdev->ee_wait);
172
173         if (do_al_complete_io)
174                 drbd_al_complete_io(mdev, e_sector);
175
176         wake_asender(mdev);
177         put_ldev(mdev);
178 }
179
180 /* writes on behalf of the partner, or resync writes,
181  * "submitted" by the receiver.
182  */
183 void drbd_endio_sec(struct bio *bio, int error)
184 {
185         struct drbd_epoch_entry *e = bio->bi_private;
186         struct drbd_conf *mdev = e->mdev;
187         int uptodate = bio_flagged(bio, BIO_UPTODATE);
188         int is_write = bio_data_dir(bio) == WRITE;
189
190         if (error)
191                 dev_warn(DEV, "%s: error=%d s=%llus\n",
192                                 is_write ? "write" : "read", error,
193                                 (unsigned long long)e->sector);
194         if (!error && !uptodate) {
195                 dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
196                                 is_write ? "write" : "read",
197                                 (unsigned long long)e->sector);
198                 /* strange behavior of some lower level drivers...
199                  * fail the request by clearing the uptodate flag,
200                  * but do not return any error?! */
201                 error = -EIO;
202         }
203
204         if (error)
205                 set_bit(__EE_WAS_ERROR, &e->flags);
206
207         bio_put(bio); /* no need for the bio anymore */
208         if (atomic_dec_and_test(&e->pending_bios)) {
209                 if (is_write)
210                         drbd_endio_write_sec_final(e);
211                 else
212                         drbd_endio_read_sec_final(e);
213         }
214 }
215
216 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
217  */
218 void drbd_endio_pri(struct bio *bio, int error)
219 {
220         unsigned long flags;
221         struct drbd_request *req = bio->bi_private;
222         struct drbd_conf *mdev = req->mdev;
223         struct bio_and_error m;
224         enum drbd_req_event what;
225         int uptodate = bio_flagged(bio, BIO_UPTODATE);
226
227         if (error)
228                 dev_warn(DEV, "p %s: error=%d\n",
229                          bio_data_dir(bio) == WRITE ? "write" : "read", error);
230         if (!error && !uptodate) {
231                 dev_warn(DEV, "p %s: setting error to -EIO\n",
232                          bio_data_dir(bio) == WRITE ? "write" : "read");
233                 /* strange behavior of some lower level drivers...
234                  * fail the request by clearing the uptodate flag,
235                  * but do not return any error?! */
236                 error = -EIO;
237         }
238
239         /* to avoid recursion in __req_mod */
240         if (unlikely(error)) {
241                 what = (bio_data_dir(bio) == WRITE)
242                         ? write_completed_with_error
243                         : (bio_rw(bio) == READ)
244                           ? read_completed_with_error
245                           : read_ahead_completed_with_error;
246         } else
247                 what = completed_ok;
248
249         bio_put(req->private_bio);
250         req->private_bio = ERR_PTR(error);
251
252         spin_lock_irqsave(&mdev->req_lock, flags);
253         __req_mod(req, what, &m);
254         spin_unlock_irqrestore(&mdev->req_lock, flags);
255
256         if (m.bio)
257                 complete_master_bio(mdev, &m);
258 }
259
260 int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
261 {
262         struct drbd_request *req = container_of(w, struct drbd_request, w);
263
264         /* NOTE: mdev->ldev can be NULL by the time we get here! */
265         /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */
266
267         /* the only way this callback is scheduled is from _req_may_be_done,
268          * when it is done and had a local write error, see comments there */
269         drbd_req_free(req);
270
271         return TRUE;
272 }
273
274 int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
275 {
276         struct drbd_request *req = container_of(w, struct drbd_request, w);
277
278         /* We should not detach for read io-error,
279          * but try to WRITE the P_DATA_REPLY to the failed location,
280          * to give the disk the chance to relocate that block */
281
282         spin_lock_irq(&mdev->req_lock);
283         if (cancel ||
284             mdev->state.conn < C_CONNECTED ||
285             mdev->state.pdsk <= D_INCONSISTENT) {
286                 _req_mod(req, send_canceled);
287                 spin_unlock_irq(&mdev->req_lock);
288                 dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n");
289                 return 1;
290         }
291         spin_unlock_irq(&mdev->req_lock);
292
293         return w_send_read_req(mdev, w, 0);
294 }
295
296 int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
297 {
298         ERR_IF(cancel) return 1;
299         dev_err(DEV, "resync inactive, but callback triggered??\n");
300         return 1; /* Simply ignore this! */
301 }
302
303 void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
304 {
305         struct hash_desc desc;
306         struct scatterlist sg;
307         struct page *page = e->pages;
308         struct page *tmp;
309         unsigned len;
310
311         desc.tfm = tfm;
312         desc.flags = 0;
313
314         sg_init_table(&sg, 1);
315         crypto_hash_init(&desc);
316
317         while ((tmp = page_chain_next(page))) {
318                 /* all but the last page will be fully used */
319                 sg_set_page(&sg, page, PAGE_SIZE, 0);
320                 crypto_hash_update(&desc, &sg, sg.length);
321                 page = tmp;
322         }
323         /* and now the last, possibly only partially used page */
324         len = e->size & (PAGE_SIZE - 1);
325         sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
326         crypto_hash_update(&desc, &sg, sg.length);
327         crypto_hash_final(&desc, digest);
328 }
329
330 void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
331 {
332         struct hash_desc desc;
333         struct scatterlist sg;
334         struct bio_vec *bvec;
335         int i;
336
337         desc.tfm = tfm;
338         desc.flags = 0;
339
340         sg_init_table(&sg, 1);
341         crypto_hash_init(&desc);
342
343         __bio_for_each_segment(bvec, bio, i, 0) {
344                 sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
345                 crypto_hash_update(&desc, &sg, sg.length);
346         }
347         crypto_hash_final(&desc, digest);
348 }
349
350 static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
351 {
352         struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
353         int digest_size;
354         void *digest;
355         int ok;
356
357         D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
358
359         if (unlikely(cancel)) {
360                 drbd_free_ee(mdev, e);
361                 return 1;
362         }
363
364         if (likely((e->flags & EE_WAS_ERROR) == 0)) {
365                 digest_size = crypto_hash_digestsize(mdev->csums_tfm);
366                 digest = kmalloc(digest_size, GFP_NOIO);
367                 if (digest) {
368                         drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
369
370                         inc_rs_pending(mdev);
371                         ok = drbd_send_drequest_csum(mdev,
372                                                      e->sector,
373                                                      e->size,
374                                                      digest,
375                                                      digest_size,
376                                                      P_CSUM_RS_REQUEST);
377                         kfree(digest);
378                 } else {
379                         dev_err(DEV, "kmalloc() of digest failed.\n");
380                         ok = 0;
381                 }
382         } else
383                 ok = 1;
384
385         drbd_free_ee(mdev, e);
386
387         if (unlikely(!ok))
388                 dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
389         return ok;
390 }
391
392 #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
393
394 static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
395 {
396         struct drbd_epoch_entry *e;
397
398         if (!get_ldev(mdev))
399                 return 0;
400
401         /* GFP_TRY, because if there is no memory available right now, this may
402          * be rescheduled for later. It is "only" background resync, after all. */
403         e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
404         if (!e)
405                 goto fail;
406
407         spin_lock_irq(&mdev->req_lock);
408         list_add(&e->w.list, &mdev->read_ee);
409         spin_unlock_irq(&mdev->req_lock);
410
411         e->w.cb = w_e_send_csum;
412         if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
413                 return 1;
414
415         drbd_free_ee(mdev, e);
416 fail:
417         put_ldev(mdev);
418         return 2;
419 }
420
421 void resync_timer_fn(unsigned long data)
422 {
423         unsigned long flags;
424         struct drbd_conf *mdev = (struct drbd_conf *) data;
425         int queue;
426
427         spin_lock_irqsave(&mdev->req_lock, flags);
428
429         if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
430                 queue = 1;
431                 if (mdev->state.conn == C_VERIFY_S)
432                         mdev->resync_work.cb = w_make_ov_request;
433                 else
434                         mdev->resync_work.cb = w_make_resync_request;
435         } else {
436                 queue = 0;
437                 mdev->resync_work.cb = w_resync_inactive;
438         }
439
440         spin_unlock_irqrestore(&mdev->req_lock, flags);
441
442         /* harmless race: list_empty outside data.work.q_lock */
443         if (list_empty(&mdev->resync_work.list) && queue)
444                 drbd_queue_work(&mdev->data.work, &mdev->resync_work);
445 }
446
447 static int calc_resync_rate(struct drbd_conf *mdev)
448 {
449         int d = mdev->data_delay / 1000; /* us -> ms */
450         int td = mdev->sync_conf.throttle_th * 100;  /* 0.1s -> ms */
451         int hd = mdev->sync_conf.hold_off_th * 100;  /* 0.1s -> ms */
452         int cr = mdev->sync_conf.rate;
453
454         return d <= td ? cr :
455                 d >= hd ? 0 :
456                 cr + (cr * (td - d) / (hd - td));
457 }
458
459 int w_make_resync_request(struct drbd_conf *mdev,
460                 struct drbd_work *w, int cancel)
461 {
462         unsigned long bit;
463         sector_t sector;
464         const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
465         int max_segment_size;
466         int number, i, size, pe, mx;
467         int align, queued, sndbuf;
468
469         if (unlikely(cancel))
470                 return 1;
471
472         if (unlikely(mdev->state.conn < C_CONNECTED)) {
473                 dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected");
474                 return 0;
475         }
476
477         if (mdev->state.conn != C_SYNC_TARGET)
478                 dev_err(DEV, "%s in w_make_resync_request\n",
479                         drbd_conn_str(mdev->state.conn));
480
481         if (!get_ldev(mdev)) {
482                 /* Since we only need to access mdev->rsync a
483                    get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
484                    to continue resync with a broken disk makes no sense at
485                    all */
486                 dev_err(DEV, "Disk broke down during resync!\n");
487                 mdev->resync_work.cb = w_resync_inactive;
488                 return 1;
489         }
490
491         /* starting with drbd 8.3.8, we can handle multi-bio EEs,
492          * if it should be necessary */
493         max_segment_size = mdev->agreed_pro_version < 94 ?
494                 queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
495
496         mdev->c_sync_rate = calc_resync_rate(mdev);
497         number = SLEEP_TIME * mdev->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
498         pe = atomic_read(&mdev->rs_pending_cnt);
499
500         mutex_lock(&mdev->data.mutex);
501         if (mdev->data.socket)
502                 mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
503         else
504                 mx = 1;
505         mutex_unlock(&mdev->data.mutex);
506
507         /* For resync rates >160MB/sec, allow more pending RS requests */
508         if (number > mx)
509                 mx = number;
510
511         /* Limit the number of pending RS requests to no more than the peer's receive buffer */
512         if ((pe + number) > mx) {
513                 number = mx - pe;
514         }
515
516         for (i = 0; i < number; i++) {
517                 /* Stop generating RS requests, when half of the send buffer is filled */
518                 mutex_lock(&mdev->data.mutex);
519                 if (mdev->data.socket) {
520                         queued = mdev->data.socket->sk->sk_wmem_queued;
521                         sndbuf = mdev->data.socket->sk->sk_sndbuf;
522                 } else {
523                         queued = 1;
524                         sndbuf = 0;
525                 }
526                 mutex_unlock(&mdev->data.mutex);
527                 if (queued > sndbuf / 2)
528                         goto requeue;
529
530 next_sector:
531                 size = BM_BLOCK_SIZE;
532                 bit  = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
533
534                 if (bit == -1UL) {
535                         mdev->bm_resync_fo = drbd_bm_bits(mdev);
536                         mdev->resync_work.cb = w_resync_inactive;
537                         put_ldev(mdev);
538                         return 1;
539                 }
540
541                 sector = BM_BIT_TO_SECT(bit);
542
543                 if (drbd_try_rs_begin_io(mdev, sector)) {
544                         mdev->bm_resync_fo = bit;
545                         goto requeue;
546                 }
547                 mdev->bm_resync_fo = bit + 1;
548
549                 if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
550                         drbd_rs_complete_io(mdev, sector);
551                         goto next_sector;
552                 }
553
554 #if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
555                 /* try to find some adjacent bits.
556                  * we stop if we have already the maximum req size.
557                  *
558                  * Additionally always align bigger requests, in order to
559                  * be prepared for all stripe sizes of software RAIDs.
560                  */
561                 align = 1;
562                 for (;;) {
563                         if (size + BM_BLOCK_SIZE > max_segment_size)
564                                 break;
565
566                         /* Be always aligned */
567                         if (sector & ((1<<(align+3))-1))
568                                 break;
569
570                         /* do not cross extent boundaries */
571                         if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
572                                 break;
573                         /* now, is it actually dirty, after all?
574                          * caution, drbd_bm_test_bit is tri-state for some
575                          * obscure reason; ( b == 0 ) would get the out-of-band
576                          * only accidentally right because of the "oddly sized"
577                          * adjustment below */
578                         if (drbd_bm_test_bit(mdev, bit+1) != 1)
579                                 break;
580                         bit++;
581                         size += BM_BLOCK_SIZE;
582                         if ((BM_BLOCK_SIZE << align) <= size)
583                                 align++;
584                         i++;
585                 }
586                 /* if we merged some,
587                  * reset the offset to start the next drbd_bm_find_next from */
588                 if (size > BM_BLOCK_SIZE)
589                         mdev->bm_resync_fo = bit + 1;
590 #endif
591
592                 /* adjust very last sectors, in case we are oddly sized */
593                 if (sector + (size>>9) > capacity)
594                         size = (capacity-sector)<<9;
595                 if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
596                         switch (read_for_csum(mdev, sector, size)) {
597                         case 0: /* Disk failure*/
598                                 put_ldev(mdev);
599                                 return 0;
600                         case 2: /* Allocation failed */
601                                 drbd_rs_complete_io(mdev, sector);
602                                 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
603                                 goto requeue;
604                         /* case 1: everything ok */
605                         }
606                 } else {
607                         inc_rs_pending(mdev);
608                         if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
609                                                sector, size, ID_SYNCER)) {
610                                 dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
611                                 dec_rs_pending(mdev);
612                                 put_ldev(mdev);
613                                 return 0;
614                         }
615                 }
616         }
617
618         if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
619                 /* last syncer _request_ was sent,
620                  * but the P_RS_DATA_REPLY not yet received.  sync will end (and
621                  * next sync group will resume), as soon as we receive the last
622                  * resync data block, and the last bit is cleared.
623                  * until then resync "work" is "inactive" ...
624                  */
625                 mdev->resync_work.cb = w_resync_inactive;
626                 put_ldev(mdev);
627                 return 1;
628         }
629
630  requeue:
631         mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
632         put_ldev(mdev);
633         return 1;
634 }
635
636 static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
637 {
638         int number, i, size;
639         sector_t sector;
640         const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
641
642         if (unlikely(cancel))
643                 return 1;
644
645         if (unlikely(mdev->state.conn < C_CONNECTED)) {
646                 dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected");
647                 return 0;
648         }
649
650         number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
651         if (atomic_read(&mdev->rs_pending_cnt) > number)
652                 goto requeue;
653
654         number -= atomic_read(&mdev->rs_pending_cnt);
655
656         sector = mdev->ov_position;
657         for (i = 0; i < number; i++) {
658                 if (sector >= capacity) {
659                         mdev->resync_work.cb = w_resync_inactive;
660                         return 1;
661                 }
662
663                 size = BM_BLOCK_SIZE;
664
665                 if (drbd_try_rs_begin_io(mdev, sector)) {
666                         mdev->ov_position = sector;
667                         goto requeue;
668                 }
669
670                 if (sector + (size>>9) > capacity)
671                         size = (capacity-sector)<<9;
672
673                 inc_rs_pending(mdev);
674                 if (!drbd_send_ov_request(mdev, sector, size)) {
675                         dec_rs_pending(mdev);
676                         return 0;
677                 }
678                 sector += BM_SECT_PER_BIT;
679         }
680         mdev->ov_position = sector;
681
682  requeue:
683         mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
684         return 1;
685 }
686
687
688 int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
689 {
690         kfree(w);
691         ov_oos_print(mdev);
692         drbd_resync_finished(mdev);
693
694         return 1;
695 }
696
697 static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
698 {
699         kfree(w);
700
701         drbd_resync_finished(mdev);
702
703         return 1;
704 }
705
706 int drbd_resync_finished(struct drbd_conf *mdev)
707 {
708         unsigned long db, dt, dbdt;
709         unsigned long n_oos;
710         union drbd_state os, ns;
711         struct drbd_work *w;
712         char *khelper_cmd = NULL;
713
714         /* Remove all elements from the resync LRU. Since future actions
715          * might set bits in the (main) bitmap, then the entries in the
716          * resync LRU would be wrong. */
717         if (drbd_rs_del_all(mdev)) {
718                 /* In case this is not possible now, most probably because
719                  * there are P_RS_DATA_REPLY Packets lingering on the worker's
720                  * queue (or even the read operations for those packets
721                  * is not finished by now).   Retry in 100ms. */
722
723                 drbd_kick_lo(mdev);
724                 __set_current_state(TASK_INTERRUPTIBLE);
725                 schedule_timeout(HZ / 10);
726                 w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
727                 if (w) {
728                         w->cb = w_resync_finished;
729                         drbd_queue_work(&mdev->data.work, w);
730                         return 1;
731                 }
732                 dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
733         }
734
735         dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
736         if (dt <= 0)
737                 dt = 1;
738         db = mdev->rs_total;
739         dbdt = Bit2KB(db/dt);
740         mdev->rs_paused /= HZ;
741
742         if (!get_ldev(mdev))
743                 goto out;
744
745         spin_lock_irq(&mdev->req_lock);
746         os = mdev->state;
747
748         /* This protects us against multiple calls (that can happen in the presence
749            of application IO), and against connectivity loss just before we arrive here. */
750         if (os.conn <= C_CONNECTED)
751                 goto out_unlock;
752
753         ns = os;
754         ns.conn = C_CONNECTED;
755
756         dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
757              (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ?
758              "Online verify " : "Resync",
759              dt + mdev->rs_paused, mdev->rs_paused, dbdt);
760
761         n_oos = drbd_bm_total_weight(mdev);
762
763         if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
764                 if (n_oos) {
765                         dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
766                               n_oos, Bit2KB(1));
767                         khelper_cmd = "out-of-sync";
768                 }
769         } else {
770                 D_ASSERT((n_oos - mdev->rs_failed) == 0);
771
772                 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
773                         khelper_cmd = "after-resync-target";
774
775                 if (mdev->csums_tfm && mdev->rs_total) {
776                         const unsigned long s = mdev->rs_same_csum;
777                         const unsigned long t = mdev->rs_total;
778                         const int ratio =
779                                 (t == 0)     ? 0 :
780                         (t < 100000) ? ((s*100)/t) : (s/(t/100));
781                         dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; "
782                              "transferred %luK total %luK\n",
783                              ratio,
784                              Bit2KB(mdev->rs_same_csum),
785                              Bit2KB(mdev->rs_total - mdev->rs_same_csum),
786                              Bit2KB(mdev->rs_total));
787                 }
788         }
789
790         if (mdev->rs_failed) {
791                 dev_info(DEV, "            %lu failed blocks\n", mdev->rs_failed);
792
793                 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
794                         ns.disk = D_INCONSISTENT;
795                         ns.pdsk = D_UP_TO_DATE;
796                 } else {
797                         ns.disk = D_UP_TO_DATE;
798                         ns.pdsk = D_INCONSISTENT;
799                 }
800         } else {
801                 ns.disk = D_UP_TO_DATE;
802                 ns.pdsk = D_UP_TO_DATE;
803
804                 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
805                         if (mdev->p_uuid) {
806                                 int i;
807                                 for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
808                                         _drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
809                                 drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
810                                 _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
811                         } else {
812                                 dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
813                         }
814                 }
815
816                 drbd_uuid_set_bm(mdev, 0UL);
817
818                 if (mdev->p_uuid) {
819                         /* Now the two UUID sets are equal, update what we
820                          * know of the peer. */
821                         int i;
822                         for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
823                                 mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
824                 }
825         }
826
827         _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
828 out_unlock:
829         spin_unlock_irq(&mdev->req_lock);
830         put_ldev(mdev);
831 out:
832         mdev->rs_total  = 0;
833         mdev->rs_failed = 0;
834         mdev->rs_paused = 0;
835         mdev->ov_start_sector = 0;
836
837         if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
838                 dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
839                 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
840         }
841
842         if (khelper_cmd)
843                 drbd_khelper(mdev, khelper_cmd);
844
845         return 1;
846 }
847
848 /* helper */
849 static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
850 {
851         if (drbd_ee_has_active_page(e)) {
852                 /* This might happen if sendpage() has not finished */
853                 spin_lock_irq(&mdev->req_lock);
854                 list_add_tail(&e->w.list, &mdev->net_ee);
855                 spin_unlock_irq(&mdev->req_lock);
856         } else
857                 drbd_free_ee(mdev, e);
858 }
859
860 /**
861  * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
862  * @mdev:       DRBD device.
863  * @w:          work object.
864  * @cancel:     The connection will be closed anyways
865  */
866 int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
867 {
868         struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
869         int ok;
870
871         if (unlikely(cancel)) {
872                 drbd_free_ee(mdev, e);
873                 dec_unacked(mdev);
874                 return 1;
875         }
876
877         if (likely((e->flags & EE_WAS_ERROR) == 0)) {
878                 ok = drbd_send_block(mdev, P_DATA_REPLY, e);
879         } else {
880                 if (__ratelimit(&drbd_ratelimit_state))
881                         dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
882                             (unsigned long long)e->sector);
883
884                 ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
885         }
886
887         dec_unacked(mdev);
888
889         move_to_net_ee_or_free(mdev, e);
890
891         if (unlikely(!ok))
892                 dev_err(DEV, "drbd_send_block() failed\n");
893         return ok;
894 }
895
896 /**
897  * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
898  * @mdev:       DRBD device.
899  * @w:          work object.
900  * @cancel:     The connection will be closed anyways
901  */
902 int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
903 {
904         struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
905         int ok;
906
907         if (unlikely(cancel)) {
908                 drbd_free_ee(mdev, e);
909                 dec_unacked(mdev);
910                 return 1;
911         }
912
913         if (get_ldev_if_state(mdev, D_FAILED)) {
914                 drbd_rs_complete_io(mdev, e->sector);
915                 put_ldev(mdev);
916         }
917
918         if (likely((e->flags & EE_WAS_ERROR) == 0)) {
919                 if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
920                         inc_rs_pending(mdev);
921                         ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
922                 } else {
923                         if (__ratelimit(&drbd_ratelimit_state))
924                                 dev_err(DEV, "Not sending RSDataReply, "
925                                     "partner DISKLESS!\n");
926                         ok = 1;
927                 }
928         } else {
929                 if (__ratelimit(&drbd_ratelimit_state))
930                         dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
931                             (unsigned long long)e->sector);
932
933                 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
934
935                 /* update resync data with failure */
936                 drbd_rs_failed_io(mdev, e->sector, e->size);
937         }
938
939         dec_unacked(mdev);
940
941         move_to_net_ee_or_free(mdev, e);
942
943         if (unlikely(!ok))
944                 dev_err(DEV, "drbd_send_block() failed\n");
945         return ok;
946 }
947
948 int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
949 {
950         struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
951         struct digest_info *di;
952         int digest_size;
953         void *digest = NULL;
954         int ok, eq = 0;
955
956         if (unlikely(cancel)) {
957                 drbd_free_ee(mdev, e);
958                 dec_unacked(mdev);
959                 return 1;
960         }
961
962         drbd_rs_complete_io(mdev, e->sector);
963
964         di = (struct digest_info *)(unsigned long)e->block_id;
965
966         if (likely((e->flags & EE_WAS_ERROR) == 0)) {
967                 /* quick hack to try to avoid a race against reconfiguration.
968                  * a real fix would be much more involved,
969                  * introducing more locking mechanisms */
970                 if (mdev->csums_tfm) {
971                         digest_size = crypto_hash_digestsize(mdev->csums_tfm);
972                         D_ASSERT(digest_size == di->digest_size);
973                         digest = kmalloc(digest_size, GFP_NOIO);
974                 }
975                 if (digest) {
976                         drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
977                         eq = !memcmp(digest, di->digest, digest_size);
978                         kfree(digest);
979                 }
980
981                 if (eq) {
982                         drbd_set_in_sync(mdev, e->sector, e->size);
983                         /* rs_same_csums unit is BM_BLOCK_SIZE */
984                         mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT;
985                         ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
986                 } else {
987                         inc_rs_pending(mdev);
988                         e->block_id = ID_SYNCER;
989                         ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
990                 }
991         } else {
992                 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
993                 if (__ratelimit(&drbd_ratelimit_state))
994                         dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
995         }
996
997         dec_unacked(mdev);
998
999         kfree(di);
1000
1001         move_to_net_ee_or_free(mdev, e);
1002
1003         if (unlikely(!ok))
1004                 dev_err(DEV, "drbd_send_block/ack() failed\n");
1005         return ok;
1006 }
1007
1008 int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1009 {
1010         struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
1011         int digest_size;
1012         void *digest;
1013         int ok = 1;
1014
1015         if (unlikely(cancel))
1016                 goto out;
1017
1018         if (unlikely((e->flags & EE_WAS_ERROR) != 0))
1019                 goto out;
1020
1021         digest_size = crypto_hash_digestsize(mdev->verify_tfm);
1022         /* FIXME if this allocation fails, online verify will not terminate! */
1023         digest = kmalloc(digest_size, GFP_NOIO);
1024         if (digest) {
1025                 drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
1026                 inc_rs_pending(mdev);
1027                 ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
1028                                              digest, digest_size, P_OV_REPLY);
1029                 if (!ok)
1030                         dec_rs_pending(mdev);
1031                 kfree(digest);
1032         }
1033
1034 out:
1035         drbd_free_ee(mdev, e);
1036
1037         dec_unacked(mdev);
1038
1039         return ok;
1040 }
1041
1042 void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
1043 {
1044         if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
1045                 mdev->ov_last_oos_size += size>>9;
1046         } else {
1047                 mdev->ov_last_oos_start = sector;
1048                 mdev->ov_last_oos_size = size>>9;
1049         }
1050         drbd_set_out_of_sync(mdev, sector, size);
1051         set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
1052 }
1053
1054 int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1055 {
1056         struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
1057         struct digest_info *di;
1058         int digest_size;
1059         void *digest;
1060         int ok, eq = 0;
1061
1062         if (unlikely(cancel)) {
1063                 drbd_free_ee(mdev, e);
1064                 dec_unacked(mdev);
1065                 return 1;
1066         }
1067
1068         /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1069          * the resync lru has been cleaned up already */
1070         drbd_rs_complete_io(mdev, e->sector);
1071
1072         di = (struct digest_info *)(unsigned long)e->block_id;
1073
1074         if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1075                 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
1076                 digest = kmalloc(digest_size, GFP_NOIO);
1077                 if (digest) {
1078                         drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
1079
1080                         D_ASSERT(digest_size == di->digest_size);
1081                         eq = !memcmp(digest, di->digest, digest_size);
1082                         kfree(digest);
1083                 }
1084         } else {
1085                 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
1086                 if (__ratelimit(&drbd_ratelimit_state))
1087                         dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
1088         }
1089
1090         dec_unacked(mdev);
1091
1092         kfree(di);
1093
1094         if (!eq)
1095                 drbd_ov_oos_found(mdev, e->sector, e->size);
1096         else
1097                 ov_oos_print(mdev);
1098
1099         ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size,
1100                               eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
1101
1102         drbd_free_ee(mdev, e);
1103
1104         if (--mdev->ov_left == 0) {
1105                 ov_oos_print(mdev);
1106                 drbd_resync_finished(mdev);
1107         }
1108
1109         return ok;
1110 }
1111
1112 int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1113 {
1114         struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
1115         complete(&b->done);
1116         return 1;
1117 }
1118
1119 int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1120 {
1121         struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
1122         struct p_barrier *p = &mdev->data.sbuf.barrier;
1123         int ok = 1;
1124
1125         /* really avoid racing with tl_clear.  w.cb may have been referenced
1126          * just before it was reassigned and re-queued, so double check that.
1127          * actually, this race was harmless, since we only try to send the
1128          * barrier packet here, and otherwise do nothing with the object.
1129          * but compare with the head of w_clear_epoch */
1130         spin_lock_irq(&mdev->req_lock);
1131         if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
1132                 cancel = 1;
1133         spin_unlock_irq(&mdev->req_lock);
1134         if (cancel)
1135                 return 1;
1136
1137         if (!drbd_get_data_sock(mdev))
1138                 return 0;
1139         p->barrier = b->br_number;
1140         /* inc_ap_pending was done where this was queued.
1141          * dec_ap_pending will be done in got_BarrierAck
1142          * or (on connection loss) in w_clear_epoch.  */
1143         ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
1144                                 (struct p_header *)p, sizeof(*p), 0);
1145         drbd_put_data_sock(mdev);
1146
1147         return ok;
1148 }
1149
1150 int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1151 {
1152         if (cancel)
1153                 return 1;
1154         return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
1155 }
1156
1157 /**
1158  * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1159  * @mdev:       DRBD device.
1160  * @w:          work object.
1161  * @cancel:     The connection will be closed anyways
1162  */
1163 int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1164 {
1165         struct drbd_request *req = container_of(w, struct drbd_request, w);
1166         int ok;
1167
1168         if (unlikely(cancel)) {
1169                 req_mod(req, send_canceled);
1170                 return 1;
1171         }
1172
1173         ok = drbd_send_dblock(mdev, req);
1174         req_mod(req, ok ? handed_over_to_network : send_failed);
1175
1176         return ok;
1177 }
1178
1179 /**
1180  * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
1181  * @mdev:       DRBD device.
1182  * @w:          work object.
1183  * @cancel:     The connection will be closed anyways
1184  */
1185 int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1186 {
1187         struct drbd_request *req = container_of(w, struct drbd_request, w);
1188         int ok;
1189
1190         if (unlikely(cancel)) {
1191                 req_mod(req, send_canceled);
1192                 return 1;
1193         }
1194
1195         ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
1196                                 (unsigned long)req);
1197
1198         if (!ok) {
1199                 /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
1200                  * so this is probably redundant */
1201                 if (mdev->state.conn >= C_CONNECTED)
1202                         drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
1203         }
1204         req_mod(req, ok ? handed_over_to_network : send_failed);
1205
1206         return ok;
1207 }
1208
1209 static int _drbd_may_sync_now(struct drbd_conf *mdev)
1210 {
1211         struct drbd_conf *odev = mdev;
1212
1213         while (1) {
1214                 if (odev->sync_conf.after == -1)
1215                         return 1;
1216                 odev = minor_to_mdev(odev->sync_conf.after);
1217                 ERR_IF(!odev) return 1;
1218                 if ((odev->state.conn >= C_SYNC_SOURCE &&
1219                      odev->state.conn <= C_PAUSED_SYNC_T) ||
1220                     odev->state.aftr_isp || odev->state.peer_isp ||
1221                     odev->state.user_isp)
1222                         return 0;
1223         }
1224 }
1225
1226 /**
1227  * _drbd_pause_after() - Pause resync on all devices that may not resync now
1228  * @mdev:       DRBD device.
1229  *
1230  * Called from process context only (admin command and after_state_ch).
1231  */
1232 static int _drbd_pause_after(struct drbd_conf *mdev)
1233 {
1234         struct drbd_conf *odev;
1235         int i, rv = 0;
1236
1237         for (i = 0; i < minor_count; i++) {
1238                 odev = minor_to_mdev(i);
1239                 if (!odev)
1240                         continue;
1241                 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1242                         continue;
1243                 if (!_drbd_may_sync_now(odev))
1244                         rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
1245                                != SS_NOTHING_TO_DO);
1246         }
1247
1248         return rv;
1249 }
1250
1251 /**
1252  * _drbd_resume_next() - Resume resync on all devices that may resync now
1253  * @mdev:       DRBD device.
1254  *
1255  * Called from process context only (admin command and worker).
1256  */
1257 static int _drbd_resume_next(struct drbd_conf *mdev)
1258 {
1259         struct drbd_conf *odev;
1260         int i, rv = 0;
1261
1262         for (i = 0; i < minor_count; i++) {
1263                 odev = minor_to_mdev(i);
1264                 if (!odev)
1265                         continue;
1266                 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1267                         continue;
1268                 if (odev->state.aftr_isp) {
1269                         if (_drbd_may_sync_now(odev))
1270                                 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
1271                                                         CS_HARD, NULL)
1272                                        != SS_NOTHING_TO_DO) ;
1273                 }
1274         }
1275         return rv;
1276 }
1277
1278 void resume_next_sg(struct drbd_conf *mdev)
1279 {
1280         write_lock_irq(&global_state_lock);
1281         _drbd_resume_next(mdev);
1282         write_unlock_irq(&global_state_lock);
1283 }
1284
1285 void suspend_other_sg(struct drbd_conf *mdev)
1286 {
1287         write_lock_irq(&global_state_lock);
1288         _drbd_pause_after(mdev);
1289         write_unlock_irq(&global_state_lock);
1290 }
1291
1292 static int sync_after_error(struct drbd_conf *mdev, int o_minor)
1293 {
1294         struct drbd_conf *odev;
1295
1296         if (o_minor == -1)
1297                 return NO_ERROR;
1298         if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
1299                 return ERR_SYNC_AFTER;
1300
1301         /* check for loops */
1302         odev = minor_to_mdev(o_minor);
1303         while (1) {
1304                 if (odev == mdev)
1305                         return ERR_SYNC_AFTER_CYCLE;
1306
1307                 /* dependency chain ends here, no cycles. */
1308                 if (odev->sync_conf.after == -1)
1309                         return NO_ERROR;
1310
1311                 /* follow the dependency chain */
1312                 odev = minor_to_mdev(odev->sync_conf.after);
1313         }
1314 }
1315
1316 int drbd_alter_sa(struct drbd_conf *mdev, int na)
1317 {
1318         int changes;
1319         int retcode;
1320
1321         write_lock_irq(&global_state_lock);
1322         retcode = sync_after_error(mdev, na);
1323         if (retcode == NO_ERROR) {
1324                 mdev->sync_conf.after = na;
1325                 do {
1326                         changes  = _drbd_pause_after(mdev);
1327                         changes |= _drbd_resume_next(mdev);
1328                 } while (changes);
1329         }
1330         write_unlock_irq(&global_state_lock);
1331         return retcode;
1332 }
1333
1334 static void ping_peer(struct drbd_conf *mdev)
1335 {
1336         clear_bit(GOT_PING_ACK, &mdev->flags);
1337         request_ping(mdev);
1338         wait_event(mdev->misc_wait,
1339                    test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
1340 }
1341
1342 /**
1343  * drbd_start_resync() - Start the resync process
1344  * @mdev:       DRBD device.
1345  * @side:       Either C_SYNC_SOURCE or C_SYNC_TARGET
1346  *
1347  * This function might bring you directly into one of the
1348  * C_PAUSED_SYNC_* states.
1349  */
1350 void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1351 {
1352         union drbd_state ns;
1353         int r;
1354
1355         if (mdev->state.conn >= C_SYNC_SOURCE) {
1356                 dev_err(DEV, "Resync already running!\n");
1357                 return;
1358         }
1359
1360         /* In case a previous resync run was aborted by an IO error/detach on the peer. */
1361         drbd_rs_cancel_all(mdev);
1362
1363         if (side == C_SYNC_TARGET) {
1364                 /* Since application IO was locked out during C_WF_BITMAP_T and
1365                    C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1366                    we check that we might make the data inconsistent. */
1367                 r = drbd_khelper(mdev, "before-resync-target");
1368                 r = (r >> 8) & 0xff;
1369                 if (r > 0) {
1370                         dev_info(DEV, "before-resync-target handler returned %d, "
1371                              "dropping connection.\n", r);
1372                         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
1373                         return;
1374                 }
1375         }
1376
1377         drbd_state_lock(mdev);
1378
1379         if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
1380                 drbd_state_unlock(mdev);
1381                 return;
1382         }
1383
1384         if (side == C_SYNC_TARGET) {
1385                 mdev->bm_resync_fo = 0;
1386         } else /* side == C_SYNC_SOURCE */ {
1387                 u64 uuid;
1388
1389                 get_random_bytes(&uuid, sizeof(u64));
1390                 drbd_uuid_set(mdev, UI_BITMAP, uuid);
1391                 drbd_send_sync_uuid(mdev, uuid);
1392
1393                 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
1394         }
1395
1396         write_lock_irq(&global_state_lock);
1397         ns = mdev->state;
1398
1399         ns.aftr_isp = !_drbd_may_sync_now(mdev);
1400
1401         ns.conn = side;
1402
1403         if (side == C_SYNC_TARGET)
1404                 ns.disk = D_INCONSISTENT;
1405         else /* side == C_SYNC_SOURCE */
1406                 ns.pdsk = D_INCONSISTENT;
1407
1408         r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
1409         ns = mdev->state;
1410
1411         if (ns.conn < C_CONNECTED)
1412                 r = SS_UNKNOWN_ERROR;
1413
1414         if (r == SS_SUCCESS) {
1415                 mdev->rs_total     =
1416                 mdev->rs_mark_left = drbd_bm_total_weight(mdev);
1417                 mdev->rs_failed    = 0;
1418                 mdev->rs_paused    = 0;
1419                 mdev->rs_start     =
1420                 mdev->rs_mark_time = jiffies;
1421                 mdev->rs_same_csum = 0;
1422                 _drbd_pause_after(mdev);
1423         }
1424         write_unlock_irq(&global_state_lock);
1425         put_ldev(mdev);
1426
1427         if (r == SS_SUCCESS) {
1428                 dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1429                      drbd_conn_str(ns.conn),
1430                      (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
1431                      (unsigned long) mdev->rs_total);
1432
1433                 if (mdev->rs_total == 0) {
1434                         /* Peer still reachable? Beware of failing before-resync-target handlers! */
1435                         ping_peer(mdev);
1436                         drbd_resync_finished(mdev);
1437                 }
1438
1439                 /* ns.conn may already be != mdev->state.conn,
1440                  * we may have been paused in between, or become paused until
1441                  * the timer triggers.
1442                  * No matter, that is handled in resync_timer_fn() */
1443                 if (ns.conn == C_SYNC_TARGET)
1444                         mod_timer(&mdev->resync_timer, jiffies);
1445
1446                 drbd_md_sync(mdev);
1447         }
1448         drbd_state_unlock(mdev);
1449 }
1450
1451 int drbd_worker(struct drbd_thread *thi)
1452 {
1453         struct drbd_conf *mdev = thi->mdev;
1454         struct drbd_work *w = NULL;
1455         LIST_HEAD(work_list);
1456         int intr = 0, i;
1457
1458         sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
1459
1460         while (get_t_state(thi) == Running) {
1461                 drbd_thread_current_set_cpu(mdev);
1462
1463                 if (down_trylock(&mdev->data.work.s)) {
1464                         mutex_lock(&mdev->data.mutex);
1465                         if (mdev->data.socket && !mdev->net_conf->no_cork)
1466                                 drbd_tcp_uncork(mdev->data.socket);
1467                         mutex_unlock(&mdev->data.mutex);
1468
1469                         intr = down_interruptible(&mdev->data.work.s);
1470
1471                         mutex_lock(&mdev->data.mutex);
1472                         if (mdev->data.socket  && !mdev->net_conf->no_cork)
1473                                 drbd_tcp_cork(mdev->data.socket);
1474                         mutex_unlock(&mdev->data.mutex);
1475                 }
1476
1477                 if (intr) {
1478                         D_ASSERT(intr == -EINTR);
1479                         flush_signals(current);
1480                         ERR_IF (get_t_state(thi) == Running)
1481                                 continue;
1482                         break;
1483                 }
1484
1485                 if (get_t_state(thi) != Running)
1486                         break;
1487                 /* With this break, we have done a down() but not consumed
1488                    the entry from the list. The cleanup code takes care of
1489                    this...   */
1490
1491                 w = NULL;
1492                 spin_lock_irq(&mdev->data.work.q_lock);
1493                 ERR_IF(list_empty(&mdev->data.work.q)) {
1494                         /* something terribly wrong in our logic.
1495                          * we were able to down() the semaphore,
1496                          * but the list is empty... doh.
1497                          *
1498                          * what is the best thing to do now?
1499                          * try again from scratch, restarting the receiver,
1500                          * asender, whatnot? could break even more ugly,
1501                          * e.g. when we are primary, but no good local data.
1502                          *
1503                          * I'll try to get away just starting over this loop.
1504                          */
1505                         spin_unlock_irq(&mdev->data.work.q_lock);
1506                         continue;
1507                 }
1508                 w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
1509                 list_del_init(&w->list);
1510                 spin_unlock_irq(&mdev->data.work.q_lock);
1511
1512                 if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
1513                         /* dev_warn(DEV, "worker: a callback failed! \n"); */
1514                         if (mdev->state.conn >= C_CONNECTED)
1515                                 drbd_force_state(mdev,
1516                                                 NS(conn, C_NETWORK_FAILURE));
1517                 }
1518         }
1519         D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));
1520         D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));
1521
1522         spin_lock_irq(&mdev->data.work.q_lock);
1523         i = 0;
1524         while (!list_empty(&mdev->data.work.q)) {
1525                 list_splice_init(&mdev->data.work.q, &work_list);
1526                 spin_unlock_irq(&mdev->data.work.q_lock);
1527
1528                 while (!list_empty(&work_list)) {
1529                         w = list_entry(work_list.next, struct drbd_work, list);
1530                         list_del_init(&w->list);
1531                         w->cb(mdev, w, 1);
1532                         i++; /* dead debugging code */
1533                 }
1534
1535                 spin_lock_irq(&mdev->data.work.q_lock);
1536         }
1537         sema_init(&mdev->data.work.s, 0);
1538         /* DANGEROUS race: if someone did queue his work within the spinlock,
1539          * but up() ed outside the spinlock, we could get an up() on the
1540          * semaphore without corresponding list entry.
1541          * So don't do that.
1542          */
1543         spin_unlock_irq(&mdev->data.work.q_lock);
1544
1545         D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
1546         /* _drbd_set_state only uses stop_nowait.
1547          * wait here for the Exiting receiver. */
1548         drbd_thread_stop(&mdev->receiver);
1549         drbd_mdev_cleanup(mdev);
1550
1551         dev_info(DEV, "worker terminated\n");
1552
1553         clear_bit(DEVICE_DYING, &mdev->flags);
1554         clear_bit(CONFIG_PENDING, &mdev->flags);
1555         wake_up(&mdev->state_wait);
1556
1557         return 0;
1558 }