Merge branch 'CVE-2014-7970' of git://git.kernel.org/pub/scm/linux/kernel/git/luto...
[sfrench/cifs-2.6.git] / drivers / block / rbd.c
1
2 /*
3    rbd.c -- Export ceph rados objects as a Linux block device
4
5
6    based on drivers/block/osdblk.c:
7
8    Copyright 2009 Red Hat, Inc.
9
10    This program is free software; you can redistribute it and/or modify
11    it under the terms of the GNU General Public License as published by
12    the Free Software Foundation.
13
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18
19    You should have received a copy of the GNU General Public License
20    along with this program; see the file COPYING.  If not, write to
21    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
25    For usage instructions, please refer to:
26
27                  Documentation/ABI/testing/sysfs-bus-rbd
28
29  */
30
31 #include <linux/ceph/libceph.h>
32 #include <linux/ceph/osd_client.h>
33 #include <linux/ceph/mon_client.h>
34 #include <linux/ceph/decode.h>
35 #include <linux/parser.h>
36 #include <linux/bsearch.h>
37
38 #include <linux/kernel.h>
39 #include <linux/device.h>
40 #include <linux/module.h>
41 #include <linux/fs.h>
42 #include <linux/blkdev.h>
43 #include <linux/slab.h>
44 #include <linux/idr.h>
45 #include <linux/workqueue.h>
46
47 #include "rbd_types.h"
48
49 #define RBD_DEBUG       /* Activate rbd_assert() calls */
50
51 /*
52  * The basic unit of block I/O is a sector.  It is interpreted in a
53  * number of contexts in Linux (blk, bio, genhd), but the default is
54  * universally 512 bytes.  These symbols are just slightly more
55  * meaningful than the bare numbers they represent.
56  */
57 #define SECTOR_SHIFT    9
58 #define SECTOR_SIZE     (1ULL << SECTOR_SHIFT)
59
60 /*
61  * Increment the given counter and return its updated value.
62  * If the counter is already 0 it will not be incremented.
63  * If the counter is already at its maximum value returns
64  * -EINVAL without updating it.
65  */
66 static int atomic_inc_return_safe(atomic_t *v)
67 {
68         unsigned int counter;
69
70         counter = (unsigned int)__atomic_add_unless(v, 1, 0);
71         if (counter <= (unsigned int)INT_MAX)
72                 return (int)counter;
73
74         atomic_dec(v);
75
76         return -EINVAL;
77 }
78
79 /* Decrement the counter.  Return the resulting value, or -EINVAL */
80 static int atomic_dec_return_safe(atomic_t *v)
81 {
82         int counter;
83
84         counter = atomic_dec_return(v);
85         if (counter >= 0)
86                 return counter;
87
88         atomic_inc(v);
89
90         return -EINVAL;
91 }
92
93 #define RBD_DRV_NAME "rbd"
94
95 #define RBD_MINORS_PER_MAJOR            256
96 #define RBD_SINGLE_MAJOR_PART_SHIFT     4
97
98 #define RBD_SNAP_DEV_NAME_PREFIX        "snap_"
99 #define RBD_MAX_SNAP_NAME_LEN   \
100                         (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
101
102 #define RBD_MAX_SNAP_COUNT      510     /* allows max snapc to fit in 4KB */
103
104 #define RBD_SNAP_HEAD_NAME      "-"
105
106 #define BAD_SNAP_INDEX  U32_MAX         /* invalid index into snap array */
107
108 /* This allows a single page to hold an image name sent by OSD */
109 #define RBD_IMAGE_NAME_LEN_MAX  (PAGE_SIZE - sizeof (__le32) - 1)
110 #define RBD_IMAGE_ID_LEN_MAX    64
111
112 #define RBD_OBJ_PREFIX_LEN_MAX  64
113
114 /* Feature bits */
115
116 #define RBD_FEATURE_LAYERING    (1<<0)
117 #define RBD_FEATURE_STRIPINGV2  (1<<1)
118 #define RBD_FEATURES_ALL \
119             (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
120
121 /* Features supported by this (client software) implementation. */
122
123 #define RBD_FEATURES_SUPPORTED  (RBD_FEATURES_ALL)
124
125 /*
126  * An RBD device name will be "rbd#", where the "rbd" comes from
127  * RBD_DRV_NAME above, and # is a unique integer identifier.
128  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
129  * enough to hold all possible device names.
130  */
131 #define DEV_NAME_LEN            32
132 #define MAX_INT_FORMAT_WIDTH    ((5 * sizeof (int)) / 2 + 1)
133
134 /*
135  * block device image metadata (in-memory version)
136  */
137 struct rbd_image_header {
138         /* These six fields never change for a given rbd image */
139         char *object_prefix;
140         __u8 obj_order;
141         __u8 crypt_type;
142         __u8 comp_type;
143         u64 stripe_unit;
144         u64 stripe_count;
145         u64 features;           /* Might be changeable someday? */
146
147         /* The remaining fields need to be updated occasionally */
148         u64 image_size;
149         struct ceph_snap_context *snapc;
150         char *snap_names;       /* format 1 only */
151         u64 *snap_sizes;        /* format 1 only */
152 };
153
154 /*
155  * An rbd image specification.
156  *
157  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
158  * identify an image.  Each rbd_dev structure includes a pointer to
159  * an rbd_spec structure that encapsulates this identity.
160  *
161  * Each of the id's in an rbd_spec has an associated name.  For a
162  * user-mapped image, the names are supplied and the id's associated
163  * with them are looked up.  For a layered image, a parent image is
164  * defined by the tuple, and the names are looked up.
165  *
166  * An rbd_dev structure contains a parent_spec pointer which is
167  * non-null if the image it represents is a child in a layered
168  * image.  This pointer will refer to the rbd_spec structure used
169  * by the parent rbd_dev for its own identity (i.e., the structure
170  * is shared between the parent and child).
171  *
172  * Since these structures are populated once, during the discovery
173  * phase of image construction, they are effectively immutable so
174  * we make no effort to synchronize access to them.
175  *
176  * Note that code herein does not assume the image name is known (it
177  * could be a null pointer).
178  */
179 struct rbd_spec {
180         u64             pool_id;
181         const char      *pool_name;
182
183         const char      *image_id;
184         const char      *image_name;
185
186         u64             snap_id;
187         const char      *snap_name;
188
189         struct kref     kref;
190 };
191
192 /*
193  * an instance of the client.  multiple devices may share an rbd client.
194  */
195 struct rbd_client {
196         struct ceph_client      *client;
197         struct kref             kref;
198         struct list_head        node;
199 };
200
201 struct rbd_img_request;
202 typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
203
204 #define BAD_WHICH       U32_MAX         /* Good which or bad which, which? */
205
206 struct rbd_obj_request;
207 typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
208
209 enum obj_request_type {
210         OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
211 };
212
213 enum obj_req_flags {
214         OBJ_REQ_DONE,           /* completion flag: not done = 0, done = 1 */
215         OBJ_REQ_IMG_DATA,       /* object usage: standalone = 0, image = 1 */
216         OBJ_REQ_KNOWN,          /* EXISTS flag valid: no = 0, yes = 1 */
217         OBJ_REQ_EXISTS,         /* target exists: no = 0, yes = 1 */
218 };
219
220 struct rbd_obj_request {
221         const char              *object_name;
222         u64                     offset;         /* object start byte */
223         u64                     length;         /* bytes from offset */
224         unsigned long           flags;
225
226         /*
227          * An object request associated with an image will have its
228          * img_data flag set; a standalone object request will not.
229          *
230          * A standalone object request will have which == BAD_WHICH
231          * and a null obj_request pointer.
232          *
233          * An object request initiated in support of a layered image
234          * object (to check for its existence before a write) will
235          * have which == BAD_WHICH and a non-null obj_request pointer.
236          *
237          * Finally, an object request for rbd image data will have
238          * which != BAD_WHICH, and will have a non-null img_request
239          * pointer.  The value of which will be in the range
240          * 0..(img_request->obj_request_count-1).
241          */
242         union {
243                 struct rbd_obj_request  *obj_request;   /* STAT op */
244                 struct {
245                         struct rbd_img_request  *img_request;
246                         u64                     img_offset;
247                         /* links for img_request->obj_requests list */
248                         struct list_head        links;
249                 };
250         };
251         u32                     which;          /* posn image request list */
252
253         enum obj_request_type   type;
254         union {
255                 struct bio      *bio_list;
256                 struct {
257                         struct page     **pages;
258                         u32             page_count;
259                 };
260         };
261         struct page             **copyup_pages;
262         u32                     copyup_page_count;
263
264         struct ceph_osd_request *osd_req;
265
266         u64                     xferred;        /* bytes transferred */
267         int                     result;
268
269         rbd_obj_callback_t      callback;
270         struct completion       completion;
271
272         struct kref             kref;
273 };
274
275 enum img_req_flags {
276         IMG_REQ_WRITE,          /* I/O direction: read = 0, write = 1 */
277         IMG_REQ_CHILD,          /* initiator: block = 0, child image = 1 */
278         IMG_REQ_LAYERED,        /* ENOENT handling: normal = 0, layered = 1 */
279 };
280
281 struct rbd_img_request {
282         struct rbd_device       *rbd_dev;
283         u64                     offset; /* starting image byte offset */
284         u64                     length; /* byte count from offset */
285         unsigned long           flags;
286         union {
287                 u64                     snap_id;        /* for reads */
288                 struct ceph_snap_context *snapc;        /* for writes */
289         };
290         union {
291                 struct request          *rq;            /* block request */
292                 struct rbd_obj_request  *obj_request;   /* obj req initiator */
293         };
294         struct page             **copyup_pages;
295         u32                     copyup_page_count;
296         spinlock_t              completion_lock;/* protects next_completion */
297         u32                     next_completion;
298         rbd_img_callback_t      callback;
299         u64                     xferred;/* aggregate bytes transferred */
300         int                     result; /* first nonzero obj_request result */
301
302         u32                     obj_request_count;
303         struct list_head        obj_requests;   /* rbd_obj_request structs */
304
305         struct kref             kref;
306 };
307
308 #define for_each_obj_request(ireq, oreq) \
309         list_for_each_entry(oreq, &(ireq)->obj_requests, links)
310 #define for_each_obj_request_from(ireq, oreq) \
311         list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
312 #define for_each_obj_request_safe(ireq, oreq, n) \
313         list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
314
315 struct rbd_mapping {
316         u64                     size;
317         u64                     features;
318         bool                    read_only;
319 };
320
321 /*
322  * a single device
323  */
324 struct rbd_device {
325         int                     dev_id;         /* blkdev unique id */
326
327         int                     major;          /* blkdev assigned major */
328         int                     minor;
329         struct gendisk          *disk;          /* blkdev's gendisk and rq */
330
331         u32                     image_format;   /* Either 1 or 2 */
332         struct rbd_client       *rbd_client;
333
334         char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
335
336         struct list_head        rq_queue;       /* incoming rq queue */
337         spinlock_t              lock;           /* queue, flags, open_count */
338         struct workqueue_struct *rq_wq;
339         struct work_struct      rq_work;
340
341         struct rbd_image_header header;
342         unsigned long           flags;          /* possibly lock protected */
343         struct rbd_spec         *spec;
344
345         char                    *header_name;
346
347         struct ceph_file_layout layout;
348
349         struct ceph_osd_event   *watch_event;
350         struct rbd_obj_request  *watch_request;
351
352         struct rbd_spec         *parent_spec;
353         u64                     parent_overlap;
354         atomic_t                parent_ref;
355         struct rbd_device       *parent;
356
357         /* protects updating the header */
358         struct rw_semaphore     header_rwsem;
359
360         struct rbd_mapping      mapping;
361
362         struct list_head        node;
363
364         /* sysfs related */
365         struct device           dev;
366         unsigned long           open_count;     /* protected by lock */
367 };
368
369 /*
370  * Flag bits for rbd_dev->flags.  If atomicity is required,
371  * rbd_dev->lock is used to protect access.
372  *
373  * Currently, only the "removing" flag (which is coupled with the
374  * "open_count" field) requires atomic access.
375  */
376 enum rbd_dev_flags {
377         RBD_DEV_FLAG_EXISTS,    /* mapped snapshot has not been deleted */
378         RBD_DEV_FLAG_REMOVING,  /* this mapping is being removed */
379 };
380
381 static DEFINE_MUTEX(client_mutex);      /* Serialize client creation */
382
383 static LIST_HEAD(rbd_dev_list);    /* devices */
384 static DEFINE_SPINLOCK(rbd_dev_list_lock);
385
386 static LIST_HEAD(rbd_client_list);              /* clients */
387 static DEFINE_SPINLOCK(rbd_client_list_lock);
388
389 /* Slab caches for frequently-allocated structures */
390
391 static struct kmem_cache        *rbd_img_request_cache;
392 static struct kmem_cache        *rbd_obj_request_cache;
393 static struct kmem_cache        *rbd_segment_name_cache;
394
395 static int rbd_major;
396 static DEFINE_IDA(rbd_dev_id_ida);
397
398 /*
399  * Default to false for now, as single-major requires >= 0.75 version of
400  * userspace rbd utility.
401  */
402 static bool single_major = false;
403 module_param(single_major, bool, S_IRUGO);
404 MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
405
406 static int rbd_img_request_submit(struct rbd_img_request *img_request);
407
408 static void rbd_dev_device_release(struct device *dev);
409
410 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
411                        size_t count);
412 static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
413                           size_t count);
414 static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
415                                     size_t count);
416 static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
417                                        size_t count);
418 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
419 static void rbd_spec_put(struct rbd_spec *spec);
420
421 static int rbd_dev_id_to_minor(int dev_id)
422 {
423         return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
424 }
425
426 static int minor_to_rbd_dev_id(int minor)
427 {
428         return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
429 }
430
431 static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
432 static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
433 static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
434 static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
435
436 static struct attribute *rbd_bus_attrs[] = {
437         &bus_attr_add.attr,
438         &bus_attr_remove.attr,
439         &bus_attr_add_single_major.attr,
440         &bus_attr_remove_single_major.attr,
441         NULL,
442 };
443
444 static umode_t rbd_bus_is_visible(struct kobject *kobj,
445                                   struct attribute *attr, int index)
446 {
447         if (!single_major &&
448             (attr == &bus_attr_add_single_major.attr ||
449              attr == &bus_attr_remove_single_major.attr))
450                 return 0;
451
452         return attr->mode;
453 }
454
455 static const struct attribute_group rbd_bus_group = {
456         .attrs = rbd_bus_attrs,
457         .is_visible = rbd_bus_is_visible,
458 };
459 __ATTRIBUTE_GROUPS(rbd_bus);
460
461 static struct bus_type rbd_bus_type = {
462         .name           = "rbd",
463         .bus_groups     = rbd_bus_groups,
464 };
465
466 static void rbd_root_dev_release(struct device *dev)
467 {
468 }
469
470 static struct device rbd_root_dev = {
471         .init_name =    "rbd",
472         .release =      rbd_root_dev_release,
473 };
474
475 static __printf(2, 3)
476 void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
477 {
478         struct va_format vaf;
479         va_list args;
480
481         va_start(args, fmt);
482         vaf.fmt = fmt;
483         vaf.va = &args;
484
485         if (!rbd_dev)
486                 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
487         else if (rbd_dev->disk)
488                 printk(KERN_WARNING "%s: %s: %pV\n",
489                         RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
490         else if (rbd_dev->spec && rbd_dev->spec->image_name)
491                 printk(KERN_WARNING "%s: image %s: %pV\n",
492                         RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
493         else if (rbd_dev->spec && rbd_dev->spec->image_id)
494                 printk(KERN_WARNING "%s: id %s: %pV\n",
495                         RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
496         else    /* punt */
497                 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
498                         RBD_DRV_NAME, rbd_dev, &vaf);
499         va_end(args);
500 }
501
502 #ifdef RBD_DEBUG
503 #define rbd_assert(expr)                                                \
504                 if (unlikely(!(expr))) {                                \
505                         printk(KERN_ERR "\nAssertion failure in %s() "  \
506                                                 "at line %d:\n\n"       \
507                                         "\trbd_assert(%s);\n\n",        \
508                                         __func__, __LINE__, #expr);     \
509                         BUG();                                          \
510                 }
511 #else /* !RBD_DEBUG */
512 #  define rbd_assert(expr)      ((void) 0)
513 #endif /* !RBD_DEBUG */
514
515 static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
516 static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
517 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
518
519 static int rbd_dev_refresh(struct rbd_device *rbd_dev);
520 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
521 static int rbd_dev_header_info(struct rbd_device *rbd_dev);
522 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
523 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
524                                         u64 snap_id);
525 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
526                                 u8 *order, u64 *snap_size);
527 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
528                 u64 *snap_features);
529 static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
530
531 static int rbd_open(struct block_device *bdev, fmode_t mode)
532 {
533         struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
534         bool removing = false;
535
536         if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
537                 return -EROFS;
538
539         spin_lock_irq(&rbd_dev->lock);
540         if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
541                 removing = true;
542         else
543                 rbd_dev->open_count++;
544         spin_unlock_irq(&rbd_dev->lock);
545         if (removing)
546                 return -ENOENT;
547
548         (void) get_device(&rbd_dev->dev);
549
550         return 0;
551 }
552
553 static void rbd_release(struct gendisk *disk, fmode_t mode)
554 {
555         struct rbd_device *rbd_dev = disk->private_data;
556         unsigned long open_count_before;
557
558         spin_lock_irq(&rbd_dev->lock);
559         open_count_before = rbd_dev->open_count--;
560         spin_unlock_irq(&rbd_dev->lock);
561         rbd_assert(open_count_before > 0);
562
563         put_device(&rbd_dev->dev);
564 }
565
566 static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
567 {
568         int ret = 0;
569         int val;
570         bool ro;
571         bool ro_changed = false;
572
573         /* get_user() may sleep, so call it before taking rbd_dev->lock */
574         if (get_user(val, (int __user *)(arg)))
575                 return -EFAULT;
576
577         ro = val ? true : false;
578         /* Snapshot doesn't allow to write*/
579         if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
580                 return -EROFS;
581
582         spin_lock_irq(&rbd_dev->lock);
583         /* prevent others open this device */
584         if (rbd_dev->open_count > 1) {
585                 ret = -EBUSY;
586                 goto out;
587         }
588
589         if (rbd_dev->mapping.read_only != ro) {
590                 rbd_dev->mapping.read_only = ro;
591                 ro_changed = true;
592         }
593
594 out:
595         spin_unlock_irq(&rbd_dev->lock);
596         /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
597         if (ret == 0 && ro_changed)
598                 set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
599
600         return ret;
601 }
602
603 static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
604                         unsigned int cmd, unsigned long arg)
605 {
606         struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
607         int ret = 0;
608
609         switch (cmd) {
610         case BLKROSET:
611                 ret = rbd_ioctl_set_ro(rbd_dev, arg);
612                 break;
613         default:
614                 ret = -ENOTTY;
615         }
616
617         return ret;
618 }
619
620 #ifdef CONFIG_COMPAT
621 static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
622                                 unsigned int cmd, unsigned long arg)
623 {
624         return rbd_ioctl(bdev, mode, cmd, arg);
625 }
626 #endif /* CONFIG_COMPAT */
627
628 static const struct block_device_operations rbd_bd_ops = {
629         .owner                  = THIS_MODULE,
630         .open                   = rbd_open,
631         .release                = rbd_release,
632         .ioctl                  = rbd_ioctl,
633 #ifdef CONFIG_COMPAT
634         .compat_ioctl           = rbd_compat_ioctl,
635 #endif
636 };
637
638 /*
639  * Initialize an rbd client instance.  Success or not, this function
640  * consumes ceph_opts.  Caller holds client_mutex.
641  */
642 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
643 {
644         struct rbd_client *rbdc;
645         int ret = -ENOMEM;
646
647         dout("%s:\n", __func__);
648         rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
649         if (!rbdc)
650                 goto out_opt;
651
652         kref_init(&rbdc->kref);
653         INIT_LIST_HEAD(&rbdc->node);
654
655         rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
656         if (IS_ERR(rbdc->client))
657                 goto out_rbdc;
658         ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
659
660         ret = ceph_open_session(rbdc->client);
661         if (ret < 0)
662                 goto out_client;
663
664         spin_lock(&rbd_client_list_lock);
665         list_add_tail(&rbdc->node, &rbd_client_list);
666         spin_unlock(&rbd_client_list_lock);
667
668         dout("%s: rbdc %p\n", __func__, rbdc);
669
670         return rbdc;
671 out_client:
672         ceph_destroy_client(rbdc->client);
673 out_rbdc:
674         kfree(rbdc);
675 out_opt:
676         if (ceph_opts)
677                 ceph_destroy_options(ceph_opts);
678         dout("%s: error %d\n", __func__, ret);
679
680         return ERR_PTR(ret);
681 }
682
683 static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
684 {
685         kref_get(&rbdc->kref);
686
687         return rbdc;
688 }
689
690 /*
691  * Find a ceph client with specific addr and configuration.  If
692  * found, bump its reference count.
693  */
694 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
695 {
696         struct rbd_client *client_node;
697         bool found = false;
698
699         if (ceph_opts->flags & CEPH_OPT_NOSHARE)
700                 return NULL;
701
702         spin_lock(&rbd_client_list_lock);
703         list_for_each_entry(client_node, &rbd_client_list, node) {
704                 if (!ceph_compare_options(ceph_opts, client_node->client)) {
705                         __rbd_get_client(client_node);
706
707                         found = true;
708                         break;
709                 }
710         }
711         spin_unlock(&rbd_client_list_lock);
712
713         return found ? client_node : NULL;
714 }
715
716 /*
717  * mount options
718  */
719 enum {
720         Opt_last_int,
721         /* int args above */
722         Opt_last_string,
723         /* string args above */
724         Opt_read_only,
725         Opt_read_write,
726         /* Boolean args above */
727         Opt_last_bool,
728 };
729
730 static match_table_t rbd_opts_tokens = {
731         /* int args above */
732         /* string args above */
733         {Opt_read_only, "read_only"},
734         {Opt_read_only, "ro"},          /* Alternate spelling */
735         {Opt_read_write, "read_write"},
736         {Opt_read_write, "rw"},         /* Alternate spelling */
737         /* Boolean args above */
738         {-1, NULL}
739 };
740
741 struct rbd_options {
742         bool    read_only;
743 };
744
745 #define RBD_READ_ONLY_DEFAULT   false
746
747 static int parse_rbd_opts_token(char *c, void *private)
748 {
749         struct rbd_options *rbd_opts = private;
750         substring_t argstr[MAX_OPT_ARGS];
751         int token, intval, ret;
752
753         token = match_token(c, rbd_opts_tokens, argstr);
754         if (token < 0)
755                 return -EINVAL;
756
757         if (token < Opt_last_int) {
758                 ret = match_int(&argstr[0], &intval);
759                 if (ret < 0) {
760                         pr_err("bad mount option arg (not int) "
761                                "at '%s'\n", c);
762                         return ret;
763                 }
764                 dout("got int token %d val %d\n", token, intval);
765         } else if (token > Opt_last_int && token < Opt_last_string) {
766                 dout("got string token %d val %s\n", token,
767                      argstr[0].from);
768         } else if (token > Opt_last_string && token < Opt_last_bool) {
769                 dout("got Boolean token %d\n", token);
770         } else {
771                 dout("got token %d\n", token);
772         }
773
774         switch (token) {
775         case Opt_read_only:
776                 rbd_opts->read_only = true;
777                 break;
778         case Opt_read_write:
779                 rbd_opts->read_only = false;
780                 break;
781         default:
782                 rbd_assert(false);
783                 break;
784         }
785         return 0;
786 }
787
788 /*
789  * Get a ceph client with specific addr and configuration, if one does
790  * not exist create it.  Either way, ceph_opts is consumed by this
791  * function.
792  */
793 static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
794 {
795         struct rbd_client *rbdc;
796
797         mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
798         rbdc = rbd_client_find(ceph_opts);
799         if (rbdc)       /* using an existing client */
800                 ceph_destroy_options(ceph_opts);
801         else
802                 rbdc = rbd_client_create(ceph_opts);
803         mutex_unlock(&client_mutex);
804
805         return rbdc;
806 }
807
808 /*
809  * Destroy ceph client
810  *
811  * Caller must hold rbd_client_list_lock.
812  */
813 static void rbd_client_release(struct kref *kref)
814 {
815         struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
816
817         dout("%s: rbdc %p\n", __func__, rbdc);
818         spin_lock(&rbd_client_list_lock);
819         list_del(&rbdc->node);
820         spin_unlock(&rbd_client_list_lock);
821
822         ceph_destroy_client(rbdc->client);
823         kfree(rbdc);
824 }
825
826 /*
827  * Drop reference to ceph client node. If it's not referenced anymore, release
828  * it.
829  */
830 static void rbd_put_client(struct rbd_client *rbdc)
831 {
832         if (rbdc)
833                 kref_put(&rbdc->kref, rbd_client_release);
834 }
835
836 static bool rbd_image_format_valid(u32 image_format)
837 {
838         return image_format == 1 || image_format == 2;
839 }
840
841 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
842 {
843         size_t size;
844         u32 snap_count;
845
846         /* The header has to start with the magic rbd header text */
847         if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
848                 return false;
849
850         /* The bio layer requires at least sector-sized I/O */
851
852         if (ondisk->options.order < SECTOR_SHIFT)
853                 return false;
854
855         /* If we use u64 in a few spots we may be able to loosen this */
856
857         if (ondisk->options.order > 8 * sizeof (int) - 1)
858                 return false;
859
860         /*
861          * The size of a snapshot header has to fit in a size_t, and
862          * that limits the number of snapshots.
863          */
864         snap_count = le32_to_cpu(ondisk->snap_count);
865         size = SIZE_MAX - sizeof (struct ceph_snap_context);
866         if (snap_count > size / sizeof (__le64))
867                 return false;
868
869         /*
870          * Not only that, but the size of the entire the snapshot
871          * header must also be representable in a size_t.
872          */
873         size -= snap_count * sizeof (__le64);
874         if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
875                 return false;
876
877         return true;
878 }
879
880 /*
881  * Fill an rbd image header with information from the given format 1
882  * on-disk header.
883  */
884 static int rbd_header_from_disk(struct rbd_device *rbd_dev,
885                                  struct rbd_image_header_ondisk *ondisk)
886 {
887         struct rbd_image_header *header = &rbd_dev->header;
888         bool first_time = header->object_prefix == NULL;
889         struct ceph_snap_context *snapc;
890         char *object_prefix = NULL;
891         char *snap_names = NULL;
892         u64 *snap_sizes = NULL;
893         u32 snap_count;
894         size_t size;
895         int ret = -ENOMEM;
896         u32 i;
897
898         /* Allocate this now to avoid having to handle failure below */
899
900         if (first_time) {
901                 size_t len;
902
903                 len = strnlen(ondisk->object_prefix,
904                                 sizeof (ondisk->object_prefix));
905                 object_prefix = kmalloc(len + 1, GFP_KERNEL);
906                 if (!object_prefix)
907                         return -ENOMEM;
908                 memcpy(object_prefix, ondisk->object_prefix, len);
909                 object_prefix[len] = '\0';
910         }
911
912         /* Allocate the snapshot context and fill it in */
913
914         snap_count = le32_to_cpu(ondisk->snap_count);
915         snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
916         if (!snapc)
917                 goto out_err;
918         snapc->seq = le64_to_cpu(ondisk->snap_seq);
919         if (snap_count) {
920                 struct rbd_image_snap_ondisk *snaps;
921                 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
922
923                 /* We'll keep a copy of the snapshot names... */
924
925                 if (snap_names_len > (u64)SIZE_MAX)
926                         goto out_2big;
927                 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
928                 if (!snap_names)
929                         goto out_err;
930
931                 /* ...as well as the array of their sizes. */
932
933                 size = snap_count * sizeof (*header->snap_sizes);
934                 snap_sizes = kmalloc(size, GFP_KERNEL);
935                 if (!snap_sizes)
936                         goto out_err;
937
938                 /*
939                  * Copy the names, and fill in each snapshot's id
940                  * and size.
941                  *
942                  * Note that rbd_dev_v1_header_info() guarantees the
943                  * ondisk buffer we're working with has
944                  * snap_names_len bytes beyond the end of the
945                  * snapshot id array, this memcpy() is safe.
946                  */
947                 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
948                 snaps = ondisk->snaps;
949                 for (i = 0; i < snap_count; i++) {
950                         snapc->snaps[i] = le64_to_cpu(snaps[i].id);
951                         snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
952                 }
953         }
954
955         /* We won't fail any more, fill in the header */
956
957         if (first_time) {
958                 header->object_prefix = object_prefix;
959                 header->obj_order = ondisk->options.order;
960                 header->crypt_type = ondisk->options.crypt_type;
961                 header->comp_type = ondisk->options.comp_type;
962                 /* The rest aren't used for format 1 images */
963                 header->stripe_unit = 0;
964                 header->stripe_count = 0;
965                 header->features = 0;
966         } else {
967                 ceph_put_snap_context(header->snapc);
968                 kfree(header->snap_names);
969                 kfree(header->snap_sizes);
970         }
971
972         /* The remaining fields always get updated (when we refresh) */
973
974         header->image_size = le64_to_cpu(ondisk->image_size);
975         header->snapc = snapc;
976         header->snap_names = snap_names;
977         header->snap_sizes = snap_sizes;
978
979         return 0;
980 out_2big:
981         ret = -EIO;
982 out_err:
983         kfree(snap_sizes);
984         kfree(snap_names);
985         ceph_put_snap_context(snapc);
986         kfree(object_prefix);
987
988         return ret;
989 }
990
991 static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
992 {
993         const char *snap_name;
994
995         rbd_assert(which < rbd_dev->header.snapc->num_snaps);
996
997         /* Skip over names until we find the one we are looking for */
998
999         snap_name = rbd_dev->header.snap_names;
1000         while (which--)
1001                 snap_name += strlen(snap_name) + 1;
1002
1003         return kstrdup(snap_name, GFP_KERNEL);
1004 }
1005
1006 /*
1007  * Snapshot id comparison function for use with qsort()/bsearch().
1008  * Note that result is for snapshots in *descending* order.
1009  */
1010 static int snapid_compare_reverse(const void *s1, const void *s2)
1011 {
1012         u64 snap_id1 = *(u64 *)s1;
1013         u64 snap_id2 = *(u64 *)s2;
1014
1015         if (snap_id1 < snap_id2)
1016                 return 1;
1017         return snap_id1 == snap_id2 ? 0 : -1;
1018 }
1019
1020 /*
1021  * Search a snapshot context to see if the given snapshot id is
1022  * present.
1023  *
1024  * Returns the position of the snapshot id in the array if it's found,
1025  * or BAD_SNAP_INDEX otherwise.
1026  *
1027  * Note: The snapshot array is in kept sorted (by the osd) in
1028  * reverse order, highest snapshot id first.
1029  */
1030 static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1031 {
1032         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
1033         u64 *found;
1034
1035         found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1036                                 sizeof (snap_id), snapid_compare_reverse);
1037
1038         return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
1039 }
1040
1041 static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1042                                         u64 snap_id)
1043 {
1044         u32 which;
1045         const char *snap_name;
1046
1047         which = rbd_dev_snap_index(rbd_dev, snap_id);
1048         if (which == BAD_SNAP_INDEX)
1049                 return ERR_PTR(-ENOENT);
1050
1051         snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1052         return snap_name ? snap_name : ERR_PTR(-ENOMEM);
1053 }
1054
1055 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1056 {
1057         if (snap_id == CEPH_NOSNAP)
1058                 return RBD_SNAP_HEAD_NAME;
1059
1060         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1061         if (rbd_dev->image_format == 1)
1062                 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
1063
1064         return rbd_dev_v2_snap_name(rbd_dev, snap_id);
1065 }
1066
1067 static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1068                                 u64 *snap_size)
1069 {
1070         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1071         if (snap_id == CEPH_NOSNAP) {
1072                 *snap_size = rbd_dev->header.image_size;
1073         } else if (rbd_dev->image_format == 1) {
1074                 u32 which;
1075
1076                 which = rbd_dev_snap_index(rbd_dev, snap_id);
1077                 if (which == BAD_SNAP_INDEX)
1078                         return -ENOENT;
1079
1080                 *snap_size = rbd_dev->header.snap_sizes[which];
1081         } else {
1082                 u64 size = 0;
1083                 int ret;
1084
1085                 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1086                 if (ret)
1087                         return ret;
1088
1089                 *snap_size = size;
1090         }
1091         return 0;
1092 }
1093
1094 static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1095                         u64 *snap_features)
1096 {
1097         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1098         if (snap_id == CEPH_NOSNAP) {
1099                 *snap_features = rbd_dev->header.features;
1100         } else if (rbd_dev->image_format == 1) {
1101                 *snap_features = 0;     /* No features for format 1 */
1102         } else {
1103                 u64 features = 0;
1104                 int ret;
1105
1106                 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1107                 if (ret)
1108                         return ret;
1109
1110                 *snap_features = features;
1111         }
1112         return 0;
1113 }
1114
1115 static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1116 {
1117         u64 snap_id = rbd_dev->spec->snap_id;
1118         u64 size = 0;
1119         u64 features = 0;
1120         int ret;
1121
1122         ret = rbd_snap_size(rbd_dev, snap_id, &size);
1123         if (ret)
1124                 return ret;
1125         ret = rbd_snap_features(rbd_dev, snap_id, &features);
1126         if (ret)
1127                 return ret;
1128
1129         rbd_dev->mapping.size = size;
1130         rbd_dev->mapping.features = features;
1131
1132         return 0;
1133 }
1134
1135 static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1136 {
1137         rbd_dev->mapping.size = 0;
1138         rbd_dev->mapping.features = 0;
1139 }
1140
1141 static void rbd_segment_name_free(const char *name)
1142 {
1143         /* The explicit cast here is needed to drop the const qualifier */
1144
1145         kmem_cache_free(rbd_segment_name_cache, (void *)name);
1146 }
1147
1148 static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1149 {
1150         char *name;
1151         u64 segment;
1152         int ret;
1153         char *name_format;
1154
1155         name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
1156         if (!name)
1157                 return NULL;
1158         segment = offset >> rbd_dev->header.obj_order;
1159         name_format = "%s.%012llx";
1160         if (rbd_dev->image_format == 2)
1161                 name_format = "%s.%016llx";
1162         ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
1163                         rbd_dev->header.object_prefix, segment);
1164         if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
1165                 pr_err("error formatting segment name for #%llu (%d)\n",
1166                         segment, ret);
1167                 rbd_segment_name_free(name);
1168                 name = NULL;
1169         }
1170
1171         return name;
1172 }
1173
1174 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
1175 {
1176         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1177
1178         return offset & (segment_size - 1);
1179 }
1180
1181 static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1182                                 u64 offset, u64 length)
1183 {
1184         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1185
1186         offset &= segment_size - 1;
1187
1188         rbd_assert(length <= U64_MAX - offset);
1189         if (offset + length > segment_size)
1190                 length = segment_size - offset;
1191
1192         return length;
1193 }
1194
1195 /*
1196  * returns the size of an object in the image
1197  */
1198 static u64 rbd_obj_bytes(struct rbd_image_header *header)
1199 {
1200         return 1 << header->obj_order;
1201 }
1202
1203 /*
1204  * bio helpers
1205  */
1206
1207 static void bio_chain_put(struct bio *chain)
1208 {
1209         struct bio *tmp;
1210
1211         while (chain) {
1212                 tmp = chain;
1213                 chain = chain->bi_next;
1214                 bio_put(tmp);
1215         }
1216 }
1217
1218 /*
1219  * zeros a bio chain, starting at specific offset
1220  */
1221 static void zero_bio_chain(struct bio *chain, int start_ofs)
1222 {
1223         struct bio_vec bv;
1224         struct bvec_iter iter;
1225         unsigned long flags;
1226         void *buf;
1227         int pos = 0;
1228
1229         while (chain) {
1230                 bio_for_each_segment(bv, chain, iter) {
1231                         if (pos + bv.bv_len > start_ofs) {
1232                                 int remainder = max(start_ofs - pos, 0);
1233                                 buf = bvec_kmap_irq(&bv, &flags);
1234                                 memset(buf + remainder, 0,
1235                                        bv.bv_len - remainder);
1236                                 flush_dcache_page(bv.bv_page);
1237                                 bvec_kunmap_irq(buf, &flags);
1238                         }
1239                         pos += bv.bv_len;
1240                 }
1241
1242                 chain = chain->bi_next;
1243         }
1244 }
1245
1246 /*
1247  * similar to zero_bio_chain(), zeros data defined by a page array,
1248  * starting at the given byte offset from the start of the array and
1249  * continuing up to the given end offset.  The pages array is
1250  * assumed to be big enough to hold all bytes up to the end.
1251  */
1252 static void zero_pages(struct page **pages, u64 offset, u64 end)
1253 {
1254         struct page **page = &pages[offset >> PAGE_SHIFT];
1255
1256         rbd_assert(end > offset);
1257         rbd_assert(end - offset <= (u64)SIZE_MAX);
1258         while (offset < end) {
1259                 size_t page_offset;
1260                 size_t length;
1261                 unsigned long flags;
1262                 void *kaddr;
1263
1264                 page_offset = offset & ~PAGE_MASK;
1265                 length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1266                 local_irq_save(flags);
1267                 kaddr = kmap_atomic(*page);
1268                 memset(kaddr + page_offset, 0, length);
1269                 flush_dcache_page(*page);
1270                 kunmap_atomic(kaddr);
1271                 local_irq_restore(flags);
1272
1273                 offset += length;
1274                 page++;
1275         }
1276 }
1277
1278 /*
1279  * Clone a portion of a bio, starting at the given byte offset
1280  * and continuing for the number of bytes indicated.
1281  */
1282 static struct bio *bio_clone_range(struct bio *bio_src,
1283                                         unsigned int offset,
1284                                         unsigned int len,
1285                                         gfp_t gfpmask)
1286 {
1287         struct bio *bio;
1288
1289         bio = bio_clone(bio_src, gfpmask);
1290         if (!bio)
1291                 return NULL;    /* ENOMEM */
1292
1293         bio_advance(bio, offset);
1294         bio->bi_iter.bi_size = len;
1295
1296         return bio;
1297 }
1298
1299 /*
1300  * Clone a portion of a bio chain, starting at the given byte offset
1301  * into the first bio in the source chain and continuing for the
1302  * number of bytes indicated.  The result is another bio chain of
1303  * exactly the given length, or a null pointer on error.
1304  *
1305  * The bio_src and offset parameters are both in-out.  On entry they
1306  * refer to the first source bio and the offset into that bio where
1307  * the start of data to be cloned is located.
1308  *
1309  * On return, bio_src is updated to refer to the bio in the source
1310  * chain that contains first un-cloned byte, and *offset will
1311  * contain the offset of that byte within that bio.
1312  */
1313 static struct bio *bio_chain_clone_range(struct bio **bio_src,
1314                                         unsigned int *offset,
1315                                         unsigned int len,
1316                                         gfp_t gfpmask)
1317 {
1318         struct bio *bi = *bio_src;
1319         unsigned int off = *offset;
1320         struct bio *chain = NULL;
1321         struct bio **end;
1322
1323         /* Build up a chain of clone bios up to the limit */
1324
1325         if (!bi || off >= bi->bi_iter.bi_size || !len)
1326                 return NULL;            /* Nothing to clone */
1327
1328         end = &chain;
1329         while (len) {
1330                 unsigned int bi_size;
1331                 struct bio *bio;
1332
1333                 if (!bi) {
1334                         rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1335                         goto out_err;   /* EINVAL; ran out of bio's */
1336                 }
1337                 bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
1338                 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1339                 if (!bio)
1340                         goto out_err;   /* ENOMEM */
1341
1342                 *end = bio;
1343                 end = &bio->bi_next;
1344
1345                 off += bi_size;
1346                 if (off == bi->bi_iter.bi_size) {
1347                         bi = bi->bi_next;
1348                         off = 0;
1349                 }
1350                 len -= bi_size;
1351         }
1352         *bio_src = bi;
1353         *offset = off;
1354
1355         return chain;
1356 out_err:
1357         bio_chain_put(chain);
1358
1359         return NULL;
1360 }
1361
1362 /*
1363  * The default/initial value for all object request flags is 0.  For
1364  * each flag, once its value is set to 1 it is never reset to 0
1365  * again.
1366  */
1367 static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1368 {
1369         if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
1370                 struct rbd_device *rbd_dev;
1371
1372                 rbd_dev = obj_request->img_request->rbd_dev;
1373                 rbd_warn(rbd_dev, "obj_request %p already marked img_data",
1374                         obj_request);
1375         }
1376 }
1377
1378 static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1379 {
1380         smp_mb();
1381         return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1382 }
1383
1384 static void obj_request_done_set(struct rbd_obj_request *obj_request)
1385 {
1386         if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1387                 struct rbd_device *rbd_dev = NULL;
1388
1389                 if (obj_request_img_data_test(obj_request))
1390                         rbd_dev = obj_request->img_request->rbd_dev;
1391                 rbd_warn(rbd_dev, "obj_request %p already marked done",
1392                         obj_request);
1393         }
1394 }
1395
1396 static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1397 {
1398         smp_mb();
1399         return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1400 }
1401
1402 /*
1403  * This sets the KNOWN flag after (possibly) setting the EXISTS
1404  * flag.  The latter is set based on the "exists" value provided.
1405  *
1406  * Note that for our purposes once an object exists it never goes
1407  * away again.  It's possible that the response from two existence
1408  * checks are separated by the creation of the target object, and
1409  * the first ("doesn't exist") response arrives *after* the second
1410  * ("does exist").  In that case we ignore the second one.
1411  */
1412 static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1413                                 bool exists)
1414 {
1415         if (exists)
1416                 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1417         set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1418         smp_mb();
1419 }
1420
1421 static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1422 {
1423         smp_mb();
1424         return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1425 }
1426
1427 static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1428 {
1429         smp_mb();
1430         return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1431 }
1432
1433 static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
1434 {
1435         struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1436
1437         return obj_request->img_offset <
1438             round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
1439 }
1440
1441 static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1442 {
1443         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1444                 atomic_read(&obj_request->kref.refcount));
1445         kref_get(&obj_request->kref);
1446 }
1447
1448 static void rbd_obj_request_destroy(struct kref *kref);
1449 static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1450 {
1451         rbd_assert(obj_request != NULL);
1452         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1453                 atomic_read(&obj_request->kref.refcount));
1454         kref_put(&obj_request->kref, rbd_obj_request_destroy);
1455 }
1456
1457 static void rbd_img_request_get(struct rbd_img_request *img_request)
1458 {
1459         dout("%s: img %p (was %d)\n", __func__, img_request,
1460              atomic_read(&img_request->kref.refcount));
1461         kref_get(&img_request->kref);
1462 }
1463
1464 static bool img_request_child_test(struct rbd_img_request *img_request);
1465 static void rbd_parent_request_destroy(struct kref *kref);
1466 static void rbd_img_request_destroy(struct kref *kref);
1467 static void rbd_img_request_put(struct rbd_img_request *img_request)
1468 {
1469         rbd_assert(img_request != NULL);
1470         dout("%s: img %p (was %d)\n", __func__, img_request,
1471                 atomic_read(&img_request->kref.refcount));
1472         if (img_request_child_test(img_request))
1473                 kref_put(&img_request->kref, rbd_parent_request_destroy);
1474         else
1475                 kref_put(&img_request->kref, rbd_img_request_destroy);
1476 }
1477
1478 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1479                                         struct rbd_obj_request *obj_request)
1480 {
1481         rbd_assert(obj_request->img_request == NULL);
1482
1483         /* Image request now owns object's original reference */
1484         obj_request->img_request = img_request;
1485         obj_request->which = img_request->obj_request_count;
1486         rbd_assert(!obj_request_img_data_test(obj_request));
1487         obj_request_img_data_set(obj_request);
1488         rbd_assert(obj_request->which != BAD_WHICH);
1489         img_request->obj_request_count++;
1490         list_add_tail(&obj_request->links, &img_request->obj_requests);
1491         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1492                 obj_request->which);
1493 }
1494
1495 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1496                                         struct rbd_obj_request *obj_request)
1497 {
1498         rbd_assert(obj_request->which != BAD_WHICH);
1499
1500         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1501                 obj_request->which);
1502         list_del(&obj_request->links);
1503         rbd_assert(img_request->obj_request_count > 0);
1504         img_request->obj_request_count--;
1505         rbd_assert(obj_request->which == img_request->obj_request_count);
1506         obj_request->which = BAD_WHICH;
1507         rbd_assert(obj_request_img_data_test(obj_request));
1508         rbd_assert(obj_request->img_request == img_request);
1509         obj_request->img_request = NULL;
1510         obj_request->callback = NULL;
1511         rbd_obj_request_put(obj_request);
1512 }
1513
1514 static bool obj_request_type_valid(enum obj_request_type type)
1515 {
1516         switch (type) {
1517         case OBJ_REQUEST_NODATA:
1518         case OBJ_REQUEST_BIO:
1519         case OBJ_REQUEST_PAGES:
1520                 return true;
1521         default:
1522                 return false;
1523         }
1524 }
1525
1526 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1527                                 struct rbd_obj_request *obj_request)
1528 {
1529         dout("%s %p\n", __func__, obj_request);
1530         return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1531 }
1532
1533 static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
1534 {
1535         dout("%s %p\n", __func__, obj_request);
1536         ceph_osdc_cancel_request(obj_request->osd_req);
1537 }
1538
1539 /*
1540  * Wait for an object request to complete.  If interrupted, cancel the
1541  * underlying osd request.
1542  */
1543 static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1544 {
1545         int ret;
1546
1547         dout("%s %p\n", __func__, obj_request);
1548
1549         ret = wait_for_completion_interruptible(&obj_request->completion);
1550         if (ret < 0) {
1551                 dout("%s %p interrupted\n", __func__, obj_request);
1552                 rbd_obj_request_end(obj_request);
1553                 return ret;
1554         }
1555
1556         dout("%s %p done\n", __func__, obj_request);
1557         return 0;
1558 }
1559
1560 static void rbd_img_request_complete(struct rbd_img_request *img_request)
1561 {
1562
1563         dout("%s: img %p\n", __func__, img_request);
1564
1565         /*
1566          * If no error occurred, compute the aggregate transfer
1567          * count for the image request.  We could instead use
1568          * atomic64_cmpxchg() to update it as each object request
1569          * completes; not clear which way is better off hand.
1570          */
1571         if (!img_request->result) {
1572                 struct rbd_obj_request *obj_request;
1573                 u64 xferred = 0;
1574
1575                 for_each_obj_request(img_request, obj_request)
1576                         xferred += obj_request->xferred;
1577                 img_request->xferred = xferred;
1578         }
1579
1580         if (img_request->callback)
1581                 img_request->callback(img_request);
1582         else
1583                 rbd_img_request_put(img_request);
1584 }
1585
1586 /*
1587  * The default/initial value for all image request flags is 0.  Each
1588  * is conditionally set to 1 at image request initialization time
1589  * and currently never change thereafter.
1590  */
1591 static void img_request_write_set(struct rbd_img_request *img_request)
1592 {
1593         set_bit(IMG_REQ_WRITE, &img_request->flags);
1594         smp_mb();
1595 }
1596
1597 static bool img_request_write_test(struct rbd_img_request *img_request)
1598 {
1599         smp_mb();
1600         return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1601 }
1602
1603 static void img_request_child_set(struct rbd_img_request *img_request)
1604 {
1605         set_bit(IMG_REQ_CHILD, &img_request->flags);
1606         smp_mb();
1607 }
1608
1609 static void img_request_child_clear(struct rbd_img_request *img_request)
1610 {
1611         clear_bit(IMG_REQ_CHILD, &img_request->flags);
1612         smp_mb();
1613 }
1614
1615 static bool img_request_child_test(struct rbd_img_request *img_request)
1616 {
1617         smp_mb();
1618         return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1619 }
1620
1621 static void img_request_layered_set(struct rbd_img_request *img_request)
1622 {
1623         set_bit(IMG_REQ_LAYERED, &img_request->flags);
1624         smp_mb();
1625 }
1626
1627 static void img_request_layered_clear(struct rbd_img_request *img_request)
1628 {
1629         clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1630         smp_mb();
1631 }
1632
1633 static bool img_request_layered_test(struct rbd_img_request *img_request)
1634 {
1635         smp_mb();
1636         return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1637 }
1638
1639 static void
1640 rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1641 {
1642         u64 xferred = obj_request->xferred;
1643         u64 length = obj_request->length;
1644
1645         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1646                 obj_request, obj_request->img_request, obj_request->result,
1647                 xferred, length);
1648         /*
1649          * ENOENT means a hole in the image.  We zero-fill the entire
1650          * length of the request.  A short read also implies zero-fill
1651          * to the end of the request.  An error requires the whole
1652          * length of the request to be reported finished with an error
1653          * to the block layer.  In each case we update the xferred
1654          * count to indicate the whole request was satisfied.
1655          */
1656         rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
1657         if (obj_request->result == -ENOENT) {
1658                 if (obj_request->type == OBJ_REQUEST_BIO)
1659                         zero_bio_chain(obj_request->bio_list, 0);
1660                 else
1661                         zero_pages(obj_request->pages, 0, length);
1662                 obj_request->result = 0;
1663         } else if (xferred < length && !obj_request->result) {
1664                 if (obj_request->type == OBJ_REQUEST_BIO)
1665                         zero_bio_chain(obj_request->bio_list, xferred);
1666                 else
1667                         zero_pages(obj_request->pages, xferred, length);
1668         }
1669         obj_request->xferred = length;
1670         obj_request_done_set(obj_request);
1671 }
1672
1673 static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1674 {
1675         dout("%s: obj %p cb %p\n", __func__, obj_request,
1676                 obj_request->callback);
1677         if (obj_request->callback)
1678                 obj_request->callback(obj_request);
1679         else
1680                 complete_all(&obj_request->completion);
1681 }
1682
1683 static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1684 {
1685         dout("%s: obj %p\n", __func__, obj_request);
1686         obj_request_done_set(obj_request);
1687 }
1688
1689 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1690 {
1691         struct rbd_img_request *img_request = NULL;
1692         struct rbd_device *rbd_dev = NULL;
1693         bool layered = false;
1694
1695         if (obj_request_img_data_test(obj_request)) {
1696                 img_request = obj_request->img_request;
1697                 layered = img_request && img_request_layered_test(img_request);
1698                 rbd_dev = img_request->rbd_dev;
1699         }
1700
1701         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1702                 obj_request, img_request, obj_request->result,
1703                 obj_request->xferred, obj_request->length);
1704         if (layered && obj_request->result == -ENOENT &&
1705                         obj_request->img_offset < rbd_dev->parent_overlap)
1706                 rbd_img_parent_read(obj_request);
1707         else if (img_request)
1708                 rbd_img_obj_request_read_callback(obj_request);
1709         else
1710                 obj_request_done_set(obj_request);
1711 }
1712
1713 static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1714 {
1715         dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1716                 obj_request->result, obj_request->length);
1717         /*
1718          * There is no such thing as a successful short write.  Set
1719          * it to our originally-requested length.
1720          */
1721         obj_request->xferred = obj_request->length;
1722         obj_request_done_set(obj_request);
1723 }
1724
1725 /*
1726  * For a simple stat call there's nothing to do.  We'll do more if
1727  * this is part of a write sequence for a layered image.
1728  */
1729 static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1730 {
1731         dout("%s: obj %p\n", __func__, obj_request);
1732         obj_request_done_set(obj_request);
1733 }
1734
1735 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1736                                 struct ceph_msg *msg)
1737 {
1738         struct rbd_obj_request *obj_request = osd_req->r_priv;
1739         u16 opcode;
1740
1741         dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1742         rbd_assert(osd_req == obj_request->osd_req);
1743         if (obj_request_img_data_test(obj_request)) {
1744                 rbd_assert(obj_request->img_request);
1745                 rbd_assert(obj_request->which != BAD_WHICH);
1746         } else {
1747                 rbd_assert(obj_request->which == BAD_WHICH);
1748         }
1749
1750         if (osd_req->r_result < 0)
1751                 obj_request->result = osd_req->r_result;
1752
1753         rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
1754
1755         /*
1756          * We support a 64-bit length, but ultimately it has to be
1757          * passed to blk_end_request(), which takes an unsigned int.
1758          */
1759         obj_request->xferred = osd_req->r_reply_op_len[0];
1760         rbd_assert(obj_request->xferred < (u64)UINT_MAX);
1761
1762         opcode = osd_req->r_ops[0].op;
1763         switch (opcode) {
1764         case CEPH_OSD_OP_READ:
1765                 rbd_osd_read_callback(obj_request);
1766                 break;
1767         case CEPH_OSD_OP_SETALLOCHINT:
1768                 rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
1769                 /* fall through */
1770         case CEPH_OSD_OP_WRITE:
1771                 rbd_osd_write_callback(obj_request);
1772                 break;
1773         case CEPH_OSD_OP_STAT:
1774                 rbd_osd_stat_callback(obj_request);
1775                 break;
1776         case CEPH_OSD_OP_CALL:
1777         case CEPH_OSD_OP_NOTIFY_ACK:
1778         case CEPH_OSD_OP_WATCH:
1779                 rbd_osd_trivial_callback(obj_request);
1780                 break;
1781         default:
1782                 rbd_warn(NULL, "%s: unsupported op %hu",
1783                         obj_request->object_name, (unsigned short) opcode);
1784                 break;
1785         }
1786
1787         if (obj_request_done_test(obj_request))
1788                 rbd_obj_request_complete(obj_request);
1789 }
1790
1791 static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1792 {
1793         struct rbd_img_request *img_request = obj_request->img_request;
1794         struct ceph_osd_request *osd_req = obj_request->osd_req;
1795         u64 snap_id;
1796
1797         rbd_assert(osd_req != NULL);
1798
1799         snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
1800         ceph_osdc_build_request(osd_req, obj_request->offset,
1801                         NULL, snap_id, NULL);
1802 }
1803
1804 static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1805 {
1806         struct rbd_img_request *img_request = obj_request->img_request;
1807         struct ceph_osd_request *osd_req = obj_request->osd_req;
1808         struct ceph_snap_context *snapc;
1809         struct timespec mtime = CURRENT_TIME;
1810
1811         rbd_assert(osd_req != NULL);
1812
1813         snapc = img_request ? img_request->snapc : NULL;
1814         ceph_osdc_build_request(osd_req, obj_request->offset,
1815                         snapc, CEPH_NOSNAP, &mtime);
1816 }
1817
1818 /*
1819  * Create an osd request.  A read request has one osd op (read).
1820  * A write request has either one (watch) or two (hint+write) osd ops.
1821  * (All rbd data writes are prefixed with an allocation hint op, but
1822  * technically osd watch is a write request, hence this distinction.)
1823  */
1824 static struct ceph_osd_request *rbd_osd_req_create(
1825                                         struct rbd_device *rbd_dev,
1826                                         bool write_request,
1827                                         unsigned int num_ops,
1828                                         struct rbd_obj_request *obj_request)
1829 {
1830         struct ceph_snap_context *snapc = NULL;
1831         struct ceph_osd_client *osdc;
1832         struct ceph_osd_request *osd_req;
1833
1834         if (obj_request_img_data_test(obj_request)) {
1835                 struct rbd_img_request *img_request = obj_request->img_request;
1836
1837                 rbd_assert(write_request ==
1838                                 img_request_write_test(img_request));
1839                 if (write_request)
1840                         snapc = img_request->snapc;
1841         }
1842
1843         rbd_assert(num_ops == 1 || (write_request && num_ops == 2));
1844
1845         /* Allocate and initialize the request, for the num_ops ops */
1846
1847         osdc = &rbd_dev->rbd_client->client->osdc;
1848         osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
1849                                           GFP_ATOMIC);
1850         if (!osd_req)
1851                 return NULL;    /* ENOMEM */
1852
1853         if (write_request)
1854                 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1855         else
1856                 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1857
1858         osd_req->r_callback = rbd_osd_req_callback;
1859         osd_req->r_priv = obj_request;
1860
1861         osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
1862         ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
1863
1864         return osd_req;
1865 }
1866
1867 /*
1868  * Create a copyup osd request based on the information in the
1869  * object request supplied.  A copyup request has three osd ops,
1870  * a copyup method call, a hint op, and a write op.
1871  */
1872 static struct ceph_osd_request *
1873 rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1874 {
1875         struct rbd_img_request *img_request;
1876         struct ceph_snap_context *snapc;
1877         struct rbd_device *rbd_dev;
1878         struct ceph_osd_client *osdc;
1879         struct ceph_osd_request *osd_req;
1880
1881         rbd_assert(obj_request_img_data_test(obj_request));
1882         img_request = obj_request->img_request;
1883         rbd_assert(img_request);
1884         rbd_assert(img_request_write_test(img_request));
1885
1886         /* Allocate and initialize the request, for the three ops */
1887
1888         snapc = img_request->snapc;
1889         rbd_dev = img_request->rbd_dev;
1890         osdc = &rbd_dev->rbd_client->client->osdc;
1891         osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC);
1892         if (!osd_req)
1893                 return NULL;    /* ENOMEM */
1894
1895         osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1896         osd_req->r_callback = rbd_osd_req_callback;
1897         osd_req->r_priv = obj_request;
1898
1899         osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
1900         ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
1901
1902         return osd_req;
1903 }
1904
1905
1906 static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1907 {
1908         ceph_osdc_put_request(osd_req);
1909 }
1910
1911 /* object_name is assumed to be a non-null pointer and NUL-terminated */
1912
1913 static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1914                                                 u64 offset, u64 length,
1915                                                 enum obj_request_type type)
1916 {
1917         struct rbd_obj_request *obj_request;
1918         size_t size;
1919         char *name;
1920
1921         rbd_assert(obj_request_type_valid(type));
1922
1923         size = strlen(object_name) + 1;
1924         name = kmalloc(size, GFP_KERNEL);
1925         if (!name)
1926                 return NULL;
1927
1928         obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
1929         if (!obj_request) {
1930                 kfree(name);
1931                 return NULL;
1932         }
1933
1934         obj_request->object_name = memcpy(name, object_name, size);
1935         obj_request->offset = offset;
1936         obj_request->length = length;
1937         obj_request->flags = 0;
1938         obj_request->which = BAD_WHICH;
1939         obj_request->type = type;
1940         INIT_LIST_HEAD(&obj_request->links);
1941         init_completion(&obj_request->completion);
1942         kref_init(&obj_request->kref);
1943
1944         dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1945                 offset, length, (int)type, obj_request);
1946
1947         return obj_request;
1948 }
1949
1950 static void rbd_obj_request_destroy(struct kref *kref)
1951 {
1952         struct rbd_obj_request *obj_request;
1953
1954         obj_request = container_of(kref, struct rbd_obj_request, kref);
1955
1956         dout("%s: obj %p\n", __func__, obj_request);
1957
1958         rbd_assert(obj_request->img_request == NULL);
1959         rbd_assert(obj_request->which == BAD_WHICH);
1960
1961         if (obj_request->osd_req)
1962                 rbd_osd_req_destroy(obj_request->osd_req);
1963
1964         rbd_assert(obj_request_type_valid(obj_request->type));
1965         switch (obj_request->type) {
1966         case OBJ_REQUEST_NODATA:
1967                 break;          /* Nothing to do */
1968         case OBJ_REQUEST_BIO:
1969                 if (obj_request->bio_list)
1970                         bio_chain_put(obj_request->bio_list);
1971                 break;
1972         case OBJ_REQUEST_PAGES:
1973                 if (obj_request->pages)
1974                         ceph_release_page_vector(obj_request->pages,
1975                                                 obj_request->page_count);
1976                 break;
1977         }
1978
1979         kfree(obj_request->object_name);
1980         obj_request->object_name = NULL;
1981         kmem_cache_free(rbd_obj_request_cache, obj_request);
1982 }
1983
1984 /* It's OK to call this for a device with no parent */
1985
1986 static void rbd_spec_put(struct rbd_spec *spec);
1987 static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1988 {
1989         rbd_dev_remove_parent(rbd_dev);
1990         rbd_spec_put(rbd_dev->parent_spec);
1991         rbd_dev->parent_spec = NULL;
1992         rbd_dev->parent_overlap = 0;
1993 }
1994
1995 /*
1996  * Parent image reference counting is used to determine when an
1997  * image's parent fields can be safely torn down--after there are no
1998  * more in-flight requests to the parent image.  When the last
1999  * reference is dropped, cleaning them up is safe.
2000  */
2001 static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
2002 {
2003         int counter;
2004
2005         if (!rbd_dev->parent_spec)
2006                 return;
2007
2008         counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
2009         if (counter > 0)
2010                 return;
2011
2012         /* Last reference; clean up parent data structures */
2013
2014         if (!counter)
2015                 rbd_dev_unparent(rbd_dev);
2016         else
2017                 rbd_warn(rbd_dev, "parent reference underflow");
2018 }
2019
2020 /*
2021  * If an image has a non-zero parent overlap, get a reference to its
2022  * parent.
2023  *
2024  * We must get the reference before checking for the overlap to
2025  * coordinate properly with zeroing the parent overlap in
2026  * rbd_dev_v2_parent_info() when an image gets flattened.  We
2027  * drop it again if there is no overlap.
2028  *
2029  * Returns true if the rbd device has a parent with a non-zero
2030  * overlap and a reference for it was successfully taken, or
2031  * false otherwise.
2032  */
2033 static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2034 {
2035         int counter;
2036
2037         if (!rbd_dev->parent_spec)
2038                 return false;
2039
2040         counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
2041         if (counter > 0 && rbd_dev->parent_overlap)
2042                 return true;
2043
2044         /* Image was flattened, but parent is not yet torn down */
2045
2046         if (counter < 0)
2047                 rbd_warn(rbd_dev, "parent reference overflow");
2048
2049         return false;
2050 }
2051
2052 /*
2053  * Caller is responsible for filling in the list of object requests
2054  * that comprises the image request, and the Linux request pointer
2055  * (if there is one).
2056  */
2057 static struct rbd_img_request *rbd_img_request_create(
2058                                         struct rbd_device *rbd_dev,
2059                                         u64 offset, u64 length,
2060                                         bool write_request)
2061 {
2062         struct rbd_img_request *img_request;
2063
2064         img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
2065         if (!img_request)
2066                 return NULL;
2067
2068         if (write_request) {
2069                 down_read(&rbd_dev->header_rwsem);
2070                 ceph_get_snap_context(rbd_dev->header.snapc);
2071                 up_read(&rbd_dev->header_rwsem);
2072         }
2073
2074         img_request->rq = NULL;
2075         img_request->rbd_dev = rbd_dev;
2076         img_request->offset = offset;
2077         img_request->length = length;
2078         img_request->flags = 0;
2079         if (write_request) {
2080                 img_request_write_set(img_request);
2081                 img_request->snapc = rbd_dev->header.snapc;
2082         } else {
2083                 img_request->snap_id = rbd_dev->spec->snap_id;
2084         }
2085         if (rbd_dev_parent_get(rbd_dev))
2086                 img_request_layered_set(img_request);
2087         spin_lock_init(&img_request->completion_lock);
2088         img_request->next_completion = 0;
2089         img_request->callback = NULL;
2090         img_request->result = 0;
2091         img_request->obj_request_count = 0;
2092         INIT_LIST_HEAD(&img_request->obj_requests);
2093         kref_init(&img_request->kref);
2094
2095         dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
2096                 write_request ? "write" : "read", offset, length,
2097                 img_request);
2098
2099         return img_request;
2100 }
2101
2102 static void rbd_img_request_destroy(struct kref *kref)
2103 {
2104         struct rbd_img_request *img_request;
2105         struct rbd_obj_request *obj_request;
2106         struct rbd_obj_request *next_obj_request;
2107
2108         img_request = container_of(kref, struct rbd_img_request, kref);
2109
2110         dout("%s: img %p\n", __func__, img_request);
2111
2112         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2113                 rbd_img_obj_request_del(img_request, obj_request);
2114         rbd_assert(img_request->obj_request_count == 0);
2115
2116         if (img_request_layered_test(img_request)) {
2117                 img_request_layered_clear(img_request);
2118                 rbd_dev_parent_put(img_request->rbd_dev);
2119         }
2120
2121         if (img_request_write_test(img_request))
2122                 ceph_put_snap_context(img_request->snapc);
2123
2124         kmem_cache_free(rbd_img_request_cache, img_request);
2125 }
2126
2127 static struct rbd_img_request *rbd_parent_request_create(
2128                                         struct rbd_obj_request *obj_request,
2129                                         u64 img_offset, u64 length)
2130 {
2131         struct rbd_img_request *parent_request;
2132         struct rbd_device *rbd_dev;
2133
2134         rbd_assert(obj_request->img_request);
2135         rbd_dev = obj_request->img_request->rbd_dev;
2136
2137         parent_request = rbd_img_request_create(rbd_dev->parent,
2138                                                 img_offset, length, false);
2139         if (!parent_request)
2140                 return NULL;
2141
2142         img_request_child_set(parent_request);
2143         rbd_obj_request_get(obj_request);
2144         parent_request->obj_request = obj_request;
2145
2146         return parent_request;
2147 }
2148
2149 static void rbd_parent_request_destroy(struct kref *kref)
2150 {
2151         struct rbd_img_request *parent_request;
2152         struct rbd_obj_request *orig_request;
2153
2154         parent_request = container_of(kref, struct rbd_img_request, kref);
2155         orig_request = parent_request->obj_request;
2156
2157         parent_request->obj_request = NULL;
2158         rbd_obj_request_put(orig_request);
2159         img_request_child_clear(parent_request);
2160
2161         rbd_img_request_destroy(kref);
2162 }
2163
2164 static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
2165 {
2166         struct rbd_img_request *img_request;
2167         unsigned int xferred;
2168         int result;
2169         bool more;
2170
2171         rbd_assert(obj_request_img_data_test(obj_request));
2172         img_request = obj_request->img_request;
2173
2174         rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
2175         xferred = (unsigned int)obj_request->xferred;
2176         result = obj_request->result;
2177         if (result) {
2178                 struct rbd_device *rbd_dev = img_request->rbd_dev;
2179
2180                 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
2181                         img_request_write_test(img_request) ? "write" : "read",
2182                         obj_request->length, obj_request->img_offset,
2183                         obj_request->offset);
2184                 rbd_warn(rbd_dev, "  result %d xferred %x",
2185                         result, xferred);
2186                 if (!img_request->result)
2187                         img_request->result = result;
2188         }
2189
2190         /* Image object requests don't own their page array */
2191
2192         if (obj_request->type == OBJ_REQUEST_PAGES) {
2193                 obj_request->pages = NULL;
2194                 obj_request->page_count = 0;
2195         }
2196
2197         if (img_request_child_test(img_request)) {
2198                 rbd_assert(img_request->obj_request != NULL);
2199                 more = obj_request->which < img_request->obj_request_count - 1;
2200         } else {
2201                 rbd_assert(img_request->rq != NULL);
2202                 more = blk_end_request(img_request->rq, result, xferred);
2203         }
2204
2205         return more;
2206 }
2207
2208 static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
2209 {
2210         struct rbd_img_request *img_request;
2211         u32 which = obj_request->which;
2212         bool more = true;
2213
2214         rbd_assert(obj_request_img_data_test(obj_request));
2215         img_request = obj_request->img_request;
2216
2217         dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
2218         rbd_assert(img_request != NULL);
2219         rbd_assert(img_request->obj_request_count > 0);
2220         rbd_assert(which != BAD_WHICH);
2221         rbd_assert(which < img_request->obj_request_count);
2222
2223         spin_lock_irq(&img_request->completion_lock);
2224         if (which != img_request->next_completion)
2225                 goto out;
2226
2227         for_each_obj_request_from(img_request, obj_request) {
2228                 rbd_assert(more);
2229                 rbd_assert(which < img_request->obj_request_count);
2230
2231                 if (!obj_request_done_test(obj_request))
2232                         break;
2233                 more = rbd_img_obj_end_request(obj_request);
2234                 which++;
2235         }
2236
2237         rbd_assert(more ^ (which == img_request->obj_request_count));
2238         img_request->next_completion = which;
2239 out:
2240         spin_unlock_irq(&img_request->completion_lock);
2241         rbd_img_request_put(img_request);
2242
2243         if (!more)
2244                 rbd_img_request_complete(img_request);
2245 }
2246
2247 /*
2248  * Split up an image request into one or more object requests, each
2249  * to a different object.  The "type" parameter indicates whether
2250  * "data_desc" is the pointer to the head of a list of bio
2251  * structures, or the base of a page array.  In either case this
2252  * function assumes data_desc describes memory sufficient to hold
2253  * all data described by the image request.
2254  */
2255 static int rbd_img_request_fill(struct rbd_img_request *img_request,
2256                                         enum obj_request_type type,
2257                                         void *data_desc)
2258 {
2259         struct rbd_device *rbd_dev = img_request->rbd_dev;
2260         struct rbd_obj_request *obj_request = NULL;
2261         struct rbd_obj_request *next_obj_request;
2262         bool write_request = img_request_write_test(img_request);
2263         struct bio *bio_list = NULL;
2264         unsigned int bio_offset = 0;
2265         struct page **pages = NULL;
2266         u64 img_offset;
2267         u64 resid;
2268         u16 opcode;
2269
2270         dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2271                 (int)type, data_desc);
2272
2273         opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
2274         img_offset = img_request->offset;
2275         resid = img_request->length;
2276         rbd_assert(resid > 0);
2277
2278         if (type == OBJ_REQUEST_BIO) {
2279                 bio_list = data_desc;
2280                 rbd_assert(img_offset ==
2281                            bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
2282         } else {
2283                 rbd_assert(type == OBJ_REQUEST_PAGES);
2284                 pages = data_desc;
2285         }
2286
2287         while (resid) {
2288                 struct ceph_osd_request *osd_req;
2289                 const char *object_name;
2290                 u64 offset;
2291                 u64 length;
2292                 unsigned int which = 0;
2293
2294                 object_name = rbd_segment_name(rbd_dev, img_offset);
2295                 if (!object_name)
2296                         goto out_unwind;
2297                 offset = rbd_segment_offset(rbd_dev, img_offset);
2298                 length = rbd_segment_length(rbd_dev, img_offset, resid);
2299                 obj_request = rbd_obj_request_create(object_name,
2300                                                 offset, length, type);
2301                 /* object request has its own copy of the object name */
2302                 rbd_segment_name_free(object_name);
2303                 if (!obj_request)
2304                         goto out_unwind;
2305
2306                 /*
2307                  * set obj_request->img_request before creating the
2308                  * osd_request so that it gets the right snapc
2309                  */
2310                 rbd_img_obj_request_add(img_request, obj_request);
2311
2312                 if (type == OBJ_REQUEST_BIO) {
2313                         unsigned int clone_size;
2314
2315                         rbd_assert(length <= (u64)UINT_MAX);
2316                         clone_size = (unsigned int)length;
2317                         obj_request->bio_list =
2318                                         bio_chain_clone_range(&bio_list,
2319                                                                 &bio_offset,
2320                                                                 clone_size,
2321                                                                 GFP_ATOMIC);
2322                         if (!obj_request->bio_list)
2323                                 goto out_unwind;
2324                 } else {
2325                         unsigned int page_count;
2326
2327                         obj_request->pages = pages;
2328                         page_count = (u32)calc_pages_for(offset, length);
2329                         obj_request->page_count = page_count;
2330                         if ((offset + length) & ~PAGE_MASK)
2331                                 page_count--;   /* more on last page */
2332                         pages += page_count;
2333                 }
2334
2335                 osd_req = rbd_osd_req_create(rbd_dev, write_request,
2336                                              (write_request ? 2 : 1),
2337                                              obj_request);
2338                 if (!osd_req)
2339                         goto out_unwind;
2340                 obj_request->osd_req = osd_req;
2341                 obj_request->callback = rbd_img_obj_callback;
2342                 rbd_img_request_get(img_request);
2343
2344                 if (write_request) {
2345                         osd_req_op_alloc_hint_init(osd_req, which,
2346                                              rbd_obj_bytes(&rbd_dev->header),
2347                                              rbd_obj_bytes(&rbd_dev->header));
2348                         which++;
2349                 }
2350
2351                 osd_req_op_extent_init(osd_req, which, opcode, offset, length,
2352                                        0, 0);
2353                 if (type == OBJ_REQUEST_BIO)
2354                         osd_req_op_extent_osd_data_bio(osd_req, which,
2355                                         obj_request->bio_list, length);
2356                 else
2357                         osd_req_op_extent_osd_data_pages(osd_req, which,
2358                                         obj_request->pages, length,
2359                                         offset & ~PAGE_MASK, false, false);
2360
2361                 if (write_request)
2362                         rbd_osd_req_format_write(obj_request);
2363                 else
2364                         rbd_osd_req_format_read(obj_request);
2365
2366                 obj_request->img_offset = img_offset;
2367
2368                 img_offset += length;
2369                 resid -= length;
2370         }
2371
2372         return 0;
2373
2374 out_unwind:
2375         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2376                 rbd_img_obj_request_del(img_request, obj_request);
2377
2378         return -ENOMEM;
2379 }
2380
2381 static void
2382 rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
2383 {
2384         struct rbd_img_request *img_request;
2385         struct rbd_device *rbd_dev;
2386         struct page **pages;
2387         u32 page_count;
2388
2389         rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2390         rbd_assert(obj_request_img_data_test(obj_request));
2391         img_request = obj_request->img_request;
2392         rbd_assert(img_request);
2393
2394         rbd_dev = img_request->rbd_dev;
2395         rbd_assert(rbd_dev);
2396
2397         pages = obj_request->copyup_pages;
2398         rbd_assert(pages != NULL);
2399         obj_request->copyup_pages = NULL;
2400         page_count = obj_request->copyup_page_count;
2401         rbd_assert(page_count);
2402         obj_request->copyup_page_count = 0;
2403         ceph_release_page_vector(pages, page_count);
2404
2405         /*
2406          * We want the transfer count to reflect the size of the
2407          * original write request.  There is no such thing as a
2408          * successful short write, so if the request was successful
2409          * we can just set it to the originally-requested length.
2410          */
2411         if (!obj_request->result)
2412                 obj_request->xferred = obj_request->length;
2413
2414         /* Finish up with the normal image object callback */
2415
2416         rbd_img_obj_callback(obj_request);
2417 }
2418
2419 static void
2420 rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2421 {
2422         struct rbd_obj_request *orig_request;
2423         struct ceph_osd_request *osd_req;
2424         struct ceph_osd_client *osdc;
2425         struct rbd_device *rbd_dev;
2426         struct page **pages;
2427         u32 page_count;
2428         int img_result;
2429         u64 parent_length;
2430         u64 offset;
2431         u64 length;
2432
2433         rbd_assert(img_request_child_test(img_request));
2434
2435         /* First get what we need from the image request */
2436
2437         pages = img_request->copyup_pages;
2438         rbd_assert(pages != NULL);
2439         img_request->copyup_pages = NULL;
2440         page_count = img_request->copyup_page_count;
2441         rbd_assert(page_count);
2442         img_request->copyup_page_count = 0;
2443
2444         orig_request = img_request->obj_request;
2445         rbd_assert(orig_request != NULL);
2446         rbd_assert(obj_request_type_valid(orig_request->type));
2447         img_result = img_request->result;
2448         parent_length = img_request->length;
2449         rbd_assert(parent_length == img_request->xferred);
2450         rbd_img_request_put(img_request);
2451
2452         rbd_assert(orig_request->img_request);
2453         rbd_dev = orig_request->img_request->rbd_dev;
2454         rbd_assert(rbd_dev);
2455
2456         /*
2457          * If the overlap has become 0 (most likely because the
2458          * image has been flattened) we need to free the pages
2459          * and re-submit the original write request.
2460          */
2461         if (!rbd_dev->parent_overlap) {
2462                 struct ceph_osd_client *osdc;
2463
2464                 ceph_release_page_vector(pages, page_count);
2465                 osdc = &rbd_dev->rbd_client->client->osdc;
2466                 img_result = rbd_obj_request_submit(osdc, orig_request);
2467                 if (!img_result)
2468                         return;
2469         }
2470
2471         if (img_result)
2472                 goto out_err;
2473
2474         /*
2475          * The original osd request is of no use to use any more.
2476          * We need a new one that can hold the three ops in a copyup
2477          * request.  Allocate the new copyup osd request for the
2478          * original request, and release the old one.
2479          */
2480         img_result = -ENOMEM;
2481         osd_req = rbd_osd_req_create_copyup(orig_request);
2482         if (!osd_req)
2483                 goto out_err;
2484         rbd_osd_req_destroy(orig_request->osd_req);
2485         orig_request->osd_req = osd_req;
2486         orig_request->copyup_pages = pages;
2487         orig_request->copyup_page_count = page_count;
2488
2489         /* Initialize the copyup op */
2490
2491         osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2492         osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
2493                                                 false, false);
2494
2495         /* Then the hint op */
2496
2497         osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header),
2498                                    rbd_obj_bytes(&rbd_dev->header));
2499
2500         /* And the original write request op */
2501
2502         offset = orig_request->offset;
2503         length = orig_request->length;
2504         osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE,
2505                                         offset, length, 0, 0);
2506         if (orig_request->type == OBJ_REQUEST_BIO)
2507                 osd_req_op_extent_osd_data_bio(osd_req, 2,
2508                                         orig_request->bio_list, length);
2509         else
2510                 osd_req_op_extent_osd_data_pages(osd_req, 2,
2511                                         orig_request->pages, length,
2512                                         offset & ~PAGE_MASK, false, false);
2513
2514         rbd_osd_req_format_write(orig_request);
2515
2516         /* All set, send it off. */
2517
2518         orig_request->callback = rbd_img_obj_copyup_callback;
2519         osdc = &rbd_dev->rbd_client->client->osdc;
2520         img_result = rbd_obj_request_submit(osdc, orig_request);
2521         if (!img_result)
2522                 return;
2523 out_err:
2524         /* Record the error code and complete the request */
2525
2526         orig_request->result = img_result;
2527         orig_request->xferred = 0;
2528         obj_request_done_set(orig_request);
2529         rbd_obj_request_complete(orig_request);
2530 }
2531
2532 /*
2533  * Read from the parent image the range of data that covers the
2534  * entire target of the given object request.  This is used for
2535  * satisfying a layered image write request when the target of an
2536  * object request from the image request does not exist.
2537  *
2538  * A page array big enough to hold the returned data is allocated
2539  * and supplied to rbd_img_request_fill() as the "data descriptor."
2540  * When the read completes, this page array will be transferred to
2541  * the original object request for the copyup operation.
2542  *
2543  * If an error occurs, record it as the result of the original
2544  * object request and mark it done so it gets completed.
2545  */
2546 static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2547 {
2548         struct rbd_img_request *img_request = NULL;
2549         struct rbd_img_request *parent_request = NULL;
2550         struct rbd_device *rbd_dev;
2551         u64 img_offset;
2552         u64 length;
2553         struct page **pages = NULL;
2554         u32 page_count;
2555         int result;
2556
2557         rbd_assert(obj_request_img_data_test(obj_request));
2558         rbd_assert(obj_request_type_valid(obj_request->type));
2559
2560         img_request = obj_request->img_request;
2561         rbd_assert(img_request != NULL);
2562         rbd_dev = img_request->rbd_dev;
2563         rbd_assert(rbd_dev->parent != NULL);
2564
2565         /*
2566          * Determine the byte range covered by the object in the
2567          * child image to which the original request was to be sent.
2568          */
2569         img_offset = obj_request->img_offset - obj_request->offset;
2570         length = (u64)1 << rbd_dev->header.obj_order;
2571
2572         /*
2573          * There is no defined parent data beyond the parent
2574          * overlap, so limit what we read at that boundary if
2575          * necessary.
2576          */
2577         if (img_offset + length > rbd_dev->parent_overlap) {
2578                 rbd_assert(img_offset < rbd_dev->parent_overlap);
2579                 length = rbd_dev->parent_overlap - img_offset;
2580         }
2581
2582         /*
2583          * Allocate a page array big enough to receive the data read
2584          * from the parent.
2585          */
2586         page_count = (u32)calc_pages_for(0, length);
2587         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2588         if (IS_ERR(pages)) {
2589                 result = PTR_ERR(pages);
2590                 pages = NULL;
2591                 goto out_err;
2592         }
2593
2594         result = -ENOMEM;
2595         parent_request = rbd_parent_request_create(obj_request,
2596                                                 img_offset, length);
2597         if (!parent_request)
2598                 goto out_err;
2599
2600         result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2601         if (result)
2602                 goto out_err;
2603         parent_request->copyup_pages = pages;
2604         parent_request->copyup_page_count = page_count;
2605
2606         parent_request->callback = rbd_img_obj_parent_read_full_callback;
2607         result = rbd_img_request_submit(parent_request);
2608         if (!result)
2609                 return 0;
2610
2611         parent_request->copyup_pages = NULL;
2612         parent_request->copyup_page_count = 0;
2613         parent_request->obj_request = NULL;
2614         rbd_obj_request_put(obj_request);
2615 out_err:
2616         if (pages)
2617                 ceph_release_page_vector(pages, page_count);
2618         if (parent_request)
2619                 rbd_img_request_put(parent_request);
2620         obj_request->result = result;
2621         obj_request->xferred = 0;
2622         obj_request_done_set(obj_request);
2623
2624         return result;
2625 }
2626
2627 static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2628 {
2629         struct rbd_obj_request *orig_request;
2630         struct rbd_device *rbd_dev;
2631         int result;
2632
2633         rbd_assert(!obj_request_img_data_test(obj_request));
2634
2635         /*
2636          * All we need from the object request is the original
2637          * request and the result of the STAT op.  Grab those, then
2638          * we're done with the request.
2639          */
2640         orig_request = obj_request->obj_request;
2641         obj_request->obj_request = NULL;
2642         rbd_obj_request_put(orig_request);
2643         rbd_assert(orig_request);
2644         rbd_assert(orig_request->img_request);
2645
2646         result = obj_request->result;
2647         obj_request->result = 0;
2648
2649         dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2650                 obj_request, orig_request, result,
2651                 obj_request->xferred, obj_request->length);
2652         rbd_obj_request_put(obj_request);
2653
2654         /*
2655          * If the overlap has become 0 (most likely because the
2656          * image has been flattened) we need to free the pages
2657          * and re-submit the original write request.
2658          */
2659         rbd_dev = orig_request->img_request->rbd_dev;
2660         if (!rbd_dev->parent_overlap) {
2661                 struct ceph_osd_client *osdc;
2662
2663                 osdc = &rbd_dev->rbd_client->client->osdc;
2664                 result = rbd_obj_request_submit(osdc, orig_request);
2665                 if (!result)
2666                         return;
2667         }
2668
2669         /*
2670          * Our only purpose here is to determine whether the object
2671          * exists, and we don't want to treat the non-existence as
2672          * an error.  If something else comes back, transfer the
2673          * error to the original request and complete it now.
2674          */
2675         if (!result) {
2676                 obj_request_existence_set(orig_request, true);
2677         } else if (result == -ENOENT) {
2678                 obj_request_existence_set(orig_request, false);
2679         } else if (result) {
2680                 orig_request->result = result;
2681                 goto out;
2682         }
2683
2684         /*
2685          * Resubmit the original request now that we have recorded
2686          * whether the target object exists.
2687          */
2688         orig_request->result = rbd_img_obj_request_submit(orig_request);
2689 out:
2690         if (orig_request->result)
2691                 rbd_obj_request_complete(orig_request);
2692 }
2693
2694 static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2695 {
2696         struct rbd_obj_request *stat_request;
2697         struct rbd_device *rbd_dev;
2698         struct ceph_osd_client *osdc;
2699         struct page **pages = NULL;
2700         u32 page_count;
2701         size_t size;
2702         int ret;
2703
2704         /*
2705          * The response data for a STAT call consists of:
2706          *     le64 length;
2707          *     struct {
2708          *         le32 tv_sec;
2709          *         le32 tv_nsec;
2710          *     } mtime;
2711          */
2712         size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2713         page_count = (u32)calc_pages_for(0, size);
2714         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2715         if (IS_ERR(pages))
2716                 return PTR_ERR(pages);
2717
2718         ret = -ENOMEM;
2719         stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2720                                                         OBJ_REQUEST_PAGES);
2721         if (!stat_request)
2722                 goto out;
2723
2724         rbd_obj_request_get(obj_request);
2725         stat_request->obj_request = obj_request;
2726         stat_request->pages = pages;
2727         stat_request->page_count = page_count;
2728
2729         rbd_assert(obj_request->img_request);
2730         rbd_dev = obj_request->img_request->rbd_dev;
2731         stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
2732                                                    stat_request);
2733         if (!stat_request->osd_req)
2734                 goto out;
2735         stat_request->callback = rbd_img_obj_exists_callback;
2736
2737         osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2738         osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2739                                         false, false);
2740         rbd_osd_req_format_read(stat_request);
2741
2742         osdc = &rbd_dev->rbd_client->client->osdc;
2743         ret = rbd_obj_request_submit(osdc, stat_request);
2744 out:
2745         if (ret)
2746                 rbd_obj_request_put(obj_request);
2747
2748         return ret;
2749 }
2750
2751 static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2752 {
2753         struct rbd_img_request *img_request;
2754         struct rbd_device *rbd_dev;
2755         bool known;
2756
2757         rbd_assert(obj_request_img_data_test(obj_request));
2758
2759         img_request = obj_request->img_request;
2760         rbd_assert(img_request);
2761         rbd_dev = img_request->rbd_dev;
2762
2763         /*
2764          * Only writes to layered images need special handling.
2765          * Reads and non-layered writes are simple object requests.
2766          * Layered writes that start beyond the end of the overlap
2767          * with the parent have no parent data, so they too are
2768          * simple object requests.  Finally, if the target object is
2769          * known to already exist, its parent data has already been
2770          * copied, so a write to the object can also be handled as a
2771          * simple object request.
2772          */
2773         if (!img_request_write_test(img_request) ||
2774                 !img_request_layered_test(img_request) ||
2775                 !obj_request_overlaps_parent(obj_request) ||
2776                 ((known = obj_request_known_test(obj_request)) &&
2777                         obj_request_exists_test(obj_request))) {
2778
2779                 struct rbd_device *rbd_dev;
2780                 struct ceph_osd_client *osdc;
2781
2782                 rbd_dev = obj_request->img_request->rbd_dev;
2783                 osdc = &rbd_dev->rbd_client->client->osdc;
2784
2785                 return rbd_obj_request_submit(osdc, obj_request);
2786         }
2787
2788         /*
2789          * It's a layered write.  The target object might exist but
2790          * we may not know that yet.  If we know it doesn't exist,
2791          * start by reading the data for the full target object from
2792          * the parent so we can use it for a copyup to the target.
2793          */
2794         if (known)
2795                 return rbd_img_obj_parent_read_full(obj_request);
2796
2797         /* We don't know whether the target exists.  Go find out. */
2798
2799         return rbd_img_obj_exists_submit(obj_request);
2800 }
2801
2802 static int rbd_img_request_submit(struct rbd_img_request *img_request)
2803 {
2804         struct rbd_obj_request *obj_request;
2805         struct rbd_obj_request *next_obj_request;
2806
2807         dout("%s: img %p\n", __func__, img_request);
2808         for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2809                 int ret;
2810
2811                 ret = rbd_img_obj_request_submit(obj_request);
2812                 if (ret)
2813                         return ret;
2814         }
2815
2816         return 0;
2817 }
2818
2819 static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2820 {
2821         struct rbd_obj_request *obj_request;
2822         struct rbd_device *rbd_dev;
2823         u64 obj_end;
2824         u64 img_xferred;
2825         int img_result;
2826
2827         rbd_assert(img_request_child_test(img_request));
2828
2829         /* First get what we need from the image request and release it */
2830
2831         obj_request = img_request->obj_request;
2832         img_xferred = img_request->xferred;
2833         img_result = img_request->result;
2834         rbd_img_request_put(img_request);
2835
2836         /*
2837          * If the overlap has become 0 (most likely because the
2838          * image has been flattened) we need to re-submit the
2839          * original request.
2840          */
2841         rbd_assert(obj_request);
2842         rbd_assert(obj_request->img_request);
2843         rbd_dev = obj_request->img_request->rbd_dev;
2844         if (!rbd_dev->parent_overlap) {
2845                 struct ceph_osd_client *osdc;
2846
2847                 osdc = &rbd_dev->rbd_client->client->osdc;
2848                 img_result = rbd_obj_request_submit(osdc, obj_request);
2849                 if (!img_result)
2850                         return;
2851         }
2852
2853         obj_request->result = img_result;
2854         if (obj_request->result)
2855                 goto out;
2856
2857         /*
2858          * We need to zero anything beyond the parent overlap
2859          * boundary.  Since rbd_img_obj_request_read_callback()
2860          * will zero anything beyond the end of a short read, an
2861          * easy way to do this is to pretend the data from the
2862          * parent came up short--ending at the overlap boundary.
2863          */
2864         rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2865         obj_end = obj_request->img_offset + obj_request->length;
2866         if (obj_end > rbd_dev->parent_overlap) {
2867                 u64 xferred = 0;
2868
2869                 if (obj_request->img_offset < rbd_dev->parent_overlap)
2870                         xferred = rbd_dev->parent_overlap -
2871                                         obj_request->img_offset;
2872
2873                 obj_request->xferred = min(img_xferred, xferred);
2874         } else {
2875                 obj_request->xferred = img_xferred;
2876         }
2877 out:
2878         rbd_img_obj_request_read_callback(obj_request);
2879         rbd_obj_request_complete(obj_request);
2880 }
2881
2882 static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2883 {
2884         struct rbd_img_request *img_request;
2885         int result;
2886
2887         rbd_assert(obj_request_img_data_test(obj_request));
2888         rbd_assert(obj_request->img_request != NULL);
2889         rbd_assert(obj_request->result == (s32) -ENOENT);
2890         rbd_assert(obj_request_type_valid(obj_request->type));
2891
2892         /* rbd_read_finish(obj_request, obj_request->length); */
2893         img_request = rbd_parent_request_create(obj_request,
2894                                                 obj_request->img_offset,
2895                                                 obj_request->length);
2896         result = -ENOMEM;
2897         if (!img_request)
2898                 goto out_err;
2899
2900         if (obj_request->type == OBJ_REQUEST_BIO)
2901                 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2902                                                 obj_request->bio_list);
2903         else
2904                 result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
2905                                                 obj_request->pages);
2906         if (result)
2907                 goto out_err;
2908
2909         img_request->callback = rbd_img_parent_read_callback;
2910         result = rbd_img_request_submit(img_request);
2911         if (result)
2912                 goto out_err;
2913
2914         return;
2915 out_err:
2916         if (img_request)
2917                 rbd_img_request_put(img_request);
2918         obj_request->result = result;
2919         obj_request->xferred = 0;
2920         obj_request_done_set(obj_request);
2921 }
2922
2923 static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
2924 {
2925         struct rbd_obj_request *obj_request;
2926         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2927         int ret;
2928
2929         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2930                                                         OBJ_REQUEST_NODATA);
2931         if (!obj_request)
2932                 return -ENOMEM;
2933
2934         ret = -ENOMEM;
2935         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
2936                                                   obj_request);
2937         if (!obj_request->osd_req)
2938                 goto out;
2939
2940         osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2941                                         notify_id, 0, 0);
2942         rbd_osd_req_format_read(obj_request);
2943
2944         ret = rbd_obj_request_submit(osdc, obj_request);
2945         if (ret)
2946                 goto out;
2947         ret = rbd_obj_request_wait(obj_request);
2948 out:
2949         rbd_obj_request_put(obj_request);
2950
2951         return ret;
2952 }
2953
2954 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2955 {
2956         struct rbd_device *rbd_dev = (struct rbd_device *)data;
2957         int ret;
2958
2959         if (!rbd_dev)
2960                 return;
2961
2962         dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2963                 rbd_dev->header_name, (unsigned long long)notify_id,
2964                 (unsigned int)opcode);
2965
2966         /*
2967          * Until adequate refresh error handling is in place, there is
2968          * not much we can do here, except warn.
2969          *
2970          * See http://tracker.ceph.com/issues/5040
2971          */
2972         ret = rbd_dev_refresh(rbd_dev);
2973         if (ret)
2974                 rbd_warn(rbd_dev, "refresh failed: %d", ret);
2975
2976         ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id);
2977         if (ret)
2978                 rbd_warn(rbd_dev, "notify_ack ret %d", ret);
2979 }
2980
2981 /*
2982  * Send a (un)watch request and wait for the ack.  Return a request
2983  * with a ref held on success or error.
2984  */
2985 static struct rbd_obj_request *rbd_obj_watch_request_helper(
2986                                                 struct rbd_device *rbd_dev,
2987                                                 bool watch)
2988 {
2989         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2990         struct rbd_obj_request *obj_request;
2991         int ret;
2992
2993         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2994                                              OBJ_REQUEST_NODATA);
2995         if (!obj_request)
2996                 return ERR_PTR(-ENOMEM);
2997
2998         obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
2999                                                   obj_request);
3000         if (!obj_request->osd_req) {
3001                 ret = -ENOMEM;
3002                 goto out;
3003         }
3004
3005         osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
3006                               rbd_dev->watch_event->cookie, 0, watch);
3007         rbd_osd_req_format_write(obj_request);
3008
3009         if (watch)
3010                 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
3011
3012         ret = rbd_obj_request_submit(osdc, obj_request);
3013         if (ret)
3014                 goto out;
3015
3016         ret = rbd_obj_request_wait(obj_request);
3017         if (ret)
3018                 goto out;
3019
3020         ret = obj_request->result;
3021         if (ret) {
3022                 if (watch)
3023                         rbd_obj_request_end(obj_request);
3024                 goto out;
3025         }
3026
3027         return obj_request;
3028
3029 out:
3030         rbd_obj_request_put(obj_request);
3031         return ERR_PTR(ret);
3032 }
3033
3034 /*
3035  * Initiate a watch request, synchronously.
3036  */
3037 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
3038 {
3039         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3040         struct rbd_obj_request *obj_request;
3041         int ret;
3042
3043         rbd_assert(!rbd_dev->watch_event);
3044         rbd_assert(!rbd_dev->watch_request);
3045
3046         ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
3047                                      &rbd_dev->watch_event);
3048         if (ret < 0)
3049                 return ret;
3050
3051         obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
3052         if (IS_ERR(obj_request)) {
3053                 ceph_osdc_cancel_event(rbd_dev->watch_event);
3054                 rbd_dev->watch_event = NULL;
3055                 return PTR_ERR(obj_request);
3056         }
3057
3058         /*
3059          * A watch request is set to linger, so the underlying osd
3060          * request won't go away until we unregister it.  We retain
3061          * a pointer to the object request during that time (in
3062          * rbd_dev->watch_request), so we'll keep a reference to it.
3063          * We'll drop that reference after we've unregistered it in
3064          * rbd_dev_header_unwatch_sync().
3065          */
3066         rbd_dev->watch_request = obj_request;
3067
3068         return 0;
3069 }
3070
3071 /*
3072  * Tear down a watch request, synchronously.
3073  */
3074 static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
3075 {
3076         struct rbd_obj_request *obj_request;
3077
3078         rbd_assert(rbd_dev->watch_event);
3079         rbd_assert(rbd_dev->watch_request);
3080
3081         rbd_obj_request_end(rbd_dev->watch_request);
3082         rbd_obj_request_put(rbd_dev->watch_request);
3083         rbd_dev->watch_request = NULL;
3084
3085         obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
3086         if (!IS_ERR(obj_request))
3087                 rbd_obj_request_put(obj_request);
3088         else
3089                 rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
3090                          PTR_ERR(obj_request));
3091
3092         ceph_osdc_cancel_event(rbd_dev->watch_event);
3093         rbd_dev->watch_event = NULL;
3094 }
3095
3096 /*
3097  * Synchronous osd object method call.  Returns the number of bytes
3098  * returned in the outbound buffer, or a negative error code.
3099  */
3100 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
3101                              const char *object_name,
3102                              const char *class_name,
3103                              const char *method_name,
3104                              const void *outbound,
3105                              size_t outbound_size,
3106                              void *inbound,
3107                              size_t inbound_size)
3108 {
3109         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3110         struct rbd_obj_request *obj_request;
3111         struct page **pages;
3112         u32 page_count;
3113         int ret;
3114
3115         /*
3116          * Method calls are ultimately read operations.  The result
3117          * should placed into the inbound buffer provided.  They
3118          * also supply outbound data--parameters for the object
3119          * method.  Currently if this is present it will be a
3120          * snapshot id.
3121          */
3122         page_count = (u32)calc_pages_for(0, inbound_size);
3123         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3124         if (IS_ERR(pages))
3125                 return PTR_ERR(pages);
3126
3127         ret = -ENOMEM;
3128         obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
3129                                                         OBJ_REQUEST_PAGES);
3130         if (!obj_request)
3131                 goto out;
3132
3133         obj_request->pages = pages;
3134         obj_request->page_count = page_count;
3135
3136         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
3137                                                   obj_request);
3138         if (!obj_request->osd_req)
3139                 goto out;
3140
3141         osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
3142                                         class_name, method_name);
3143         if (outbound_size) {
3144                 struct ceph_pagelist *pagelist;
3145
3146                 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
3147                 if (!pagelist)
3148                         goto out;
3149
3150                 ceph_pagelist_init(pagelist);
3151                 ceph_pagelist_append(pagelist, outbound, outbound_size);
3152                 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
3153                                                 pagelist);
3154         }
3155         osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
3156                                         obj_request->pages, inbound_size,
3157                                         0, false, false);
3158         rbd_osd_req_format_read(obj_request);
3159
3160         ret = rbd_obj_request_submit(osdc, obj_request);
3161         if (ret)
3162                 goto out;
3163         ret = rbd_obj_request_wait(obj_request);
3164         if (ret)
3165                 goto out;
3166
3167         ret = obj_request->result;
3168         if (ret < 0)
3169                 goto out;
3170
3171         rbd_assert(obj_request->xferred < (u64)INT_MAX);
3172         ret = (int)obj_request->xferred;
3173         ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
3174 out:
3175         if (obj_request)
3176                 rbd_obj_request_put(obj_request);
3177         else
3178                 ceph_release_page_vector(pages, page_count);
3179
3180         return ret;
3181 }
3182
3183 static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
3184 {
3185         struct rbd_img_request *img_request;
3186         u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3187         u64 length = blk_rq_bytes(rq);
3188         bool wr = rq_data_dir(rq) == WRITE;
3189         int result;
3190
3191         /* Ignore/skip any zero-length requests */
3192
3193         if (!length) {
3194                 dout("%s: zero-length request\n", __func__);
3195                 result = 0;
3196                 goto err_rq;
3197         }
3198
3199         /* Disallow writes to a read-only device */
3200
3201         if (wr) {
3202                 if (rbd_dev->mapping.read_only) {
3203                         result = -EROFS;
3204                         goto err_rq;
3205                 }
3206                 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
3207         }
3208
3209         /*
3210          * Quit early if the mapped snapshot no longer exists.  It's
3211          * still possible the snapshot will have disappeared by the
3212          * time our request arrives at the osd, but there's no sense in
3213          * sending it if we already know.
3214          */
3215         if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3216                 dout("request for non-existent snapshot");
3217                 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3218                 result = -ENXIO;
3219                 goto err_rq;
3220         }
3221
3222         if (offset && length > U64_MAX - offset + 1) {
3223                 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3224                          length);
3225                 result = -EINVAL;
3226                 goto err_rq;    /* Shouldn't happen */
3227         }
3228
3229         if (offset + length > rbd_dev->mapping.size) {
3230                 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
3231                          length, rbd_dev->mapping.size);
3232                 result = -EIO;
3233                 goto err_rq;
3234         }
3235
3236         img_request = rbd_img_request_create(rbd_dev, offset, length, wr);
3237         if (!img_request) {
3238                 result = -ENOMEM;
3239                 goto err_rq;
3240         }
3241         img_request->rq = rq;
3242
3243         result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, rq->bio);
3244         if (result)
3245                 goto err_img_request;
3246
3247         result = rbd_img_request_submit(img_request);
3248         if (result)
3249                 goto err_img_request;
3250
3251         return;
3252
3253 err_img_request:
3254         rbd_img_request_put(img_request);
3255 err_rq:
3256         if (result)
3257                 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
3258                          wr ? "write" : "read", length, offset, result);
3259         blk_end_request_all(rq, result);
3260 }
3261
3262 static void rbd_request_workfn(struct work_struct *work)
3263 {
3264         struct rbd_device *rbd_dev =
3265             container_of(work, struct rbd_device, rq_work);
3266         struct request *rq, *next;
3267         LIST_HEAD(requests);
3268
3269         spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */
3270         list_splice_init(&rbd_dev->rq_queue, &requests);
3271         spin_unlock_irq(&rbd_dev->lock);
3272
3273         list_for_each_entry_safe(rq, next, &requests, queuelist) {
3274                 list_del_init(&rq->queuelist);
3275                 rbd_handle_request(rbd_dev, rq);
3276         }
3277 }
3278
3279 /*
3280  * Called with q->queue_lock held and interrupts disabled, possibly on
3281  * the way to schedule().  Do not sleep here!
3282  */
3283 static void rbd_request_fn(struct request_queue *q)
3284 {
3285         struct rbd_device *rbd_dev = q->queuedata;
3286         struct request *rq;
3287         int queued = 0;
3288
3289         rbd_assert(rbd_dev);
3290
3291         while ((rq = blk_fetch_request(q))) {
3292                 /* Ignore any non-FS requests that filter through. */
3293                 if (rq->cmd_type != REQ_TYPE_FS) {
3294                         dout("%s: non-fs request type %d\n", __func__,
3295                                 (int) rq->cmd_type);
3296                         __blk_end_request_all(rq, 0);
3297                         continue;
3298                 }
3299
3300                 list_add_tail(&rq->queuelist, &rbd_dev->rq_queue);
3301                 queued++;
3302         }
3303
3304         if (queued)
3305                 queue_work(rbd_dev->rq_wq, &rbd_dev->rq_work);
3306 }
3307
3308 /*
3309  * a queue callback. Makes sure that we don't create a bio that spans across
3310  * multiple osd objects. One exception would be with a single page bios,
3311  * which we handle later at bio_chain_clone_range()
3312  */
3313 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
3314                           struct bio_vec *bvec)
3315 {
3316         struct rbd_device *rbd_dev = q->queuedata;
3317         sector_t sector_offset;
3318         sector_t sectors_per_obj;
3319         sector_t obj_sector_offset;
3320         int ret;
3321
3322         /*
3323          * Find how far into its rbd object the partition-relative
3324          * bio start sector is to offset relative to the enclosing
3325          * device.
3326          */
3327         sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
3328         sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
3329         obj_sector_offset = sector_offset & (sectors_per_obj - 1);
3330
3331         /*
3332          * Compute the number of bytes from that offset to the end
3333          * of the object.  Account for what's already used by the bio.
3334          */
3335         ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
3336         if (ret > bmd->bi_size)
3337                 ret -= bmd->bi_size;
3338         else
3339                 ret = 0;
3340
3341         /*
3342          * Don't send back more than was asked for.  And if the bio
3343          * was empty, let the whole thing through because:  "Note
3344          * that a block device *must* allow a single page to be
3345          * added to an empty bio."
3346          */
3347         rbd_assert(bvec->bv_len <= PAGE_SIZE);
3348         if (ret > (int) bvec->bv_len || !bmd->bi_size)
3349                 ret = (int) bvec->bv_len;
3350
3351         return ret;
3352 }
3353
3354 static void rbd_free_disk(struct rbd_device *rbd_dev)
3355 {
3356         struct gendisk *disk = rbd_dev->disk;
3357
3358         if (!disk)
3359                 return;
3360
3361         rbd_dev->disk = NULL;
3362         if (disk->flags & GENHD_FL_UP) {
3363                 del_gendisk(disk);
3364             &n