Merge branch 'for-4.16' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu
[sfrench/cifs-2.6.git] / drivers / block / zram / zram_drv.c
1 /*
2  * Compressed RAM block device
3  *
4  * Copyright (C) 2008, 2009, 2010  Nitin Gupta
5  *               2012, 2013 Minchan Kim
6  *
7  * This code is released using a dual license strategy: BSD/GPL
8  * You can choose the licence that better fits your requirements.
9  *
10  * Released under the terms of 3-clause BSD License
11  * Released under the terms of GNU General Public License Version 2.0
12  *
13  */
14
15 #define KMSG_COMPONENT "zram"
16 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
17
18 #include <linux/module.h>
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/bitops.h>
22 #include <linux/blkdev.h>
23 #include <linux/buffer_head.h>
24 #include <linux/device.h>
25 #include <linux/genhd.h>
26 #include <linux/highmem.h>
27 #include <linux/slab.h>
28 #include <linux/backing-dev.h>
29 #include <linux/string.h>
30 #include <linux/vmalloc.h>
31 #include <linux/err.h>
32 #include <linux/idr.h>
33 #include <linux/sysfs.h>
34 #include <linux/cpuhotplug.h>
35
36 #include "zram_drv.h"
37
38 static DEFINE_IDR(zram_index_idr);
39 /* idr index must be protected */
40 static DEFINE_MUTEX(zram_index_mutex);
41
42 static int zram_major;
43 static const char *default_compressor = "lzo";
44
45 /* Module params (documentation at end) */
46 static unsigned int num_devices = 1;
47
48 static void zram_free_page(struct zram *zram, size_t index);
49
50 static inline bool init_done(struct zram *zram)
51 {
52         return zram->disksize;
53 }
54
55 static inline struct zram *dev_to_zram(struct device *dev)
56 {
57         return (struct zram *)dev_to_disk(dev)->private_data;
58 }
59
60 static unsigned long zram_get_handle(struct zram *zram, u32 index)
61 {
62         return zram->table[index].handle;
63 }
64
65 static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
66 {
67         zram->table[index].handle = handle;
68 }
69
70 /* flag operations require table entry bit_spin_lock() being held */
71 static int zram_test_flag(struct zram *zram, u32 index,
72                         enum zram_pageflags flag)
73 {
74         return zram->table[index].value & BIT(flag);
75 }
76
77 static void zram_set_flag(struct zram *zram, u32 index,
78                         enum zram_pageflags flag)
79 {
80         zram->table[index].value |= BIT(flag);
81 }
82
83 static void zram_clear_flag(struct zram *zram, u32 index,
84                         enum zram_pageflags flag)
85 {
86         zram->table[index].value &= ~BIT(flag);
87 }
88
89 static inline void zram_set_element(struct zram *zram, u32 index,
90                         unsigned long element)
91 {
92         zram->table[index].element = element;
93 }
94
95 static unsigned long zram_get_element(struct zram *zram, u32 index)
96 {
97         return zram->table[index].element;
98 }
99
100 static size_t zram_get_obj_size(struct zram *zram, u32 index)
101 {
102         return zram->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
103 }
104
105 static void zram_set_obj_size(struct zram *zram,
106                                         u32 index, size_t size)
107 {
108         unsigned long flags = zram->table[index].value >> ZRAM_FLAG_SHIFT;
109
110         zram->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
111 }
112
113 #if PAGE_SIZE != 4096
114 static inline bool is_partial_io(struct bio_vec *bvec)
115 {
116         return bvec->bv_len != PAGE_SIZE;
117 }
118 #else
119 static inline bool is_partial_io(struct bio_vec *bvec)
120 {
121         return false;
122 }
123 #endif
124
125 /*
126  * Check if request is within bounds and aligned on zram logical blocks.
127  */
128 static inline bool valid_io_request(struct zram *zram,
129                 sector_t start, unsigned int size)
130 {
131         u64 end, bound;
132
133         /* unaligned request */
134         if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
135                 return false;
136         if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
137                 return false;
138
139         end = start + (size >> SECTOR_SHIFT);
140         bound = zram->disksize >> SECTOR_SHIFT;
141         /* out of range range */
142         if (unlikely(start >= bound || end > bound || start > end))
143                 return false;
144
145         /* I/O request is valid */
146         return true;
147 }
148
149 static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
150 {
151         *index  += (*offset + bvec->bv_len) / PAGE_SIZE;
152         *offset = (*offset + bvec->bv_len) % PAGE_SIZE;
153 }
154
155 static inline void update_used_max(struct zram *zram,
156                                         const unsigned long pages)
157 {
158         unsigned long old_max, cur_max;
159
160         old_max = atomic_long_read(&zram->stats.max_used_pages);
161
162         do {
163                 cur_max = old_max;
164                 if (pages > cur_max)
165                         old_max = atomic_long_cmpxchg(
166                                 &zram->stats.max_used_pages, cur_max, pages);
167         } while (old_max != cur_max);
168 }
169
170 static inline void zram_fill_page(void *ptr, unsigned long len,
171                                         unsigned long value)
172 {
173         WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long)));
174         memset_l(ptr, value, len / sizeof(unsigned long));
175 }
176
177 static bool page_same_filled(void *ptr, unsigned long *element)
178 {
179         unsigned int pos;
180         unsigned long *page;
181         unsigned long val;
182
183         page = (unsigned long *)ptr;
184         val = page[0];
185
186         for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) {
187                 if (val != page[pos])
188                         return false;
189         }
190
191         *element = val;
192
193         return true;
194 }
195
196 static ssize_t initstate_show(struct device *dev,
197                 struct device_attribute *attr, char *buf)
198 {
199         u32 val;
200         struct zram *zram = dev_to_zram(dev);
201
202         down_read(&zram->init_lock);
203         val = init_done(zram);
204         up_read(&zram->init_lock);
205
206         return scnprintf(buf, PAGE_SIZE, "%u\n", val);
207 }
208
209 static ssize_t disksize_show(struct device *dev,
210                 struct device_attribute *attr, char *buf)
211 {
212         struct zram *zram = dev_to_zram(dev);
213
214         return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
215 }
216
217 static ssize_t mem_limit_store(struct device *dev,
218                 struct device_attribute *attr, const char *buf, size_t len)
219 {
220         u64 limit;
221         char *tmp;
222         struct zram *zram = dev_to_zram(dev);
223
224         limit = memparse(buf, &tmp);
225         if (buf == tmp) /* no chars parsed, invalid input */
226                 return -EINVAL;
227
228         down_write(&zram->init_lock);
229         zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
230         up_write(&zram->init_lock);
231
232         return len;
233 }
234
235 static ssize_t mem_used_max_store(struct device *dev,
236                 struct device_attribute *attr, const char *buf, size_t len)
237 {
238         int err;
239         unsigned long val;
240         struct zram *zram = dev_to_zram(dev);
241
242         err = kstrtoul(buf, 10, &val);
243         if (err || val != 0)
244                 return -EINVAL;
245
246         down_read(&zram->init_lock);
247         if (init_done(zram)) {
248                 atomic_long_set(&zram->stats.max_used_pages,
249                                 zs_get_total_pages(zram->mem_pool));
250         }
251         up_read(&zram->init_lock);
252
253         return len;
254 }
255
256 #ifdef CONFIG_ZRAM_WRITEBACK
257 static bool zram_wb_enabled(struct zram *zram)
258 {
259         return zram->backing_dev;
260 }
261
262 static void reset_bdev(struct zram *zram)
263 {
264         struct block_device *bdev;
265
266         if (!zram_wb_enabled(zram))
267                 return;
268
269         bdev = zram->bdev;
270         if (zram->old_block_size)
271                 set_blocksize(bdev, zram->old_block_size);
272         blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
273         /* hope filp_close flush all of IO */
274         filp_close(zram->backing_dev, NULL);
275         zram->backing_dev = NULL;
276         zram->old_block_size = 0;
277         zram->bdev = NULL;
278
279         kvfree(zram->bitmap);
280         zram->bitmap = NULL;
281 }
282
283 static ssize_t backing_dev_show(struct device *dev,
284                 struct device_attribute *attr, char *buf)
285 {
286         struct zram *zram = dev_to_zram(dev);
287         struct file *file = zram->backing_dev;
288         char *p;
289         ssize_t ret;
290
291         down_read(&zram->init_lock);
292         if (!zram_wb_enabled(zram)) {
293                 memcpy(buf, "none\n", 5);
294                 up_read(&zram->init_lock);
295                 return 5;
296         }
297
298         p = file_path(file, buf, PAGE_SIZE - 1);
299         if (IS_ERR(p)) {
300                 ret = PTR_ERR(p);
301                 goto out;
302         }
303
304         ret = strlen(p);
305         memmove(buf, p, ret);
306         buf[ret++] = '\n';
307 out:
308         up_read(&zram->init_lock);
309         return ret;
310 }
311
312 static ssize_t backing_dev_store(struct device *dev,
313                 struct device_attribute *attr, const char *buf, size_t len)
314 {
315         char *file_name;
316         struct file *backing_dev = NULL;
317         struct inode *inode;
318         struct address_space *mapping;
319         unsigned int bitmap_sz, old_block_size = 0;
320         unsigned long nr_pages, *bitmap = NULL;
321         struct block_device *bdev = NULL;
322         int err;
323         struct zram *zram = dev_to_zram(dev);
324
325         file_name = kmalloc(PATH_MAX, GFP_KERNEL);
326         if (!file_name)
327                 return -ENOMEM;
328
329         down_write(&zram->init_lock);
330         if (init_done(zram)) {
331                 pr_info("Can't setup backing device for initialized device\n");
332                 err = -EBUSY;
333                 goto out;
334         }
335
336         strlcpy(file_name, buf, len);
337
338         backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
339         if (IS_ERR(backing_dev)) {
340                 err = PTR_ERR(backing_dev);
341                 backing_dev = NULL;
342                 goto out;
343         }
344
345         mapping = backing_dev->f_mapping;
346         inode = mapping->host;
347
348         /* Support only block device in this moment */
349         if (!S_ISBLK(inode->i_mode)) {
350                 err = -ENOTBLK;
351                 goto out;
352         }
353
354         bdev = bdgrab(I_BDEV(inode));
355         err = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram);
356         if (err < 0)
357                 goto out;
358
359         nr_pages = i_size_read(inode) >> PAGE_SHIFT;
360         bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
361         bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
362         if (!bitmap) {
363                 err = -ENOMEM;
364                 goto out;
365         }
366
367         old_block_size = block_size(bdev);
368         err = set_blocksize(bdev, PAGE_SIZE);
369         if (err)
370                 goto out;
371
372         reset_bdev(zram);
373         spin_lock_init(&zram->bitmap_lock);
374
375         zram->old_block_size = old_block_size;
376         zram->bdev = bdev;
377         zram->backing_dev = backing_dev;
378         zram->bitmap = bitmap;
379         zram->nr_pages = nr_pages;
380         up_write(&zram->init_lock);
381
382         pr_info("setup backing device %s\n", file_name);
383         kfree(file_name);
384
385         return len;
386 out:
387         if (bitmap)
388                 kvfree(bitmap);
389
390         if (bdev)
391                 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
392
393         if (backing_dev)
394                 filp_close(backing_dev, NULL);
395
396         up_write(&zram->init_lock);
397
398         kfree(file_name);
399
400         return err;
401 }
402
403 static unsigned long get_entry_bdev(struct zram *zram)
404 {
405         unsigned long entry;
406
407         spin_lock(&zram->bitmap_lock);
408         /* skip 0 bit to confuse zram.handle = 0 */
409         entry = find_next_zero_bit(zram->bitmap, zram->nr_pages, 1);
410         if (entry == zram->nr_pages) {
411                 spin_unlock(&zram->bitmap_lock);
412                 return 0;
413         }
414
415         set_bit(entry, zram->bitmap);
416         spin_unlock(&zram->bitmap_lock);
417
418         return entry;
419 }
420
421 static void put_entry_bdev(struct zram *zram, unsigned long entry)
422 {
423         int was_set;
424
425         spin_lock(&zram->bitmap_lock);
426         was_set = test_and_clear_bit(entry, zram->bitmap);
427         spin_unlock(&zram->bitmap_lock);
428         WARN_ON_ONCE(!was_set);
429 }
430
431 static void zram_page_end_io(struct bio *bio)
432 {
433         struct page *page = bio_first_page_all(bio);
434
435         page_endio(page, op_is_write(bio_op(bio)),
436                         blk_status_to_errno(bio->bi_status));
437         bio_put(bio);
438 }
439
440 /*
441  * Returns 1 if the submission is successful.
442  */
443 static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
444                         unsigned long entry, struct bio *parent)
445 {
446         struct bio *bio;
447
448         bio = bio_alloc(GFP_ATOMIC, 1);
449         if (!bio)
450                 return -ENOMEM;
451
452         bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
453         bio_set_dev(bio, zram->bdev);
454         if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) {
455                 bio_put(bio);
456                 return -EIO;
457         }
458
459         if (!parent) {
460                 bio->bi_opf = REQ_OP_READ;
461                 bio->bi_end_io = zram_page_end_io;
462         } else {
463                 bio->bi_opf = parent->bi_opf;
464                 bio_chain(bio, parent);
465         }
466
467         submit_bio(bio);
468         return 1;
469 }
470
471 struct zram_work {
472         struct work_struct work;
473         struct zram *zram;
474         unsigned long entry;
475         struct bio *bio;
476 };
477
478 #if PAGE_SIZE != 4096
479 static void zram_sync_read(struct work_struct *work)
480 {
481         struct bio_vec bvec;
482         struct zram_work *zw = container_of(work, struct zram_work, work);
483         struct zram *zram = zw->zram;
484         unsigned long entry = zw->entry;
485         struct bio *bio = zw->bio;
486
487         read_from_bdev_async(zram, &bvec, entry, bio);
488 }
489
490 /*
491  * Block layer want one ->make_request_fn to be active at a time
492  * so if we use chained IO with parent IO in same context,
493  * it's a deadlock. To avoid, it, it uses worker thread context.
494  */
495 static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
496                                 unsigned long entry, struct bio *bio)
497 {
498         struct zram_work work;
499
500         work.zram = zram;
501         work.entry = entry;
502         work.bio = bio;
503
504         INIT_WORK_ONSTACK(&work.work, zram_sync_read);
505         queue_work(system_unbound_wq, &work.work);
506         flush_work(&work.work);
507         destroy_work_on_stack(&work.work);
508
509         return 1;
510 }
511 #else
512 static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
513                                 unsigned long entry, struct bio *bio)
514 {
515         WARN_ON(1);
516         return -EIO;
517 }
518 #endif
519
520 static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
521                         unsigned long entry, struct bio *parent, bool sync)
522 {
523         if (sync)
524                 return read_from_bdev_sync(zram, bvec, entry, parent);
525         else
526                 return read_from_bdev_async(zram, bvec, entry, parent);
527 }
528
529 static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
530                                         u32 index, struct bio *parent,
531                                         unsigned long *pentry)
532 {
533         struct bio *bio;
534         unsigned long entry;
535
536         bio = bio_alloc(GFP_ATOMIC, 1);
537         if (!bio)
538                 return -ENOMEM;
539
540         entry = get_entry_bdev(zram);
541         if (!entry) {
542                 bio_put(bio);
543                 return -ENOSPC;
544         }
545
546         bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
547         bio_set_dev(bio, zram->bdev);
548         if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len,
549                                         bvec->bv_offset)) {
550                 bio_put(bio);
551                 put_entry_bdev(zram, entry);
552                 return -EIO;
553         }
554
555         if (!parent) {
556                 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
557                 bio->bi_end_io = zram_page_end_io;
558         } else {
559                 bio->bi_opf = parent->bi_opf;
560                 bio_chain(bio, parent);
561         }
562
563         submit_bio(bio);
564         *pentry = entry;
565
566         return 0;
567 }
568
569 static void zram_wb_clear(struct zram *zram, u32 index)
570 {
571         unsigned long entry;
572
573         zram_clear_flag(zram, index, ZRAM_WB);
574         entry = zram_get_element(zram, index);
575         zram_set_element(zram, index, 0);
576         put_entry_bdev(zram, entry);
577 }
578
579 #else
580 static bool zram_wb_enabled(struct zram *zram) { return false; }
581 static inline void reset_bdev(struct zram *zram) {};
582 static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
583                                         u32 index, struct bio *parent,
584                                         unsigned long *pentry)
585
586 {
587         return -EIO;
588 }
589
590 static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
591                         unsigned long entry, struct bio *parent, bool sync)
592 {
593         return -EIO;
594 }
595 static void zram_wb_clear(struct zram *zram, u32 index) {}
596 #endif
597
598
599 /*
600  * We switched to per-cpu streams and this attr is not needed anymore.
601  * However, we will keep it around for some time, because:
602  * a) we may revert per-cpu streams in the future
603  * b) it's visible to user space and we need to follow our 2 years
604  *    retirement rule; but we already have a number of 'soon to be
605  *    altered' attrs, so max_comp_streams need to wait for the next
606  *    layoff cycle.
607  */
608 static ssize_t max_comp_streams_show(struct device *dev,
609                 struct device_attribute *attr, char *buf)
610 {
611         return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
612 }
613
614 static ssize_t max_comp_streams_store(struct device *dev,
615                 struct device_attribute *attr, const char *buf, size_t len)
616 {
617         return len;
618 }
619
620 static ssize_t comp_algorithm_show(struct device *dev,
621                 struct device_attribute *attr, char *buf)
622 {
623         size_t sz;
624         struct zram *zram = dev_to_zram(dev);
625
626         down_read(&zram->init_lock);
627         sz = zcomp_available_show(zram->compressor, buf);
628         up_read(&zram->init_lock);
629
630         return sz;
631 }
632
633 static ssize_t comp_algorithm_store(struct device *dev,
634                 struct device_attribute *attr, const char *buf, size_t len)
635 {
636         struct zram *zram = dev_to_zram(dev);
637         char compressor[ARRAY_SIZE(zram->compressor)];
638         size_t sz;
639
640         strlcpy(compressor, buf, sizeof(compressor));
641         /* ignore trailing newline */
642         sz = strlen(compressor);
643         if (sz > 0 && compressor[sz - 1] == '\n')
644                 compressor[sz - 1] = 0x00;
645
646         if (!zcomp_available_algorithm(compressor))
647                 return -EINVAL;
648
649         down_write(&zram->init_lock);
650         if (init_done(zram)) {
651                 up_write(&zram->init_lock);
652                 pr_info("Can't change algorithm for initialized device\n");
653                 return -EBUSY;
654         }
655
656         strcpy(zram->compressor, compressor);
657         up_write(&zram->init_lock);
658         return len;
659 }
660
661 static ssize_t compact_store(struct device *dev,
662                 struct device_attribute *attr, const char *buf, size_t len)
663 {
664         struct zram *zram = dev_to_zram(dev);
665
666         down_read(&zram->init_lock);
667         if (!init_done(zram)) {
668                 up_read(&zram->init_lock);
669                 return -EINVAL;
670         }
671
672         zs_compact(zram->mem_pool);
673         up_read(&zram->init_lock);
674
675         return len;
676 }
677
678 static ssize_t io_stat_show(struct device *dev,
679                 struct device_attribute *attr, char *buf)
680 {
681         struct zram *zram = dev_to_zram(dev);
682         ssize_t ret;
683
684         down_read(&zram->init_lock);
685         ret = scnprintf(buf, PAGE_SIZE,
686                         "%8llu %8llu %8llu %8llu\n",
687                         (u64)atomic64_read(&zram->stats.failed_reads),
688                         (u64)atomic64_read(&zram->stats.failed_writes),
689                         (u64)atomic64_read(&zram->stats.invalid_io),
690                         (u64)atomic64_read(&zram->stats.notify_free));
691         up_read(&zram->init_lock);
692
693         return ret;
694 }
695
696 static ssize_t mm_stat_show(struct device *dev,
697                 struct device_attribute *attr, char *buf)
698 {
699         struct zram *zram = dev_to_zram(dev);
700         struct zs_pool_stats pool_stats;
701         u64 orig_size, mem_used = 0;
702         long max_used;
703         ssize_t ret;
704
705         memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
706
707         down_read(&zram->init_lock);
708         if (init_done(zram)) {
709                 mem_used = zs_get_total_pages(zram->mem_pool);
710                 zs_pool_stats(zram->mem_pool, &pool_stats);
711         }
712
713         orig_size = atomic64_read(&zram->stats.pages_stored);
714         max_used = atomic_long_read(&zram->stats.max_used_pages);
715
716         ret = scnprintf(buf, PAGE_SIZE,
717                         "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
718                         orig_size << PAGE_SHIFT,
719                         (u64)atomic64_read(&zram->stats.compr_data_size),
720                         mem_used << PAGE_SHIFT,
721                         zram->limit_pages << PAGE_SHIFT,
722                         max_used << PAGE_SHIFT,
723                         (u64)atomic64_read(&zram->stats.same_pages),
724                         pool_stats.pages_compacted);
725         up_read(&zram->init_lock);
726
727         return ret;
728 }
729
730 static ssize_t debug_stat_show(struct device *dev,
731                 struct device_attribute *attr, char *buf)
732 {
733         int version = 1;
734         struct zram *zram = dev_to_zram(dev);
735         ssize_t ret;
736
737         down_read(&zram->init_lock);
738         ret = scnprintf(buf, PAGE_SIZE,
739                         "version: %d\n%8llu\n",
740                         version,
741                         (u64)atomic64_read(&zram->stats.writestall));
742         up_read(&zram->init_lock);
743
744         return ret;
745 }
746
747 static DEVICE_ATTR_RO(io_stat);
748 static DEVICE_ATTR_RO(mm_stat);
749 static DEVICE_ATTR_RO(debug_stat);
750
751 static void zram_slot_lock(struct zram *zram, u32 index)
752 {
753         bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value);
754 }
755
756 static void zram_slot_unlock(struct zram *zram, u32 index)
757 {
758         bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value);
759 }
760
761 static void zram_meta_free(struct zram *zram, u64 disksize)
762 {
763         size_t num_pages = disksize >> PAGE_SHIFT;
764         size_t index;
765
766         /* Free all pages that are still in this zram device */
767         for (index = 0; index < num_pages; index++)
768                 zram_free_page(zram, index);
769
770         zs_destroy_pool(zram->mem_pool);
771         vfree(zram->table);
772 }
773
774 static bool zram_meta_alloc(struct zram *zram, u64 disksize)
775 {
776         size_t num_pages;
777
778         num_pages = disksize >> PAGE_SHIFT;
779         zram->table = vzalloc(num_pages * sizeof(*zram->table));
780         if (!zram->table)
781                 return false;
782
783         zram->mem_pool = zs_create_pool(zram->disk->disk_name);
784         if (!zram->mem_pool) {
785                 vfree(zram->table);
786                 return false;
787         }
788
789         return true;
790 }
791
792 /*
793  * To protect concurrent access to the same index entry,
794  * caller should hold this table index entry's bit_spinlock to
795  * indicate this index entry is accessing.
796  */
797 static void zram_free_page(struct zram *zram, size_t index)
798 {
799         unsigned long handle;
800
801         if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) {
802                 zram_wb_clear(zram, index);
803                 atomic64_dec(&zram->stats.pages_stored);
804                 return;
805         }
806
807         /*
808          * No memory is allocated for same element filled pages.
809          * Simply clear same page flag.
810          */
811         if (zram_test_flag(zram, index, ZRAM_SAME)) {
812                 zram_clear_flag(zram, index, ZRAM_SAME);
813                 zram_set_element(zram, index, 0);
814                 atomic64_dec(&zram->stats.same_pages);
815                 atomic64_dec(&zram->stats.pages_stored);
816                 return;
817         }
818
819         handle = zram_get_handle(zram, index);
820         if (!handle)
821                 return;
822
823         zs_free(zram->mem_pool, handle);
824
825         atomic64_sub(zram_get_obj_size(zram, index),
826                         &zram->stats.compr_data_size);
827         atomic64_dec(&zram->stats.pages_stored);
828
829         zram_set_handle(zram, index, 0);
830         zram_set_obj_size(zram, index, 0);
831 }
832
833 static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
834                                 struct bio *bio, bool partial_io)
835 {
836         int ret;
837         unsigned long handle;
838         unsigned int size;
839         void *src, *dst;
840
841         if (zram_wb_enabled(zram)) {
842                 zram_slot_lock(zram, index);
843                 if (zram_test_flag(zram, index, ZRAM_WB)) {
844                         struct bio_vec bvec;
845
846                         zram_slot_unlock(zram, index);
847
848                         bvec.bv_page = page;
849                         bvec.bv_len = PAGE_SIZE;
850                         bvec.bv_offset = 0;
851                         return read_from_bdev(zram, &bvec,
852                                         zram_get_element(zram, index),
853                                         bio, partial_io);
854                 }
855                 zram_slot_unlock(zram, index);
856         }
857
858         zram_slot_lock(zram, index);
859         handle = zram_get_handle(zram, index);
860         if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) {
861                 unsigned long value;
862                 void *mem;
863
864                 value = handle ? zram_get_element(zram, index) : 0;
865                 mem = kmap_atomic(page);
866                 zram_fill_page(mem, PAGE_SIZE, value);
867                 kunmap_atomic(mem);
868                 zram_slot_unlock(zram, index);
869                 return 0;
870         }
871
872         size = zram_get_obj_size(zram, index);
873
874         src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
875         if (size == PAGE_SIZE) {
876                 dst = kmap_atomic(page);
877                 memcpy(dst, src, PAGE_SIZE);
878                 kunmap_atomic(dst);
879                 ret = 0;
880         } else {
881                 struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
882
883                 dst = kmap_atomic(page);
884                 ret = zcomp_decompress(zstrm, src, size, dst);
885                 kunmap_atomic(dst);
886                 zcomp_stream_put(zram->comp);
887         }
888         zs_unmap_object(zram->mem_pool, handle);
889         zram_slot_unlock(zram, index);
890
891         /* Should NEVER happen. Return bio error if it does. */
892         if (unlikely(ret))
893                 pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
894
895         return ret;
896 }
897
898 static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
899                                 u32 index, int offset, struct bio *bio)
900 {
901         int ret;
902         struct page *page;
903
904         page = bvec->bv_page;
905         if (is_partial_io(bvec)) {
906                 /* Use a temporary buffer to decompress the page */
907                 page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
908                 if (!page)
909                         return -ENOMEM;
910         }
911
912         ret = __zram_bvec_read(zram, page, index, bio, is_partial_io(bvec));
913         if (unlikely(ret))
914                 goto out;
915
916         if (is_partial_io(bvec)) {
917                 void *dst = kmap_atomic(bvec->bv_page);
918                 void *src = kmap_atomic(page);
919
920                 memcpy(dst + bvec->bv_offset, src + offset, bvec->bv_len);
921                 kunmap_atomic(src);
922                 kunmap_atomic(dst);
923         }
924 out:
925         if (is_partial_io(bvec))
926                 __free_page(page);
927
928         return ret;
929 }
930
931 static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
932                                 u32 index, struct bio *bio)
933 {
934         int ret = 0;
935         unsigned long alloced_pages;
936         unsigned long handle = 0;
937         unsigned int comp_len = 0;
938         void *src, *dst, *mem;
939         struct zcomp_strm *zstrm;
940         struct page *page = bvec->bv_page;
941         unsigned long element = 0;
942         enum zram_pageflags flags = 0;
943         bool allow_wb = true;
944
945         mem = kmap_atomic(page);
946         if (page_same_filled(mem, &element)) {
947                 kunmap_atomic(mem);
948                 /* Free memory associated with this sector now. */
949                 flags = ZRAM_SAME;
950                 atomic64_inc(&zram->stats.same_pages);
951                 goto out;
952         }
953         kunmap_atomic(mem);
954
955 compress_again:
956         zstrm = zcomp_stream_get(zram->comp);
957         src = kmap_atomic(page);
958         ret = zcomp_compress(zstrm, src, &comp_len);
959         kunmap_atomic(src);
960
961         if (unlikely(ret)) {
962                 zcomp_stream_put(zram->comp);
963                 pr_err("Compression failed! err=%d\n", ret);
964                 zs_free(zram->mem_pool, handle);
965                 return ret;
966         }
967
968         if (unlikely(comp_len > max_zpage_size)) {
969                 if (zram_wb_enabled(zram) && allow_wb) {
970                         zcomp_stream_put(zram->comp);
971                         ret = write_to_bdev(zram, bvec, index, bio, &element);
972                         if (!ret) {
973                                 flags = ZRAM_WB;
974                                 ret = 1;
975                                 goto out;
976                         }
977                         allow_wb = false;
978                         goto compress_again;
979                 }
980                 comp_len = PAGE_SIZE;
981         }
982
983         /*
984          * handle allocation has 2 paths:
985          * a) fast path is executed with preemption disabled (for
986          *  per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
987          *  since we can't sleep;
988          * b) slow path enables preemption and attempts to allocate
989          *  the page with __GFP_DIRECT_RECLAIM bit set. we have to
990          *  put per-cpu compression stream and, thus, to re-do
991          *  the compression once handle is allocated.
992          *
993          * if we have a 'non-null' handle here then we are coming
994          * from the slow path and handle has already been allocated.
995          */
996         if (!handle)
997                 handle = zs_malloc(zram->mem_pool, comp_len,
998                                 __GFP_KSWAPD_RECLAIM |
999                                 __GFP_NOWARN |
1000                                 __GFP_HIGHMEM |
1001                                 __GFP_MOVABLE);
1002         if (!handle) {
1003                 zcomp_stream_put(zram->comp);
1004                 atomic64_inc(&zram->stats.writestall);
1005                 handle = zs_malloc(zram->mem_pool, comp_len,
1006                                 GFP_NOIO | __GFP_HIGHMEM |
1007                                 __GFP_MOVABLE);
1008                 if (handle)
1009                         goto compress_again;
1010                 return -ENOMEM;
1011         }
1012
1013         alloced_pages = zs_get_total_pages(zram->mem_pool);
1014         update_used_max(zram, alloced_pages);
1015
1016         if (zram->limit_pages && alloced_pages > zram->limit_pages) {
1017                 zcomp_stream_put(zram->comp);
1018                 zs_free(zram->mem_pool, handle);
1019                 return -ENOMEM;
1020         }
1021
1022         dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
1023
1024         src = zstrm->buffer;
1025         if (comp_len == PAGE_SIZE)
1026                 src = kmap_atomic(page);
1027         memcpy(dst, src, comp_len);
1028         if (comp_len == PAGE_SIZE)
1029                 kunmap_atomic(src);
1030
1031         zcomp_stream_put(zram->comp);
1032         zs_unmap_object(zram->mem_pool, handle);
1033         atomic64_add(comp_len, &zram->stats.compr_data_size);
1034 out:
1035         /*
1036          * Free memory associated with this sector
1037          * before overwriting unused sectors.
1038          */
1039         zram_slot_lock(zram, index);
1040         zram_free_page(zram, index);
1041
1042         if (flags) {
1043                 zram_set_flag(zram, index, flags);
1044                 zram_set_element(zram, index, element);
1045         }  else {
1046                 zram_set_handle(zram, index, handle);
1047                 zram_set_obj_size(zram, index, comp_len);
1048         }
1049         zram_slot_unlock(zram, index);
1050
1051         /* Update stats */
1052         atomic64_inc(&zram->stats.pages_stored);
1053         return ret;
1054 }
1055
1056 static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
1057                                 u32 index, int offset, struct bio *bio)
1058 {
1059         int ret;
1060         struct page *page = NULL;
1061         void *src;
1062         struct bio_vec vec;
1063
1064         vec = *bvec;
1065         if (is_partial_io(bvec)) {
1066                 void *dst;
1067                 /*
1068                  * This is a partial IO. We need to read the full page
1069                  * before to write the changes.
1070                  */
1071                 page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
1072                 if (!page)
1073                         return -ENOMEM;
1074
1075                 ret = __zram_bvec_read(zram, page, index, bio, true);
1076                 if (ret)
1077                         goto out;
1078
1079                 src = kmap_atomic(bvec->bv_page);
1080                 dst = kmap_atomic(page);
1081                 memcpy(dst + offset, src + bvec->bv_offset, bvec->bv_len);
1082                 kunmap_atomic(dst);
1083                 kunmap_atomic(src);
1084
1085                 vec.bv_page = page;
1086                 vec.bv_len = PAGE_SIZE;
1087                 vec.bv_offset = 0;
1088         }
1089
1090         ret = __zram_bvec_write(zram, &vec, index, bio);
1091 out:
1092         if (is_partial_io(bvec))
1093                 __free_page(page);
1094         return ret;
1095 }
1096
1097 /*
1098  * zram_bio_discard - handler on discard request
1099  * @index: physical block index in PAGE_SIZE units
1100  * @offset: byte offset within physical block
1101  */
1102 static void zram_bio_discard(struct zram *zram, u32 index,
1103                              int offset, struct bio *bio)
1104 {
1105         size_t n = bio->bi_iter.bi_size;
1106
1107         /*
1108          * zram manages data in physical block size units. Because logical block
1109          * size isn't identical with physical block size on some arch, we
1110          * could get a discard request pointing to a specific offset within a
1111          * certain physical block.  Although we can handle this request by
1112          * reading that physiclal block and decompressing and partially zeroing
1113          * and re-compressing and then re-storing it, this isn't reasonable
1114          * because our intent with a discard request is to save memory.  So
1115          * skipping this logical block is appropriate here.
1116          */
1117         if (offset) {
1118                 if (n <= (PAGE_SIZE - offset))
1119                         return;
1120
1121                 n -= (PAGE_SIZE - offset);
1122                 index++;
1123         }
1124
1125         while (n >= PAGE_SIZE) {
1126                 zram_slot_lock(zram, index);
1127                 zram_free_page(zram, index);
1128                 zram_slot_unlock(zram, index);
1129                 atomic64_inc(&zram->stats.notify_free);
1130                 index++;
1131                 n -= PAGE_SIZE;
1132         }
1133 }
1134
1135 /*
1136  * Returns errno if it has some problem. Otherwise return 0 or 1.
1137  * Returns 0 if IO request was done synchronously
1138  * Returns 1 if IO request was successfully submitted.
1139  */
1140 static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
1141                         int offset, bool is_write, struct bio *bio)
1142 {
1143         unsigned long start_time = jiffies;
1144         int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
1145         struct request_queue *q = zram->disk->queue;
1146         int ret;
1147
1148         generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT,
1149                         &zram->disk->part0);
1150
1151         if (!is_write) {
1152                 atomic64_inc(&zram->stats.num_reads);
1153                 ret = zram_bvec_read(zram, bvec, index, offset, bio);
1154                 flush_dcache_page(bvec->bv_page);
1155         } else {
1156                 atomic64_inc(&zram->stats.num_writes);
1157                 ret = zram_bvec_write(zram, bvec, index, offset, bio);
1158         }
1159
1160         generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time);
1161
1162         if (unlikely(ret < 0)) {
1163                 if (!is_write)
1164                         atomic64_inc(&zram->stats.failed_reads);
1165                 else
1166                         atomic64_inc(&zram->stats.failed_writes);
1167         }
1168
1169         return ret;
1170 }
1171
1172 static void __zram_make_request(struct zram *zram, struct bio *bio)
1173 {
1174         int offset;
1175         u32 index;
1176         struct bio_vec bvec;
1177         struct bvec_iter iter;
1178
1179         index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
1180         offset = (bio->bi_iter.bi_sector &
1181                   (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
1182
1183         switch (bio_op(bio)) {
1184         case REQ_OP_DISCARD:
1185         case REQ_OP_WRITE_ZEROES:
1186                 zram_bio_discard(zram, index, offset, bio);
1187                 bio_endio(bio);
1188                 return;
1189         default:
1190                 break;
1191         }
1192
1193         bio_for_each_segment(bvec, bio, iter) {
1194                 struct bio_vec bv = bvec;
1195                 unsigned int unwritten = bvec.bv_len;
1196
1197                 do {
1198                         bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
1199                                                         unwritten);
1200                         if (zram_bvec_rw(zram, &bv, index, offset,
1201                                         op_is_write(bio_op(bio)), bio) < 0)
1202                                 goto out;
1203
1204                         bv.bv_offset += bv.bv_len;
1205                         unwritten -= bv.bv_len;
1206
1207                         update_position(&index, &offset, &bv);
1208                 } while (unwritten);
1209         }
1210
1211         bio_endio(bio);
1212         return;
1213
1214 out:
1215         bio_io_error(bio);
1216 }
1217
1218 /*
1219  * Handler function for all zram I/O requests.
1220  */
1221 static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio)
1222 {
1223         struct zram *zram = queue->queuedata;
1224
1225         if (!valid_io_request(zram, bio->bi_iter.bi_sector,
1226                                         bio->bi_iter.bi_size)) {
1227                 atomic64_inc(&zram->stats.invalid_io);
1228                 goto error;
1229         }
1230
1231         __zram_make_request(zram, bio);
1232         return BLK_QC_T_NONE;
1233
1234 error:
1235         bio_io_error(bio);
1236         return BLK_QC_T_NONE;
1237 }
1238
1239 static void zram_slot_free_notify(struct block_device *bdev,
1240                                 unsigned long index)
1241 {
1242         struct zram *zram;
1243
1244         zram = bdev->bd_disk->private_data;
1245
1246         zram_slot_lock(zram, index);
1247         zram_free_page(zram, index);
1248         zram_slot_unlock(zram, index);
1249         atomic64_inc(&zram->stats.notify_free);
1250 }
1251
1252 static int zram_rw_page(struct block_device *bdev, sector_t sector,
1253                        struct page *page, bool is_write)
1254 {
1255         int offset, ret;
1256         u32 index;
1257         struct zram *zram;
1258         struct bio_vec bv;
1259
1260         if (PageTransHuge(page))
1261                 return -ENOTSUPP;
1262         zram = bdev->bd_disk->private_data;
1263
1264         if (!valid_io_request(zram, sector, PAGE_SIZE)) {
1265                 atomic64_inc(&zram->stats.invalid_io);
1266                 ret = -EINVAL;
1267                 goto out;
1268         }
1269
1270         index = sector >> SECTORS_PER_PAGE_SHIFT;
1271         offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
1272
1273         bv.bv_page = page;
1274         bv.bv_len = PAGE_SIZE;
1275         bv.bv_offset = 0;
1276
1277         ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL);
1278 out:
1279         /*
1280          * If I/O fails, just return error(ie, non-zero) without
1281          * calling page_endio.
1282          * It causes resubmit the I/O with bio request by upper functions
1283          * of rw_page(e.g., swap_readpage, __swap_writepage) and
1284          * bio->bi_end_io does things to handle the error
1285          * (e.g., SetPageError, set_page_dirty and extra works).
1286          */
1287         if (unlikely(ret < 0))
1288                 return ret;
1289
1290         switch (ret) {
1291         case 0:
1292                 page_endio(page, is_write, 0);
1293                 break;
1294         case 1:
1295                 ret = 0;
1296                 break;
1297         default:
1298                 WARN_ON(1);
1299         }
1300         return ret;
1301 }
1302
1303 static void zram_reset_device(struct zram *zram)
1304 {
1305         struct zcomp *comp;
1306         u64 disksize;
1307
1308         down_write(&zram->init_lock);
1309
1310         zram->limit_pages = 0;
1311
1312         if (!init_done(zram)) {
1313                 up_write(&zram->init_lock);
1314                 return;
1315         }
1316
1317         comp = zram->comp;
1318         disksize = zram->disksize;
1319         zram->disksize = 0;
1320
1321         set_capacity(zram->disk, 0);
1322         part_stat_set_all(&zram->disk->part0, 0);
1323
1324         up_write(&zram->init_lock);
1325         /* I/O operation under all of CPU are done so let's free */
1326         zram_meta_free(zram, disksize);
1327         memset(&zram->stats, 0, sizeof(zram->stats));
1328         zcomp_destroy(comp);
1329         reset_bdev(zram);
1330 }
1331
1332 static ssize_t disksize_store(struct device *dev,
1333                 struct device_attribute *attr, const char *buf, size_t len)
1334 {
1335         u64 disksize;
1336         struct zcomp *comp;
1337         struct zram *zram = dev_to_zram(dev);
1338         int err;
1339
1340         disksize = memparse(buf, NULL);
1341         if (!disksize)
1342                 return -EINVAL;
1343
1344         down_write(&zram->init_lock);
1345         if (init_done(zram)) {
1346                 pr_info("Cannot change disksize for initialized device\n");
1347                 err = -EBUSY;
1348                 goto out_unlock;
1349         }
1350
1351         disksize = PAGE_ALIGN(disksize);
1352         if (!zram_meta_alloc(zram, disksize)) {
1353                 err = -ENOMEM;
1354                 goto out_unlock;
1355         }
1356
1357         comp = zcomp_create(zram->compressor);
1358         if (IS_ERR(comp)) {
1359                 pr_err("Cannot initialise %s compressing backend\n",
1360                                 zram->compressor);
1361                 err = PTR_ERR(comp);
1362                 goto out_free_meta;
1363         }
1364
1365         zram->comp = comp;
1366         zram->disksize = disksize;
1367         set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
1368
1369         revalidate_disk(zram->disk);
1370         up_write(&zram->init_lock);
1371
1372         return len;
1373
1374 out_free_meta:
1375         zram_meta_free(zram, disksize);
1376 out_unlock:
1377         up_write(&zram->init_lock);
1378         return err;
1379 }
1380
1381 static ssize_t reset_store(struct device *dev,
1382                 struct device_attribute *attr, const char *buf, size_t len)
1383 {
1384         int ret;
1385         unsigned short do_reset;
1386         struct zram *zram;
1387         struct block_device *bdev;
1388
1389         ret = kstrtou16(buf, 10, &do_reset);
1390         if (ret)
1391                 return ret;
1392
1393         if (!do_reset)
1394                 return -EINVAL;
1395
1396         zram = dev_to_zram(dev);
1397         bdev = bdget_disk(zram->disk, 0);
1398         if (!bdev)
1399                 return -ENOMEM;
1400
1401         mutex_lock(&bdev->bd_mutex);
1402         /* Do not reset an active device or claimed device */
1403         if (bdev->bd_openers || zram->claim) {
1404                 mutex_unlock(&bdev->bd_mutex);
1405                 bdput(bdev);
1406                 return -EBUSY;
1407         }
1408
1409         /* From now on, anyone can't open /dev/zram[0-9] */
1410         zram->claim = true;
1411         mutex_unlock(&bdev->bd_mutex);
1412
1413         /* Make sure all the pending I/O are finished */
1414         fsync_bdev(bdev);
1415         zram_reset_device(zram);
1416         revalidate_disk(zram->disk);
1417         bdput(bdev);
1418
1419         mutex_lock(&bdev->bd_mutex);
1420         zram->claim = false;
1421         mutex_unlock(&bdev->bd_mutex);
1422
1423         return len;
1424 }
1425
1426 static int zram_open(struct block_device *bdev, fmode_t mode)
1427 {
1428         int ret = 0;
1429         struct zram *zram;
1430
1431         WARN_ON(!mutex_is_locked(&bdev->bd_mutex));
1432
1433         zram = bdev->bd_disk->private_data;
1434         /* zram was claimed to reset so open request fails */
1435         if (zram->claim)
1436                 ret = -EBUSY;
1437
1438         return ret;
1439 }
1440
1441 static const struct block_device_operations zram_devops = {
1442         .open = zram_open,
1443         .swap_slot_free_notify = zram_slot_free_notify,
1444         .rw_page = zram_rw_page,
1445         .owner = THIS_MODULE
1446 };
1447
1448 static DEVICE_ATTR_WO(compact);
1449 static DEVICE_ATTR_RW(disksize);
1450 static DEVICE_ATTR_RO(initstate);
1451 static DEVICE_ATTR_WO(reset);
1452 static DEVICE_ATTR_WO(mem_limit);
1453 static DEVICE_ATTR_WO(mem_used_max);
1454 static DEVICE_ATTR_RW(max_comp_streams);
1455 static DEVICE_ATTR_RW(comp_algorithm);
1456 #ifdef CONFIG_ZRAM_WRITEBACK
1457 static DEVICE_ATTR_RW(backing_dev);
1458 #endif
1459
1460 static struct attribute *zram_disk_attrs[] = {
1461         &dev_attr_disksize.attr,
1462         &dev_attr_initstate.attr,
1463         &dev_attr_reset.attr,
1464         &dev_attr_compact.attr,
1465         &dev_attr_mem_limit.attr,
1466         &dev_attr_mem_used_max.attr,
1467         &dev_attr_max_comp_streams.attr,
1468         &dev_attr_comp_algorithm.attr,
1469 #ifdef CONFIG_ZRAM_WRITEBACK
1470         &dev_attr_backing_dev.attr,
1471 #endif
1472         &dev_attr_io_stat.attr,
1473         &dev_attr_mm_stat.attr,
1474         &dev_attr_debug_stat.attr,
1475         NULL,
1476 };
1477
1478 static const struct attribute_group zram_disk_attr_group = {
1479         .attrs = zram_disk_attrs,
1480 };
1481
1482 /*
1483  * Allocate and initialize new zram device. the function returns
1484  * '>= 0' device_id upon success, and negative value otherwise.
1485  */
1486 static int zram_add(void)
1487 {
1488         struct zram *zram;
1489         struct request_queue *queue;
1490         int ret, device_id;
1491
1492         zram = kzalloc(sizeof(struct zram), GFP_KERNEL);
1493         if (!zram)
1494                 return -ENOMEM;
1495
1496         ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL);
1497         if (ret < 0)
1498                 goto out_free_dev;
1499         device_id = ret;
1500
1501         init_rwsem(&zram->init_lock);
1502
1503         queue = blk_alloc_queue(GFP_KERNEL);
1504         if (!queue) {
1505                 pr_err("Error allocating disk queue for device %d\n",
1506                         device_id);
1507                 ret = -ENOMEM;
1508                 goto out_free_idr;
1509         }
1510
1511         blk_queue_make_request(queue, zram_make_request);
1512
1513         /* gendisk structure */
1514         zram->disk = alloc_disk(1);
1515         if (!zram->disk) {
1516                 pr_err("Error allocating disk structure for device %d\n",
1517                         device_id);
1518                 ret = -ENOMEM;
1519                 goto out_free_queue;
1520         }
1521
1522         zram->disk->major = zram_major;
1523         zram->disk->first_minor = device_id;
1524         zram->disk->fops = &zram_devops;
1525         zram->disk->queue = queue;
1526         zram->disk->queue->queuedata = zram;
1527         zram->disk->private_data = zram;
1528         snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
1529
1530         /* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
1531         set_capacity(zram->disk, 0);
1532         /* zram devices sort of resembles non-rotational disks */
1533         queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
1534         queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
1535
1536         /*
1537          * To ensure that we always get PAGE_SIZE aligned
1538          * and n*PAGE_SIZED sized I/O requests.
1539          */
1540         blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
1541         blk_queue_logical_block_size(zram->disk->queue,
1542                                         ZRAM_LOGICAL_BLOCK_SIZE);
1543         blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
1544         blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
1545         zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
1546         blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
1547         queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
1548
1549         /*
1550          * zram_bio_discard() will clear all logical blocks if logical block
1551          * size is identical with physical block size(PAGE_SIZE). But if it is
1552          * different, we will skip discarding some parts of logical blocks in
1553          * the part of the request range which isn't aligned to physical block
1554          * size.  So we can't ensure that all discarded logical blocks are
1555          * zeroed.
1556          */
1557         if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
1558                 blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
1559
1560         zram->disk->queue->backing_dev_info->capabilities |=
1561                         (BDI_CAP_STABLE_WRITES | BDI_CAP_SYNCHRONOUS_IO);
1562         add_disk(zram->disk);
1563
1564         ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
1565                                 &zram_disk_attr_group);
1566         if (ret < 0) {
1567                 pr_err("Error creating sysfs group for device %d\n",
1568                                 device_id);
1569                 goto out_free_disk;
1570         }
1571         strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
1572
1573         pr_info("Added device: %s\n", zram->disk->disk_name);
1574         return device_id;
1575
1576 out_free_disk:
1577         del_gendisk(zram->disk);
1578         put_disk(zram->disk);
1579 out_free_queue:
1580         blk_cleanup_queue(queue);
1581 out_free_idr:
1582         idr_remove(&zram_index_idr, device_id);
1583 out_free_dev:
1584         kfree(zram);
1585         return ret;
1586 }
1587
1588 static int zram_remove(struct zram *zram)
1589 {
1590         struct block_device *bdev;
1591
1592         bdev = bdget_disk(zram->disk, 0);
1593         if (!bdev)
1594                 return -ENOMEM;
1595
1596         mutex_lock(&bdev->bd_mutex);
1597         if (bdev->bd_openers || zram->claim) {
1598                 mutex_unlock(&bdev->bd_mutex);
1599                 bdput(bdev);
1600                 return -EBUSY;
1601         }
1602
1603         zram->claim = true;
1604         mutex_unlock(&bdev->bd_mutex);
1605
1606         /*
1607          * Remove sysfs first, so no one will perform a disksize
1608          * store while we destroy the devices. This also helps during
1609          * hot_remove -- zram_reset_device() is the last holder of
1610          * ->init_lock, no later/concurrent disksize_store() or any
1611          * other sysfs handlers are possible.
1612          */
1613         sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
1614                         &zram_disk_attr_group);
1615
1616         /* Make sure all the pending I/O are finished */
1617         fsync_bdev(bdev);
1618         zram_reset_device(zram);
1619         bdput(bdev);
1620
1621         pr_info("Removed device: %s\n", zram->disk->disk_name);
1622
1623         blk_cleanup_queue(zram->disk->queue);
1624         del_gendisk(zram->disk);
1625         put_disk(zram->disk);
1626         kfree(zram);
1627         return 0;
1628 }
1629
1630 /* zram-control sysfs attributes */
1631
1632 /*
1633  * NOTE: hot_add attribute is not the usual read-only sysfs attribute. In a
1634  * sense that reading from this file does alter the state of your system -- it
1635  * creates a new un-initialized zram device and returns back this device's
1636  * device_id (or an error code if it fails to create a new device).
1637  */
1638 static ssize_t hot_add_show(struct class *class,
1639                         struct class_attribute *attr,
1640                         char *buf)
1641 {
1642         int ret;
1643
1644         mutex_lock(&zram_index_mutex);
1645         ret = zram_add();
1646         mutex_unlock(&zram_index_mutex);
1647
1648         if (ret < 0)
1649                 return ret;
1650         return scnprintf(buf, PAGE_SIZE, "%d\n", ret);
1651 }
1652 static CLASS_ATTR_RO(hot_add);
1653
1654 static ssize_t hot_remove_store(struct class *class,
1655                         struct class_attribute *attr,
1656                         const char *buf,
1657                         size_t count)
1658 {
1659         struct zram *zram;
1660         int ret, dev_id;
1661
1662         /* dev_id is gendisk->first_minor, which is `int' */
1663         ret = kstrtoint(buf, 10, &dev_id);
1664         if (ret)
1665                 return ret;
1666         if (dev_id < 0)
1667                 return -EINVAL;
1668
1669         mutex_lock(&zram_index_mutex);
1670
1671         zram = idr_find(&zram_index_idr, dev_id);
1672         if (zram) {
1673                 ret = zram_remove(zram);
1674                 if (!ret)
1675                         idr_remove(&zram_index_idr, dev_id);
1676         } else {
1677                 ret = -ENODEV;
1678         }
1679
1680         mutex_unlock(&zram_index_mutex);
1681         return ret ? ret : count;
1682 }
1683 static CLASS_ATTR_WO(hot_remove);
1684
1685 static struct attribute *zram_control_class_attrs[] = {
1686         &class_attr_hot_add.attr,
1687         &class_attr_hot_remove.attr,
1688         NULL,
1689 };
1690 ATTRIBUTE_GROUPS(zram_control_class);
1691
1692 static struct class zram_control_class = {
1693         .name           = "zram-control",
1694         .owner          = THIS_MODULE,
1695         .class_groups   = zram_control_class_groups,
1696 };
1697
1698 static int zram_remove_cb(int id, void *ptr, void *data)
1699 {
1700         zram_remove(ptr);
1701         return 0;
1702 }
1703
1704 static void destroy_devices(void)
1705 {
1706         class_unregister(&zram_control_class);
1707         idr_for_each(&zram_index_idr, &zram_remove_cb, NULL);
1708         idr_destroy(&zram_index_idr);
1709         unregister_blkdev(zram_major, "zram");
1710         cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
1711 }
1712
1713 static int __init zram_init(void)
1714 {
1715         int ret;
1716
1717         ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare",
1718                                       zcomp_cpu_up_prepare, zcomp_cpu_dead);
1719         if (ret < 0)
1720                 return ret;
1721
1722         ret = class_register(&zram_control_class);
1723         if (ret) {
1724                 pr_err("Unable to register zram-control class\n");
1725                 cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
1726                 return ret;
1727         }
1728
1729         zram_major = register_blkdev(0, "zram");
1730         if (zram_major <= 0) {
1731                 pr_err("Unable to get major number\n");
1732                 class_unregister(&zram_control_class);
1733                 cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
1734                 return -EBUSY;
1735         }
1736
1737         while (num_devices != 0) {
1738                 mutex_lock(&zram_index_mutex);
1739                 ret = zram_add();
1740                 mutex_unlock(&zram_index_mutex);
1741                 if (ret < 0)
1742                         goto out_error;
1743                 num_devices--;
1744         }
1745
1746         return 0;
1747
1748 out_error:
1749         destroy_devices();
1750         return ret;
1751 }
1752
1753 static void __exit zram_exit(void)
1754 {
1755         destroy_devices();
1756 }
1757
1758 module_init(zram_init);
1759 module_exit(zram_exit);
1760
1761 module_param(num_devices, uint, 0);
1762 MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices");
1763
1764 MODULE_LICENSE("Dual BSD/GPL");
1765 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
1766 MODULE_DESCRIPTION("Compressed RAM Block Device");