zram: add bd_stat statistics
[sfrench/cifs-2.6.git] / drivers / block / zram / zram_drv.c
1 /*
2  * Compressed RAM block device
3  *
4  * Copyright (C) 2008, 2009, 2010  Nitin Gupta
5  *               2012, 2013 Minchan Kim
6  *
7  * This code is released using a dual license strategy: BSD/GPL
8  * You can choose the licence that better fits your requirements.
9  *
10  * Released under the terms of 3-clause BSD License
11  * Released under the terms of GNU General Public License Version 2.0
12  *
13  */
14
15 #define KMSG_COMPONENT "zram"
16 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
17
18 #include <linux/module.h>
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/bitops.h>
22 #include <linux/blkdev.h>
23 #include <linux/buffer_head.h>
24 #include <linux/device.h>
25 #include <linux/genhd.h>
26 #include <linux/highmem.h>
27 #include <linux/slab.h>
28 #include <linux/backing-dev.h>
29 #include <linux/string.h>
30 #include <linux/vmalloc.h>
31 #include <linux/err.h>
32 #include <linux/idr.h>
33 #include <linux/sysfs.h>
34 #include <linux/debugfs.h>
35 #include <linux/cpuhotplug.h>
36
37 #include "zram_drv.h"
38
39 static DEFINE_IDR(zram_index_idr);
40 /* idr index must be protected */
41 static DEFINE_MUTEX(zram_index_mutex);
42
43 static int zram_major;
44 static const char *default_compressor = "lzo";
45
46 /* Module params (documentation at end) */
47 static unsigned int num_devices = 1;
48 /*
49  * Pages that compress to sizes equals or greater than this are stored
50  * uncompressed in memory.
51  */
52 static size_t huge_class_size;
53
54 static void zram_free_page(struct zram *zram, size_t index);
55 static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
56                                 u32 index, int offset, struct bio *bio);
57
58
59 static int zram_slot_trylock(struct zram *zram, u32 index)
60 {
61         return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags);
62 }
63
64 static void zram_slot_lock(struct zram *zram, u32 index)
65 {
66         bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags);
67 }
68
69 static void zram_slot_unlock(struct zram *zram, u32 index)
70 {
71         bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags);
72 }
73
74 static inline bool init_done(struct zram *zram)
75 {
76         return zram->disksize;
77 }
78
79 static inline struct zram *dev_to_zram(struct device *dev)
80 {
81         return (struct zram *)dev_to_disk(dev)->private_data;
82 }
83
84 static unsigned long zram_get_handle(struct zram *zram, u32 index)
85 {
86         return zram->table[index].handle;
87 }
88
89 static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
90 {
91         zram->table[index].handle = handle;
92 }
93
94 /* flag operations require table entry bit_spin_lock() being held */
95 static bool zram_test_flag(struct zram *zram, u32 index,
96                         enum zram_pageflags flag)
97 {
98         return zram->table[index].flags & BIT(flag);
99 }
100
101 static void zram_set_flag(struct zram *zram, u32 index,
102                         enum zram_pageflags flag)
103 {
104         zram->table[index].flags |= BIT(flag);
105 }
106
107 static void zram_clear_flag(struct zram *zram, u32 index,
108                         enum zram_pageflags flag)
109 {
110         zram->table[index].flags &= ~BIT(flag);
111 }
112
113 static inline void zram_set_element(struct zram *zram, u32 index,
114                         unsigned long element)
115 {
116         zram->table[index].element = element;
117 }
118
119 static unsigned long zram_get_element(struct zram *zram, u32 index)
120 {
121         return zram->table[index].element;
122 }
123
124 static size_t zram_get_obj_size(struct zram *zram, u32 index)
125 {
126         return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1);
127 }
128
129 static void zram_set_obj_size(struct zram *zram,
130                                         u32 index, size_t size)
131 {
132         unsigned long flags = zram->table[index].flags >> ZRAM_FLAG_SHIFT;
133
134         zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size;
135 }
136
137 static inline bool zram_allocated(struct zram *zram, u32 index)
138 {
139         return zram_get_obj_size(zram, index) ||
140                         zram_test_flag(zram, index, ZRAM_SAME) ||
141                         zram_test_flag(zram, index, ZRAM_WB);
142 }
143
144 #if PAGE_SIZE != 4096
145 static inline bool is_partial_io(struct bio_vec *bvec)
146 {
147         return bvec->bv_len != PAGE_SIZE;
148 }
149 #else
150 static inline bool is_partial_io(struct bio_vec *bvec)
151 {
152         return false;
153 }
154 #endif
155
156 /*
157  * Check if request is within bounds and aligned on zram logical blocks.
158  */
159 static inline bool valid_io_request(struct zram *zram,
160                 sector_t start, unsigned int size)
161 {
162         u64 end, bound;
163
164         /* unaligned request */
165         if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
166                 return false;
167         if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
168                 return false;
169
170         end = start + (size >> SECTOR_SHIFT);
171         bound = zram->disksize >> SECTOR_SHIFT;
172         /* out of range range */
173         if (unlikely(start >= bound || end > bound || start > end))
174                 return false;
175
176         /* I/O request is valid */
177         return true;
178 }
179
180 static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
181 {
182         *index  += (*offset + bvec->bv_len) / PAGE_SIZE;
183         *offset = (*offset + bvec->bv_len) % PAGE_SIZE;
184 }
185
186 static inline void update_used_max(struct zram *zram,
187                                         const unsigned long pages)
188 {
189         unsigned long old_max, cur_max;
190
191         old_max = atomic_long_read(&zram->stats.max_used_pages);
192
193         do {
194                 cur_max = old_max;
195                 if (pages > cur_max)
196                         old_max = atomic_long_cmpxchg(
197                                 &zram->stats.max_used_pages, cur_max, pages);
198         } while (old_max != cur_max);
199 }
200
201 static inline void zram_fill_page(void *ptr, unsigned long len,
202                                         unsigned long value)
203 {
204         WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long)));
205         memset_l(ptr, value, len / sizeof(unsigned long));
206 }
207
208 static bool page_same_filled(void *ptr, unsigned long *element)
209 {
210         unsigned int pos;
211         unsigned long *page;
212         unsigned long val;
213
214         page = (unsigned long *)ptr;
215         val = page[0];
216
217         for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) {
218                 if (val != page[pos])
219                         return false;
220         }
221
222         *element = val;
223
224         return true;
225 }
226
227 static ssize_t initstate_show(struct device *dev,
228                 struct device_attribute *attr, char *buf)
229 {
230         u32 val;
231         struct zram *zram = dev_to_zram(dev);
232
233         down_read(&zram->init_lock);
234         val = init_done(zram);
235         up_read(&zram->init_lock);
236
237         return scnprintf(buf, PAGE_SIZE, "%u\n", val);
238 }
239
240 static ssize_t disksize_show(struct device *dev,
241                 struct device_attribute *attr, char *buf)
242 {
243         struct zram *zram = dev_to_zram(dev);
244
245         return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
246 }
247
248 static ssize_t mem_limit_store(struct device *dev,
249                 struct device_attribute *attr, const char *buf, size_t len)
250 {
251         u64 limit;
252         char *tmp;
253         struct zram *zram = dev_to_zram(dev);
254
255         limit = memparse(buf, &tmp);
256         if (buf == tmp) /* no chars parsed, invalid input */
257                 return -EINVAL;
258
259         down_write(&zram->init_lock);
260         zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
261         up_write(&zram->init_lock);
262
263         return len;
264 }
265
266 static ssize_t mem_used_max_store(struct device *dev,
267                 struct device_attribute *attr, const char *buf, size_t len)
268 {
269         int err;
270         unsigned long val;
271         struct zram *zram = dev_to_zram(dev);
272
273         err = kstrtoul(buf, 10, &val);
274         if (err || val != 0)
275                 return -EINVAL;
276
277         down_read(&zram->init_lock);
278         if (init_done(zram)) {
279                 atomic_long_set(&zram->stats.max_used_pages,
280                                 zs_get_total_pages(zram->mem_pool));
281         }
282         up_read(&zram->init_lock);
283
284         return len;
285 }
286
287 static ssize_t idle_store(struct device *dev,
288                 struct device_attribute *attr, const char *buf, size_t len)
289 {
290         struct zram *zram = dev_to_zram(dev);
291         unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
292         int index;
293         char mode_buf[8];
294         ssize_t sz;
295
296         sz = strscpy(mode_buf, buf, sizeof(mode_buf));
297         if (sz <= 0)
298                 return -EINVAL;
299
300         /* ignore trailing new line */
301         if (mode_buf[sz - 1] == '\n')
302                 mode_buf[sz - 1] = 0x00;
303
304         if (strcmp(mode_buf, "all"))
305                 return -EINVAL;
306
307         down_read(&zram->init_lock);
308         if (!init_done(zram)) {
309                 up_read(&zram->init_lock);
310                 return -EINVAL;
311         }
312
313         for (index = 0; index < nr_pages; index++) {
314                 /*
315                  * Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race.
316                  * See the comment in writeback_store.
317                  */
318                 zram_slot_lock(zram, index);
319                 if (!zram_allocated(zram, index) ||
320                                 zram_test_flag(zram, index, ZRAM_UNDER_WB))
321                         goto next;
322                 zram_set_flag(zram, index, ZRAM_IDLE);
323 next:
324                 zram_slot_unlock(zram, index);
325         }
326
327         up_read(&zram->init_lock);
328
329         return len;
330 }
331
332 #ifdef CONFIG_ZRAM_WRITEBACK
333 static void reset_bdev(struct zram *zram)
334 {
335         struct block_device *bdev;
336
337         if (!zram->backing_dev)
338                 return;
339
340         bdev = zram->bdev;
341         if (zram->old_block_size)
342                 set_blocksize(bdev, zram->old_block_size);
343         blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
344         /* hope filp_close flush all of IO */
345         filp_close(zram->backing_dev, NULL);
346         zram->backing_dev = NULL;
347         zram->old_block_size = 0;
348         zram->bdev = NULL;
349         zram->disk->queue->backing_dev_info->capabilities |=
350                                 BDI_CAP_SYNCHRONOUS_IO;
351         kvfree(zram->bitmap);
352         zram->bitmap = NULL;
353 }
354
355 static ssize_t backing_dev_show(struct device *dev,
356                 struct device_attribute *attr, char *buf)
357 {
358         struct zram *zram = dev_to_zram(dev);
359         struct file *file = zram->backing_dev;
360         char *p;
361         ssize_t ret;
362
363         down_read(&zram->init_lock);
364         if (!zram->backing_dev) {
365                 memcpy(buf, "none\n", 5);
366                 up_read(&zram->init_lock);
367                 return 5;
368         }
369
370         p = file_path(file, buf, PAGE_SIZE - 1);
371         if (IS_ERR(p)) {
372                 ret = PTR_ERR(p);
373                 goto out;
374         }
375
376         ret = strlen(p);
377         memmove(buf, p, ret);
378         buf[ret++] = '\n';
379 out:
380         up_read(&zram->init_lock);
381         return ret;
382 }
383
384 static ssize_t backing_dev_store(struct device *dev,
385                 struct device_attribute *attr, const char *buf, size_t len)
386 {
387         char *file_name;
388         size_t sz;
389         struct file *backing_dev = NULL;
390         struct inode *inode;
391         struct address_space *mapping;
392         unsigned int bitmap_sz, old_block_size = 0;
393         unsigned long nr_pages, *bitmap = NULL;
394         struct block_device *bdev = NULL;
395         int err;
396         struct zram *zram = dev_to_zram(dev);
397
398         file_name = kmalloc(PATH_MAX, GFP_KERNEL);
399         if (!file_name)
400                 return -ENOMEM;
401
402         down_write(&zram->init_lock);
403         if (init_done(zram)) {
404                 pr_info("Can't setup backing device for initialized device\n");
405                 err = -EBUSY;
406                 goto out;
407         }
408
409         strlcpy(file_name, buf, PATH_MAX);
410         /* ignore trailing newline */
411         sz = strlen(file_name);
412         if (sz > 0 && file_name[sz - 1] == '\n')
413                 file_name[sz - 1] = 0x00;
414
415         backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
416         if (IS_ERR(backing_dev)) {
417                 err = PTR_ERR(backing_dev);
418                 backing_dev = NULL;
419                 goto out;
420         }
421
422         mapping = backing_dev->f_mapping;
423         inode = mapping->host;
424
425         /* Support only block device in this moment */
426         if (!S_ISBLK(inode->i_mode)) {
427                 err = -ENOTBLK;
428                 goto out;
429         }
430
431         bdev = bdgrab(I_BDEV(inode));
432         err = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram);
433         if (err < 0) {
434                 bdev = NULL;
435                 goto out;
436         }
437
438         nr_pages = i_size_read(inode) >> PAGE_SHIFT;
439         bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
440         bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
441         if (!bitmap) {
442                 err = -ENOMEM;
443                 goto out;
444         }
445
446         old_block_size = block_size(bdev);
447         err = set_blocksize(bdev, PAGE_SIZE);
448         if (err)
449                 goto out;
450
451         reset_bdev(zram);
452
453         zram->old_block_size = old_block_size;
454         zram->bdev = bdev;
455         zram->backing_dev = backing_dev;
456         zram->bitmap = bitmap;
457         zram->nr_pages = nr_pages;
458         /*
459          * With writeback feature, zram does asynchronous IO so it's no longer
460          * synchronous device so let's remove synchronous io flag. Othewise,
461          * upper layer(e.g., swap) could wait IO completion rather than
462          * (submit and return), which will cause system sluggish.
463          * Furthermore, when the IO function returns(e.g., swap_readpage),
464          * upper layer expects IO was done so it could deallocate the page
465          * freely but in fact, IO is going on so finally could cause
466          * use-after-free when the IO is really done.
467          */
468         zram->disk->queue->backing_dev_info->capabilities &=
469                         ~BDI_CAP_SYNCHRONOUS_IO;
470         up_write(&zram->init_lock);
471
472         pr_info("setup backing device %s\n", file_name);
473         kfree(file_name);
474
475         return len;
476 out:
477         if (bitmap)
478                 kvfree(bitmap);
479
480         if (bdev)
481                 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
482
483         if (backing_dev)
484                 filp_close(backing_dev, NULL);
485
486         up_write(&zram->init_lock);
487
488         kfree(file_name);
489
490         return err;
491 }
492
493 static unsigned long alloc_block_bdev(struct zram *zram)
494 {
495         unsigned long blk_idx = 1;
496 retry:
497         /* skip 0 bit to confuse zram.handle = 0 */
498         blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, blk_idx);
499         if (blk_idx == zram->nr_pages)
500                 return 0;
501
502         if (test_and_set_bit(blk_idx, zram->bitmap))
503                 goto retry;
504
505         atomic64_inc(&zram->stats.bd_count);
506         return blk_idx;
507 }
508
509 static void free_block_bdev(struct zram *zram, unsigned long blk_idx)
510 {
511         int was_set;
512
513         was_set = test_and_clear_bit(blk_idx, zram->bitmap);
514         WARN_ON_ONCE(!was_set);
515         atomic64_dec(&zram->stats.bd_count);
516 }
517
518 static void zram_page_end_io(struct bio *bio)
519 {
520         struct page *page = bio_first_page_all(bio);
521
522         page_endio(page, op_is_write(bio_op(bio)),
523                         blk_status_to_errno(bio->bi_status));
524         bio_put(bio);
525 }
526
527 /*
528  * Returns 1 if the submission is successful.
529  */
530 static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
531                         unsigned long entry, struct bio *parent)
532 {
533         struct bio *bio;
534
535         bio = bio_alloc(GFP_ATOMIC, 1);
536         if (!bio)
537                 return -ENOMEM;
538
539         bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
540         bio_set_dev(bio, zram->bdev);
541         if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) {
542                 bio_put(bio);
543                 return -EIO;
544         }
545
546         if (!parent) {
547                 bio->bi_opf = REQ_OP_READ;
548                 bio->bi_end_io = zram_page_end_io;
549         } else {
550                 bio->bi_opf = parent->bi_opf;
551                 bio_chain(bio, parent);
552         }
553
554         submit_bio(bio);
555         return 1;
556 }
557
558 #define HUGE_WRITEBACK 0x1
559 #define IDLE_WRITEBACK 0x2
560
561 static ssize_t writeback_store(struct device *dev,
562                 struct device_attribute *attr, const char *buf, size_t len)
563 {
564         struct zram *zram = dev_to_zram(dev);
565         unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
566         unsigned long index;
567         struct bio bio;
568         struct bio_vec bio_vec;
569         struct page *page;
570         ssize_t ret, sz;
571         char mode_buf[8];
572         unsigned long mode = -1UL;
573         unsigned long blk_idx = 0;
574
575         sz = strscpy(mode_buf, buf, sizeof(mode_buf));
576         if (sz <= 0)
577                 return -EINVAL;
578
579         /* ignore trailing newline */
580         if (mode_buf[sz - 1] == '\n')
581                 mode_buf[sz - 1] = 0x00;
582
583         if (!strcmp(mode_buf, "idle"))
584                 mode = IDLE_WRITEBACK;
585         else if (!strcmp(mode_buf, "huge"))
586                 mode = HUGE_WRITEBACK;
587
588         if (mode == -1UL)
589                 return -EINVAL;
590
591         down_read(&zram->init_lock);
592         if (!init_done(zram)) {
593                 ret = -EINVAL;
594                 goto release_init_lock;
595         }
596
597         if (!zram->backing_dev) {
598                 ret = -ENODEV;
599                 goto release_init_lock;
600         }
601
602         page = alloc_page(GFP_KERNEL);
603         if (!page) {
604                 ret = -ENOMEM;
605                 goto release_init_lock;
606         }
607
608         for (index = 0; index < nr_pages; index++) {
609                 struct bio_vec bvec;
610
611                 bvec.bv_page = page;
612                 bvec.bv_len = PAGE_SIZE;
613                 bvec.bv_offset = 0;
614
615                 if (!blk_idx) {
616                         blk_idx = alloc_block_bdev(zram);
617                         if (!blk_idx) {
618                                 ret = -ENOSPC;
619                                 break;
620                         }
621                 }
622
623                 zram_slot_lock(zram, index);
624                 if (!zram_allocated(zram, index))
625                         goto next;
626
627                 if (zram_test_flag(zram, index, ZRAM_WB) ||
628                                 zram_test_flag(zram, index, ZRAM_SAME) ||
629                                 zram_test_flag(zram, index, ZRAM_UNDER_WB))
630                         goto next;
631
632                 if ((mode & IDLE_WRITEBACK &&
633                           !zram_test_flag(zram, index, ZRAM_IDLE)) &&
634                     (mode & HUGE_WRITEBACK &&
635                           !zram_test_flag(zram, index, ZRAM_HUGE)))
636                         goto next;
637                 /*
638                  * Clearing ZRAM_UNDER_WB is duty of caller.
639                  * IOW, zram_free_page never clear it.
640                  */
641                 zram_set_flag(zram, index, ZRAM_UNDER_WB);
642                 /* Need for hugepage writeback racing */
643                 zram_set_flag(zram, index, ZRAM_IDLE);
644                 zram_slot_unlock(zram, index);
645                 if (zram_bvec_read(zram, &bvec, index, 0, NULL)) {
646                         zram_slot_lock(zram, index);
647                         zram_clear_flag(zram, index, ZRAM_UNDER_WB);
648                         zram_clear_flag(zram, index, ZRAM_IDLE);
649                         zram_slot_unlock(zram, index);
650                         continue;
651                 }
652
653                 bio_init(&bio, &bio_vec, 1);
654                 bio_set_dev(&bio, zram->bdev);
655                 bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
656                 bio.bi_opf = REQ_OP_WRITE | REQ_SYNC;
657
658                 bio_add_page(&bio, bvec.bv_page, bvec.bv_len,
659                                 bvec.bv_offset);
660                 /*
661                  * XXX: A single page IO would be inefficient for write
662                  * but it would be not bad as starter.
663                  */
664                 ret = submit_bio_wait(&bio);
665                 if (ret) {
666                         zram_slot_lock(zram, index);
667                         zram_clear_flag(zram, index, ZRAM_UNDER_WB);
668                         zram_clear_flag(zram, index, ZRAM_IDLE);
669                         zram_slot_unlock(zram, index);
670                         continue;
671                 }
672
673                 atomic64_inc(&zram->stats.bd_writes);
674                 /*
675                  * We released zram_slot_lock so need to check if the slot was
676                  * changed. If there is freeing for the slot, we can catch it
677                  * easily by zram_allocated.
678                  * A subtle case is the slot is freed/reallocated/marked as
679                  * ZRAM_IDLE again. To close the race, idle_store doesn't
680                  * mark ZRAM_IDLE once it found the slot was ZRAM_UNDER_WB.
681                  * Thus, we could close the race by checking ZRAM_IDLE bit.
682                  */
683                 zram_slot_lock(zram, index);
684                 if (!zram_allocated(zram, index) ||
685                           !zram_test_flag(zram, index, ZRAM_IDLE)) {
686                         zram_clear_flag(zram, index, ZRAM_UNDER_WB);
687                         zram_clear_flag(zram, index, ZRAM_IDLE);
688                         goto next;
689                 }
690
691                 zram_free_page(zram, index);
692                 zram_clear_flag(zram, index, ZRAM_UNDER_WB);
693                 zram_set_flag(zram, index, ZRAM_WB);
694                 zram_set_element(zram, index, blk_idx);
695                 blk_idx = 0;
696                 atomic64_inc(&zram->stats.pages_stored);
697 next:
698                 zram_slot_unlock(zram, index);
699         }
700
701         if (blk_idx)
702                 free_block_bdev(zram, blk_idx);
703         ret = len;
704         __free_page(page);
705 release_init_lock:
706         up_read(&zram->init_lock);
707
708         return ret;
709 }
710
711 struct zram_work {
712         struct work_struct work;
713         struct zram *zram;
714         unsigned long entry;
715         struct bio *bio;
716 };
717
718 #if PAGE_SIZE != 4096
719 static void zram_sync_read(struct work_struct *work)
720 {
721         struct bio_vec bvec;
722         struct zram_work *zw = container_of(work, struct zram_work, work);
723         struct zram *zram = zw->zram;
724         unsigned long entry = zw->entry;
725         struct bio *bio = zw->bio;
726
727         read_from_bdev_async(zram, &bvec, entry, bio);
728 }
729
730 /*
731  * Block layer want one ->make_request_fn to be active at a time
732  * so if we use chained IO with parent IO in same context,
733  * it's a deadlock. To avoid, it, it uses worker thread context.
734  */
735 static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
736                                 unsigned long entry, struct bio *bio)
737 {
738         struct zram_work work;
739
740         work.zram = zram;
741         work.entry = entry;
742         work.bio = bio;
743
744         INIT_WORK_ONSTACK(&work.work, zram_sync_read);
745         queue_work(system_unbound_wq, &work.work);
746         flush_work(&work.work);
747         destroy_work_on_stack(&work.work);
748
749         return 1;
750 }
751 #else
752 static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
753                                 unsigned long entry, struct bio *bio)
754 {
755         WARN_ON(1);
756         return -EIO;
757 }
758 #endif
759
760 static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
761                         unsigned long entry, struct bio *parent, bool sync)
762 {
763         atomic64_inc(&zram->stats.bd_reads);
764         if (sync)
765                 return read_from_bdev_sync(zram, bvec, entry, parent);
766         else
767                 return read_from_bdev_async(zram, bvec, entry, parent);
768 }
769 #else
770 static inline void reset_bdev(struct zram *zram) {};
771 static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
772                         unsigned long entry, struct bio *parent, bool sync)
773 {
774         return -EIO;
775 }
776
777 static void free_block_bdev(struct zram *zram, unsigned long blk_idx) {};
778 #endif
779
780 #ifdef CONFIG_ZRAM_MEMORY_TRACKING
781
782 static struct dentry *zram_debugfs_root;
783
784 static void zram_debugfs_create(void)
785 {
786         zram_debugfs_root = debugfs_create_dir("zram", NULL);
787 }
788
789 static void zram_debugfs_destroy(void)
790 {
791         debugfs_remove_recursive(zram_debugfs_root);
792 }
793
794 static void zram_accessed(struct zram *zram, u32 index)
795 {
796         zram_clear_flag(zram, index, ZRAM_IDLE);
797         zram->table[index].ac_time = ktime_get_boottime();
798 }
799
800 static ssize_t read_block_state(struct file *file, char __user *buf,
801                                 size_t count, loff_t *ppos)
802 {
803         char *kbuf;
804         ssize_t index, written = 0;
805         struct zram *zram = file->private_data;
806         unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
807         struct timespec64 ts;
808
809         kbuf = kvmalloc(count, GFP_KERNEL);
810         if (!kbuf)
811                 return -ENOMEM;
812
813         down_read(&zram->init_lock);
814         if (!init_done(zram)) {
815                 up_read(&zram->init_lock);
816                 kvfree(kbuf);
817                 return -EINVAL;
818         }
819
820         for (index = *ppos; index < nr_pages; index++) {
821                 int copied;
822
823                 zram_slot_lock(zram, index);
824                 if (!zram_allocated(zram, index))
825                         goto next;
826
827                 ts = ktime_to_timespec64(zram->table[index].ac_time);
828                 copied = snprintf(kbuf + written, count,
829                         "%12zd %12lld.%06lu %c%c%c%c\n",
830                         index, (s64)ts.tv_sec,
831                         ts.tv_nsec / NSEC_PER_USEC,
832                         zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.',
833                         zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.',
834                         zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.',
835                         zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.');
836
837                 if (count < copied) {
838                         zram_slot_unlock(zram, index);
839                         break;
840                 }
841                 written += copied;
842                 count -= copied;
843 next:
844                 zram_slot_unlock(zram, index);
845                 *ppos += 1;
846         }
847
848         up_read(&zram->init_lock);
849         if (copy_to_user(buf, kbuf, written))
850                 written = -EFAULT;
851         kvfree(kbuf);
852
853         return written;
854 }
855
856 static const struct file_operations proc_zram_block_state_op = {
857         .open = simple_open,
858         .read = read_block_state,
859         .llseek = default_llseek,
860 };
861
862 static void zram_debugfs_register(struct zram *zram)
863 {
864         if (!zram_debugfs_root)
865                 return;
866
867         zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name,
868                                                 zram_debugfs_root);
869         debugfs_create_file("block_state", 0400, zram->debugfs_dir,
870                                 zram, &proc_zram_block_state_op);
871 }
872
873 static void zram_debugfs_unregister(struct zram *zram)
874 {
875         debugfs_remove_recursive(zram->debugfs_dir);
876 }
877 #else
878 static void zram_debugfs_create(void) {};
879 static void zram_debugfs_destroy(void) {};
880 static void zram_accessed(struct zram *zram, u32 index)
881 {
882         zram_clear_flag(zram, index, ZRAM_IDLE);
883 };
884 static void zram_debugfs_register(struct zram *zram) {};
885 static void zram_debugfs_unregister(struct zram *zram) {};
886 #endif
887
888 /*
889  * We switched to per-cpu streams and this attr is not needed anymore.
890  * However, we will keep it around for some time, because:
891  * a) we may revert per-cpu streams in the future
892  * b) it's visible to user space and we need to follow our 2 years
893  *    retirement rule; but we already have a number of 'soon to be
894  *    altered' attrs, so max_comp_streams need to wait for the next
895  *    layoff cycle.
896  */
897 static ssize_t max_comp_streams_show(struct device *dev,
898                 struct device_attribute *attr, char *buf)
899 {
900         return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
901 }
902
903 static ssize_t max_comp_streams_store(struct device *dev,
904                 struct device_attribute *attr, const char *buf, size_t len)
905 {
906         return len;
907 }
908
909 static ssize_t comp_algorithm_show(struct device *dev,
910                 struct device_attribute *attr, char *buf)
911 {
912         size_t sz;
913         struct zram *zram = dev_to_zram(dev);
914
915         down_read(&zram->init_lock);
916         sz = zcomp_available_show(zram->compressor, buf);
917         up_read(&zram->init_lock);
918
919         return sz;
920 }
921
922 static ssize_t comp_algorithm_store(struct device *dev,
923                 struct device_attribute *attr, const char *buf, size_t len)
924 {
925         struct zram *zram = dev_to_zram(dev);
926         char compressor[ARRAY_SIZE(zram->compressor)];
927         size_t sz;
928
929         strlcpy(compressor, buf, sizeof(compressor));
930         /* ignore trailing newline */
931         sz = strlen(compressor);
932         if (sz > 0 && compressor[sz - 1] == '\n')
933                 compressor[sz - 1] = 0x00;
934
935         if (!zcomp_available_algorithm(compressor))
936                 return -EINVAL;
937
938         down_write(&zram->init_lock);
939         if (init_done(zram)) {
940                 up_write(&zram->init_lock);
941                 pr_info("Can't change algorithm for initialized device\n");
942                 return -EBUSY;
943         }
944
945         strcpy(zram->compressor, compressor);
946         up_write(&zram->init_lock);
947         return len;
948 }
949
950 static ssize_t compact_store(struct device *dev,
951                 struct device_attribute *attr, const char *buf, size_t len)
952 {
953         struct zram *zram = dev_to_zram(dev);
954
955         down_read(&zram->init_lock);
956         if (!init_done(zram)) {
957                 up_read(&zram->init_lock);
958                 return -EINVAL;
959         }
960
961         zs_compact(zram->mem_pool);
962         up_read(&zram->init_lock);
963
964         return len;
965 }
966
967 static ssize_t io_stat_show(struct device *dev,
968                 struct device_attribute *attr, char *buf)
969 {
970         struct zram *zram = dev_to_zram(dev);
971         ssize_t ret;
972
973         down_read(&zram->init_lock);
974         ret = scnprintf(buf, PAGE_SIZE,
975                         "%8llu %8llu %8llu %8llu\n",
976                         (u64)atomic64_read(&zram->stats.failed_reads),
977                         (u64)atomic64_read(&zram->stats.failed_writes),
978                         (u64)atomic64_read(&zram->stats.invalid_io),
979                         (u64)atomic64_read(&zram->stats.notify_free));
980         up_read(&zram->init_lock);
981
982         return ret;
983 }
984
985 static ssize_t mm_stat_show(struct device *dev,
986                 struct device_attribute *attr, char *buf)
987 {
988         struct zram *zram = dev_to_zram(dev);
989         struct zs_pool_stats pool_stats;
990         u64 orig_size, mem_used = 0;
991         long max_used;
992         ssize_t ret;
993
994         memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
995
996         down_read(&zram->init_lock);
997         if (init_done(zram)) {
998                 mem_used = zs_get_total_pages(zram->mem_pool);
999                 zs_pool_stats(zram->mem_pool, &pool_stats);
1000         }
1001
1002         orig_size = atomic64_read(&zram->stats.pages_stored);
1003         max_used = atomic_long_read(&zram->stats.max_used_pages);
1004
1005         ret = scnprintf(buf, PAGE_SIZE,
1006                         "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu\n",
1007                         orig_size << PAGE_SHIFT,
1008                         (u64)atomic64_read(&zram->stats.compr_data_size),
1009                         mem_used << PAGE_SHIFT,
1010                         zram->limit_pages << PAGE_SHIFT,
1011                         max_used << PAGE_SHIFT,
1012                         (u64)atomic64_read(&zram->stats.same_pages),
1013                         pool_stats.pages_compacted,
1014                         (u64)atomic64_read(&zram->stats.huge_pages));
1015         up_read(&zram->init_lock);
1016
1017         return ret;
1018 }
1019
1020 #ifdef CONFIG_ZRAM_WRITEBACK
1021 static ssize_t bd_stat_show(struct device *dev,
1022                 struct device_attribute *attr, char *buf)
1023 {
1024         struct zram *zram = dev_to_zram(dev);
1025         ssize_t ret;
1026
1027         down_read(&zram->init_lock);
1028         ret = scnprintf(buf, PAGE_SIZE,
1029                 "%8llu %8llu %8llu\n",
1030                 (u64)atomic64_read(&zram->stats.bd_count) * (PAGE_SHIFT - 12),
1031                 (u64)atomic64_read(&zram->stats.bd_reads) * (PAGE_SHIFT - 12),
1032                 (u64)atomic64_read(&zram->stats.bd_writes) * (PAGE_SHIFT - 12));
1033         up_read(&zram->init_lock);
1034
1035         return ret;
1036 }
1037 #endif
1038
1039 static ssize_t debug_stat_show(struct device *dev,
1040                 struct device_attribute *attr, char *buf)
1041 {
1042         int version = 1;
1043         struct zram *zram = dev_to_zram(dev);
1044         ssize_t ret;
1045
1046         down_read(&zram->init_lock);
1047         ret = scnprintf(buf, PAGE_SIZE,
1048                         "version: %d\n%8llu %8llu\n",
1049                         version,
1050                         (u64)atomic64_read(&zram->stats.writestall),
1051                         (u64)atomic64_read(&zram->stats.miss_free));
1052         up_read(&zram->init_lock);
1053
1054         return ret;
1055 }
1056
1057 static DEVICE_ATTR_RO(io_stat);
1058 static DEVICE_ATTR_RO(mm_stat);
1059 #ifdef CONFIG_ZRAM_WRITEBACK
1060 static DEVICE_ATTR_RO(bd_stat);
1061 #endif
1062 static DEVICE_ATTR_RO(debug_stat);
1063
1064 static void zram_meta_free(struct zram *zram, u64 disksize)
1065 {
1066         size_t num_pages = disksize >> PAGE_SHIFT;
1067         size_t index;
1068
1069         /* Free all pages that are still in this zram device */
1070         for (index = 0; index < num_pages; index++)
1071                 zram_free_page(zram, index);
1072
1073         zs_destroy_pool(zram->mem_pool);
1074         vfree(zram->table);
1075 }
1076
1077 static bool zram_meta_alloc(struct zram *zram, u64 disksize)
1078 {
1079         size_t num_pages;
1080
1081         num_pages = disksize >> PAGE_SHIFT;
1082         zram->table = vzalloc(array_size(num_pages, sizeof(*zram->table)));
1083         if (!zram->table)
1084                 return false;
1085
1086         zram->mem_pool = zs_create_pool(zram->disk->disk_name);
1087         if (!zram->mem_pool) {
1088                 vfree(zram->table);
1089                 return false;
1090         }
1091
1092         if (!huge_class_size)
1093                 huge_class_size = zs_huge_class_size(zram->mem_pool);
1094         return true;
1095 }
1096
1097 /*
1098  * To protect concurrent access to the same index entry,
1099  * caller should hold this table index entry's bit_spinlock to
1100  * indicate this index entry is accessing.
1101  */
1102 static void zram_free_page(struct zram *zram, size_t index)
1103 {
1104         unsigned long handle;
1105
1106 #ifdef CONFIG_ZRAM_MEMORY_TRACKING
1107         zram->table[index].ac_time = 0;
1108 #endif
1109         if (zram_test_flag(zram, index, ZRAM_IDLE))
1110                 zram_clear_flag(zram, index, ZRAM_IDLE);
1111
1112         if (zram_test_flag(zram, index, ZRAM_HUGE)) {
1113                 zram_clear_flag(zram, index, ZRAM_HUGE);
1114                 atomic64_dec(&zram->stats.huge_pages);
1115         }
1116
1117         if (zram_test_flag(zram, index, ZRAM_WB)) {
1118                 zram_clear_flag(zram, index, ZRAM_WB);
1119                 free_block_bdev(zram, zram_get_element(zram, index));
1120                 goto out;
1121         }
1122
1123         /*
1124          * No memory is allocated for same element filled pages.
1125          * Simply clear same page flag.
1126          */
1127         if (zram_test_flag(zram, index, ZRAM_SAME)) {
1128                 zram_clear_flag(zram, index, ZRAM_SAME);
1129                 atomic64_dec(&zram->stats.same_pages);
1130                 goto out;
1131         }
1132
1133         handle = zram_get_handle(zram, index);
1134         if (!handle)
1135                 return;
1136
1137         zs_free(zram->mem_pool, handle);
1138
1139         atomic64_sub(zram_get_obj_size(zram, index),
1140                         &zram->stats.compr_data_size);
1141 out:
1142         atomic64_dec(&zram->stats.pages_stored);
1143         zram_set_handle(zram, index, 0);
1144         zram_set_obj_size(zram, index, 0);
1145         WARN_ON_ONCE(zram->table[index].flags &
1146                 ~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB));
1147 }
1148
1149 static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
1150                                 struct bio *bio, bool partial_io)
1151 {
1152         int ret;
1153         unsigned long handle;
1154         unsigned int size;
1155         void *src, *dst;
1156
1157         zram_slot_lock(zram, index);
1158         if (zram_test_flag(zram, index, ZRAM_WB)) {
1159                 struct bio_vec bvec;
1160
1161                 zram_slot_unlock(zram, index);
1162
1163                 bvec.bv_page = page;
1164                 bvec.bv_len = PAGE_SIZE;
1165                 bvec.bv_offset = 0;
1166                 return read_from_bdev(zram, &bvec,
1167                                 zram_get_element(zram, index),
1168                                 bio, partial_io);
1169         }
1170
1171         handle = zram_get_handle(zram, index);
1172         if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) {
1173                 unsigned long value;
1174                 void *mem;
1175
1176                 value = handle ? zram_get_element(zram, index) : 0;
1177                 mem = kmap_atomic(page);
1178                 zram_fill_page(mem, PAGE_SIZE, value);
1179                 kunmap_atomic(mem);
1180                 zram_slot_unlock(zram, index);
1181                 return 0;
1182         }
1183
1184         size = zram_get_obj_size(zram, index);
1185
1186         src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
1187         if (size == PAGE_SIZE) {
1188                 dst = kmap_atomic(page);
1189                 memcpy(dst, src, PAGE_SIZE);
1190                 kunmap_atomic(dst);
1191                 ret = 0;
1192         } else {
1193                 struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
1194
1195                 dst = kmap_atomic(page);
1196                 ret = zcomp_decompress(zstrm, src, size, dst);
1197                 kunmap_atomic(dst);
1198                 zcomp_stream_put(zram->comp);
1199         }
1200         zs_unmap_object(zram->mem_pool, handle);
1201         zram_slot_unlock(zram, index);
1202
1203         /* Should NEVER happen. Return bio error if it does. */
1204         if (unlikely(ret))
1205                 pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
1206
1207         return ret;
1208 }
1209
1210 static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
1211                                 u32 index, int offset, struct bio *bio)
1212 {
1213         int ret;
1214         struct page *page;
1215
1216         page = bvec->bv_page;
1217         if (is_partial_io(bvec)) {
1218                 /* Use a temporary buffer to decompress the page */
1219                 page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
1220                 if (!page)
1221                         return -ENOMEM;
1222         }
1223
1224         ret = __zram_bvec_read(zram, page, index, bio, is_partial_io(bvec));
1225         if (unlikely(ret))
1226                 goto out;
1227
1228         if (is_partial_io(bvec)) {
1229                 void *dst = kmap_atomic(bvec->bv_page);
1230                 void *src = kmap_atomic(page);
1231
1232                 memcpy(dst + bvec->bv_offset, src + offset, bvec->bv_len);
1233                 kunmap_atomic(src);
1234                 kunmap_atomic(dst);
1235         }
1236 out:
1237         if (is_partial_io(bvec))
1238                 __free_page(page);
1239
1240         return ret;
1241 }
1242
1243 static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
1244                                 u32 index, struct bio *bio)
1245 {
1246         int ret = 0;
1247         unsigned long alloced_pages;
1248         unsigned long handle = 0;
1249         unsigned int comp_len = 0;
1250         void *src, *dst, *mem;
1251         struct zcomp_strm *zstrm;
1252         struct page *page = bvec->bv_page;
1253         unsigned long element = 0;
1254         enum zram_pageflags flags = 0;
1255
1256         mem = kmap_atomic(page);
1257         if (page_same_filled(mem, &element)) {
1258                 kunmap_atomic(mem);
1259                 /* Free memory associated with this sector now. */
1260                 flags = ZRAM_SAME;
1261                 atomic64_inc(&zram->stats.same_pages);
1262                 goto out;
1263         }
1264         kunmap_atomic(mem);
1265
1266 compress_again:
1267         zstrm = zcomp_stream_get(zram->comp);
1268         src = kmap_atomic(page);
1269         ret = zcomp_compress(zstrm, src, &comp_len);
1270         kunmap_atomic(src);
1271
1272         if (unlikely(ret)) {
1273                 zcomp_stream_put(zram->comp);
1274                 pr_err("Compression failed! err=%d\n", ret);
1275                 zs_free(zram->mem_pool, handle);
1276                 return ret;
1277         }
1278
1279         if (comp_len >= huge_class_size)
1280                 comp_len = PAGE_SIZE;
1281         /*
1282          * handle allocation has 2 paths:
1283          * a) fast path is executed with preemption disabled (for
1284          *  per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
1285          *  since we can't sleep;
1286          * b) slow path enables preemption and attempts to allocate
1287          *  the page with __GFP_DIRECT_RECLAIM bit set. we have to
1288          *  put per-cpu compression stream and, thus, to re-do
1289          *  the compression once handle is allocated.
1290          *
1291          * if we have a 'non-null' handle here then we are coming
1292          * from the slow path and handle has already been allocated.
1293          */
1294         if (!handle)
1295                 handle = zs_malloc(zram->mem_pool, comp_len,
1296                                 __GFP_KSWAPD_RECLAIM |
1297                                 __GFP_NOWARN |
1298                                 __GFP_HIGHMEM |
1299                                 __GFP_MOVABLE);
1300         if (!handle) {
1301                 zcomp_stream_put(zram->comp);
1302                 atomic64_inc(&zram->stats.writestall);
1303                 handle = zs_malloc(zram->mem_pool, comp_len,
1304                                 GFP_NOIO | __GFP_HIGHMEM |
1305                                 __GFP_MOVABLE);
1306                 if (handle)
1307                         goto compress_again;
1308                 return -ENOMEM;
1309         }
1310
1311         alloced_pages = zs_get_total_pages(zram->mem_pool);
1312         update_used_max(zram, alloced_pages);
1313
1314         if (zram->limit_pages && alloced_pages > zram->limit_pages) {
1315                 zcomp_stream_put(zram->comp);
1316                 zs_free(zram->mem_pool, handle);
1317                 return -ENOMEM;
1318         }
1319
1320         dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
1321
1322         src = zstrm->buffer;
1323         if (comp_len == PAGE_SIZE)
1324                 src = kmap_atomic(page);
1325         memcpy(dst, src, comp_len);
1326         if (comp_len == PAGE_SIZE)
1327                 kunmap_atomic(src);
1328
1329         zcomp_stream_put(zram->comp);
1330         zs_unmap_object(zram->mem_pool, handle);
1331         atomic64_add(comp_len, &zram->stats.compr_data_size);
1332 out:
1333         /*
1334          * Free memory associated with this sector
1335          * before overwriting unused sectors.
1336          */
1337         zram_slot_lock(zram, index);
1338         zram_free_page(zram, index);
1339
1340         if (comp_len == PAGE_SIZE) {
1341                 zram_set_flag(zram, index, ZRAM_HUGE);
1342                 atomic64_inc(&zram->stats.huge_pages);
1343         }
1344
1345         if (flags) {
1346                 zram_set_flag(zram, index, flags);
1347                 zram_set_element(zram, index, element);
1348         }  else {
1349                 zram_set_handle(zram, index, handle);
1350                 zram_set_obj_size(zram, index, comp_len);
1351         }
1352         zram_slot_unlock(zram, index);
1353
1354         /* Update stats */
1355         atomic64_inc(&zram->stats.pages_stored);
1356         return ret;
1357 }
1358
1359 static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
1360                                 u32 index, int offset, struct bio *bio)
1361 {
1362         int ret;
1363         struct page *page = NULL;
1364         void *src;
1365         struct bio_vec vec;
1366
1367         vec = *bvec;
1368         if (is_partial_io(bvec)) {
1369                 void *dst;
1370                 /*
1371                  * This is a partial IO. We need to read the full page
1372                  * before to write the changes.
1373                  */
1374                 page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
1375                 if (!page)
1376                         return -ENOMEM;
1377
1378                 ret = __zram_bvec_read(zram, page, index, bio, true);
1379                 if (ret)
1380                         goto out;
1381
1382                 src = kmap_atomic(bvec->bv_page);
1383                 dst = kmap_atomic(page);
1384                 memcpy(dst + offset, src + bvec->bv_offset, bvec->bv_len);
1385                 kunmap_atomic(dst);
1386                 kunmap_atomic(src);
1387
1388                 vec.bv_page = page;
1389                 vec.bv_len = PAGE_SIZE;
1390                 vec.bv_offset = 0;
1391         }
1392
1393         ret = __zram_bvec_write(zram, &vec, index, bio);
1394 out:
1395         if (is_partial_io(bvec))
1396                 __free_page(page);
1397         return ret;
1398 }
1399
1400 /*
1401  * zram_bio_discard - handler on discard request
1402  * @index: physical block index in PAGE_SIZE units
1403  * @offset: byte offset within physical block
1404  */
1405 static void zram_bio_discard(struct zram *zram, u32 index,
1406                              int offset, struct bio *bio)
1407 {
1408         size_t n = bio->bi_iter.bi_size;
1409
1410         /*
1411          * zram manages data in physical block size units. Because logical block
1412          * size isn't identical with physical block size on some arch, we
1413          * could get a discard request pointing to a specific offset within a
1414          * certain physical block.  Although we can handle this request by
1415          * reading that physiclal block and decompressing and partially zeroing
1416          * and re-compressing and then re-storing it, this isn't reasonable
1417          * because our intent with a discard request is to save memory.  So
1418          * skipping this logical block is appropriate here.
1419          */
1420         if (offset) {
1421                 if (n <= (PAGE_SIZE - offset))
1422                         return;
1423
1424                 n -= (PAGE_SIZE - offset);
1425                 index++;
1426         }
1427
1428         while (n >= PAGE_SIZE) {
1429                 zram_slot_lock(zram, index);
1430                 zram_free_page(zram, index);
1431                 zram_slot_unlock(zram, index);
1432                 atomic64_inc(&zram->stats.notify_free);
1433                 index++;
1434                 n -= PAGE_SIZE;
1435         }
1436 }
1437
1438 /*
1439  * Returns errno if it has some problem. Otherwise return 0 or 1.
1440  * Returns 0 if IO request was done synchronously
1441  * Returns 1 if IO request was successfully submitted.
1442  */
1443 static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
1444                         int offset, unsigned int op, struct bio *bio)
1445 {
1446         unsigned long start_time = jiffies;
1447         struct request_queue *q = zram->disk->queue;
1448         int ret;
1449
1450         generic_start_io_acct(q, op, bvec->bv_len >> SECTOR_SHIFT,
1451                         &zram->disk->part0);
1452
1453         if (!op_is_write(op)) {
1454                 atomic64_inc(&zram->stats.num_reads);
1455                 ret = zram_bvec_read(zram, bvec, index, offset, bio);
1456                 flush_dcache_page(bvec->bv_page);
1457         } else {
1458                 atomic64_inc(&zram->stats.num_writes);
1459                 ret = zram_bvec_write(zram, bvec, index, offset, bio);
1460         }
1461
1462         generic_end_io_acct(q, op, &zram->disk->part0, start_time);
1463
1464         zram_slot_lock(zram, index);
1465         zram_accessed(zram, index);
1466         zram_slot_unlock(zram, index);
1467
1468         if (unlikely(ret < 0)) {
1469                 if (!op_is_write(op))
1470                         atomic64_inc(&zram->stats.failed_reads);
1471                 else
1472                         atomic64_inc(&zram->stats.failed_writes);
1473         }
1474
1475         return ret;
1476 }
1477
1478 static void __zram_make_request(struct zram *zram, struct bio *bio)
1479 {
1480         int offset;
1481         u32 index;
1482         struct bio_vec bvec;
1483         struct bvec_iter iter;
1484
1485         index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
1486         offset = (bio->bi_iter.bi_sector &
1487                   (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
1488
1489         switch (bio_op(bio)) {
1490         case REQ_OP_DISCARD:
1491         case REQ_OP_WRITE_ZEROES:
1492                 zram_bio_discard(zram, index, offset, bio);
1493                 bio_endio(bio);
1494                 return;
1495         default:
1496                 break;
1497         }
1498
1499         bio_for_each_segment(bvec, bio, iter) {
1500                 struct bio_vec bv = bvec;
1501                 unsigned int unwritten = bvec.bv_len;
1502
1503                 do {
1504                         bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
1505                                                         unwritten);
1506                         if (zram_bvec_rw(zram, &bv, index, offset,
1507                                          bio_op(bio), bio) < 0)
1508                                 goto out;
1509
1510                         bv.bv_offset += bv.bv_len;
1511                         unwritten -= bv.bv_len;
1512
1513                         update_position(&index, &offset, &bv);
1514                 } while (unwritten);
1515         }
1516
1517         bio_endio(bio);
1518         return;
1519
1520 out:
1521         bio_io_error(bio);
1522 }
1523
1524 /*
1525  * Handler function for all zram I/O requests.
1526  */
1527 static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio)
1528 {
1529         struct zram *zram = queue->queuedata;
1530
1531         if (!valid_io_request(zram, bio->bi_iter.bi_sector,
1532                                         bio->bi_iter.bi_size)) {
1533                 atomic64_inc(&zram->stats.invalid_io);
1534                 goto error;
1535         }
1536
1537         __zram_make_request(zram, bio);
1538         return BLK_QC_T_NONE;
1539
1540 error:
1541         bio_io_error(bio);
1542         return BLK_QC_T_NONE;
1543 }
1544
1545 static void zram_slot_free_notify(struct block_device *bdev,
1546                                 unsigned long index)
1547 {
1548         struct zram *zram;
1549
1550         zram = bdev->bd_disk->private_data;
1551
1552         atomic64_inc(&zram->stats.notify_free);
1553         if (!zram_slot_trylock(zram, index)) {
1554                 atomic64_inc(&zram->stats.miss_free);
1555                 return;
1556         }
1557
1558         zram_free_page(zram, index);
1559         zram_slot_unlock(zram, index);
1560 }
1561
1562 static int zram_rw_page(struct block_device *bdev, sector_t sector,
1563                        struct page *page, unsigned int op)
1564 {
1565         int offset, ret;
1566         u32 index;
1567         struct zram *zram;
1568         struct bio_vec bv;
1569
1570         if (PageTransHuge(page))
1571                 return -ENOTSUPP;
1572         zram = bdev->bd_disk->private_data;
1573
1574         if (!valid_io_request(zram, sector, PAGE_SIZE)) {
1575                 atomic64_inc(&zram->stats.invalid_io);
1576                 ret = -EINVAL;
1577                 goto out;
1578         }
1579
1580         index = sector >> SECTORS_PER_PAGE_SHIFT;
1581         offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
1582
1583         bv.bv_page = page;
1584         bv.bv_len = PAGE_SIZE;
1585         bv.bv_offset = 0;
1586
1587         ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL);
1588 out:
1589         /*
1590          * If I/O fails, just return error(ie, non-zero) without
1591          * calling page_endio.
1592          * It causes resubmit the I/O with bio request by upper functions
1593          * of rw_page(e.g., swap_readpage, __swap_writepage) and
1594          * bio->bi_end_io does things to handle the error
1595          * (e.g., SetPageError, set_page_dirty and extra works).
1596          */
1597         if (unlikely(ret < 0))
1598                 return ret;
1599
1600         switch (ret) {
1601         case 0:
1602                 page_endio(page, op_is_write(op), 0);
1603                 break;
1604         case 1:
1605                 ret = 0;
1606                 break;
1607         default:
1608                 WARN_ON(1);
1609         }
1610         return ret;
1611 }
1612
1613 static void zram_reset_device(struct zram *zram)
1614 {
1615         struct zcomp *comp;
1616         u64 disksize;
1617
1618         down_write(&zram->init_lock);
1619
1620         zram->limit_pages = 0;
1621
1622         if (!init_done(zram)) {
1623                 up_write(&zram->init_lock);
1624                 return;
1625         }
1626
1627         comp = zram->comp;
1628         disksize = zram->disksize;
1629         zram->disksize = 0;
1630
1631         set_capacity(zram->disk, 0);
1632         part_stat_set_all(&zram->disk->part0, 0);
1633
1634         up_write(&zram->init_lock);
1635         /* I/O operation under all of CPU are done so let's free */
1636         zram_meta_free(zram, disksize);
1637         memset(&zram->stats, 0, sizeof(zram->stats));
1638         zcomp_destroy(comp);
1639         reset_bdev(zram);
1640 }
1641
1642 static ssize_t disksize_store(struct device *dev,
1643                 struct device_attribute *attr, const char *buf, size_t len)
1644 {
1645         u64 disksize;
1646         struct zcomp *comp;
1647         struct zram *zram = dev_to_zram(dev);
1648         int err;
1649
1650         disksize = memparse(buf, NULL);
1651         if (!disksize)
1652                 return -EINVAL;
1653
1654         down_write(&zram->init_lock);
1655         if (init_done(zram)) {
1656                 pr_info("Cannot change disksize for initialized device\n");
1657                 err = -EBUSY;
1658                 goto out_unlock;
1659         }
1660
1661         disksize = PAGE_ALIGN(disksize);
1662         if (!zram_meta_alloc(zram, disksize)) {
1663                 err = -ENOMEM;
1664                 goto out_unlock;
1665         }
1666
1667         comp = zcomp_create(zram->compressor);
1668         if (IS_ERR(comp)) {
1669                 pr_err("Cannot initialise %s compressing backend\n",
1670                                 zram->compressor);
1671                 err = PTR_ERR(comp);
1672                 goto out_free_meta;
1673         }
1674
1675         zram->comp = comp;
1676         zram->disksize = disksize;
1677         set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
1678
1679         revalidate_disk(zram->disk);
1680         up_write(&zram->init_lock);
1681
1682         return len;
1683
1684 out_free_meta:
1685         zram_meta_free(zram, disksize);
1686 out_unlock:
1687         up_write(&zram->init_lock);
1688         return err;
1689 }
1690
1691 static ssize_t reset_store(struct device *dev,
1692                 struct device_attribute *attr, const char *buf, size_t len)
1693 {
1694         int ret;
1695         unsigned short do_reset;
1696         struct zram *zram;
1697         struct block_device *bdev;
1698
1699         ret = kstrtou16(buf, 10, &do_reset);
1700         if (ret)
1701                 return ret;
1702
1703         if (!do_reset)
1704                 return -EINVAL;
1705
1706         zram = dev_to_zram(dev);
1707         bdev = bdget_disk(zram->disk, 0);
1708         if (!bdev)
1709                 return -ENOMEM;
1710
1711         mutex_lock(&bdev->bd_mutex);
1712         /* Do not reset an active device or claimed device */
1713         if (bdev->bd_openers || zram->claim) {
1714                 mutex_unlock(&bdev->bd_mutex);
1715                 bdput(bdev);
1716                 return -EBUSY;
1717         }
1718
1719         /* From now on, anyone can't open /dev/zram[0-9] */
1720         zram->claim = true;
1721         mutex_unlock(&bdev->bd_mutex);
1722
1723         /* Make sure all the pending I/O are finished */
1724         fsync_bdev(bdev);
1725         zram_reset_device(zram);
1726         revalidate_disk(zram->disk);
1727         bdput(bdev);
1728
1729         mutex_lock(&bdev->bd_mutex);
1730         zram->claim = false;
1731         mutex_unlock(&bdev->bd_mutex);
1732
1733         return len;
1734 }
1735
1736 static int zram_open(struct block_device *bdev, fmode_t mode)
1737 {
1738         int ret = 0;
1739         struct zram *zram;
1740
1741         WARN_ON(!mutex_is_locked(&bdev->bd_mutex));
1742
1743         zram = bdev->bd_disk->private_data;
1744         /* zram was claimed to reset so open request fails */
1745         if (zram->claim)
1746                 ret = -EBUSY;
1747
1748         return ret;
1749 }
1750
1751 static const struct block_device_operations zram_devops = {
1752         .open = zram_open,
1753         .swap_slot_free_notify = zram_slot_free_notify,
1754         .rw_page = zram_rw_page,
1755         .owner = THIS_MODULE
1756 };
1757
1758 static DEVICE_ATTR_WO(compact);
1759 static DEVICE_ATTR_RW(disksize);
1760 static DEVICE_ATTR_RO(initstate);
1761 static DEVICE_ATTR_WO(reset);
1762 static DEVICE_ATTR_WO(mem_limit);
1763 static DEVICE_ATTR_WO(mem_used_max);
1764 static DEVICE_ATTR_WO(idle);
1765 static DEVICE_ATTR_RW(max_comp_streams);
1766 static DEVICE_ATTR_RW(comp_algorithm);
1767 #ifdef CONFIG_ZRAM_WRITEBACK
1768 static DEVICE_ATTR_RW(backing_dev);
1769 static DEVICE_ATTR_WO(writeback);
1770 #endif
1771
1772 static struct attribute *zram_disk_attrs[] = {
1773         &dev_attr_disksize.attr,
1774         &dev_attr_initstate.attr,
1775         &dev_attr_reset.attr,
1776         &dev_attr_compact.attr,
1777         &dev_attr_mem_limit.attr,
1778         &dev_attr_mem_used_max.attr,
1779         &dev_attr_idle.attr,
1780         &dev_attr_max_comp_streams.attr,
1781         &dev_attr_comp_algorithm.attr,
1782 #ifdef CONFIG_ZRAM_WRITEBACK
1783         &dev_attr_backing_dev.attr,
1784         &dev_attr_writeback.attr,
1785 #endif
1786         &dev_attr_io_stat.attr,
1787         &dev_attr_mm_stat.attr,
1788 #ifdef CONFIG_ZRAM_WRITEBACK
1789         &dev_attr_bd_stat.attr,
1790 #endif
1791         &dev_attr_debug_stat.attr,
1792         NULL,
1793 };
1794
1795 static const struct attribute_group zram_disk_attr_group = {
1796         .attrs = zram_disk_attrs,
1797 };
1798
1799 static const struct attribute_group *zram_disk_attr_groups[] = {
1800         &zram_disk_attr_group,
1801         NULL,
1802 };
1803
1804 /*
1805  * Allocate and initialize new zram device. the function returns
1806  * '>= 0' device_id upon success, and negative value otherwise.
1807  */
1808 static int zram_add(void)
1809 {
1810         struct zram *zram;
1811         struct request_queue *queue;
1812         int ret, device_id;
1813
1814         zram = kzalloc(sizeof(struct zram), GFP_KERNEL);
1815         if (!zram)
1816                 return -ENOMEM;
1817
1818         ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL);
1819         if (ret < 0)
1820                 goto out_free_dev;
1821         device_id = ret;
1822
1823         init_rwsem(&zram->init_lock);
1824
1825         queue = blk_alloc_queue(GFP_KERNEL);
1826         if (!queue) {
1827                 pr_err("Error allocating disk queue for device %d\n",
1828                         device_id);
1829                 ret = -ENOMEM;
1830                 goto out_free_idr;
1831         }
1832
1833         blk_queue_make_request(queue, zram_make_request);
1834
1835         /* gendisk structure */
1836         zram->disk = alloc_disk(1);
1837         if (!zram->disk) {
1838                 pr_err("Error allocating disk structure for device %d\n",
1839                         device_id);
1840                 ret = -ENOMEM;
1841                 goto out_free_queue;
1842         }
1843
1844         zram->disk->major = zram_major;
1845         zram->disk->first_minor = device_id;
1846         zram->disk->fops = &zram_devops;
1847         zram->disk->queue = queue;
1848         zram->disk->queue->queuedata = zram;
1849         zram->disk->private_data = zram;
1850         snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
1851
1852         /* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
1853         set_capacity(zram->disk, 0);
1854         /* zram devices sort of resembles non-rotational disks */
1855         blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue);
1856         blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
1857
1858         /*
1859          * To ensure that we always get PAGE_SIZE aligned
1860          * and n*PAGE_SIZED sized I/O requests.
1861          */
1862         blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
1863         blk_queue_logical_block_size(zram->disk->queue,
1864                                         ZRAM_LOGICAL_BLOCK_SIZE);
1865         blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
1866         blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
1867         zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
1868         blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
1869         blk_queue_flag_set(QUEUE_FLAG_DISCARD, zram->disk->queue);
1870
1871         /*
1872          * zram_bio_discard() will clear all logical blocks if logical block
1873          * size is identical with physical block size(PAGE_SIZE). But if it is
1874          * different, we will skip discarding some parts of logical blocks in
1875          * the part of the request range which isn't aligned to physical block
1876          * size.  So we can't ensure that all discarded logical blocks are
1877          * zeroed.
1878          */
1879         if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
1880                 blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
1881
1882         zram->disk->queue->backing_dev_info->capabilities |=
1883                         (BDI_CAP_STABLE_WRITES | BDI_CAP_SYNCHRONOUS_IO);
1884         device_add_disk(NULL, zram->disk, zram_disk_attr_groups);
1885
1886         strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
1887
1888         zram_debugfs_register(zram);
1889         pr_info("Added device: %s\n", zram->disk->disk_name);
1890         return device_id;
1891
1892 out_free_queue:
1893         blk_cleanup_queue(queue);
1894 out_free_idr:
1895         idr_remove(&zram_index_idr, device_id);
1896 out_free_dev:
1897         kfree(zram);
1898         return ret;
1899 }
1900
1901 static int zram_remove(struct zram *zram)
1902 {
1903         struct block_device *bdev;
1904
1905         bdev = bdget_disk(zram->disk, 0);
1906         if (!bdev)
1907                 return -ENOMEM;
1908
1909         mutex_lock(&bdev->bd_mutex);
1910         if (bdev->bd_openers || zram->claim) {
1911                 mutex_unlock(&bdev->bd_mutex);
1912                 bdput(bdev);
1913                 return -EBUSY;
1914         }
1915
1916         zram->claim = true;
1917         mutex_unlock(&bdev->bd_mutex);
1918
1919         zram_debugfs_unregister(zram);
1920
1921         /* Make sure all the pending I/O are finished */
1922         fsync_bdev(bdev);
1923         zram_reset_device(zram);
1924         bdput(bdev);
1925
1926         pr_info("Removed device: %s\n", zram->disk->disk_name);
1927
1928         del_gendisk(zram->disk);
1929         blk_cleanup_queue(zram->disk->queue);
1930         put_disk(zram->disk);
1931         kfree(zram);
1932         return 0;
1933 }
1934
1935 /* zram-control sysfs attributes */
1936
1937 /*
1938  * NOTE: hot_add attribute is not the usual read-only sysfs attribute. In a
1939  * sense that reading from this file does alter the state of your system -- it
1940  * creates a new un-initialized zram device and returns back this device's
1941  * device_id (or an error code if it fails to create a new device).
1942  */
1943 static ssize_t hot_add_show(struct class *class,
1944                         struct class_attribute *attr,
1945                         char *buf)
1946 {
1947         int ret;
1948
1949         mutex_lock(&zram_index_mutex);
1950         ret = zram_add();
1951         mutex_unlock(&zram_index_mutex);
1952
1953         if (ret < 0)
1954                 return ret;
1955         return scnprintf(buf, PAGE_SIZE, "%d\n", ret);
1956 }
1957 static CLASS_ATTR_RO(hot_add);
1958
1959 static ssize_t hot_remove_store(struct class *class,
1960                         struct class_attribute *attr,
1961                         const char *buf,
1962                         size_t count)
1963 {
1964         struct zram *zram;
1965         int ret, dev_id;
1966
1967         /* dev_id is gendisk->first_minor, which is `int' */
1968         ret = kstrtoint(buf, 10, &dev_id);
1969         if (ret)
1970                 return ret;
1971         if (dev_id < 0)
1972                 return -EINVAL;
1973
1974         mutex_lock(&zram_index_mutex);
1975
1976         zram = idr_find(&zram_index_idr, dev_id);
1977         if (zram) {
1978                 ret = zram_remove(zram);
1979                 if (!ret)
1980                         idr_remove(&zram_index_idr, dev_id);
1981         } else {
1982                 ret = -ENODEV;
1983         }
1984
1985         mutex_unlock(&zram_index_mutex);
1986         return ret ? ret : count;
1987 }
1988 static CLASS_ATTR_WO(hot_remove);
1989
1990 static struct attribute *zram_control_class_attrs[] = {
1991         &class_attr_hot_add.attr,
1992         &class_attr_hot_remove.attr,
1993         NULL,
1994 };
1995 ATTRIBUTE_GROUPS(zram_control_class);
1996
1997 static struct class zram_control_class = {
1998         .name           = "zram-control",
1999         .owner          = THIS_MODULE,
2000         .class_groups   = zram_control_class_groups,
2001 };
2002
2003 static int zram_remove_cb(int id, void *ptr, void *data)
2004 {
2005         zram_remove(ptr);
2006         return 0;
2007 }
2008
2009 static void destroy_devices(void)
2010 {
2011         class_unregister(&zram_control_class);
2012         idr_for_each(&zram_index_idr, &zram_remove_cb, NULL);
2013         zram_debugfs_destroy();
2014         idr_destroy(&zram_index_idr);
2015         unregister_blkdev(zram_major, "zram");
2016         cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
2017 }
2018
2019 static int __init zram_init(void)
2020 {
2021         int ret;
2022
2023         ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare",
2024                                       zcomp_cpu_up_prepare, zcomp_cpu_dead);
2025         if (ret < 0)
2026                 return ret;
2027
2028         ret = class_register(&zram_control_class);
2029         if (ret) {
2030                 pr_err("Unable to register zram-control class\n");
2031                 cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
2032                 return ret;
2033         }
2034
2035         zram_debugfs_create();
2036         zram_major = register_blkdev(0, "zram");
2037         if (zram_major <= 0) {
2038                 pr_err("Unable to get major number\n");
2039                 class_unregister(&zram_control_class);
2040                 cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
2041                 return -EBUSY;
2042         }
2043
2044         while (num_devices != 0) {
2045                 mutex_lock(&zram_index_mutex);
2046                 ret = zram_add();
2047                 mutex_unlock(&zram_index_mutex);
2048                 if (ret < 0)
2049                         goto out_error;
2050                 num_devices--;
2051         }
2052
2053         return 0;
2054
2055 out_error:
2056         destroy_devices();
2057         return ret;
2058 }
2059
2060 static void __exit zram_exit(void)
2061 {
2062         destroy_devices();
2063 }
2064
2065 module_init(zram_init);
2066 module_exit(zram_exit);
2067
2068 module_param(num_devices, uint, 0);
2069 MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices");
2070
2071 MODULE_LICENSE("Dual BSD/GPL");
2072 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
2073 MODULE_DESCRIPTION("Compressed RAM Block Device");