Merge branch 'upstream' of git://git.linux-mips.org/pub/scm/ralf/upstream-linus
[sfrench/cifs-2.6.git] / mm / shmem.c
1 /*
2  * Resizable virtual memory filesystem for Linux.
3  *
4  * Copyright (C) 2000 Linus Torvalds.
5  *               2000 Transmeta Corp.
6  *               2000-2001 Christoph Rohland
7  *               2000-2001 SAP AG
8  *               2002 Red Hat Inc.
9  * Copyright (C) 2002-2011 Hugh Dickins.
10  * Copyright (C) 2011 Google Inc.
11  * Copyright (C) 2002-2005 VERITAS Software Corporation.
12  * Copyright (C) 2004 Andi Kleen, SuSE Labs
13  *
14  * Extended attribute support for tmpfs:
15  * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
16  * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
17  *
18  * tiny-shmem:
19  * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
20  *
21  * This file is released under the GPL.
22  */
23
24 #include <linux/fs.h>
25 #include <linux/init.h>
26 #include <linux/vfs.h>
27 #include <linux/mount.h>
28 #include <linux/ramfs.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #include <linux/mm.h>
32 #include <linux/export.h>
33 #include <linux/swap.h>
34 #include <linux/aio.h>
35
36 static struct vfsmount *shm_mnt;
37
38 #ifdef CONFIG_SHMEM
39 /*
40  * This virtual memory filesystem is heavily based on the ramfs. It
41  * extends ramfs by the ability to use swap and honor resource limits
42  * which makes it a completely usable filesystem.
43  */
44
45 #include <linux/xattr.h>
46 #include <linux/exportfs.h>
47 #include <linux/posix_acl.h>
48 #include <linux/posix_acl_xattr.h>
49 #include <linux/mman.h>
50 #include <linux/string.h>
51 #include <linux/slab.h>
52 #include <linux/backing-dev.h>
53 #include <linux/shmem_fs.h>
54 #include <linux/writeback.h>
55 #include <linux/blkdev.h>
56 #include <linux/pagevec.h>
57 #include <linux/percpu_counter.h>
58 #include <linux/falloc.h>
59 #include <linux/splice.h>
60 #include <linux/security.h>
61 #include <linux/swapops.h>
62 #include <linux/mempolicy.h>
63 #include <linux/namei.h>
64 #include <linux/ctype.h>
65 #include <linux/migrate.h>
66 #include <linux/highmem.h>
67 #include <linux/seq_file.h>
68 #include <linux/magic.h>
69
70 #include <asm/uaccess.h>
71 #include <asm/pgtable.h>
72
73 #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
74 #define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
75
76 /* Pretend that each entry is of this size in directory's i_size */
77 #define BOGO_DIRENT_SIZE 20
78
79 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */
80 #define SHORT_SYMLINK_LEN 128
81
82 /*
83  * shmem_fallocate communicates with shmem_fault or shmem_writepage via
84  * inode->i_private (with i_mutex making sure that it has only one user at
85  * a time): we would prefer not to enlarge the shmem inode just for that.
86  */
87 struct shmem_falloc {
88         wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
89         pgoff_t start;          /* start of range currently being fallocated */
90         pgoff_t next;           /* the next page offset to be fallocated */
91         pgoff_t nr_falloced;    /* how many new pages have been fallocated */
92         pgoff_t nr_unswapped;   /* how often writepage refused to swap out */
93 };
94
95 /* Flag allocation requirements to shmem_getpage */
96 enum sgp_type {
97         SGP_READ,       /* don't exceed i_size, don't allocate page */
98         SGP_CACHE,      /* don't exceed i_size, may allocate page */
99         SGP_DIRTY,      /* like SGP_CACHE, but set new page dirty */
100         SGP_WRITE,      /* may exceed i_size, may allocate !Uptodate page */
101         SGP_FALLOC,     /* like SGP_WRITE, but make existing page Uptodate */
102 };
103
104 #ifdef CONFIG_TMPFS
105 static unsigned long shmem_default_max_blocks(void)
106 {
107         return totalram_pages / 2;
108 }
109
110 static unsigned long shmem_default_max_inodes(void)
111 {
112         return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
113 }
114 #endif
115
116 static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
117 static int shmem_replace_page(struct page **pagep, gfp_t gfp,
118                                 struct shmem_inode_info *info, pgoff_t index);
119 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
120         struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
121
122 static inline int shmem_getpage(struct inode *inode, pgoff_t index,
123         struct page **pagep, enum sgp_type sgp, int *fault_type)
124 {
125         return shmem_getpage_gfp(inode, index, pagep, sgp,
126                         mapping_gfp_mask(inode->i_mapping), fault_type);
127 }
128
129 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
130 {
131         return sb->s_fs_info;
132 }
133
134 /*
135  * shmem_file_setup pre-accounts the whole fixed size of a VM object,
136  * for shared memory and for shared anonymous (/dev/zero) mappings
137  * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
138  * consistent with the pre-accounting of private mappings ...
139  */
140 static inline int shmem_acct_size(unsigned long flags, loff_t size)
141 {
142         return (flags & VM_NORESERVE) ?
143                 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
144 }
145
146 static inline void shmem_unacct_size(unsigned long flags, loff_t size)
147 {
148         if (!(flags & VM_NORESERVE))
149                 vm_unacct_memory(VM_ACCT(size));
150 }
151
152 static inline int shmem_reacct_size(unsigned long flags,
153                 loff_t oldsize, loff_t newsize)
154 {
155         if (!(flags & VM_NORESERVE)) {
156                 if (VM_ACCT(newsize) > VM_ACCT(oldsize))
157                         return security_vm_enough_memory_mm(current->mm,
158                                         VM_ACCT(newsize) - VM_ACCT(oldsize));
159                 else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
160                         vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
161         }
162         return 0;
163 }
164
165 /*
166  * ... whereas tmpfs objects are accounted incrementally as
167  * pages are allocated, in order to allow huge sparse files.
168  * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
169  * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
170  */
171 static inline int shmem_acct_block(unsigned long flags)
172 {
173         return (flags & VM_NORESERVE) ?
174                 security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;
175 }
176
177 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
178 {
179         if (flags & VM_NORESERVE)
180                 vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
181 }
182
183 static const struct super_operations shmem_ops;
184 static const struct address_space_operations shmem_aops;
185 static const struct file_operations shmem_file_operations;
186 static const struct inode_operations shmem_inode_operations;
187 static const struct inode_operations shmem_dir_inode_operations;
188 static const struct inode_operations shmem_special_inode_operations;
189 static const struct vm_operations_struct shmem_vm_ops;
190
191 static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
192         .ra_pages       = 0,    /* No readahead */
193         .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
194 };
195
196 static LIST_HEAD(shmem_swaplist);
197 static DEFINE_MUTEX(shmem_swaplist_mutex);
198
199 static int shmem_reserve_inode(struct super_block *sb)
200 {
201         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
202         if (sbinfo->max_inodes) {
203                 spin_lock(&sbinfo->stat_lock);
204                 if (!sbinfo->free_inodes) {
205                         spin_unlock(&sbinfo->stat_lock);
206                         return -ENOSPC;
207                 }
208                 sbinfo->free_inodes--;
209                 spin_unlock(&sbinfo->stat_lock);
210         }
211         return 0;
212 }
213
214 static void shmem_free_inode(struct super_block *sb)
215 {
216         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
217         if (sbinfo->max_inodes) {
218                 spin_lock(&sbinfo->stat_lock);
219                 sbinfo->free_inodes++;
220                 spin_unlock(&sbinfo->stat_lock);
221         }
222 }
223
224 /**
225  * shmem_recalc_inode - recalculate the block usage of an inode
226  * @inode: inode to recalc
227  *
228  * We have to calculate the free blocks since the mm can drop
229  * undirtied hole pages behind our back.
230  *
231  * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
232  * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
233  *
234  * It has to be called with the spinlock held.
235  */
236 static void shmem_recalc_inode(struct inode *inode)
237 {
238         struct shmem_inode_info *info = SHMEM_I(inode);
239         long freed;
240
241         freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
242         if (freed > 0) {
243                 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
244                 if (sbinfo->max_blocks)
245                         percpu_counter_add(&sbinfo->used_blocks, -freed);
246                 info->alloced -= freed;
247                 inode->i_blocks -= freed * BLOCKS_PER_PAGE;
248                 shmem_unacct_blocks(info->flags, freed);
249         }
250 }
251
252 /*
253  * Replace item expected in radix tree by a new item, while holding tree lock.
254  */
255 static int shmem_radix_tree_replace(struct address_space *mapping,
256                         pgoff_t index, void *expected, void *replacement)
257 {
258         void **pslot;
259         void *item;
260
261         VM_BUG_ON(!expected);
262         VM_BUG_ON(!replacement);
263         pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
264         if (!pslot)
265                 return -ENOENT;
266         item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
267         if (item != expected)
268                 return -ENOENT;
269         radix_tree_replace_slot(pslot, replacement);
270         return 0;
271 }
272
273 /*
274  * Sometimes, before we decide whether to proceed or to fail, we must check
275  * that an entry was not already brought back from swap by a racing thread.
276  *
277  * Checking page is not enough: by the time a SwapCache page is locked, it
278  * might be reused, and again be SwapCache, using the same swap as before.
279  */
280 static bool shmem_confirm_swap(struct address_space *mapping,
281                                pgoff_t index, swp_entry_t swap)
282 {
283         void *item;
284
285         rcu_read_lock();
286         item = radix_tree_lookup(&mapping->page_tree, index);
287         rcu_read_unlock();
288         return item == swp_to_radix_entry(swap);
289 }
290
291 /*
292  * Like add_to_page_cache_locked, but error if expected item has gone.
293  */
294 static int shmem_add_to_page_cache(struct page *page,
295                                    struct address_space *mapping,
296                                    pgoff_t index, void *expected)
297 {
298         int error;
299
300         VM_BUG_ON_PAGE(!PageLocked(page), page);
301         VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
302
303         page_cache_get(page);
304         page->mapping = mapping;
305         page->index = index;
306
307         spin_lock_irq(&mapping->tree_lock);
308         if (!expected)
309                 error = radix_tree_insert(&mapping->page_tree, index, page);
310         else
311                 error = shmem_radix_tree_replace(mapping, index, expected,
312                                                                  page);
313         if (!error) {
314                 mapping->nrpages++;
315                 __inc_zone_page_state(page, NR_FILE_PAGES);
316                 __inc_zone_page_state(page, NR_SHMEM);
317                 spin_unlock_irq(&mapping->tree_lock);
318         } else {
319                 page->mapping = NULL;
320                 spin_unlock_irq(&mapping->tree_lock);
321                 page_cache_release(page);
322         }
323         return error;
324 }
325
326 /*
327  * Like delete_from_page_cache, but substitutes swap for page.
328  */
329 static void shmem_delete_from_page_cache(struct page *page, void *radswap)
330 {
331         struct address_space *mapping = page->mapping;
332         int error;
333
334         spin_lock_irq(&mapping->tree_lock);
335         error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
336         page->mapping = NULL;
337         mapping->nrpages--;
338         __dec_zone_page_state(page, NR_FILE_PAGES);
339         __dec_zone_page_state(page, NR_SHMEM);
340         spin_unlock_irq(&mapping->tree_lock);
341         page_cache_release(page);
342         BUG_ON(error);
343 }
344
345 /*
346  * Remove swap entry from radix tree, free the swap and its page cache.
347  */
348 static int shmem_free_swap(struct address_space *mapping,
349                            pgoff_t index, void *radswap)
350 {
351         void *old;
352
353         spin_lock_irq(&mapping->tree_lock);
354         old = radix_tree_delete_item(&mapping->page_tree, index, radswap);
355         spin_unlock_irq(&mapping->tree_lock);
356         if (old != radswap)
357                 return -ENOENT;
358         free_swap_and_cache(radix_to_swp_entry(radswap));
359         return 0;
360 }
361
362 /*
363  * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
364  */
365 void shmem_unlock_mapping(struct address_space *mapping)
366 {
367         struct pagevec pvec;
368         pgoff_t indices[PAGEVEC_SIZE];
369         pgoff_t index = 0;
370
371         pagevec_init(&pvec, 0);
372         /*
373          * Minor point, but we might as well stop if someone else SHM_LOCKs it.
374          */
375         while (!mapping_unevictable(mapping)) {
376                 /*
377                  * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
378                  * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
379                  */
380                 pvec.nr = find_get_entries(mapping, index,
381                                            PAGEVEC_SIZE, pvec.pages, indices);
382                 if (!pvec.nr)
383                         break;
384                 index = indices[pvec.nr - 1] + 1;
385                 pagevec_remove_exceptionals(&pvec);
386                 check_move_unevictable_pages(pvec.pages, pvec.nr);
387                 pagevec_release(&pvec);
388                 cond_resched();
389         }
390 }
391
392 /*
393  * Remove range of pages and swap entries from radix tree, and free them.
394  * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
395  */
396 static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
397                                                                  bool unfalloc)
398 {
399         struct address_space *mapping = inode->i_mapping;
400         struct shmem_inode_info *info = SHMEM_I(inode);
401         pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
402         pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
403         unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
404         unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
405         struct pagevec pvec;
406         pgoff_t indices[PAGEVEC_SIZE];
407         long nr_swaps_freed = 0;
408         pgoff_t index;
409         int i;
410
411         if (lend == -1)
412                 end = -1;       /* unsigned, so actually very big */
413
414         pagevec_init(&pvec, 0);
415         index = start;
416         while (index < end) {
417                 pvec.nr = find_get_entries(mapping, index,
418                         min(end - index, (pgoff_t)PAGEVEC_SIZE),
419                         pvec.pages, indices);
420                 if (!pvec.nr)
421                         break;
422                 mem_cgroup_uncharge_start();
423                 for (i = 0; i < pagevec_count(&pvec); i++) {
424                         struct page *page = pvec.pages[i];
425
426                         index = indices[i];
427                         if (index >= end)
428                                 break;
429
430                         if (radix_tree_exceptional_entry(page)) {
431                                 if (unfalloc)
432                                         continue;
433                                 nr_swaps_freed += !shmem_free_swap(mapping,
434                                                                 index, page);
435                                 continue;
436                         }
437
438                         if (!trylock_page(page))
439                                 continue;
440                         if (!unfalloc || !PageUptodate(page)) {
441                                 if (page->mapping == mapping) {
442                                         VM_BUG_ON_PAGE(PageWriteback(page), page);
443                                         truncate_inode_page(mapping, page);
444                                 }
445                         }
446                         unlock_page(page);
447                 }
448                 pagevec_remove_exceptionals(&pvec);
449                 pagevec_release(&pvec);
450                 mem_cgroup_uncharge_end();
451                 cond_resched();
452                 index++;
453         }
454
455         if (partial_start) {
456                 struct page *page = NULL;
457                 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
458                 if (page) {
459                         unsigned int top = PAGE_CACHE_SIZE;
460                         if (start > end) {
461                                 top = partial_end;
462                                 partial_end = 0;
463                         }
464                         zero_user_segment(page, partial_start, top);
465                         set_page_dirty(page);
466                         unlock_page(page);
467                         page_cache_release(page);
468                 }
469         }
470         if (partial_end) {
471                 struct page *page = NULL;
472                 shmem_getpage(inode, end, &page, SGP_READ, NULL);
473                 if (page) {
474                         zero_user_segment(page, 0, partial_end);
475                         set_page_dirty(page);
476                         unlock_page(page);
477                         page_cache_release(page);
478                 }
479         }
480         if (start >= end)
481                 return;
482
483         index = start;
484         while (index < end) {
485                 cond_resched();
486
487                 pvec.nr = find_get_entries(mapping, index,
488                                 min(end - index, (pgoff_t)PAGEVEC_SIZE),
489                                 pvec.pages, indices);
490                 if (!pvec.nr) {
491                         /* If all gone or hole-punch or unfalloc, we're done */
492                         if (index == start || end != -1)
493                                 break;
494                         /* But if truncating, restart to make sure all gone */
495                         index = start;
496                         continue;
497                 }
498                 mem_cgroup_uncharge_start();
499                 for (i = 0; i < pagevec_count(&pvec); i++) {
500                         struct page *page = pvec.pages[i];
501
502                         index = indices[i];
503                         if (index >= end)
504                                 break;
505
506                         if (radix_tree_exceptional_entry(page)) {
507                                 if (unfalloc)
508                                         continue;
509                                 if (shmem_free_swap(mapping, index, page)) {
510                                         /* Swap was replaced by page: retry */
511                                         index--;
512                                         break;
513                                 }
514                                 nr_swaps_freed++;
515                                 continue;
516                         }
517
518                         lock_page(page);
519                         if (!unfalloc || !PageUptodate(page)) {
520                                 if (page->mapping == mapping) {
521                                         VM_BUG_ON_PAGE(PageWriteback(page), page);
522                                         truncate_inode_page(mapping, page);
523                                 } else {
524                                         /* Page was replaced by swap: retry */
525                                         unlock_page(page);
526                                         index--;
527                                         break;
528                                 }
529                         }
530                         unlock_page(page);
531                 }
532                 pagevec_remove_exceptionals(&pvec);
533                 pagevec_release(&pvec);
534                 mem_cgroup_uncharge_end();
535                 index++;
536         }
537
538         spin_lock(&info->lock);
539         info->swapped -= nr_swaps_freed;
540         shmem_recalc_inode(inode);
541         spin_unlock(&info->lock);
542 }
543
544 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
545 {
546         shmem_undo_range(inode, lstart, lend, false);
547         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
548 }
549 EXPORT_SYMBOL_GPL(shmem_truncate_range);
550
551 static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
552 {
553         struct inode *inode = dentry->d_inode;
554         int error;
555
556         error = inode_change_ok(inode, attr);
557         if (error)
558                 return error;
559
560         if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
561                 loff_t oldsize = inode->i_size;
562                 loff_t newsize = attr->ia_size;
563
564                 if (newsize != oldsize) {
565                         error = shmem_reacct_size(SHMEM_I(inode)->flags,
566                                         oldsize, newsize);
567                         if (error)
568                                 return error;
569                         i_size_write(inode, newsize);
570                         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
571                 }
572                 if (newsize < oldsize) {
573                         loff_t holebegin = round_up(newsize, PAGE_SIZE);
574                         unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
575                         shmem_truncate_range(inode, newsize, (loff_t)-1);
576                         /* unmap again to remove racily COWed private pages */
577                         unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
578                 }
579         }
580
581         setattr_copy(inode, attr);
582         if (attr->ia_valid & ATTR_MODE)
583                 error = posix_acl_chmod(inode, inode->i_mode);
584         return error;
585 }
586
587 static void shmem_evict_inode(struct inode *inode)
588 {
589         struct shmem_inode_info *info = SHMEM_I(inode);
590
591         if (inode->i_mapping->a_ops == &shmem_aops) {
592                 shmem_unacct_size(info->flags, inode->i_size);
593                 inode->i_size = 0;
594                 shmem_truncate_range(inode, 0, (loff_t)-1);
595                 if (!list_empty(&info->swaplist)) {
596                         mutex_lock(&shmem_swaplist_mutex);
597                         list_del_init(&info->swaplist);
598                         mutex_unlock(&shmem_swaplist_mutex);
599                 }
600         } else
601                 kfree(info->symlink);
602
603         simple_xattrs_free(&info->xattrs);
604         WARN_ON(inode->i_blocks);
605         shmem_free_inode(inode->i_sb);
606         clear_inode(inode);
607 }
608
609 /*
610  * If swap found in inode, free it and move page from swapcache to filecache.
611  */
612 static int shmem_unuse_inode(struct shmem_inode_info *info,
613                              swp_entry_t swap, struct page **pagep)
614 {
615         struct address_space *mapping = info->vfs_inode.i_mapping;
616         void *radswap;
617         pgoff_t index;
618         gfp_t gfp;
619         int error = 0;
620
621         radswap = swp_to_radix_entry(swap);
622         index = radix_tree_locate_item(&mapping->page_tree, radswap);
623         if (index == -1)
624                 return 0;
625
626         /*
627          * Move _head_ to start search for next from here.
628          * But be careful: shmem_evict_inode checks list_empty without taking
629          * mutex, and there's an instant in list_move_tail when info->swaplist
630          * would appear empty, if it were the only one on shmem_swaplist.
631          */
632         if (shmem_swaplist.next != &info->swaplist)
633                 list_move_tail(&shmem_swaplist, &info->swaplist);
634
635         gfp = mapping_gfp_mask(mapping);
636         if (shmem_should_replace_page(*pagep, gfp)) {
637                 mutex_unlock(&shmem_swaplist_mutex);
638                 error = shmem_replace_page(pagep, gfp, info, index);
639                 mutex_lock(&shmem_swaplist_mutex);
640                 /*
641                  * We needed to drop mutex to make that restrictive page
642                  * allocation, but the inode might have been freed while we
643                  * dropped it: although a racing shmem_evict_inode() cannot
644                  * complete without emptying the radix_tree, our page lock
645                  * on this swapcache page is not enough to prevent that -
646                  * free_swap_and_cache() of our swap entry will only
647                  * trylock_page(), removing swap from radix_tree whatever.
648                  *
649                  * We must not proceed to shmem_add_to_page_cache() if the
650                  * inode has been freed, but of course we cannot rely on
651                  * inode or mapping or info to check that.  However, we can
652                  * safely check if our swap entry is still in use (and here
653                  * it can't have got reused for another page): if it's still
654                  * in use, then the inode cannot have been freed yet, and we
655                  * can safely proceed (if it's no longer in use, that tells
656                  * nothing about the inode, but we don't need to unuse swap).
657                  */
658                 if (!page_swapcount(*pagep))
659                         error = -ENOENT;
660         }
661
662         /*
663          * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
664          * but also to hold up shmem_evict_inode(): so inode cannot be freed
665          * beneath us (pagelock doesn't help until the page is in pagecache).
666          */
667         if (!error)
668                 error = shmem_add_to_page_cache(*pagep, mapping, index,
669                                                 radswap);
670         if (error != -ENOMEM) {
671                 /*
672                  * Truncation and eviction use free_swap_and_cache(), which
673                  * only does trylock page: if we raced, best clean up here.
674                  */
675                 delete_from_swap_cache(*pagep);
676                 set_page_dirty(*pagep);
677                 if (!error) {
678                         spin_lock(&info->lock);
679                         info->swapped--;
680                         spin_unlock(&info->lock);
681                         swap_free(swap);
682                 }
683                 error = 1;      /* not an error, but entry was found */
684         }
685         return error;
686 }
687
688 /*
689  * Search through swapped inodes to find and replace swap by page.
690  */
691 int shmem_unuse(swp_entry_t swap, struct page *page)
692 {
693         struct list_head *this, *next;
694         struct shmem_inode_info *info;
695         int found = 0;
696         int error = 0;
697
698         /*
699          * There's a faint possibility that swap page was replaced before
700          * caller locked it: caller will come back later with the right page.
701          */
702         if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
703                 goto out;
704
705         /*
706          * Charge page using GFP_KERNEL while we can wait, before taking
707          * the shmem_swaplist_mutex which might hold up shmem_writepage().
708          * Charged back to the user (not to caller) when swap account is used.
709          */
710         error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL);
711         if (error)
712                 goto out;
713         /* No radix_tree_preload: swap entry keeps a place for page in tree */
714
715         mutex_lock(&shmem_swaplist_mutex);
716         list_for_each_safe(this, next, &shmem_swaplist) {
717                 info = list_entry(this, struct shmem_inode_info, swaplist);
718                 if (info->swapped)
719                         found = shmem_unuse_inode(info, swap, &page);
720                 else
721                         list_del_init(&info->swaplist);
722                 cond_resched();
723                 if (found)
724                         break;
725         }
726         mutex_unlock(&shmem_swaplist_mutex);
727
728         if (found < 0)
729                 error = found;
730 out:
731         unlock_page(page);
732         page_cache_release(page);
733         return error;
734 }
735
736 /*
737  * Move the page from the page cache to the swap cache.
738  */
739 static int shmem_writepage(struct page *page, struct writeback_control *wbc)
740 {
741         struct shmem_inode_info *info;
742         struct address_space *mapping;
743         struct inode *inode;
744         swp_entry_t swap;
745         pgoff_t index;
746
747         BUG_ON(!PageLocked(page));
748         mapping = page->mapping;
749         index = page->index;
750         inode = mapping->host;
751         info = SHMEM_I(inode);
752         if (info->flags & VM_LOCKED)
753                 goto redirty;
754         if (!total_swap_pages)
755                 goto redirty;
756
757         /*
758          * shmem_backing_dev_info's capabilities prevent regular writeback or
759          * sync from ever calling shmem_writepage; but a stacking filesystem
760          * might use ->writepage of its underlying filesystem, in which case
761          * tmpfs should write out to swap only in response to memory pressure,
762          * and not for the writeback threads or sync.
763          */
764         if (!wbc->for_reclaim) {
765                 WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
766                 goto redirty;
767         }
768
769         /*
770          * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
771          * value into swapfile.c, the only way we can correctly account for a
772          * fallocated page arriving here is now to initialize it and write it.
773          *
774          * That's okay for a page already fallocated earlier, but if we have
775          * not yet completed the fallocation, then (a) we want to keep track
776          * of this page in case we have to undo it, and (b) it may not be a
777          * good idea to continue anyway, once we're pushing into swap.  So
778          * reactivate the page, and let shmem_fallocate() quit when too many.
779          */
780         if (!PageUptodate(page)) {
781                 if (inode->i_private) {
782                         struct shmem_falloc *shmem_falloc;
783                         spin_lock(&inode->i_lock);
784                         shmem_falloc = inode->i_private;
785                         if (shmem_falloc &&
786                             !shmem_falloc->waitq &&
787                             index >= shmem_falloc->start &&
788                             index < shmem_falloc->next)
789                                 shmem_falloc->nr_unswapped++;
790                         else
791                                 shmem_falloc = NULL;
792                         spin_unlock(&inode->i_lock);
793                         if (shmem_falloc)
794                                 goto redirty;
795                 }
796                 clear_highpage(page);
797                 flush_dcache_page(page);
798                 SetPageUptodate(page);
799         }
800
801         swap = get_swap_page();
802         if (!swap.val)
803                 goto redirty;
804
805         /*
806          * Add inode to shmem_unuse()'s list of swapped-out inodes,
807          * if it's not already there.  Do it now before the page is
808          * moved to swap cache, when its pagelock no longer protects
809          * the inode from eviction.  But don't unlock the mutex until
810          * we've incremented swapped, because shmem_unuse_inode() will
811          * prune a !swapped inode from the swaplist under this mutex.
812          */
813         mutex_lock(&shmem_swaplist_mutex);
814         if (list_empty(&info->swaplist))
815                 list_add_tail(&info->swaplist, &shmem_swaplist);
816
817         if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
818                 swap_shmem_alloc(swap);
819                 shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
820
821                 spin_lock(&info->lock);
822                 info->swapped++;
823                 shmem_recalc_inode(inode);
824                 spin_unlock(&info->lock);
825
826                 mutex_unlock(&shmem_swaplist_mutex);
827                 BUG_ON(page_mapped(page));
828                 swap_writepage(page, wbc);
829                 return 0;
830         }
831
832         mutex_unlock(&shmem_swaplist_mutex);
833         swapcache_free(swap, NULL);
834 redirty:
835         set_page_dirty(page);
836         if (wbc->for_reclaim)
837                 return AOP_WRITEPAGE_ACTIVATE;  /* Return with page locked */
838         unlock_page(page);
839         return 0;
840 }
841
842 #ifdef CONFIG_NUMA
843 #ifdef CONFIG_TMPFS
844 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
845 {
846         char buffer[64];
847
848         if (!mpol || mpol->mode == MPOL_DEFAULT)
849                 return;         /* show nothing */
850
851         mpol_to_str(buffer, sizeof(buffer), mpol);
852
853         seq_printf(seq, ",mpol=%s", buffer);
854 }
855
856 static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
857 {
858         struct mempolicy *mpol = NULL;
859         if (sbinfo->mpol) {
860                 spin_lock(&sbinfo->stat_lock);  /* prevent replace/use races */
861                 mpol = sbinfo->mpol;
862                 mpol_get(mpol);
863                 spin_unlock(&sbinfo->stat_lock);
864         }
865         return mpol;
866 }
867 #endif /* CONFIG_TMPFS */
868
869 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
870                         struct shmem_inode_info *info, pgoff_t index)
871 {
872         struct vm_area_struct pvma;
873         struct page *page;
874
875         /* Create a pseudo vma that just contains the policy */
876         pvma.vm_start = 0;
877         /* Bias interleave by inode number to distribute better across nodes */
878         pvma.vm_pgoff = index + info->vfs_inode.i_ino;
879         pvma.vm_ops = NULL;
880         pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
881
882         page = swapin_readahead(swap, gfp, &pvma, 0);
883
884         /* Drop reference taken by mpol_shared_policy_lookup() */
885         mpol_cond_put(pvma.vm_policy);
886
887         return page;
888 }
889
890 static struct page *shmem_alloc_page(gfp_t gfp,
891                         struct shmem_inode_info *info, pgoff_t index)
892 {
893         struct vm_area_struct pvma;
894         struct page *page;
895
896         /* Create a pseudo vma that just contains the policy */
897         pvma.vm_start = 0;
898         /* Bias interleave by inode number to distribute better across nodes */
899         pvma.vm_pgoff = index + info->vfs_inode.i_ino;
900         pvma.vm_ops = NULL;
901         pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
902
903         page = alloc_page_vma(gfp, &pvma, 0);
904
905         /* Drop reference taken by mpol_shared_policy_lookup() */
906         mpol_cond_put(pvma.vm_policy);
907
908         return page;
909 }
910 #else /* !CONFIG_NUMA */
911 #ifdef CONFIG_TMPFS
912 static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
913 {
914 }
915 #endif /* CONFIG_TMPFS */
916
917 static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
918                         struct shmem_inode_info *info, pgoff_t index)
919 {
920         return swapin_readahead(swap, gfp, NULL, 0);
921 }
922
923 static inline struct page *shmem_alloc_page(gfp_t gfp,
924                         struct shmem_inode_info *info, pgoff_t index)
925 {
926         return alloc_page(gfp);
927 }
928 #endif /* CONFIG_NUMA */
929
930 #if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
931 static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
932 {
933         return NULL;
934 }
935 #endif
936
937 /*
938  * When a page is moved from swapcache to shmem filecache (either by the
939  * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
940  * shmem_unuse_inode()), it may have been read in earlier from swap, in
941  * ignorance of the mapping it belongs to.  If that mapping has special
942  * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
943  * we may need to copy to a suitable page before moving to filecache.
944  *
945  * In a future release, this may well be extended to respect cpuset and
946  * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
947  * but for now it is a simple matter of zone.
948  */
949 static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
950 {
951         return page_zonenum(page) > gfp_zone(gfp);
952 }
953
954 static int shmem_replace_page(struct page **pagep, gfp_t gfp,
955                                 struct shmem_inode_info *info, pgoff_t index)
956 {
957         struct page *oldpage, *newpage;
958         struct address_space *swap_mapping;
959         pgoff_t swap_index;
960         int error;
961
962         oldpage = *pagep;
963         swap_index = page_private(oldpage);
964         swap_mapping = page_mapping(oldpage);
965
966         /*
967          * We have arrived here because our zones are constrained, so don't
968          * limit chance of success by further cpuset and node constraints.
969          */
970         gfp &= ~GFP_CONSTRAINT_MASK;
971         newpage = shmem_alloc_page(gfp, info, index);
972         if (!newpage)
973                 return -ENOMEM;
974
975         page_cache_get(newpage);
976         copy_highpage(newpage, oldpage);
977         flush_dcache_page(newpage);
978
979         __set_page_locked(newpage);
980         SetPageUptodate(newpage);
981         SetPageSwapBacked(newpage);
982         set_page_private(newpage, swap_index);
983         SetPageSwapCache(newpage);
984
985         /*
986          * Our caller will very soon move newpage out of swapcache, but it's
987          * a nice clean interface for us to replace oldpage by newpage there.
988          */
989         spin_lock_irq(&swap_mapping->tree_lock);
990         error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
991                                                                    newpage);
992         if (!error) {
993                 __inc_zone_page_state(newpage, NR_FILE_PAGES);
994                 __dec_zone_page_state(oldpage, NR_FILE_PAGES);
995         }
996         spin_unlock_irq(&swap_mapping->tree_lock);
997
998         if (unlikely(error)) {
999                 /*
1000                  * Is this possible?  I think not, now that our callers check
1001                  * both PageSwapCache and page_private after getting page lock;
1002                  * but be defensive.  Reverse old to newpage for clear and free.
1003                  */
1004                 oldpage = newpage;
1005         } else {
1006                 mem_cgroup_replace_page_cache(oldpage, newpage);
1007                 lru_cache_add_anon(newpage);
1008                 *pagep = newpage;
1009         }
1010
1011         ClearPageSwapCache(oldpage);
1012         set_page_private(oldpage, 0);
1013
1014         unlock_page(oldpage);
1015         page_cache_release(oldpage);
1016         page_cache_release(oldpage);
1017         return error;
1018 }
1019
1020 /*
1021  * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1022  *
1023  * If we allocate a new one we do not mark it dirty. That's up to the
1024  * vm. If we swap it in we mark it dirty since we also free the swap
1025  * entry since a page cannot live in both the swap and page cache
1026  */
1027 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1028         struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
1029 {
1030         struct address_space *mapping = inode->i_mapping;
1031         struct shmem_inode_info *info;
1032         struct shmem_sb_info *sbinfo;
1033         struct page *page;
1034         swp_entry_t swap;
1035         int error;
1036         int once = 0;
1037         int alloced = 0;
1038
1039         if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
1040                 return -EFBIG;
1041 repeat:
1042         swap.val = 0;
1043         page = find_lock_entry(mapping, index);
1044         if (radix_tree_exceptional_entry(page)) {
1045                 swap = radix_to_swp_entry(page);
1046                 page = NULL;
1047         }
1048
1049         if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
1050             ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
1051                 error = -EINVAL;
1052                 goto failed;
1053         }
1054
1055         if (page && sgp == SGP_WRITE)
1056                 mark_page_accessed(page);
1057
1058         /* fallocated page? */
1059         if (page && !PageUptodate(page)) {
1060                 if (sgp != SGP_READ)
1061                         goto clear;
1062                 unlock_page(page);
1063                 page_cache_release(page);
1064                 page = NULL;
1065         }
1066         if (page || (sgp == SGP_READ && !swap.val)) {
1067                 *pagep = page;
1068                 return 0;
1069         }
1070
1071         /*
1072          * Fast cache lookup did not find it:
1073          * bring it back from swap or allocate.
1074          */
1075         info = SHMEM_I(inode);
1076         sbinfo = SHMEM_SB(inode->i_sb);
1077
1078         if (swap.val) {
1079                 /* Look it up and read it in.. */
1080                 page = lookup_swap_cache(swap);
1081                 if (!page) {
1082                         /* here we actually do the io */
1083                         if (fault_type)
1084                                 *fault_type |= VM_FAULT_MAJOR;
1085                         page = shmem_swapin(swap, gfp, info, index);
1086                         if (!page) {
1087                                 error = -ENOMEM;
1088                                 goto failed;
1089                         }
1090                 }
1091
1092                 /* We have to do this with page locked to prevent races */
1093                 lock_page(page);
1094                 if (!PageSwapCache(page) || page_private(page) != swap.val ||
1095                     !shmem_confirm_swap(mapping, index, swap)) {
1096                         error = -EEXIST;        /* try again */
1097                         goto unlock;
1098                 }
1099                 if (!PageUptodate(page)) {
1100                         error = -EIO;
1101                         goto failed;
1102                 }
1103                 wait_on_page_writeback(page);
1104
1105                 if (shmem_should_replace_page(page, gfp)) {
1106                         error = shmem_replace_page(&page, gfp, info, index);
1107                         if (error)
1108                                 goto failed;
1109                 }
1110
1111                 error = mem_cgroup_charge_file(page, current->mm,
1112                                                 gfp & GFP_RECLAIM_MASK);
1113                 if (!error) {
1114                         error = shmem_add_to_page_cache(page, mapping, index,
1115                                                 swp_to_radix_entry(swap));
1116                         /*
1117                          * We already confirmed swap under page lock, and make
1118                          * no memory allocation here, so usually no possibility
1119                          * of error; but free_swap_and_cache() only trylocks a
1120                          * page, so it is just possible that the entry has been
1121                          * truncated or holepunched since swap was confirmed.
1122                          * shmem_undo_range() will have done some of the
1123                          * unaccounting, now delete_from_swap_cache() will do
1124                          * the rest (including mem_cgroup_uncharge_swapcache).
1125                          * Reset swap.val? No, leave it so "failed" goes back to
1126                          * "repeat": reading a hole and writing should succeed.
1127                          */
1128                         if (error)
1129                                 delete_from_swap_cache(page);
1130                 }
1131                 if (error)
1132                         goto failed;
1133
1134                 spin_lock(&info->lock);
1135                 info->swapped--;
1136                 shmem_recalc_inode(inode);
1137                 spin_unlock(&info->lock);
1138
1139                 if (sgp == SGP_WRITE)
1140                         mark_page_accessed(page);
1141
1142                 delete_from_swap_cache(page);
1143                 set_page_dirty(page);
1144                 swap_free(swap);
1145
1146         } else {
1147                 if (shmem_acct_block(info->flags)) {
1148                         error = -ENOSPC;
1149                         goto failed;
1150                 }
1151                 if (sbinfo->max_blocks) {
1152                         if (percpu_counter_compare(&sbinfo->used_blocks,
1153                                                 sbinfo->max_blocks) >= 0) {
1154                                 error = -ENOSPC;
1155                                 goto unacct;
1156                         }
1157                         percpu_counter_inc(&sbinfo->used_blocks);
1158                 }
1159
1160                 page = shmem_alloc_page(gfp, info, index);
1161                 if (!page) {
1162                         error = -ENOMEM;
1163                         goto decused;
1164                 }
1165
1166                 __SetPageSwapBacked(page);
1167                 __set_page_locked(page);
1168                 if (sgp == SGP_WRITE)
1169                         __SetPageReferenced(page);
1170
1171                 error = mem_cgroup_charge_file(page, current->mm,
1172                                                 gfp & GFP_RECLAIM_MASK);
1173                 if (error)
1174                         goto decused;
1175                 error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
1176                 if (!error) {
1177                         error = shmem_add_to_page_cache(page, mapping, index,
1178                                                         NULL);
1179                         radix_tree_preload_end();
1180                 }
1181                 if (error) {
1182                         mem_cgroup_uncharge_cache_page(page);
1183                         goto decused;
1184                 }
1185                 lru_cache_add_anon(page);
1186
1187                 spin_lock(&info->lock);
1188                 info->alloced++;
1189                 inode->i_blocks += BLOCKS_PER_PAGE;
1190                 shmem_recalc_inode(inode);
1191                 spin_unlock(&info->lock);
1192                 alloced = true;
1193
1194                 /*
1195                  * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
1196                  */
1197                 if (sgp == SGP_FALLOC)
1198                         sgp = SGP_WRITE;
1199 clear:
1200                 /*
1201                  * Let SGP_WRITE caller clear ends if write does not fill page;
1202                  * but SGP_FALLOC on a page fallocated earlier must initialize
1203                  * it now, lest undo on failure cancel our earlier guarantee.
1204                  */
1205                 if (sgp != SGP_WRITE) {
1206                         clear_highpage(page);
1207                         flush_dcache_page(page);
1208                         SetPageUptodate(page);
1209                 }
1210                 if (sgp == SGP_DIRTY)
1211                         set_page_dirty(page);
1212         }
1213
1214         /* Perhaps the file has been truncated since we checked */
1215         if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
1216             ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
1217                 error = -EINVAL;
1218                 if (alloced)
1219                         goto trunc;
1220                 else
1221                         goto failed;
1222         }
1223         *pagep = page;
1224         return 0;
1225
1226         /*
1227          * Error recovery.
1228          */
1229 trunc:
1230         info = SHMEM_I(inode);
1231         ClearPageDirty(page);
1232         delete_from_page_cache(page);
1233         spin_lock(&info->lock);
1234         info->alloced--;
1235         inode->i_blocks -= BLOCKS_PER_PAGE;
1236         spin_unlock(&info->lock);
1237 decused:
1238         sbinfo = SHMEM_SB(inode->i_sb);
1239         if (sbinfo->max_blocks)
1240                 percpu_counter_add(&sbinfo->used_blocks, -1);
1241 unacct:
1242         shmem_unacct_blocks(info->flags, 1);
1243 failed:
1244         if (swap.val && error != -EINVAL &&
1245             !shmem_confirm_swap(mapping, index, swap))
1246                 error = -EEXIST;
1247 unlock:
1248         if (page) {
1249                 unlock_page(page);
1250                 page_cache_release(page);
1251         }
1252         if (error == -ENOSPC && !once++) {
1253                 info = SHMEM_I(inode);
1254                 spin_lock(&info->lock);
1255                 shmem_recalc_inode(inode);
1256                 spin_unlock(&info->lock);
1257                 goto repeat;
1258         }
1259         if (error == -EEXIST)   /* from above or from radix_tree_insert */
1260                 goto repeat;
1261         return error;
1262 }
1263
1264 static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1265 {
1266         struct inode *inode = file_inode(vma->vm_file);
1267         int error;
1268         int ret = VM_FAULT_LOCKED;
1269
1270         /*
1271          * Trinity finds that probing a hole which tmpfs is punching can
1272          * prevent the hole-punch from ever completing: which in turn
1273          * locks writers out with its hold on i_mutex.  So refrain from
1274          * faulting pages into the hole while it's being punched.  Although
1275          * shmem_undo_range() does remove the additions, it may be unable to
1276          * keep up, as each new page needs its own unmap_mapping_range() call,
1277          * and the i_mmap tree grows ever slower to scan if new vmas are added.
1278          *
1279          * It does not matter if we sometimes reach this check just before the
1280          * hole-punch begins, so that one fault then races with the punch:
1281          * we just need to make racing faults a rare case.
1282          *
1283          * The implementation below would be much simpler if we just used a
1284          * standard mutex or completion: but we cannot take i_mutex in fault,
1285          * and bloating every shmem inode for this unlikely case would be sad.
1286          */
1287         if (unlikely(inode->i_private)) {
1288                 struct shmem_falloc *shmem_falloc;
1289
1290                 spin_lock(&inode->i_lock);
1291                 shmem_falloc = inode->i_private;
1292                 if (shmem_falloc &&
1293                     shmem_falloc->waitq &&
1294                     vmf->pgoff >= shmem_falloc->start &&
1295                     vmf->pgoff < shmem_falloc->next) {
1296                         wait_queue_head_t *shmem_falloc_waitq;
1297                         DEFINE_WAIT(shmem_fault_wait);
1298
1299                         ret = VM_FAULT_NOPAGE;
1300                         if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
1301                            !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
1302                                 /* It's polite to up mmap_sem if we can */
1303                                 up_read(&vma->vm_mm->mmap_sem);
1304                                 ret = VM_FAULT_RETRY;
1305                         }
1306
1307                         shmem_falloc_waitq = shmem_falloc->waitq;
1308                         prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
1309                                         TASK_UNINTERRUPTIBLE);
1310                         spin_unlock(&inode->i_lock);
1311                         schedule();
1312
1313                         /*
1314                          * shmem_falloc_waitq points into the shmem_fallocate()
1315                          * stack of the hole-punching task: shmem_falloc_waitq
1316                          * is usually invalid by the time we reach here, but
1317                          * finish_wait() does not dereference it in that case;
1318                          * though i_lock needed lest racing with wake_up_all().
1319                          */
1320                         spin_lock(&inode->i_lock);
1321                         finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
1322                         spin_unlock(&inode->i_lock);
1323                         return ret;
1324                 }
1325                 spin_unlock(&inode->i_lock);
1326         }
1327
1328         error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1329         if (error)
1330                 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1331
1332         if (ret & VM_FAULT_MAJOR) {
1333                 count_vm_event(PGMAJFAULT);
1334                 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1335         }
1336         return ret;
1337 }
1338
1339 #ifdef CONFIG_NUMA
1340 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
1341 {
1342         struct inode *inode = file_inode(vma->vm_file);
1343         return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
1344 }
1345
1346 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
1347                                           unsigned long addr)
1348 {
1349         struct inode *inode = file_inode(vma->vm_file);
1350         pgoff_t index;
1351
1352         index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1353         return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
1354 }
1355 #endif
1356
1357 int shmem_lock(struct file *file, int lock, struct user_struct *user)
1358 {
1359         struct inode *inode = file_inode(file);
1360         struct shmem_inode_info *info = SHMEM_I(inode);
1361         int retval = -ENOMEM;
1362
1363         spin_lock(&info->lock);
1364         if (lock && !(info->flags & VM_LOCKED)) {
1365                 if (!user_shm_lock(inode->i_size, user))
1366                         goto out_nomem;
1367                 info->flags |= VM_LOCKED;
1368                 mapping_set_unevictable(file->f_mapping);
1369         }
1370         if (!lock && (info->flags & VM_LOCKED) && user) {
1371                 user_shm_unlock(inode->i_size, user);
1372                 info->flags &= ~VM_LOCKED;
1373                 mapping_clear_unevictable(file->f_mapping);
1374         }
1375         retval = 0;
1376
1377 out_nomem:
1378         spin_unlock(&info->lock);
1379         return retval;
1380 }
1381
1382 static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1383 {
1384         file_accessed(file);
1385         vma->vm_ops = &shmem_vm_ops;
1386         return 0;
1387 }
1388
1389 static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
1390                                      umode_t mode, dev_t dev, unsigned long flags)
1391 {
1392         struct inode *inode;
1393         struct shmem_inode_info *info;
1394         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1395
1396         if (shmem_reserve_inode(sb))
1397                 return NULL;
1398
1399         inode = new_inode(sb);
1400         if (inode) {
1401                 inode->i_ino = get_next_ino();
1402                 inode_init_owner(inode, dir, mode);
1403                 inode->i_blocks = 0;
1404                 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1405                 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1406                 inode->i_generation = get_seconds();
1407                 info = SHMEM_I(inode);
1408                 memset(info, 0, (char *)inode - (char *)info);
1409                 spin_lock_init(&info->lock);
1410                 info->flags = flags & VM_NORESERVE;
1411                 INIT_LIST_HEAD(&info->swaplist);
1412                 simple_xattrs_init(&info->xattrs);
1413                 cache_no_acl(inode);
1414
1415                 switch (mode & S_IFMT) {
1416                 default:
1417                         inode->i_op = &shmem_special_inode_operations;
1418                         init_special_inode(inode, mode, dev);
1419                         break;
1420                 case S_IFREG:
1421                         inode->i_mapping->a_ops = &shmem_aops;
1422                         inode->i_op = &shmem_inode_operations;
1423                         inode->i_fop = &shmem_file_operations;
1424                         mpol_shared_policy_init(&info->policy,
1425                                                  shmem_get_sbmpol(sbinfo));
1426                         break;
1427                 case S_IFDIR:
1428                         inc_nlink(inode);
1429                         /* Some things misbehave if size == 0 on a directory */
1430                         inode->i_size = 2 * BOGO_DIRENT_SIZE;
1431                         inode->i_op = &shmem_dir_inode_operations;
1432                         inode->i_fop = &simple_dir_operations;
1433                         break;
1434                 case S_IFLNK:
1435                         /*
1436                          * Must not load anything in the rbtree,
1437                          * mpol_free_shared_policy will not be called.
1438                          */
1439                         mpol_shared_policy_init(&info->policy, NULL);
1440                         break;
1441                 }
1442         } else
1443                 shmem_free_inode(sb);
1444         return inode;
1445 }
1446
1447 bool shmem_mapping(struct address_space *mapping)
1448 {
1449         return mapping->backing_dev_info == &shmem_backing_dev_info;
1450 }
1451
1452 #ifdef CONFIG_TMPFS
1453 static const struct inode_operations shmem_symlink_inode_operations;
1454 static const struct inode_operations shmem_short_symlink_operations;
1455
1456 #ifdef CONFIG_TMPFS_XATTR
1457 static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
1458 #else
1459 #define shmem_initxattrs NULL
1460 #endif
1461
1462 static int
1463 shmem_write_begin(struct file *file, struct address_space *mapping,
1464                         loff_t pos, unsigned len, unsigned flags,
1465                         struct page **pagep, void **fsdata)
1466 {
1467         struct inode *inode = mapping->host;
1468         pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1469         return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1470 }
1471
1472 static int
1473 shmem_write_end(struct file *file, struct address_space *mapping,
1474                         loff_t pos, unsigned len, unsigned copied,
1475                         struct page *page, void *fsdata)
1476 {
1477         struct inode *inode = mapping->host;
1478
1479         if (pos + copied > inode->i_size)
1480                 i_size_write(inode, pos + copied);
1481
1482         if (!PageUptodate(page)) {
1483                 if (copied < PAGE_CACHE_SIZE) {
1484                         unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1485                         zero_user_segments(page, 0, from,
1486                                         from + copied, PAGE_CACHE_SIZE);
1487                 }
1488                 SetPageUptodate(page);
1489         }
1490         set_page_dirty(page);
1491         unlock_page(page);
1492         page_cache_release(page);
1493
1494         return copied;
1495 }
1496
1497 static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1498 {
1499         struct file *file = iocb->ki_filp;
1500         struct inode *inode = file_inode(file);
1501         struct address_space *mapping = inode->i_mapping;
1502         pgoff_t index;
1503         unsigned long offset;
1504         enum sgp_type sgp = SGP_READ;
1505         int error = 0;
1506         ssize_t retval = 0;
1507         loff_t *ppos = &iocb->ki_pos;
1508
1509         /*
1510          * Might this read be for a stacking filesystem?  Then when reading
1511          * holes of a sparse file, we actually need to allocate those pages,
1512          * and even mark them dirty, so it cannot exceed the max_blocks limit.
1513          */
1514         if (segment_eq(get_fs(), KERNEL_DS))
1515                 sgp = SGP_DIRTY;
1516
1517         index = *ppos >> PAGE_CACHE_SHIFT;
1518         offset = *ppos & ~PAGE_CACHE_MASK;
1519
1520         for (;;) {
1521                 struct page *page = NULL;
1522                 pgoff_t end_index;
1523                 unsigned long nr, ret;
1524                 loff_t i_size = i_size_read(inode);
1525
1526                 end_index = i_size >> PAGE_CACHE_SHIFT;
1527                 if (index > end_index)
1528                         break;
1529                 if (index == end_index) {
1530                         nr = i_size & ~PAGE_CACHE_MASK;
1531                         if (nr <= offset)
1532                                 break;
1533                 }
1534
1535                 error = shmem_getpage(inode, index, &page, sgp, NULL);
1536                 if (error) {
1537                         if (error == -EINVAL)
1538                                 error = 0;
1539                         break;
1540                 }
1541                 if (page)
1542                         unlock_page(page);
1543
1544                 /*
1545                  * We must evaluate after, since reads (unlike writes)
1546                  * are called without i_mutex protection against truncate
1547                  */
1548                 nr = PAGE_CACHE_SIZE;
1549                 i_size = i_size_read(inode);
1550                 end_index = i_size >> PAGE_CACHE_SHIFT;
1551                 if (index == end_index) {
1552                         nr = i_size & ~PAGE_CACHE_MASK;
1553                         if (nr <= offset) {
1554                                 if (page)
1555                                         page_cache_release(page);
1556                                 break;
1557                         }
1558                 }
1559                 nr -= offset;
1560
1561                 if (page) {
1562                         /*
1563                          * If users can be writing to this page using arbitrary
1564                          * virtual addresses, take care about potential aliasing
1565                          * before reading the page on the kernel side.
1566                          */
1567                         if (mapping_writably_mapped(mapping))
1568                                 flush_dcache_page(page);
1569                         /*
1570                          * Mark the page accessed if we read the beginning.
1571                          */
1572                         if (!offset)
1573                                 mark_page_accessed(page);
1574                 } else {
1575                         page = ZERO_PAGE(0);
1576                         page_cache_get(page);
1577                 }
1578
1579                 /*
1580                  * Ok, we have the page, and it's up-to-date, so
1581                  * now we can copy it to user space...
1582                  */
1583                 ret = copy_page_to_iter(page, offset, nr, to);
1584                 retval += ret;
1585                 offset += ret;
1586                 index += offset >> PAGE_CACHE_SHIFT;
1587                 offset &= ~PAGE_CACHE_MASK;
1588
1589                 page_cache_release(page);
1590                 if (!iov_iter_count(to))
1591                         break;
1592                 if (ret < nr) {
1593                         error = -EFAULT;
1594                         break;
1595                 }
1596                 cond_resched();
1597         }
1598
1599         *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1600         file_accessed(file);
1601         return retval ? retval : error;
1602 }
1603
1604 static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1605                                 struct pipe_inode_info *pipe, size_t len,
1606                                 unsigned int flags)
1607 {
1608         struct address_space *mapping = in->f_mapping;
1609         struct inode *inode = mapping->host;
1610         unsigned int loff, nr_pages, req_pages;
1611         struct page *pages[PIPE_DEF_BUFFERS];
1612         struct partial_page partial[PIPE_DEF_BUFFERS];
1613         struct page *page;
1614         pgoff_t index, end_index;
1615         loff_t isize, left;
1616         int error, page_nr;
1617         struct splice_pipe_desc spd = {
1618                 .pages = pages,
1619                 .partial = partial,
1620                 .nr_pages_max = PIPE_DEF_BUFFERS,
1621                 .flags = flags,
1622                 .ops = &page_cache_pipe_buf_ops,
1623                 .spd_release = spd_release_page,
1624         };
1625
1626         isize = i_size_read(inode);
1627         if (unlikely(*ppos >= isize))
1628                 return 0;
1629
1630         left = isize - *ppos;
1631         if (unlikely(left < len))
1632                 len = left;
1633
1634         if (splice_grow_spd(pipe, &spd))
1635                 return -ENOMEM;
1636
1637         index = *ppos >> PAGE_CACHE_SHIFT;
1638         loff = *ppos & ~PAGE_CACHE_MASK;
1639         req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1640         nr_pages = min(req_pages, spd.nr_pages_max);
1641
1642         spd.nr_pages = find_get_pages_contig(mapping, index,
1643                                                 nr_pages, spd.pages);
1644         index += spd.nr_pages;
1645         error = 0;
1646
1647         while (spd.nr_pages < nr_pages) {
1648                 error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
1649                 if (error)
1650                         break;
1651                 unlock_page(page);
1652                 spd.pages[spd.nr_pages++] = page;
1653                 index++;
1654         }
1655
1656         index = *ppos >> PAGE_CACHE_SHIFT;
1657         nr_pages = spd.nr_pages;
1658         spd.nr_pages = 0;
1659
1660         for (page_nr = 0; page_nr < nr_pages; page_nr++) {
1661                 unsigned int this_len;
1662
1663                 if (!len)
1664                         break;
1665
1666                 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
1667                 page = spd.pages[page_nr];
1668
1669                 if (!PageUptodate(page) || page->mapping != mapping) {
1670                         error = shmem_getpage(inode, index, &page,
1671                                                         SGP_CACHE, NULL);
1672                         if (error)
1673                                 break;
1674                         unlock_page(page);
1675                         page_cache_release(spd.pages[page_nr]);
1676                         spd.pages[page_nr] = page;
1677                 }
1678
1679                 isize = i_size_read(inode);
1680                 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1681                 if (unlikely(!isize || index > end_index))
1682                         break;
1683
1684                 if (end_index == index) {
1685                         unsigned int plen;
1686
1687                         plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1688                         if (plen <= loff)
1689                                 break;
1690
1691                         this_len = min(this_len, plen - loff);
1692                         len = this_len;
1693                 }
1694
1695                 spd.partial[page_nr].offset = loff;
1696                 spd.partial[page_nr].len = this_len;
1697                 len -= this_len;
1698                 loff = 0;
1699                 spd.nr_pages++;
1700                 index++;
1701         }
1702
1703         while (page_nr < nr_pages)
1704                 page_cache_release(spd.pages[page_nr++]);
1705
1706         if (spd.nr_pages)
1707                 error = splice_to_pipe(pipe, &spd);
1708
1709         splice_shrink_spd(&spd);
1710
1711         if (error > 0) {
1712                 *ppos += error;
1713                 file_accessed(in);
1714         }
1715         return error;
1716 }
1717
1718 /*
1719  * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
1720  */
1721 static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
1722                                     pgoff_t index, pgoff_t end, int whence)
1723 {
1724         struct page *page;
1725         struct pagevec pvec;
1726         pgoff_t indices[PAGEVEC_SIZE];
1727         bool done = false;
1728         int i;
1729
1730         pagevec_init(&pvec, 0);
1731         pvec.nr = 1;            /* start small: we may be there already */
1732         while (!done) {
1733                 pvec.nr = find_get_entries(mapping, index,
1734                                         pvec.nr, pvec.pages, indices);
1735                 if (!pvec.nr) {
1736                         if (whence == SEEK_DATA)
1737                                 index = end;
1738                         break;
1739                 }
1740                 for (i = 0; i < pvec.nr; i++, index++) {
1741                         if (index < indices[i]) {
1742                                 if (whence == SEEK_HOLE) {
1743                                         done = true;
1744                                         break;
1745                                 }
1746                                 index = indices[i];
1747                         }
1748                         page = pvec.pages[i];
1749                         if (page && !radix_tree_exceptional_entry(page)) {
1750                                 if (!PageUptodate(page))
1751                                         page = NULL;
1752                         }
1753                         if (index >= end ||
1754                             (page && whence == SEEK_DATA) ||
1755                             (!page && whence == SEEK_HOLE)) {
1756                                 done = true;
1757                                 break;
1758                         }
1759                 }
1760                 pagevec_remove_exceptionals(&pvec);
1761                 pagevec_release(&pvec);
1762                 pvec.nr = PAGEVEC_SIZE;
1763                 cond_resched();
1764         }
1765         return index;
1766 }
1767
1768 static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
1769 {
1770         struct address_space *mapping = file->f_mapping;
1771         struct inode *inode = mapping->host;
1772         pgoff_t start, end;
1773         loff_t new_offset;
1774
1775         if (whence != SEEK_DATA && whence != SEEK_HOLE)
1776                 return generic_file_llseek_size(file, offset, whence,
1777                                         MAX_LFS_FILESIZE, i_size_read(inode));
1778         mutex_lock(&inode->i_mutex);
1779         /* We're holding i_mutex so we can access i_size directly */
1780
1781         if (offset < 0)
1782                 offset = -EINVAL;
1783         else if (offset >= inode->i_size)
1784                 offset = -ENXIO;
1785         else {
1786                 start = offset >> PAGE_CACHE_SHIFT;
1787                 end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1788                 new_offset = shmem_seek_hole_data(mapping, start, end, whence);
1789                 new_offset <<= PAGE_CACHE_SHIFT;
1790                 if (new_offset > offset) {
1791                         if (new_offset < inode->i_size)
1792                                 offset = new_offset;
1793                         else if (whence == SEEK_DATA)
1794                                 offset = -ENXIO;
1795                         else
1796                                 offset = inode->i_size;
1797                 }
1798         }
1799
1800         if (offset >= 0)
1801                 offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
1802         mutex_unlock(&inode->i_mutex);
1803         return offset;
1804 }
1805
1806 static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1807                                                          loff_t len)
1808 {
1809         struct inode *inode = file_inode(file);
1810         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1811         struct shmem_falloc shmem_falloc;
1812         pgoff_t start, index, end;
1813         int error;
1814
1815         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
1816                 return -EOPNOTSUPP;
1817
1818         mutex_lock(&inode->i_mutex);
1819
1820         if (mode & FALLOC_FL_PUNCH_HOLE) {
1821                 struct address_space *mapping = file->f_mapping;
1822                 loff_t unmap_start = round_up(offset, PAGE_SIZE);
1823                 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
1824                 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
1825
1826                 shmem_falloc.waitq = &shmem_falloc_waitq;
1827                 shmem_falloc.start = unmap_start >> PAGE_SHIFT;
1828                 shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
1829                 spin_lock(&inode->i_lock);
1830                 inode->i_private = &shmem_falloc;
1831                 spin_unlock(&inode->i_lock);
1832
1833                 if ((u64)unmap_end > (u64)unmap_start)
1834                         unmap_mapping_range(mapping, unmap_start,
1835                                             1 + unmap_end - unmap_start, 0);
1836                 shmem_truncate_range(inode, offset, offset + len - 1);
1837                 /* No need to unmap again: hole-punching leaves COWed pages */
1838
1839                 spin_lock(&inode->i_lock);
1840                 inode->i_private = NULL;
1841                 wake_up_all(&shmem_falloc_waitq);
1842                 spin_unlock(&inode->i_lock);
1843                 error = 0;
1844                 goto out;
1845         }
1846
1847         /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
1848         error = inode_newsize_ok(inode, offset + len);
1849         if (error)
1850                 goto out;
1851
1852         start = offset >> PAGE_CACHE_SHIFT;
1853         end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1854         /* Try to avoid a swapstorm if len is impossible to satisfy */
1855         if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
1856                 error = -ENOSPC;
1857                 goto out;
1858         }
1859
1860         shmem_falloc.waitq = NULL;
1861         shmem_falloc.start = start;
1862         shmem_falloc.next  = start;
1863         shmem_falloc.nr_falloced = 0;
1864         shmem_falloc.nr_unswapped = 0;
1865         spin_lock(&inode->i_lock);
1866         inode->i_private = &shmem_falloc;
1867         spin_unlock(&inode->i_lock);
1868
1869         for (index = start; index < end; index++) {
1870                 struct page *page;
1871
1872                 /*
1873                  * Good, the fallocate(2) manpage permits EINTR: we may have
1874                  * been interrupted because we are using up too much memory.
1875                  */
1876                 if (signal_pending(current))
1877                         error = -EINTR;
1878                 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
1879                         error = -ENOMEM;
1880                 else
1881                         error = shmem_getpage(inode, index, &page, SGP_FALLOC,
1882                                                                         NULL);
1883                 if (error) {
1884                         /* Remove the !PageUptodate pages we added */
1885                         shmem_undo_range(inode,
1886                                 (loff_t)start << PAGE_CACHE_SHIFT,
1887                                 (loff_t)index << PAGE_CACHE_SHIFT, true);
1888                         goto undone;
1889                 }
1890
1891                 /*
1892                  * Inform shmem_writepage() how far we have reached.
1893                  * No need for lock or barrier: we have the page lock.
1894                  */
1895                 shmem_falloc.next++;
1896                 if (!PageUptodate(page))
1897                         shmem_falloc.nr_falloced++;
1898
1899                 /*
1900                  * If !PageUptodate, leave it that way so that freeable pages
1901                  * can be recognized if we need to rollback on error later.
1902                  * But set_page_dirty so that memory pressure will swap rather
1903                  * than free the pages we are allocating (and SGP_CACHE pages
1904                  * might still be clean: we now need to mark those dirty too).
1905                  */
1906                 set_page_dirty(page);
1907                 unlock_page(page);
1908                 page_cache_release(page);
1909                 cond_resched();
1910         }
1911
1912         if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
1913                 i_size_write(inode, offset + len);
1914         inode->i_ctime = CURRENT_TIME;
1915 undone:
1916         spin_lock(&inode->i_lock);
1917         inode->i_private = NULL;
1918         spin_unlock(&inode->i_lock);
1919 out:
1920         mutex_unlock(&inode->i_mutex);
1921         return error;
1922 }
1923
1924 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1925 {
1926         struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
1927
1928         buf->f_type = TMPFS_MAGIC;
1929         buf->f_bsize = PAGE_CACHE_SIZE;
1930         buf->f_namelen = NAME_MAX;
1931         if (sbinfo->max_blocks) {
1932                 buf->f_blocks = sbinfo->max_blocks;
1933                 buf->f_bavail =
1934                 buf->f_bfree  = sbinfo->max_blocks -
1935                                 percpu_counter_sum(&sbinfo->used_blocks);
1936         }
1937         if (sbinfo->max_inodes) {
1938                 buf->f_files = sbinfo->max_inodes;
1939                 buf->f_ffree = sbinfo->free_inodes;
1940         }
1941         /* else leave those fields 0 like simple_statfs */
1942         return 0;
1943 }
1944
1945 /*
1946  * File creation. Allocate an inode, and we're done..
1947  */
1948 static int
1949 shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
1950 {
1951         struct inode *inode;
1952         int error = -ENOSPC;
1953
1954         inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
1955         if (inode) {
1956                 error = simple_acl_create(dir, inode);
1957                 if (error)
1958                         goto out_iput;
1959                 error = security_inode_init_security(inode, dir,
1960                                                      &dentry->d_name,
1961                                                      shmem_initxattrs, NULL);
1962                 if (error && error != -EOPNOTSUPP)
1963                         goto out_iput;
1964
1965                 error = 0;
1966                 dir->i_size += BOGO_DIRENT_SIZE;
1967                 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1968                 d_instantiate(dentry, inode);
1969                 dget(dentry); /* Extra count - pin the dentry in core */
1970         }
1971         return error;
1972 out_iput:
1973         iput(inode);
1974         return error;
1975 }
1976
1977 static int
1978 shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
1979 {
1980         struct inode *inode;
1981         int error = -ENOSPC;
1982
1983         inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
1984         if (inode) {
1985                 error = security_inode_init_security(inode, dir,
1986                                                      NULL,
1987                                                      shmem_initxattrs, NULL);
1988                 if (error && error != -EOPNOTSUPP)
1989                         goto out_iput;
1990                 error = simple_acl_create(dir, inode);
1991                 if (error)
1992                         goto out_iput;
1993                 d_tmpfile(dentry, inode);
1994         }
1995         return error;
1996 out_iput:
1997         iput(inode);
1998         return error;
1999 }
2000
2001 static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
2002 {
2003         int error;
2004
2005         if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
2006                 return error;
2007         inc_nlink(dir);
2008         return 0;
2009 }
2010
2011 static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2012                 bool excl)
2013 {
2014         return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
2015 }
2016
2017 /*
2018  * Link a file..
2019  */
2020 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
2021 {
2022         struct inode *inode = old_dentry->d_inode;
2023         int ret;
2024
2025         /*
2026          * No ordinary (disk based) filesystem counts links as inodes;
2027          * but each new link needs a new dentry, pinning lowmem, and
2028          * tmpfs dentries cannot be pruned until they are unlinked.
2029          */
2030         ret = shmem_reserve_inode(inode->i_sb);
2031         if (ret)
2032                 goto out;
2033
2034         dir->i_size += BOGO_DIRENT_SIZE;
2035         inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
2036         inc_nlink(inode);
2037         ihold(inode);   /* New dentry reference */
2038         dget(dentry);           /* Extra pinning count for the created dentry */
2039         d_instantiate(dentry, inode);
2040 out:
2041         return ret;
2042 }
2043
2044 static int shmem_unlink(struct inode *dir, struct dentry *dentry)
2045 {
2046         struct inode *inode = dentry->d_inode;
2047
2048         if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
2049                 shmem_free_inode(inode->i_sb);
2050
2051         dir->i_size -= BOGO_DIRENT_SIZE;
2052         inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
2053         drop_nlink(inode);
2054         dput(dentry);   /* Undo the count from "create" - this does all the work */
2055         return 0;
2056 }
2057
2058 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
2059 {
2060         if (!simple_empty(dentry))
2061                 return -ENOTEMPTY;
2062
2063         drop_nlink(dentry->d_inode);
2064         drop_nlink(dir);
2065         return shmem_unlink(dir, dentry);
2066 }
2067
2068 /*
2069  * The VFS layer already does all the dentry stuff for rename,
2070  * we just have to decrement the usage count for the target if
2071  * it exists so that the VFS layer correctly free's it when it
2072  * gets overwritten.
2073  */
2074 static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
2075 {
2076         struct inode *inode = old_dentry->d_inode;
2077         int they_are_dirs = S_ISDIR(inode->i_mode);
2078
2079         if (!simple_empty(new_dentry))
2080                 return -ENOTEMPTY;
2081
2082         if (new_dentry->d_inode) {
2083                 (void) shmem_unlink(new_dir, new_dentry);
2084                 if (they_are_dirs)
2085                         drop_nlink(old_dir);
2086         } else if (they_are_dirs) {
2087                 drop_nlink(old_dir);
2088                 inc_nlink(new_dir);
2089         }
2090
2091         old_dir->i_size -= BOGO_DIRENT_SIZE;
2092         new_dir->i_size += BOGO_DIRENT_SIZE;
2093         old_dir->i_ctime = old_dir->i_mtime =
2094         new_dir->i_ctime = new_dir->i_mtime =
2095         inode->i_ctime = CURRENT_TIME;
2096         return 0;
2097 }
2098
2099 static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
2100 {
2101         int error;
2102         int len;
2103         struct inode *inode;
2104         struct page *page;
2105         char *kaddr;
2106         struct shmem_inode_info *info;
2107
2108         len = strlen(symname) + 1;
2109         if (len > PAGE_CACHE_SIZE)
2110                 return -ENAMETOOLONG;
2111
2112         inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
2113         if (!inode)
2114                 return -ENOSPC;
2115
2116         error = security_inode_init_security(inode, dir, &dentry->d_name,
2117                                              shmem_initxattrs, NULL);
2118         if (error) {
2119                 if (error != -EOPNOTSUPP) {
2120                         iput(inode);
2121                         return error;
2122                 }
2123                 error = 0;
2124         }
2125
2126         info = SHMEM_I(inode);
2127         inode->i_size = len-1;
2128         if (len <= SHORT_SYMLINK_LEN) {
2129                 info->symlink = kmemdup(symname, len, GFP_KERNEL);
2130                 if (!info->symlink) {
2131                         iput(inode);
2132                         return -ENOMEM;
2133                 }
2134                 inode->i_op = &shmem_short_symlink_operations;
2135         } else {
2136                 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
2137                 if (error) {
2138                         iput(inode);
2139                         return error;
2140                 }
2141                 inode->i_mapping->a_ops = &shmem_aops;
2142                 inode->i_op = &shmem_symlink_inode_operations;
2143                 kaddr = kmap_atomic(page);
2144                 memcpy(kaddr, symname, len);
2145                 kunmap_atomic(kaddr);
2146                 SetPageUptodate(page);
2147                 set_page_dirty(page);
2148                 unlock_page(page);
2149                 page_cache_release(page);
2150         }
2151         dir->i_size += BOGO_DIRENT_SIZE;
2152         dir->i_ctime = dir->i_mtime = CURRENT_TIME;
2153         d_instantiate(dentry, inode);
2154         dget(dentry);
2155         return 0;
2156 }
2157
2158 static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
2159 {
2160         nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
2161         return NULL;
2162 }
2163
2164 static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
2165 {
2166         struct page *page = NULL;
2167         int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
2168         nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
2169         if (page)
2170                 unlock_page(page);
2171         return page;
2172 }
2173
2174 static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
2175 {
2176         if (!IS_ERR(nd_get_link(nd))) {
2177                 struct page *page = cookie;
2178                 kunmap(page);
2179                 mark_page_accessed(page);
2180                 page_cache_release(page);
2181         }
2182 }
2183
2184 #ifdef CONFIG_TMPFS_XATTR
2185 /*
2186  * Superblocks without xattr inode operations may get some security.* xattr
2187  * support from the LSM "for free". As soon as we have any other xattrs
2188  * like ACLs, we also need to implement the security.* handlers at
2189  * filesystem level, though.
2190  */
2191
2192 /*
2193  * Callback for security_inode_init_security() for acquiring xattrs.
2194  */
2195 static int shmem_initxattrs(struct inode *inode,
2196                             const struct xattr *xattr_array,
2197                             void *fs_info)
2198 {
2199         struct shmem_inode_info *info = SHMEM_I(inode);
2200         const struct xattr *xattr;
2201         struct simple_xattr *new_xattr;
2202         size_t len;
2203
2204         for (xattr = xattr_array; xattr->name != NULL; xattr++) {
2205                 new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
2206                 if (!new_xattr)
2207                         return -ENOMEM;
2208
2209                 len = strlen(xattr->name) + 1;
2210                 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
2211                                           GFP_KERNEL);
2212                 if (!new_xattr->name) {
2213                         kfree(new_xattr);
2214                         return -ENOMEM;
2215                 }
2216
2217                 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
2218                        XATTR_SECURITY_PREFIX_LEN);
2219                 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
2220                        xattr->name, len);
2221
2222                 simple_xattr_list_add(&info->xattrs, new_xattr);
2223         }
2224
2225         return 0;
2226 }
2227
2228 static const struct xattr_handler *shmem_xattr_handlers[] = {
2229 #ifdef CONFIG_TMPFS_POSIX_ACL
2230         &posix_acl_access_xattr_handler,
2231         &posix_acl_default_xattr_handler,
2232 #endif
2233         NULL
2234 };
2235
2236 static int shmem_xattr_validate(const char *name)
2237 {
2238         struct { const char *prefix; size_t len; } arr[] = {
2239                 { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },
2240                 { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }
2241         };
2242         int i;
2243
2244         for (i = 0; i < ARRAY_SIZE(arr); i++) {
2245                 size_t preflen = arr[i].len;
2246                 if (strncmp(name, arr[i].prefix, preflen) == 0) {
2247                         if (!name[preflen])
2248                                 return -EINVAL;
2249                         return 0;
2250                 }
2251         }
2252         return -EOPNOTSUPP;
2253 }
2254
2255 static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
2256                               void *buffer, size_t size)
2257 {
2258         struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2259         int err;
2260
2261         /*
2262          * If this is a request for a synthetic attribute in the system.*
2263          * namespace use the generic infrastructure to resolve a handler
2264          * for it via sb->s_xattr.
2265          */
2266         if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2267                 return generic_getxattr(dentry, name, buffer, size);
2268
2269         err = shmem_xattr_validate(name);
2270         if (err)
2271                 return err;
2272
2273         return simple_xattr_get(&info->xattrs, name, buffer, size);
2274 }
2275
2276 static int shmem_setxattr(struct dentry *dentry, const char *name,
2277                           const void *value, size_t size, int flags)
2278 {
2279         struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2280         int err;
2281
2282         /*
2283          * If this is a request for a synthetic attribute in the system.*
2284          * namespace use the generic infrastructure to resolve a handler
2285          * for it via sb->s_xattr.
2286          */
2287         if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2288                 return generic_setxattr(dentry, name, value, size, flags);
2289
2290         err = shmem_xattr_validate(name);
2291         if (err)
2292                 return err;
2293
2294         return simple_xattr_set(&info->xattrs, name, value, size, flags);
2295 }
2296
2297 static int shmem_removexattr(struct dentry *dentry, const char *name)
2298 {
2299         struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2300         int err;
2301
2302         /*
2303          * If this is a request for a synthetic attribute in the system.*
2304          * namespace use the generic infrastructure to resolve a handler
2305          * for it via sb->s_xattr.
2306          */
2307         if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2308                 return generic_removexattr(dentry, name);
2309
2310         err = shmem_xattr_validate(name);
2311         if (err)
2312                 return err;
2313
2314         return simple_xattr_remove(&info->xattrs, name);
2315 }
2316
2317 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
2318 {
2319         struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2320         return simple_xattr_list(&info->xattrs, buffer, size);
2321 }
2322 #endif /* CONFIG_TMPFS_XATTR */
2323
2324 static const struct inode_operations shmem_short_symlink_operations = {
2325         .readlink       = generic_readlink,
2326         .follow_link    = shmem_follow_short_symlink,
2327 #ifdef CONFIG_TMPFS_XATTR
2328         .setxattr       = shmem_setxattr,
2329         .getxattr       = shmem_getxattr,
2330         .listxattr      = shmem_listxattr,
2331         .removexattr    = shmem_removexattr,
2332 #endif
2333 };
2334
2335 static const struct inode_operations shmem_symlink_inode_operations = {
2336         .readlink       = generic_readlink,
2337         .follow_link    = shmem_follow_link,
2338         .put_link       = shmem_put_link,
2339 #ifdef CONFIG_TMPFS_XATTR
2340         .setxattr       = shmem_setxattr,
2341         .getxattr       = shmem_getxattr,
2342         .listxattr      = shmem_listxattr,
2343         .removexattr    = shmem_removexattr,
2344 #endif
2345 };
2346
2347 static struct dentry *shmem_get_parent(struct dentry *child)
2348 {
2349         return ERR_PTR(-ESTALE);
2350 }
2351
2352 static int shmem_match(struct inode *ino, void *vfh)
2353 {
2354         __u32 *fh = vfh;
2355         __u64 inum = fh[2];
2356         inum = (inum << 32) | fh[1];
2357         return ino->i_ino == inum && fh[0] == ino->i_generation;
2358 }
2359
2360 static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
2361                 struct fid *fid, int fh_len, int fh_type)
2362 {
2363         struct inode *inode;
2364         struct dentry *dentry = NULL;
2365         u64 inum;
2366
2367         if (fh_len < 3)
2368                 return NULL;
2369
2370         inum = fid->raw[2];
2371         inum = (inum << 32) | fid->raw[1];
2372
2373         inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
2374                         shmem_match, fid->raw);
2375         if (inode) {
2376                 dentry = d_find_alias(inode);
2377                 iput(inode);
2378         }
2379
2380         return dentry;
2381 }
2382
2383 static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
2384                                 struct inode *parent)
2385 {
2386         if (*len < 3) {
2387                 *len = 3;
2388                 return FILEID_INVALID;
2389         }
2390
2391         if (inode_unhashed(inode)) {
2392                 /* Unfortunately insert_inode_hash is not idempotent,
2393                  * so as we hash inodes here rather than at creation
2394                  * time, we need a lock to ensure we only try
2395                  * to do it once
2396                  */
2397                 static DEFINE_SPINLOCK(lock);
2398                 spin_lock(&lock);
2399                 if (inode_unhashed(inode))
2400                         __insert_inode_hash(inode,
2401                                             inode->i_ino + inode->i_generation);
2402                 spin_unlock(&lock);
2403         }
2404
2405         fh[0] = inode->i_generation;
2406         fh[1] = inode->i_ino;
2407         fh[2] = ((__u64)inode->i_ino) >> 32;
2408
2409         *len = 3;
2410         return 1;
2411 }
2412
2413 static const struct export_operations shmem_export_ops = {
2414         .get_parent     = shmem_get_parent,
2415         .encode_fh      = shmem_encode_fh,
2416         .fh_to_dentry   = shmem_fh_to_dentry,
2417 };
2418
2419 static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2420                                bool remount)
2421 {
2422         char *this_char, *value, *rest;
2423         struct mempolicy *mpol = NULL;
2424         uid_t uid;
2425         gid_t gid;
2426
2427         while (options != NULL) {
2428                 this_char = options;
2429                 for (;;) {
2430                         /*
2431                          * NUL-terminate this option: unfortunately,
2432                          * mount options form a comma-separated list,
2433                          * but mpol's nodelist may also contain commas.
2434                          */
2435                         options = strchr(options, ',');
2436                         if (options == NULL)
2437                                 break;
2438                         options++;
2439                         if (!isdigit(*options)) {
2440                                 options[-1] = '\0';
2441                                 break;
2442                         }
2443                 }
2444                 if (!*this_char)
2445                         continue;
2446                 if ((value = strchr(this_char,'=')) != NULL) {
2447                         *value++ = 0;
2448                 } else {
2449                         printk(KERN_ERR
2450                             "tmpfs: No value for mount option '%s'\n",
2451                             this_char);
2452                         goto error;
2453                 }
2454
2455                 if (!strcmp(this_char,"size")) {
2456                         unsigned long long size;
2457                         size = memparse(value,&rest);
2458                         if (*rest == '%') {
2459                                 size <<= PAGE_SHIFT;
2460                                 size *= totalram_pages;
2461                                 do_div(size, 100);
2462                                 rest++;
2463                         }
2464                         if (*rest)
2465                                 goto bad_val;
2466                         sbinfo->max_blocks =
2467                                 DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
2468                 } else if (!strcmp(this_char,"nr_blocks")) {
2469                         sbinfo->max_blocks = memparse(value, &rest);
2470                         if (*rest)
2471                                 goto bad_val;
2472                 } else if (!strcmp(this_char,"nr_inodes")) {
2473                         sbinfo->max_inodes = memparse(value, &rest);
2474                         if (*rest)
2475                                 goto bad_val;
2476                 } else if (!strcmp(this_char,"mode")) {
2477                         if (remount)
2478                                 continue;
2479                         sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
2480                         if (*rest)
2481                                 goto bad_val;
2482                 } else if (!strcmp(this_char,"uid")) {
2483                         if (remount)
2484                                 continue;
2485                         uid = simple_strtoul(value, &rest, 0);
2486                         if (*rest)
2487                                 goto bad_val;
2488                         sbinfo->uid = make_kuid(current_user_ns(), uid);
2489                         if (!uid_valid(sbinfo->uid))
2490                                 goto bad_val;
2491                 } else if (!strcmp(this_char,"gid")) {
2492                         if (remount)
2493                                 continue;
2494                         gid = simple_strtoul(value, &rest, 0);
2495                         if (*rest)
2496                                 goto bad_val;
2497                         sbinfo->gid = make_kgid(current_user_ns(), gid);
2498                         if (!gid_valid(sbinfo->gid))
2499                                 goto bad_val;
2500                 } else if (!strcmp(this_char,"mpol")) {
2501                         mpol_put(mpol);
2502                         mpol = NULL;
2503                         if (mpol_parse_str(value, &mpol))
2504                                 goto bad_val;
2505                 } else {
2506                         printk(KERN_ERR "tmpfs: Bad mount option %s\n",
2507                                this_char);
2508                         goto error;
2509                 }
2510         }
2511         sbinfo->mpol = mpol;
2512         return 0;
2513
2514 bad_val:
2515         printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
2516                value, this_char);
2517 error:
2518         mpol_put(mpol);
2519         return 1;
2520
2521 }
2522
2523 static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2524 {
2525         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2526         struct shmem_sb_info config = *sbinfo;
2527         unsigned long inodes;
2528         int error = -EINVAL;
2529
2530         config.mpol = NULL;
2531         if (shmem_parse_options(data, &config, true))
2532                 return error;
2533
2534         spin_lock(&sbinfo->stat_lock);
2535         inodes = sbinfo->max_inodes - sbinfo->free_inodes;
2536         if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
2537                 goto out;
2538         if (config.max_inodes < inodes)
2539                 goto out;
2540         /*
2541          * Those tests disallow limited->unlimited while any are in use;
2542          * but we must separately disallow unlimited->limited, because
2543          * in that case we have no record of how much is already in use.
2544          */
2545         if (config.max_blocks && !sbinfo->max_blocks)
2546                 goto out;
2547         if (config.max_inodes && !sbinfo->max_inodes)
2548                 goto out;
2549
2550         error = 0;
2551         sbinfo->max_blocks  = config.max_blocks;
2552         sbinfo->max_inodes  = config.max_inodes;
2553         sbinfo->free_inodes = config.max_inodes - inodes;
2554
2555         /*
2556          * Preserve previous mempolicy unless mpol remount option was specified.
2557          */
2558         if (config.mpol) {
2559                 mpol_put(sbinfo->mpol);
2560                 sbinfo->mpol = config.mpol;     /* transfers initial ref */
2561         }
2562 out:
2563         spin_unlock(&sbinfo->stat_lock);
2564         return error;
2565 }
2566
2567 static int shmem_show_options(struct seq_file *seq, struct dentry *root)
2568 {
2569         struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
2570
2571         if (sbinfo->max_blocks != shmem_default_max_blocks())
2572                 seq_printf(seq, ",size=%luk",
2573                         sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));
2574         if (sbinfo->max_inodes != shmem_default_max_inodes())
2575                 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
2576         if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
2577                 seq_printf(seq, ",mode=%03ho", sbinfo->mode);
2578         if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
2579                 seq_printf(seq, ",uid=%u",
2580                                 from_kuid_munged(&init_user_ns, sbinfo->uid));
2581         if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
2582                 seq_printf(seq, ",gid=%u",
2583                                 from_kgid_munged(&init_user_ns, sbinfo->gid));
2584         shmem_show_mpol(seq, sbinfo->mpol);
2585         return 0;
2586 }
2587 #endif /* CONFIG_TMPFS */
2588
2589 static void shmem_put_super(struct super_block *sb)
2590 {
2591         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2592
2593         percpu_counter_destroy(&sbinfo->used_blocks);
2594         mpol_put(sbinfo->mpol);
2595         kfree(sbinfo);
2596         sb->s_fs_info = NULL;
2597 }
2598
2599 int shmem_fill_super(struct super_block *sb, void *data, int silent)
2600 {
2601         struct inode *inode;
2602         struct shmem_sb_info *sbinfo;
2603         int err = -ENOMEM;
2604
2605         /* Round up to L1_CACHE_BYTES to resist false sharing */
2606         sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
2607                                 L1_CACHE_BYTES), GFP_KERNEL);
2608         if (!sbinfo)
2609                 return -ENOMEM;
2610
2611         sbinfo->mode = S_IRWXUGO | S_ISVTX;
2612         sbinfo->uid = current_fsuid();
2613         sbinfo->gid = current_fsgid();
2614         sb->s_fs_info = sbinfo;
2615
2616 #ifdef CONFIG_TMPFS
2617         /*
2618          * Per default we only allow half of the physical ram per
2619          * tmpfs instance, limiting inodes to one per page of lowmem;
2620          * but the internal instance is left unlimited.
2621          */
2622         if (!(sb->s_flags & MS_KERNMOUNT)) {
2623                 sbinfo->max_blocks = shmem_default_max_blocks();
2624                 sbinfo->max_inodes = shmem_default_max_inodes();
2625                 if (shmem_parse_options(data, sbinfo, false)) {
2626                         err = -EINVAL;
2627                         goto failed;
2628                 }
2629         } else {
2630                 sb->s_flags |= MS_NOUSER;
2631         }
2632         sb->s_export_op = &shmem_export_ops;
2633         sb->s_flags |= MS_NOSEC;
2634 #else
2635         sb->s_flags |= MS_NOUSER;
2636 #endif
2637
2638         spin_lock_init(&sbinfo->stat_lock);
2639         if (percpu_counter_init(&sbinfo->used_blocks, 0))
2640                 goto failed;
2641         sbinfo->free_inodes = sbinfo->max_inodes;
2642
2643         sb->s_maxbytes = MAX_LFS_FILESIZE;
2644         sb->s_blocksize = PAGE_CACHE_SIZE;
2645         sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2646         sb->s_magic = TMPFS_MAGIC;
2647         sb->s_op = &shmem_ops;
2648         sb->s_time_gran = 1;
2649 #ifdef CONFIG_TMPFS_XATTR
2650         sb->s_xattr = shmem_xattr_handlers;
2651 #endif
2652 #ifdef CONFIG_TMPFS_POSIX_ACL
2653         sb->s_flags |= MS_POSIXACL;
2654 #endif
2655
2656         inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
2657         if (!inode)
2658                 goto failed;
2659         inode->i_uid = sbinfo->uid;
2660         inode->i_gid = sbinfo->gid;
2661         sb->s_root = d_make_root(inode);
2662         if (!sb->s_root)
2663                 goto failed;
2664         return 0;
2665
2666 failed:
2667         shmem_put_super(sb);
2668         return err;
2669 }
2670
2671 static struct kmem_cache *shmem_inode_cachep;
2672
2673 static struct inode *shmem_alloc_inode(struct super_block *sb)
2674 {
2675         struct shmem_inode_info *info;
2676         info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
2677         if (!info)
2678                 return NULL;
2679         return &info->vfs_inode;
2680 }
2681
2682 static void shmem_destroy_callback(struct rcu_head *head)
2683 {
2684         struct inode *inode = container_of(head, struct inode, i_rcu);
2685         kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2686 }
2687
2688 static void shmem_destroy_inode(struct inode *inode)
2689 {
2690         if (S_ISREG(inode->i_mode))
2691                 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2692         call_rcu(&inode->i_rcu, shmem_destroy_callback);
2693 }
2694
2695 static void shmem_init_inode(void *foo)
2696 {
2697         struct shmem_inode_info *info = foo;
2698         inode_init_once(&info->vfs_inode);
2699 }
2700
2701 static int shmem_init_inodecache(void)
2702 {
2703         shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2704                                 sizeof(struct shmem_inode_info),
2705                                 0, SLAB_PANIC, shmem_init_inode);
2706         return 0;
2707 }
2708
2709 static void shmem_destroy_inodecache(void)
2710 {
2711         kmem_cache_destroy(shmem_inode_cachep);
2712 }
2713
2714 static const struct address_space_operations shmem_aops = {
2715         .writepage      = shmem_writepage,
2716         .set_page_dirty = __set_page_dirty_no_writeback,
2717 #ifdef CONFIG_TMPFS
2718         .write_begin    = shmem_write_begin,
2719         .write_end      = shmem_write_end,
2720 #endif
2721         .migratepage    = migrate_page,
2722         .error_remove_page = generic_error_remove_page,
2723 };
2724
2725 static const struct file_operations shmem_file_operations = {
2726         .mmap           = shmem_mmap,
2727 #ifdef CONFIG_TMPFS
2728         .llseek         = shmem_file_llseek,
2729         .read           = new_sync_read,
2730         .write          = new_sync_write,
2731         .read_iter      = shmem_file_read_iter,
2732         .write_iter     = generic_file_write_iter,
2733         .fsync          = noop_fsync,
2734         .splice_read    = shmem_file_splice_read,
2735         .splice_write   = iter_file_splice_write,
2736         .fallocate      = shmem_fallocate,
2737 #endif
2738 };
2739
2740 static const struct inode_operations shmem_inode_operations = {
2741         .setattr        = shmem_setattr,
2742 #ifdef CONFIG_TMPFS_XATTR
2743         .setxattr       = shmem_setxattr,
2744         .getxattr       = shmem_getxattr,
2745         .listxattr      = shmem_listxattr,
2746         .removexattr    = shmem_removexattr,
2747         .set_acl        = simple_set_acl,
2748 #endif
2749 };
2750
2751 static const struct inode_operations shmem_dir_inode_operations = {
2752 #ifdef CONFIG_TMPFS
2753         .create         = shmem_create,
2754         .lookup         = simple_lookup,
2755         .link           = shmem_link,
2756         .unlink         = shmem_unlink,
2757         .symlink        = shmem_symlink,
2758         .mkdir          = shmem_mkdir,
2759         .rmdir          = shmem_rmdir,
2760         .mknod          = shmem_mknod,
2761         .rename         = shmem_rename,
2762         .tmpfile        = shmem_tmpfile,
2763 #endif
2764 #ifdef CONFIG_TMPFS_XATTR
2765         .setxattr       = shmem_setxattr,
2766         .getxattr       = shmem_getxattr,
2767         .listxattr      = shmem_listxattr,
2768         .removexattr    = shmem_removexattr,
2769 #endif
2770 #ifdef CONFIG_TMPFS_POSIX_ACL
2771         .setattr        = shmem_setattr,
2772         .set_acl        = simple_set_acl,
2773 #endif
2774 };
2775
2776 static const struct inode_operations shmem_special_inode_operations = {
2777 #ifdef CONFIG_TMPFS_XATTR
2778         .setxattr       = shmem_setxattr,
2779         .getxattr       = shmem_getxattr,
2780         .listxattr      = shmem_listxattr,
2781         .removexattr    = shmem_removexattr,
2782 #endif
2783 #ifdef CONFIG_TMPFS_POSIX_ACL
2784         .setattr        = shmem_setattr,
2785         .set_acl        = simple_set_acl,
2786 #endif
2787 };
2788
2789 static const struct super_operations shmem_ops = {
2790         .alloc_inode    = shmem_alloc_inode,
2791         .destroy_inode  = shmem_destroy_inode,
2792 #ifdef CONFIG_TMPFS
2793         .statfs         = shmem_statfs,
2794         .remount_fs     = shmem_remount_fs,
2795         .show_options   = shmem_show_options,
2796 #endif
2797         .evict_inode    = shmem_evict_inode,
2798         .drop_inode     = generic_delete_inode,
2799         .put_super      = shmem_put_super,
2800 };
2801
2802 static const struct vm_operations_struct shmem_vm_ops = {
2803         .fault          = shmem_fault,
2804         .map_pages      = filemap_map_pages,
2805 #ifdef CONFIG_NUMA
2806         .set_policy     = shmem_set_policy,
2807         .get_policy     = shmem_get_policy,
2808 #endif
2809         .remap_pages    = generic_file_remap_pages,
2810 };
2811
2812 static struct dentry *shmem_mount(struct file_system_type *fs_type,
2813         int flags, const char *dev_name, void *data)
2814 {
2815         return mount_nodev(fs_type, flags, data, shmem_fill_super);
2816 }
2817
2818 static struct file_system_type shmem_fs_type = {
2819         .owner          = THIS_MODULE,
2820         .name           = "tmpfs",
2821         .mount          = shmem_mount,
2822         .kill_sb        = kill_litter_super,
2823         .fs_flags       = FS_USERNS_MOUNT,
2824 };
2825
2826 int __init shmem_init(void)
2827 {
2828         int error;
2829
2830         /* If rootfs called this, don't re-init */
2831         if (shmem_inode_cachep)
2832                 return 0;
2833
2834         error = bdi_init(&shmem_backing_dev_info);
2835         if (error)
2836                 goto out4;
2837
2838         error = shmem_init_inodecache();
2839         if (error)
2840                 goto out3;
2841
2842         error = register_filesystem(&shmem_fs_type);
2843         if (error) {
2844                 printk(KERN_ERR "Could not register tmpfs\n");
2845                 goto out2;
2846         }
2847
2848         shm_mnt = kern_mount(&shmem_fs_type);
2849         if (IS_ERR(shm_mnt)) {
2850                 error = PTR_ERR(shm_mnt);
2851                 printk(KERN_ERR "Could not kern_mount tmpfs\n");
2852                 goto out1;
2853         }
2854         return 0;
2855
2856 out1:
2857         unregister_filesystem(&shmem_fs_type);
2858 out2:
2859         shmem_destroy_inodecache();
2860 out3:
2861         bdi_destroy(&shmem_backing_dev_info);
2862 out4:
2863         shm_mnt = ERR_PTR(error);
2864         return error;
2865 }
2866
2867 #else /* !CONFIG_SHMEM */
2868
2869 /*
2870  * tiny-shmem: simple shmemfs and tmpfs using ramfs code
2871  *
2872  * This is intended for small system where the benefits of the full
2873  * shmem code (swap-backed and resource-limited) are outweighed by
2874  * their complexity. On systems without swap this code should be
2875  * effectively equivalent, but much lighter weight.
2876  */
2877
2878 static struct file_system_type shmem_fs_type = {
2879         .name           = "tmpfs",
2880         .mount          = ramfs_mount,
2881         .kill_sb        = kill_litter_super,
2882         .fs_flags       = FS_USERNS_MOUNT,
2883 };
2884
2885 int __init shmem_init(void)
2886 {
2887         BUG_ON(register_filesystem(&shmem_fs_type) != 0);
2888
2889         shm_mnt = kern_mount(&shmem_fs_type);
2890         BUG_ON(IS_ERR(shm_mnt));
2891
2892         return 0;
2893 }
2894
2895 int shmem_unuse(swp_entry_t swap, struct page *page)
2896 {
2897         return 0;
2898 }
2899
2900 int shmem_lock(struct file *file, int lock, struct user_struct *user)
2901 {
2902         return 0;
2903 }
2904
2905 void shmem_unlock_mapping(struct address_space *mapping)
2906 {
2907 }
2908
2909 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
2910 {
2911         truncate_inode_pages_range(inode->i_mapping, lstart, lend);
2912 }
2913 EXPORT_SYMBOL_GPL(shmem_truncate_range);
2914
2915 #define shmem_vm_ops                            generic_file_vm_ops
2916 #define shmem_file_operations                   ramfs_file_operations
2917 #define shmem_get_inode(sb, dir, mode, dev, flags)      ramfs_get_inode(sb, dir, mode, dev)
2918 #define shmem_acct_size(flags, size)            0
2919 #define shmem_unacct_size(flags, size)          do {} while (0)
2920
2921 #endif /* CONFIG_SHMEM */
2922
2923 /* common code */
2924
2925 static struct dentry_operations anon_ops = {
2926         .d_dname = simple_dname
2927 };
2928
2929 static struct file *__shmem_file_setup(const char *name, loff_t size,
2930                                        unsigned long flags, unsigned int i_flags)
2931 {
2932         struct file *res;
2933         struct inode *inode;
2934         struct path path;
2935         struct super_block *sb;
2936         struct qstr this;
2937
2938         if (IS_ERR(shm_mnt))
2939                 return ERR_CAST(shm_mnt);
2940
2941         if (size < 0 || size > MAX_LFS_FILESIZE)
2942                 return ERR_PTR(-EINVAL);
2943
2944         if (shmem_acct_size(flags, size))
2945                 return ERR_PTR(-ENOMEM);
2946
2947         res = ERR_PTR(-ENOMEM);
2948         this.name = name;
2949         this.len = strlen(name);
2950         this.hash = 0; /* will go */
2951         sb = shm_mnt->mnt_sb;
2952         path.mnt = mntget(shm_mnt);
2953         path.dentry = d_alloc_pseudo(sb, &this);
2954         if (!path.dentry)
2955                 goto put_memory;
2956         d_set_d_op(path.dentry, &anon_ops);
2957
2958         res = ERR_PTR(-ENOSPC);
2959         inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
2960         if (!inode)
2961                 goto put_memory;
2962
2963         inode->i_flags |= i_flags;
2964         d_instantiate(path.dentry, inode);
2965         inode->i_size = size;
2966         clear_nlink(inode);     /* It is unlinked */
2967         res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
2968         if (IS_ERR(res))
2969                 goto put_path;
2970
2971         res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
2972                   &shmem_file_operations);
2973         if (IS_ERR(res))
2974                 goto put_path;
2975
2976         return res;
2977
2978 put_memory:
2979         shmem_unacct_size(flags, size);
2980 put_path:
2981         path_put(&path);
2982         return res;
2983 }
2984
2985 /**
2986  * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
2987  *      kernel internal.  There will be NO LSM permission checks against the
2988  *      underlying inode.  So users of this interface must do LSM checks at a
2989  *      higher layer.  The one user is the big_key implementation.  LSM checks
2990  *      are provided at the key level rather than the inode level.
2991  * @name: name for dentry (to be seen in /proc/<pid>/maps
2992  * @size: size to be set for the file
2993  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
2994  */
2995 struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
2996 {
2997         return __shmem_file_setup(name, size, flags, S_PRIVATE);
2998 }
2999
3000 /**
3001  * shmem_file_setup - get an unlinked file living in tmpfs
3002  * @name: name for dentry (to be seen in /proc/<pid>/maps
3003  * @size: size to be set for the file
3004  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
3005  */
3006 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
3007 {
3008         return __shmem_file_setup(name, size, flags, 0);
3009 }
3010 EXPORT_SYMBOL_GPL(shmem_file_setup);
3011
3012 /**
3013  * shmem_zero_setup - setup a shared anonymous mapping
3014  * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
3015  */
3016 int shmem_zero_setup(struct vm_area_struct *vma)
3017 {
3018         struct file *file;
3019         loff_t size = vma->vm_end - vma->vm_start;
3020
3021         file = shmem_file_setup("dev/zero", size, vma->vm_flags);
3022         if (IS_ERR(file))
3023                 return PTR_ERR(file);
3024
3025         if (vma->vm_file)
3026                 fput(vma->vm_file);
3027         vma->vm_file = file;
3028         vma->vm_ops = &shmem_vm_ops;
3029         return 0;
3030 }
3031
3032 /**
3033  * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
3034  * @mapping:    the page's address_space
3035  * @index:      the page index
3036  * @gfp:        the page allocator flags to use if allocating
3037  *
3038  * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
3039  * with any new page allocations done using the specified allocation flags.
3040  * But read_cache_page_gfp() uses the ->readpage() method: which does not
3041  * suit tmpfs, since it may have pages in swapcache, and needs to find those
3042  * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
3043  *
3044  * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
3045  * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
3046  */
3047 struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
3048                                          pgoff_t index, gfp_t gfp)
3049 {
3050 #ifdef CONFIG_SHMEM
3051         struct inode *inode = mapping->host;
3052         struct page *page;
3053         int error;
3054
3055         BUG_ON(mapping->a_ops != &shmem_aops);
3056         error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
3057         if (error)
3058                 page = ERR_PTR(error);
3059         else
3060                 unlock_page(page);
3061         return page;
3062 #else
3063         /*
3064          * The tiny !SHMEM case uses ramfs without swap
3065          */
3066         return read_cache_page_gfp(mapping, index, gfp);
3067 #endif
3068 }
3069 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);