1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
7 #include <linux/spinlock.h>
8 #include <linux/completion.h>
9 #include <linux/buffer_head.h>
10 #include <linux/blkdev.h>
11 #include <linux/gfs2_ondisk.h>
12 #include <linux/crc32.h>
13 #include <linux/iomap.h>
14 #include <linux/ktime.h>
30 #include "trace_gfs2.h"
32 /* This doesn't need to be that large as max 64 bit pointers in a 4k
33 * block is 512, so __u16 is fine for that. It saves stack space to
37 struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
38 __u16 mp_list[GFS2_MAX_META_HEIGHT];
39 int mp_fheight; /* find_metapath height */
40 int mp_aheight; /* actual height (lookup height) */
43 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
46 * gfs2_unstuffer_folio - unstuff a stuffed inode into a block cached by a folio
48 * @dibh: the dinode buffer
49 * @block: the block number that was allocated
54 static int gfs2_unstuffer_folio(struct gfs2_inode *ip, struct buffer_head *dibh,
55 u64 block, struct folio *folio)
57 struct inode *inode = &ip->i_inode;
59 if (!folio_test_uptodate(folio)) {
60 void *kaddr = kmap_local_folio(folio, 0);
61 u64 dsize = i_size_read(inode);
63 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
64 memset(kaddr + dsize, 0, folio_size(folio) - dsize);
67 folio_mark_uptodate(folio);
70 if (gfs2_is_jdata(ip)) {
71 struct buffer_head *bh = folio_buffers(folio);
74 bh = create_empty_buffers(folio,
75 BIT(inode->i_blkbits), BIT(BH_Uptodate));
77 if (!buffer_mapped(bh))
78 map_bh(bh, inode->i_sb, block);
80 set_buffer_uptodate(bh);
81 gfs2_trans_add_data(ip->i_gl, bh);
83 folio_mark_dirty(folio);
84 gfs2_ordered_add_inode(ip);
90 static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct folio *folio)
92 struct buffer_head *bh, *dibh;
93 struct gfs2_dinode *di;
95 int isdir = gfs2_is_dir(ip);
98 error = gfs2_meta_inode_buffer(ip, &dibh);
102 if (i_size_read(&ip->i_inode)) {
103 /* Get a free block, fill it with the stuffed data,
104 and write it out to disk */
107 error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
111 gfs2_trans_remove_revoke(GFS2_SB(&ip->i_inode), block, 1);
112 error = gfs2_dir_get_new_buffer(ip, block, &bh);
115 gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
116 dibh, sizeof(struct gfs2_dinode));
119 error = gfs2_unstuffer_folio(ip, dibh, block, folio);
125 /* Set up the pointer to the new block */
127 gfs2_trans_add_meta(ip->i_gl, dibh);
128 di = (struct gfs2_dinode *)dibh->b_data;
129 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
131 if (i_size_read(&ip->i_inode)) {
132 *(__be64 *)(di + 1) = cpu_to_be64(block);
133 gfs2_add_inode_blocks(&ip->i_inode, 1);
134 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
138 di->di_height = cpu_to_be16(1);
146 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
147 * @ip: The GFS2 inode to unstuff
149 * This routine unstuffs a dinode and returns it to a "normal" state such
150 * that the height can be grown in the traditional way.
155 int gfs2_unstuff_dinode(struct gfs2_inode *ip)
157 struct inode *inode = &ip->i_inode;
161 down_write(&ip->i_rw_mutex);
162 folio = filemap_grab_folio(inode->i_mapping, 0);
163 error = PTR_ERR(folio);
166 error = __gfs2_unstuff_inode(ip, folio);
170 up_write(&ip->i_rw_mutex);
175 * find_metapath - Find path through the metadata tree
176 * @sdp: The superblock
177 * @block: The disk block to look up
178 * @mp: The metapath to return the result in
179 * @height: The pre-calculated height of the metadata tree
181 * This routine returns a struct metapath structure that defines a path
182 * through the metadata of inode "ip" to get to block "block".
185 * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a
186 * filesystem with a blocksize of 4096.
188 * find_metapath() would return a struct metapath structure set to:
189 * mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
191 * That means that in order to get to the block containing the byte at
192 * offset 101342453, we would load the indirect block pointed to by pointer
193 * 0 in the dinode. We would then load the indirect block pointed to by
194 * pointer 48 in that indirect block. We would then load the data block
195 * pointed to by pointer 165 in that indirect block.
197 * ----------------------------------------
202 * ----------------------------------------
206 * ----------------------------------------
210 * |0 5 6 7 8 9 0 1 2|
211 * ----------------------------------------
215 * ----------------------------------------
220 * ----------------------------------------
224 * ----------------------------------------
225 * | Data block containing offset |
229 * ----------------------------------------
233 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
234 struct metapath *mp, unsigned int height)
238 mp->mp_fheight = height;
239 for (i = height; i--;)
240 mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
243 static inline unsigned int metapath_branch_start(const struct metapath *mp)
245 if (mp->mp_list[0] == 0)
251 * metaptr1 - Return the first possible metadata pointer in a metapath buffer
252 * @height: The metadata height (0 = dinode)
255 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
257 struct buffer_head *bh = mp->mp_bh[height];
259 return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
260 return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
264 * metapointer - Return pointer to start of metadata in a buffer
265 * @height: The metadata height (0 = dinode)
268 * Return a pointer to the block number of the next height of the metadata
269 * tree given a buffer containing the pointer to the current height of the
273 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
275 __be64 *p = metaptr1(height, mp);
276 return p + mp->mp_list[height];
279 static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
281 const struct buffer_head *bh = mp->mp_bh[height];
282 return (const __be64 *)(bh->b_data + bh->b_size);
285 static void clone_metapath(struct metapath *clone, struct metapath *mp)
290 for (hgt = 0; hgt < mp->mp_aheight; hgt++)
291 get_bh(clone->mp_bh[hgt]);
294 static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
298 for (t = start; t < end; t++) {
299 struct buffer_head *rabh;
304 rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
305 if (trylock_buffer(rabh)) {
306 if (!buffer_uptodate(rabh)) {
307 rabh->b_end_io = end_buffer_read_sync;
308 submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META |
318 static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
319 unsigned int x, unsigned int h)
322 __be64 *ptr = metapointer(x, mp);
323 u64 dblock = be64_to_cpu(*ptr);
328 ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, &mp->mp_bh[x + 1]);
332 mp->mp_aheight = x + 1;
337 * lookup_metapath - Walk the metadata tree to a specific point
341 * Assumes that the inode's buffer has already been looked up and
342 * hooked onto mp->mp_bh[0] and that the metapath has been initialised
343 * by find_metapath().
345 * If this function encounters part of the tree which has not been
346 * allocated, it returns the current height of the tree at the point
347 * at which it found the unallocated block. Blocks which are found are
348 * added to the mp->mp_bh[] list.
353 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
355 return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
359 * fillup_metapath - fill up buffers for the metadata path to a specific height
362 * @h: The height to which it should be mapped
364 * Similar to lookup_metapath, but does lookups for a range of heights
366 * Returns: error or the number of buffers filled
369 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
375 /* find the first buffer we need to look up. */
376 for (x = h - 1; x > 0; x--) {
381 ret = __fillup_metapath(ip, mp, x, h);
384 return mp->mp_aheight - x - 1;
387 static sector_t metapath_to_block(struct gfs2_sbd *sdp, struct metapath *mp)
389 sector_t factor = 1, block = 0;
392 for (hgt = mp->mp_fheight - 1; hgt >= 0; hgt--) {
393 if (hgt < mp->mp_aheight)
394 block += mp->mp_list[hgt] * factor;
395 factor *= sdp->sd_inptrs;
400 static void release_metapath(struct metapath *mp)
404 for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
405 if (mp->mp_bh[i] == NULL)
407 brelse(mp->mp_bh[i]);
413 * gfs2_extent_length - Returns length of an extent of blocks
414 * @bh: The metadata block
415 * @ptr: Current position in @bh
416 * @limit: Max extent length to return
417 * @eob: Set to 1 if we hit "end of block"
419 * Returns: The length of the extent (minimum of one block)
422 static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob)
424 const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
425 const __be64 *first = ptr;
426 u64 d = be64_to_cpu(*ptr);
434 } while(be64_to_cpu(*ptr) == d);
440 enum walker_status { WALK_STOP, WALK_FOLLOW, WALK_CONTINUE };
443 * gfs2_metadata_walker - walk an indirect block
444 * @mp: Metapath to indirect block
445 * @ptrs: Number of pointers to look at
447 * When returning WALK_FOLLOW, the walker must update @mp to point at the right
448 * indirect block to follow.
450 typedef enum walker_status (*gfs2_metadata_walker)(struct metapath *mp,
454 * gfs2_walk_metadata - walk a tree of indirect blocks
456 * @mp: Starting point of walk
457 * @max_len: Maximum number of blocks to walk
458 * @walker: Called during the walk
460 * Returns 1 if the walk was stopped by @walker, 0 if we went past @max_len or
461 * past the end of metadata, and a negative error code otherwise.
464 static int gfs2_walk_metadata(struct inode *inode, struct metapath *mp,
465 u64 max_len, gfs2_metadata_walker walker)
467 struct gfs2_inode *ip = GFS2_I(inode);
468 struct gfs2_sbd *sdp = GFS2_SB(inode);
474 * The walk starts in the lowest allocated indirect block, which may be
475 * before the position indicated by @mp. Adjust @max_len accordingly
476 * to avoid a short walk.
478 for (hgt = mp->mp_fheight - 1; hgt >= mp->mp_aheight; hgt--) {
479 max_len += mp->mp_list[hgt] * factor;
480 mp->mp_list[hgt] = 0;
481 factor *= sdp->sd_inptrs;
485 u16 start = mp->mp_list[hgt];
486 enum walker_status status;
490 /* Walk indirect block. */
491 ptrs = (hgt >= 1 ? sdp->sd_inptrs : sdp->sd_diptrs) - start;
494 ptrs = DIV_ROUND_UP_ULL(max_len, factor);
495 status = walker(mp, ptrs);
500 BUG_ON(mp->mp_aheight == mp->mp_fheight);
501 ptrs = mp->mp_list[hgt] - start;
510 if (status == WALK_FOLLOW)
511 goto fill_up_metapath;
514 /* Decrease height of metapath. */
515 brelse(mp->mp_bh[hgt]);
516 mp->mp_bh[hgt] = NULL;
517 mp->mp_list[hgt] = 0;
521 factor *= sdp->sd_inptrs;
523 /* Advance in metadata tree. */
524 (mp->mp_list[hgt])++;
526 if (mp->mp_list[hgt] >= sdp->sd_inptrs)
529 if (mp->mp_list[hgt] >= sdp->sd_diptrs)
534 /* Increase height of metapath. */
535 ret = fillup_metapath(ip, mp, ip->i_height - 1);
540 do_div(factor, sdp->sd_inptrs);
541 mp->mp_aheight = hgt + 1;
546 static enum walker_status gfs2_hole_walker(struct metapath *mp,
549 const __be64 *start, *ptr, *end;
552 hgt = mp->mp_aheight - 1;
553 start = metapointer(hgt, mp);
556 for (ptr = start; ptr < end; ptr++) {
558 mp->mp_list[hgt] += ptr - start;
559 if (mp->mp_aheight == mp->mp_fheight)
564 return WALK_CONTINUE;
568 * gfs2_hole_size - figure out the size of a hole
570 * @lblock: The logical starting block number
571 * @len: How far to look (in blocks)
572 * @mp: The metapath at lblock
573 * @iomap: The iomap to store the hole size in
575 * This function modifies @mp.
577 * Returns: errno on error
579 static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
580 struct metapath *mp, struct iomap *iomap)
582 struct metapath clone;
586 clone_metapath(&clone, mp);
587 ret = gfs2_walk_metadata(inode, &clone, len, gfs2_hole_walker);
592 hole_size = metapath_to_block(GFS2_SB(inode), &clone) - lblock;
595 iomap->length = hole_size << inode->i_blkbits;
599 release_metapath(&clone);
603 static inline void gfs2_indirect_init(struct metapath *mp,
604 struct gfs2_glock *gl, unsigned int i,
605 unsigned offset, u64 bn)
607 __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
608 ((i > 1) ? sizeof(struct gfs2_meta_header) :
609 sizeof(struct gfs2_dinode)));
611 BUG_ON(mp->mp_bh[i] != NULL);
612 mp->mp_bh[i] = gfs2_meta_new(gl, bn);
613 gfs2_trans_add_meta(gl, mp->mp_bh[i]);
614 gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
615 gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
617 *ptr = cpu_to_be64(bn);
622 ALLOC_GROW_DEPTH = 1,
623 ALLOC_GROW_HEIGHT = 2,
624 /* ALLOC_UNSTUFF = 3, TBD and rather complicated */
628 * __gfs2_iomap_alloc - Build a metadata tree of the requested height
629 * @inode: The GFS2 inode
630 * @iomap: The iomap structure
631 * @mp: The metapath, with proper height information calculated
633 * In this routine we may have to alloc:
634 * i) Indirect blocks to grow the metadata tree height
635 * ii) Indirect blocks to fill in lower part of the metadata tree
638 * This function is called after __gfs2_iomap_get, which works out the
639 * total number of blocks which we need via gfs2_alloc_size.
641 * We then do the actual allocation asking for an extent at a time (if
642 * enough contiguous free blocks are available, there will only be one
643 * allocation request per call) and uses the state machine to initialise
644 * the blocks in order.
646 * Right now, this function will allocate at most one indirect block
647 * worth of data -- with a default block size of 4K, that's slightly
648 * less than 2M. If this limitation is ever removed to allow huge
649 * allocations, we would probably still want to limit the iomap size we
650 * return to avoid stalling other tasks during huge writes; the next
651 * iomap iteration would then find the blocks already allocated.
653 * Returns: errno on error
656 static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
659 struct gfs2_inode *ip = GFS2_I(inode);
660 struct gfs2_sbd *sdp = GFS2_SB(inode);
661 struct buffer_head *dibh = mp->mp_bh[0];
663 unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
664 size_t dblks = iomap->length >> inode->i_blkbits;
665 const unsigned end_of_metadata = mp->mp_fheight - 1;
667 enum alloc_state state;
671 BUG_ON(mp->mp_aheight < 1);
672 BUG_ON(dibh == NULL);
675 gfs2_trans_add_meta(ip->i_gl, dibh);
677 down_write(&ip->i_rw_mutex);
679 if (mp->mp_fheight == mp->mp_aheight) {
680 /* Bottom indirect block exists */
683 /* Need to allocate indirect blocks */
684 if (mp->mp_fheight == ip->i_height) {
685 /* Writing into existing tree, extend tree down */
686 iblks = mp->mp_fheight - mp->mp_aheight;
687 state = ALLOC_GROW_DEPTH;
689 /* Building up tree height */
690 state = ALLOC_GROW_HEIGHT;
691 iblks = mp->mp_fheight - ip->i_height;
692 branch_start = metapath_branch_start(mp);
693 iblks += (mp->mp_fheight - branch_start);
697 /* start of the second part of the function (state machine) */
699 blks = dblks + iblks;
703 ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
707 if (state != ALLOC_DATA || gfs2_is_jdata(ip))
708 gfs2_trans_remove_revoke(sdp, bn, n);
710 /* Growing height of tree */
711 case ALLOC_GROW_HEIGHT:
713 ptr = (__be64 *)(dibh->b_data +
714 sizeof(struct gfs2_dinode));
717 for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
719 gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
720 if (i - 1 == mp->mp_fheight - ip->i_height) {
722 gfs2_buffer_copy_tail(mp->mp_bh[i],
723 sizeof(struct gfs2_meta_header),
724 dibh, sizeof(struct gfs2_dinode));
725 gfs2_buffer_clear_tail(dibh,
726 sizeof(struct gfs2_dinode) +
728 ptr = (__be64 *)(mp->mp_bh[i]->b_data +
729 sizeof(struct gfs2_meta_header));
731 state = ALLOC_GROW_DEPTH;
732 for(i = branch_start; i < mp->mp_fheight; i++) {
733 if (mp->mp_bh[i] == NULL)
735 brelse(mp->mp_bh[i]);
742 fallthrough; /* To branching from existing tree */
743 case ALLOC_GROW_DEPTH:
744 if (i > 1 && i < mp->mp_fheight)
745 gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
746 for (; i < mp->mp_fheight && n > 0; i++, n--)
747 gfs2_indirect_init(mp, ip->i_gl, i,
748 mp->mp_list[i-1], bn++);
749 if (i == mp->mp_fheight)
753 fallthrough; /* To tree complete, adding data blocks */
756 BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
757 gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
759 ptr = metapointer(end_of_metadata, mp);
760 iomap->addr = bn << inode->i_blkbits;
761 iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
763 *ptr++ = cpu_to_be64(bn++);
766 } while (iomap->addr == IOMAP_NULL_ADDR);
768 iomap->type = IOMAP_MAPPED;
769 iomap->length = (u64)dblks << inode->i_blkbits;
770 ip->i_height = mp->mp_fheight;
771 gfs2_add_inode_blocks(&ip->i_inode, alloced);
772 gfs2_dinode_out(ip, dibh->b_data);
774 up_write(&ip->i_rw_mutex);
778 #define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
781 * gfs2_alloc_size - Compute the maximum allocation size
784 * @size: Requested size in blocks
786 * Compute the maximum size of the next allocation at @mp.
788 * Returns: size in blocks
790 static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size)
792 struct gfs2_inode *ip = GFS2_I(inode);
793 struct gfs2_sbd *sdp = GFS2_SB(inode);
794 const __be64 *first, *ptr, *end;
797 * For writes to stuffed files, this function is called twice via
798 * __gfs2_iomap_get, before and after unstuffing. The size we return the
799 * first time needs to be large enough to get the reservation and
800 * allocation sizes right. The size we return the second time must
801 * be exact or else __gfs2_iomap_alloc won't do the right thing.
804 if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) {
805 unsigned int maxsize = mp->mp_fheight > 1 ?
806 sdp->sd_inptrs : sdp->sd_diptrs;
807 maxsize -= mp->mp_list[mp->mp_fheight - 1];
813 first = metapointer(ip->i_height - 1, mp);
814 end = metaend(ip->i_height - 1, mp);
815 if (end - first > size)
817 for (ptr = first; ptr < end; ptr++) {
825 * __gfs2_iomap_get - Map blocks from an inode to disk blocks
827 * @pos: Starting position in bytes
828 * @length: Length to map, in bytes
829 * @flags: iomap flags
830 * @iomap: The iomap structure
835 static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
836 unsigned flags, struct iomap *iomap,
839 struct gfs2_inode *ip = GFS2_I(inode);
840 struct gfs2_sbd *sdp = GFS2_SB(inode);
841 loff_t size = i_size_read(inode);
844 sector_t lblock_stop;
848 struct buffer_head *dibh = NULL, *bh;
854 down_read(&ip->i_rw_mutex);
856 ret = gfs2_meta_inode_buffer(ip, &dibh);
861 if (gfs2_is_stuffed(ip)) {
862 if (flags & IOMAP_WRITE) {
863 loff_t max_size = gfs2_max_stuffed_size(ip);
865 if (pos + length > max_size)
867 iomap->length = max_size;
870 if (flags & IOMAP_REPORT) {
875 iomap->length = length;
879 iomap->length = size;
881 iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
882 sizeof(struct gfs2_dinode);
883 iomap->type = IOMAP_INLINE;
884 iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode);
889 lblock = pos >> inode->i_blkbits;
890 iomap->offset = lblock << inode->i_blkbits;
891 lblock_stop = (pos + length - 1) >> inode->i_blkbits;
892 len = lblock_stop - lblock + 1;
893 iomap->length = len << inode->i_blkbits;
895 height = ip->i_height;
896 while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
898 find_metapath(sdp, lblock, mp, height);
899 if (height > ip->i_height || gfs2_is_stuffed(ip))
902 ret = lookup_metapath(ip, mp);
906 if (mp->mp_aheight != ip->i_height)
909 ptr = metapointer(ip->i_height - 1, mp);
913 bh = mp->mp_bh[ip->i_height - 1];
914 len = gfs2_extent_length(bh, ptr, len, &eob);
916 iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
917 iomap->length = len << inode->i_blkbits;
918 iomap->type = IOMAP_MAPPED;
919 iomap->flags |= IOMAP_F_MERGED;
921 iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
924 iomap->bdev = inode->i_sb->s_bdev;
926 up_read(&ip->i_rw_mutex);
930 if (flags & IOMAP_REPORT) {
933 else if (height == ip->i_height)
934 ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
936 iomap->length = size - iomap->offset;
937 } else if (flags & IOMAP_WRITE) {
940 if (flags & IOMAP_DIRECT)
941 goto out; /* (see gfs2_file_direct_write) */
943 len = gfs2_alloc_size(inode, mp, len);
944 alloc_size = len << inode->i_blkbits;
945 if (alloc_size < iomap->length)
946 iomap->length = alloc_size;
948 if (pos < size && height == ip->i_height)
949 ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
952 iomap->addr = IOMAP_NULL_ADDR;
953 iomap->type = IOMAP_HOLE;
957 static struct folio *
958 gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
960 struct inode *inode = iter->inode;
961 unsigned int blockmask = i_blocksize(inode) - 1;
962 struct gfs2_sbd *sdp = GFS2_SB(inode);
967 blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
968 status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
970 return ERR_PTR(status);
972 folio = iomap_get_folio(iter, pos, len);
978 static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos,
979 unsigned copied, struct folio *folio)
981 struct gfs2_trans *tr = current->journal_info;
982 struct gfs2_inode *ip = GFS2_I(inode);
983 struct gfs2_sbd *sdp = GFS2_SB(inode);
985 if (!gfs2_is_stuffed(ip))
986 gfs2_trans_add_databufs(ip, folio, offset_in_folio(folio, pos),
992 if (tr->tr_num_buf_new)
993 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
998 static const struct iomap_folio_ops gfs2_iomap_folio_ops = {
999 .get_folio = gfs2_iomap_get_folio,
1000 .put_folio = gfs2_iomap_put_folio,
1003 static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
1004 loff_t length, unsigned flags,
1005 struct iomap *iomap,
1006 struct metapath *mp)
1008 struct gfs2_inode *ip = GFS2_I(inode);
1009 struct gfs2_sbd *sdp = GFS2_SB(inode);
1013 unstuff = gfs2_is_stuffed(ip) &&
1014 pos + length > gfs2_max_stuffed_size(ip);
1016 if (unstuff || iomap->type == IOMAP_HOLE) {
1017 unsigned int data_blocks, ind_blocks;
1018 struct gfs2_alloc_parms ap = {};
1019 unsigned int rblocks;
1020 struct gfs2_trans *tr;
1022 gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
1024 ap.target = data_blocks + ind_blocks;
1025 ret = gfs2_quota_lock_check(ip, &ap);
1029 ret = gfs2_inplace_reserve(ip, &ap);
1033 rblocks = RES_DINODE + ind_blocks;
1034 if (gfs2_is_jdata(ip))
1035 rblocks += data_blocks;
1036 if (ind_blocks || data_blocks)
1037 rblocks += RES_STATFS + RES_QUOTA;
1038 if (inode == sdp->sd_rindex)
1039 rblocks += 2 * RES_STATFS;
1040 rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
1042 ret = gfs2_trans_begin(sdp, rblocks,
1043 iomap->length >> inode->i_blkbits);
1045 goto out_trans_fail;
1048 ret = gfs2_unstuff_dinode(ip);
1051 release_metapath(mp);
1052 ret = __gfs2_iomap_get(inode, iomap->offset,
1053 iomap->length, flags, iomap, mp);
1058 if (iomap->type == IOMAP_HOLE) {
1059 ret = __gfs2_iomap_alloc(inode, iomap, mp);
1061 gfs2_trans_end(sdp);
1062 gfs2_inplace_release(ip);
1063 punch_hole(ip, iomap->offset, iomap->length);
1068 tr = current->journal_info;
1069 if (tr->tr_num_buf_new)
1070 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1072 gfs2_trans_end(sdp);
1075 if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
1076 iomap->folio_ops = &gfs2_iomap_folio_ops;
1080 gfs2_trans_end(sdp);
1082 gfs2_inplace_release(ip);
1084 gfs2_quota_unlock(ip);
1088 static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
1089 unsigned flags, struct iomap *iomap,
1090 struct iomap *srcmap)
1092 struct gfs2_inode *ip = GFS2_I(inode);
1093 struct metapath mp = { .mp_aheight = 1, };
1096 if (gfs2_is_jdata(ip))
1097 iomap->flags |= IOMAP_F_BUFFER_HEAD;
1099 trace_gfs2_iomap_start(ip, pos, length, flags);
1100 ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
1104 switch(flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1106 if (flags & IOMAP_DIRECT) {
1108 * Silently fall back to buffered I/O for stuffed files
1109 * or if we've got a hole (see gfs2_file_direct_write).
1111 if (iomap->type != IOMAP_MAPPED)
1117 if (iomap->type == IOMAP_HOLE)
1124 ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
1127 release_metapath(&mp);
1128 trace_gfs2_iomap_end(ip, iomap, ret);
1132 static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
1133 ssize_t written, unsigned flags, struct iomap *iomap)
1135 struct gfs2_inode *ip = GFS2_I(inode);
1136 struct gfs2_sbd *sdp = GFS2_SB(inode);
1138 switch (flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1140 if (flags & IOMAP_DIRECT)
1144 if (iomap->type == IOMAP_HOLE)
1151 if (!gfs2_is_stuffed(ip))
1152 gfs2_ordered_add_inode(ip);
1154 if (inode == sdp->sd_rindex)
1155 adjust_fs_space(inode);
1157 gfs2_inplace_release(ip);
1159 if (ip->i_qadata && ip->i_qadata->qa_qd_num)
1160 gfs2_quota_unlock(ip);
1162 if (length != written && (iomap->flags & IOMAP_F_NEW)) {
1163 /* Deallocate blocks that were just allocated. */
1164 loff_t hstart = round_up(pos + written, i_blocksize(inode));
1165 loff_t hend = iomap->offset + iomap->length;
1167 if (hstart < hend) {
1168 truncate_pagecache_range(inode, hstart, hend - 1);
1169 punch_hole(ip, hstart, hend - hstart);
1173 if (unlikely(!written))
1176 if (iomap->flags & IOMAP_F_SIZE_CHANGED)
1177 mark_inode_dirty(inode);
1178 set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
1182 const struct iomap_ops gfs2_iomap_ops = {
1183 .iomap_begin = gfs2_iomap_begin,
1184 .iomap_end = gfs2_iomap_end,
1188 * gfs2_block_map - Map one or more blocks of an inode to a disk block
1190 * @lblock: The logical block number
1191 * @bh_map: The bh to be mapped
1192 * @create: True if its ok to alloc blocks to satify the request
1194 * The size of the requested mapping is defined in bh_map->b_size.
1196 * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged
1197 * when @lblock is not mapped. Sets buffer_mapped(bh_map) and
1198 * bh_map->b_size to indicate the size of the mapping when @lblock and
1199 * successive blocks are mapped, up to the requested size.
1201 * Sets buffer_boundary() if a read of metadata will be required
1202 * before the next block can be mapped. Sets buffer_new() if new
1203 * blocks were allocated.
1208 int gfs2_block_map(struct inode *inode, sector_t lblock,
1209 struct buffer_head *bh_map, int create)
1211 struct gfs2_inode *ip = GFS2_I(inode);
1212 loff_t pos = (loff_t)lblock << inode->i_blkbits;
1213 loff_t length = bh_map->b_size;
1214 struct iomap iomap = { };
1217 clear_buffer_mapped(bh_map);
1218 clear_buffer_new(bh_map);
1219 clear_buffer_boundary(bh_map);
1220 trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
1223 ret = gfs2_iomap_get(inode, pos, length, &iomap);
1225 ret = gfs2_iomap_alloc(inode, pos, length, &iomap);
1229 if (iomap.length > bh_map->b_size) {
1230 iomap.length = bh_map->b_size;
1231 iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
1233 if (iomap.addr != IOMAP_NULL_ADDR)
1234 map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
1235 bh_map->b_size = iomap.length;
1236 if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
1237 set_buffer_boundary(bh_map);
1238 if (iomap.flags & IOMAP_F_NEW)
1239 set_buffer_new(bh_map);
1242 trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
1246 int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock,
1247 unsigned int *extlen)
1249 unsigned int blkbits = inode->i_blkbits;
1250 struct iomap iomap = { };
1254 ret = gfs2_iomap_get(inode, lblock << blkbits, *extlen << blkbits,
1258 if (iomap.type != IOMAP_MAPPED)
1260 *dblock = iomap.addr >> blkbits;
1261 len = iomap.length >> blkbits;
1267 int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
1268 unsigned int *extlen, bool *new)
1270 unsigned int blkbits = inode->i_blkbits;
1271 struct iomap iomap = { };
1275 ret = gfs2_iomap_alloc(inode, lblock << blkbits, *extlen << blkbits,
1279 if (iomap.type != IOMAP_MAPPED)
1281 *dblock = iomap.addr >> blkbits;
1282 len = iomap.length >> blkbits;
1285 *new = iomap.flags & IOMAP_F_NEW;
1290 * NOTE: Never call gfs2_block_zero_range with an open transaction because it
1291 * uses iomap write to perform its actions, which begin their own transactions
1292 * (iomap_begin, get_folio, etc.)
1294 static int gfs2_block_zero_range(struct inode *inode, loff_t from,
1295 unsigned int length)
1297 BUG_ON(current->journal_info);
1298 return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
1301 #define GFS2_JTRUNC_REVOKES 8192
1304 * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
1305 * @inode: The inode being truncated
1306 * @oldsize: The original (larger) size
1307 * @newsize: The new smaller size
1309 * With jdata files, we have to journal a revoke for each block which is
1310 * truncated. As a result, we need to split this into separate transactions
1311 * if the number of pages being truncated gets too large.
1314 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
1316 struct gfs2_sbd *sdp = GFS2_SB(inode);
1317 u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1321 while (oldsize != newsize) {
1322 struct gfs2_trans *tr;
1325 chunk = oldsize - newsize;
1326 if (chunk > max_chunk)
1329 offs = oldsize & ~PAGE_MASK;
1330 if (offs && chunk > PAGE_SIZE)
1331 chunk = offs + ((chunk - offs) & PAGE_MASK);
1333 truncate_pagecache(inode, oldsize - chunk);
1336 tr = current->journal_info;
1337 if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1340 gfs2_trans_end(sdp);
1341 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1349 static int trunc_start(struct inode *inode, u64 newsize)
1351 struct gfs2_inode *ip = GFS2_I(inode);
1352 struct gfs2_sbd *sdp = GFS2_SB(inode);
1353 struct buffer_head *dibh = NULL;
1354 int journaled = gfs2_is_jdata(ip);
1355 u64 oldsize = inode->i_size;
1358 if (!gfs2_is_stuffed(ip)) {
1359 unsigned int blocksize = i_blocksize(inode);
1360 unsigned int offs = newsize & (blocksize - 1);
1362 error = gfs2_block_zero_range(inode, newsize,
1369 error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1371 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1375 error = gfs2_meta_inode_buffer(ip, &dibh);
1379 gfs2_trans_add_meta(ip->i_gl, dibh);
1381 if (gfs2_is_stuffed(ip))
1382 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1384 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1386 i_size_write(inode, newsize);
1387 inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1388 gfs2_dinode_out(ip, dibh->b_data);
1391 error = gfs2_journaled_truncate(inode, oldsize, newsize);
1393 truncate_pagecache(inode, newsize);
1397 if (current->journal_info)
1398 gfs2_trans_end(sdp);
1402 int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
1403 struct iomap *iomap)
1405 struct metapath mp = { .mp_aheight = 1, };
1408 ret = __gfs2_iomap_get(inode, pos, length, 0, iomap, &mp);
1409 release_metapath(&mp);
1413 int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length,
1414 struct iomap *iomap)
1416 struct metapath mp = { .mp_aheight = 1, };
1419 ret = __gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
1420 if (!ret && iomap->type == IOMAP_HOLE)
1421 ret = __gfs2_iomap_alloc(inode, iomap, &mp);
1422 release_metapath(&mp);
1427 * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1429 * @rd_gh: holder of resource group glock
1430 * @bh: buffer head to sweep
1431 * @start: starting point in bh
1432 * @end: end point in bh
1433 * @meta: true if bh points to metadata (rather than data)
1434 * @btotal: place to keep count of total blocks freed
1436 * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1437 * free, and free them all. However, we do it one rgrp at a time. If this
1438 * block has references to multiple rgrps, we break it into individual
1439 * transactions. This allows other processes to use the rgrps while we're
1440 * focused on a single one, for better concurrency / performance.
1441 * At every transaction boundary, we rewrite the inode into the journal.
1442 * That way the bitmaps are kept consistent with the inode and we can recover
1443 * if we're interrupted by power-outages.
1445 * Returns: 0, or return code if an error occurred.
1446 * *btotal has the total number of blocks freed
1448 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1449 struct buffer_head *bh, __be64 *start, __be64 *end,
1450 bool meta, u32 *btotal)
1452 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1453 struct gfs2_rgrpd *rgd;
1454 struct gfs2_trans *tr;
1456 int blks_outside_rgrp;
1457 u64 bn, bstart, isize_blks;
1458 s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1460 bool buf_in_tr = false; /* buffer was added to transaction */
1464 if (gfs2_holder_initialized(rd_gh)) {
1465 rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1466 gfs2_assert_withdraw(sdp,
1467 gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1469 blks_outside_rgrp = 0;
1473 for (p = start; p < end; p++) {
1476 bn = be64_to_cpu(*p);
1479 if (!rgrp_contains_block(rgd, bn)) {
1480 blks_outside_rgrp++;
1484 rgd = gfs2_blk2rgrpd(sdp, bn, true);
1485 if (unlikely(!rgd)) {
1489 ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1490 LM_FLAG_NODE_SCOPE, rd_gh);
1494 /* Must be done with the rgrp glock held: */
1495 if (gfs2_rs_active(&ip->i_res) &&
1496 rgd == ip->i_res.rs_rgd)
1497 gfs2_rs_deltree(&ip->i_res);
1500 /* The size of our transactions will be unknown until we
1501 actually process all the metadata blocks that relate to
1502 the rgrp. So we estimate. We know it can't be more than
1503 the dinode's i_blocks and we don't want to exceed the
1504 journal flush threshold, sd_log_thresh2. */
1505 if (current->journal_info == NULL) {
1506 unsigned int jblocks_rqsted, revokes;
1508 jblocks_rqsted = rgd->rd_length + RES_DINODE +
1510 isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1511 if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1513 atomic_read(&sdp->sd_log_thresh2);
1515 jblocks_rqsted += isize_blks;
1516 revokes = jblocks_rqsted;
1518 revokes += end - start;
1519 else if (ip->i_depth)
1520 revokes += sdp->sd_inptrs;
1521 ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1524 down_write(&ip->i_rw_mutex);
1526 /* check if we will exceed the transaction blocks requested */
1527 tr = current->journal_info;
1528 if (tr->tr_num_buf_new + RES_STATFS +
1529 RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1530 /* We set blks_outside_rgrp to ensure the loop will
1531 be repeated for the same rgrp, but with a new
1533 blks_outside_rgrp++;
1534 /* This next part is tricky. If the buffer was added
1535 to the transaction, we've already set some block
1536 pointers to 0, so we better follow through and free
1537 them, or we will introduce corruption (so break).
1538 This may be impossible, or at least rare, but I
1539 decided to cover the case regardless.
1541 If the buffer was not added to the transaction
1542 (this call), doing so would exceed our transaction
1543 size, so we need to end the transaction and start a
1544 new one (so goto). */
1551 gfs2_trans_add_meta(ip->i_gl, bh);
1554 if (bstart + blen == bn) {
1559 __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1561 gfs2_add_inode_blocks(&ip->i_inode, -blen);
1567 __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1569 gfs2_add_inode_blocks(&ip->i_inode, -blen);
1572 if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1573 outside the rgrp we just processed,
1574 do it all over again. */
1575 if (current->journal_info) {
1576 struct buffer_head *dibh;
1578 ret = gfs2_meta_inode_buffer(ip, &dibh);
1582 /* Every transaction boundary, we rewrite the dinode
1583 to keep its di_blocks current in case of failure. */
1584 inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1585 gfs2_trans_add_meta(ip->i_gl, dibh);
1586 gfs2_dinode_out(ip, dibh->b_data);
1588 up_write(&ip->i_rw_mutex);
1589 gfs2_trans_end(sdp);
1592 gfs2_glock_dq_uninit(rd_gh);
1600 static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1602 if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1608 * find_nonnull_ptr - find a non-null pointer given a metapath and height
1609 * @sdp: The superblock
1610 * @mp: starting metapath
1611 * @h: desired height to search
1612 * @end_list: See punch_hole().
1613 * @end_aligned: See punch_hole().
1615 * Assumes the metapath is valid (with buffers) out to height h.
1616 * Returns: true if a non-null pointer was found in the metapath buffer
1617 * false if all remaining pointers are NULL in the buffer
1619 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1621 __u16 *end_list, unsigned int end_aligned)
1623 struct buffer_head *bh = mp->mp_bh[h];
1624 __be64 *first, *ptr, *end;
1626 first = metaptr1(h, mp);
1627 ptr = first + mp->mp_list[h];
1628 end = (__be64 *)(bh->b_data + bh->b_size);
1629 if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
1630 bool keep_end = h < end_aligned;
1631 end = first + end_list[h] + keep_end;
1635 if (*ptr) { /* if we have a non-null pointer */
1636 mp->mp_list[h] = ptr - first;
1638 if (h < GFS2_MAX_META_HEIGHT)
1647 enum dealloc_states {
1648 DEALLOC_MP_FULL = 0, /* Strip a metapath with all buffers read in */
1649 DEALLOC_MP_LOWER = 1, /* lower the metapath strip height */
1650 DEALLOC_FILL_MP = 2, /* Fill in the metapath to the given height. */
1651 DEALLOC_DONE = 3, /* process complete */
1655 metapointer_range(struct metapath *mp, int height,
1656 __u16 *start_list, unsigned int start_aligned,
1657 __u16 *end_list, unsigned int end_aligned,
1658 __be64 **start, __be64 **end)
1660 struct buffer_head *bh = mp->mp_bh[height];
1663 first = metaptr1(height, mp);
1665 if (mp_eq_to_hgt(mp, start_list, height)) {
1666 bool keep_start = height < start_aligned;
1667 *start = first + start_list[height] + keep_start;
1669 *end = (__be64 *)(bh->b_data + bh->b_size);
1670 if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
1671 bool keep_end = height < end_aligned;
1672 *end = first + end_list[height] + keep_end;
1676 static inline bool walk_done(struct gfs2_sbd *sdp,
1677 struct metapath *mp, int height,
1678 __u16 *end_list, unsigned int end_aligned)
1683 bool keep_end = height < end_aligned;
1684 if (!mp_eq_to_hgt(mp, end_list, height))
1686 end = end_list[height] + keep_end;
1688 end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
1689 return mp->mp_list[height] >= end;
1693 * punch_hole - deallocate blocks in a file
1694 * @ip: inode to truncate
1695 * @offset: the start of the hole
1696 * @length: the size of the hole (or 0 for truncate)
1698 * Punch a hole into a file or truncate a file at a given position. This
1699 * function operates in whole blocks (@offset and @length are rounded
1700 * accordingly); partially filled blocks must be cleared otherwise.
1702 * This function works from the bottom up, and from the right to the left. In
1703 * other words, it strips off the highest layer (data) before stripping any of
1704 * the metadata. Doing it this way is best in case the operation is interrupted
1705 * by power failure, etc. The dinode is rewritten in every transaction to
1706 * guarantee integrity.
1708 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
1710 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1711 u64 maxsize = sdp->sd_heightsize[ip->i_height];
1712 struct metapath mp = {};
1713 struct buffer_head *dibh, *bh;
1714 struct gfs2_holder rd_gh;
1715 unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1716 u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
1717 __u16 start_list[GFS2_MAX_META_HEIGHT];
1718 __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
1719 unsigned int start_aligned, end_aligned;
1720 unsigned int strip_h = ip->i_height - 1;
1723 int mp_h; /* metapath buffers are read in to this height */
1725 __be64 *start, *end;
1727 if (offset >= maxsize) {
1729 * The starting point lies beyond the allocated metadata;
1730 * there are no blocks to deallocate.
1736 * The start position of the hole is defined by lblock, start_list, and
1737 * start_aligned. The end position of the hole is defined by lend,
1738 * end_list, and end_aligned.
1740 * start_aligned and end_aligned define down to which height the start
1741 * and end positions are aligned to the metadata tree (i.e., the
1742 * position is a multiple of the metadata granularity at the height
1743 * above). This determines at which heights additional meta pointers
1744 * needs to be preserved for the remaining data.
1748 u64 end_offset = offset + length;
1752 * Clip the end at the maximum file size for the given height:
1753 * that's how far the metadata goes; files bigger than that
1754 * will have additional layers of indirection.
1756 if (end_offset > maxsize)
1757 end_offset = maxsize;
1758 lend = end_offset >> bsize_shift;
1763 find_metapath(sdp, lend, &mp, ip->i_height);
1764 end_list = __end_list;
1765 memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
1767 for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1774 find_metapath(sdp, lblock, &mp, ip->i_height);
1775 memcpy(start_list, mp.mp_list, sizeof(start_list));
1777 for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1778 if (start_list[mp_h])
1781 start_aligned = mp_h;
1783 ret = gfs2_meta_inode_buffer(ip, &dibh);
1788 ret = lookup_metapath(ip, &mp);
1792 /* issue read-ahead on metadata */
1793 for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
1794 metapointer_range(&mp, mp_h, start_list, start_aligned,
1795 end_list, end_aligned, &start, &end);
1796 gfs2_metapath_ra(ip->i_gl, start, end);
1799 if (mp.mp_aheight == ip->i_height)
1800 state = DEALLOC_MP_FULL; /* We have a complete metapath */
1802 state = DEALLOC_FILL_MP; /* deal with partial metapath */
1804 ret = gfs2_rindex_update(sdp);
1808 ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1811 gfs2_holder_mark_uninitialized(&rd_gh);
1815 while (state != DEALLOC_DONE) {
1817 /* Truncate a full metapath at the given strip height.
1818 * Note that strip_h == mp_h in order to be in this state. */
1819 case DEALLOC_MP_FULL:
1820 bh = mp.mp_bh[mp_h];
1821 gfs2_assert_withdraw(sdp, bh);
1822 if (gfs2_assert_withdraw(sdp,
1823 prev_bnr != bh->b_blocknr)) {
1824 fs_emerg(sdp, "inode %llu, block:%llu, i_h:%u,"
1825 "s_h:%u, mp_h:%u\n",
1826 (unsigned long long)ip->i_no_addr,
1827 prev_bnr, ip->i_height, strip_h, mp_h);
1829 prev_bnr = bh->b_blocknr;
1831 if (gfs2_metatype_check(sdp, bh,
1832 (mp_h ? GFS2_METATYPE_IN :
1833 GFS2_METATYPE_DI))) {
1839 * Below, passing end_aligned as 0 gives us the
1840 * metapointer range excluding the end point: the end
1841 * point is the first metapath we must not deallocate!
1844 metapointer_range(&mp, mp_h, start_list, start_aligned,
1845 end_list, 0 /* end_aligned */,
1847 ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
1849 mp_h != ip->i_height - 1,
1852 /* If we hit an error or just swept dinode buffer,
1855 state = DEALLOC_DONE;
1858 state = DEALLOC_MP_LOWER;
1861 /* lower the metapath strip height */
1862 case DEALLOC_MP_LOWER:
1863 /* We're done with the current buffer, so release it,
1864 unless it's the dinode buffer. Then back up to the
1865 previous pointer. */
1867 brelse(mp.mp_bh[mp_h]);
1868 mp.mp_bh[mp_h] = NULL;
1870 /* If we can't get any lower in height, we've stripped
1871 off all we can. Next step is to back up and start
1872 stripping the previous level of metadata. */
1875 memcpy(mp.mp_list, start_list, sizeof(start_list));
1877 state = DEALLOC_FILL_MP;
1880 mp.mp_list[mp_h] = 0;
1881 mp_h--; /* search one metadata height down */
1883 if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
1885 /* Here we've found a part of the metapath that is not
1886 * allocated. We need to search at that height for the
1887 * next non-null pointer. */
1888 if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
1889 state = DEALLOC_FILL_MP;
1892 /* No more non-null pointers at this height. Back up
1893 to the previous height and try again. */
1894 break; /* loop around in the same state */
1896 /* Fill the metapath with buffers to the given height. */
1897 case DEALLOC_FILL_MP:
1898 /* Fill the buffers out to the current height. */
1899 ret = fillup_metapath(ip, &mp, mp_h);
1903 /* On the first pass, issue read-ahead on metadata. */
1904 if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) {
1905 unsigned int height = mp.mp_aheight - 1;
1907 /* No read-ahead for data blocks. */
1908 if (mp.mp_aheight - 1 == strip_h)
1911 for (; height >= mp.mp_aheight - ret; height--) {
1912 metapointer_range(&mp, height,
1913 start_list, start_aligned,
1914 end_list, end_aligned,
1916 gfs2_metapath_ra(ip->i_gl, start, end);
1920 /* If buffers found for the entire strip height */
1921 if (mp.mp_aheight - 1 == strip_h) {
1922 state = DEALLOC_MP_FULL;
1925 if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1926 mp_h = mp.mp_aheight - 1;
1928 /* If we find a non-null block pointer, crawl a bit
1929 higher up in the metapath and try again, otherwise
1930 we need to look lower for a new starting point. */
1931 if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
1934 state = DEALLOC_MP_LOWER;
1940 if (current->journal_info == NULL) {
1941 ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1945 down_write(&ip->i_rw_mutex);
1947 gfs2_statfs_change(sdp, 0, +btotal, 0);
1948 gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1950 inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1951 gfs2_trans_add_meta(ip->i_gl, dibh);
1952 gfs2_dinode_out(ip, dibh->b_data);
1953 up_write(&ip->i_rw_mutex);
1954 gfs2_trans_end(sdp);
1958 if (gfs2_holder_initialized(&rd_gh))
1959 gfs2_glock_dq_uninit(&rd_gh);
1960 if (current->journal_info) {
1961 up_write(&ip->i_rw_mutex);
1962 gfs2_trans_end(sdp);
1965 gfs2_quota_unhold(ip);
1967 release_metapath(&mp);
1971 static int trunc_end(struct gfs2_inode *ip)
1973 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1974 struct buffer_head *dibh;
1977 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1981 down_write(&ip->i_rw_mutex);
1983 error = gfs2_meta_inode_buffer(ip, &dibh);
1987 if (!i_size_read(&ip->i_inode)) {
1989 ip->i_goal = ip->i_no_addr;
1990 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1991 gfs2_ordered_del_inode(ip);
1993 inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1994 ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1996 gfs2_trans_add_meta(ip->i_gl, dibh);
1997 gfs2_dinode_out(ip, dibh->b_data);
2001 up_write(&ip->i_rw_mutex);
2002 gfs2_trans_end(sdp);
2007 * do_shrink - make a file smaller
2009 * @newsize: the size to make the file
2011 * Called with an exclusive lock on @inode. The @size must
2012 * be equal to or smaller than the current inode size.
2017 static int do_shrink(struct inode *inode, u64 newsize)
2019 struct gfs2_inode *ip = GFS2_I(inode);
2022 error = trunc_start(inode, newsize);
2025 if (gfs2_is_stuffed(ip))
2028 error = punch_hole(ip, newsize, 0);
2030 error = trunc_end(ip);
2036 * do_grow - Touch and update inode size
2038 * @size: The new size
2040 * This function updates the timestamps on the inode and
2041 * may also increase the size of the inode. This function
2042 * must not be called with @size any smaller than the current
2045 * Although it is not strictly required to unstuff files here,
2046 * earlier versions of GFS2 have a bug in the stuffed file reading
2047 * code which will result in a buffer overrun if the size is larger
2048 * than the max stuffed file size. In order to prevent this from
2049 * occurring, such files are unstuffed, but in other cases we can
2050 * just update the inode size directly.
2052 * Returns: 0 on success, or -ve on error
2055 static int do_grow(struct inode *inode, u64 size)
2057 struct gfs2_inode *ip = GFS2_I(inode);
2058 struct gfs2_sbd *sdp = GFS2_SB(inode);
2059 struct gfs2_alloc_parms ap = { .target = 1, };
2060 struct buffer_head *dibh;
2064 if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
2065 error = gfs2_quota_lock_check(ip, &ap);
2069 error = gfs2_inplace_reserve(ip, &ap);
2071 goto do_grow_qunlock;
2075 error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
2077 gfs2_is_jdata(ip) ? RES_JDATA : 0) +
2078 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
2081 goto do_grow_release;
2084 error = gfs2_unstuff_dinode(ip);
2089 error = gfs2_meta_inode_buffer(ip, &dibh);
2093 truncate_setsize(inode, size);
2094 inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
2095 gfs2_trans_add_meta(ip->i_gl, dibh);
2096 gfs2_dinode_out(ip, dibh->b_data);
2100 gfs2_trans_end(sdp);
2103 gfs2_inplace_release(ip);
2105 gfs2_quota_unlock(ip);
2111 * gfs2_setattr_size - make a file a given size
2113 * @newsize: the size to make the file
2115 * The file size can grow, shrink, or stay the same size. This
2116 * is called holding i_rwsem and an exclusive glock on the inode
2122 int gfs2_setattr_size(struct inode *inode, u64 newsize)
2124 struct gfs2_inode *ip = GFS2_I(inode);
2127 BUG_ON(!S_ISREG(inode->i_mode));
2129 ret = inode_newsize_ok(inode, newsize);
2133 inode_dio_wait(inode);
2135 ret = gfs2_qa_get(ip);
2139 if (newsize >= inode->i_size) {
2140 ret = do_grow(inode, newsize);
2144 ret = do_shrink(inode, newsize);
2151 int gfs2_truncatei_resume(struct gfs2_inode *ip)
2154 error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
2156 error = trunc_end(ip);
2160 int gfs2_file_dealloc(struct gfs2_inode *ip)
2162 return punch_hole(ip, 0, 0);
2166 * gfs2_free_journal_extents - Free cached journal bmap info
2171 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
2173 struct gfs2_journal_extent *jext;
2175 while(!list_empty(&jd->extent_list)) {
2176 jext = list_first_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2177 list_del(&jext->list);
2183 * gfs2_add_jextent - Add or merge a new extent to extent cache
2184 * @jd: The journal descriptor
2185 * @lblock: The logical block at start of new extent
2186 * @dblock: The physical block at start of new extent
2187 * @blocks: Size of extent in fs blocks
2189 * Returns: 0 on success or -ENOMEM
2192 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
2194 struct gfs2_journal_extent *jext;
2196 if (!list_empty(&jd->extent_list)) {
2197 jext = list_last_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2198 if ((jext->dblock + jext->blocks) == dblock) {
2199 jext->blocks += blocks;
2204 jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
2207 jext->dblock = dblock;
2208 jext->lblock = lblock;
2209 jext->blocks = blocks;
2210 list_add_tail(&jext->list, &jd->extent_list);
2216 * gfs2_map_journal_extents - Cache journal bmap info
2217 * @sdp: The super block
2218 * @jd: The journal to map
2220 * Create a reusable "extent" mapping from all logical
2221 * blocks to all physical blocks for the given journal. This will save
2222 * us time when writing journal blocks. Most journals will have only one
2223 * extent that maps all their logical blocks. That's because gfs2.mkfs
2224 * arranges the journal blocks sequentially to maximize performance.
2225 * So the extent would map the first block for the entire file length.
2226 * However, gfs2_jadd can happen while file activity is happening, so
2227 * those journals may not be sequential. Less likely is the case where
2228 * the users created their own journals by mounting the metafs and
2229 * laying it out. But it's still possible. These journals might have
2232 * Returns: 0 on success, or error on failure
2235 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
2239 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
2240 struct buffer_head bh;
2241 unsigned int shift = sdp->sd_sb.sb_bsize_shift;
2246 start = ktime_get();
2247 lblock_stop = i_size_read(jd->jd_inode) >> shift;
2248 size = (lblock_stop - lblock) << shift;
2250 WARN_ON(!list_empty(&jd->extent_list));
2256 rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
2257 if (rc || !buffer_mapped(&bh))
2259 rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
2263 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2267 fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid,
2268 jd->nr_extents, ktime_ms_delta(end, start));
2272 fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
2274 (unsigned long long)(i_size_read(jd->jd_inode) - size),
2276 fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
2277 rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
2278 bh.b_state, (unsigned long long)bh.b_size);
2279 gfs2_free_journal_extents(jd);
2284 * gfs2_write_alloc_required - figure out if a write will require an allocation
2285 * @ip: the file being written to
2286 * @offset: the offset to write to
2287 * @len: the number of bytes being written
2289 * Returns: 1 if an alloc is required, 0 otherwise
2292 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
2295 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2296 struct buffer_head bh;
2298 u64 lblock, lblock_stop, size;
2304 if (gfs2_is_stuffed(ip)) {
2305 if (offset + len > gfs2_max_stuffed_size(ip))
2310 shift = sdp->sd_sb.sb_bsize_shift;
2311 BUG_ON(gfs2_is_dir(ip));
2312 end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
2313 lblock = offset >> shift;
2314 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
2315 if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex))
2318 size = (lblock_stop - lblock) << shift;
2322 gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
2323 if (!buffer_mapped(&bh))
2326 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2332 static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
2334 struct gfs2_inode *ip = GFS2_I(inode);
2335 struct buffer_head *dibh;
2338 if (offset >= inode->i_size)
2340 if (offset + length > inode->i_size)
2341 length = inode->i_size - offset;
2343 error = gfs2_meta_inode_buffer(ip, &dibh);
2346 gfs2_trans_add_meta(ip->i_gl, dibh);
2347 memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
2353 static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
2356 struct gfs2_sbd *sdp = GFS2_SB(inode);
2357 loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
2361 struct gfs2_trans *tr;
2366 if (chunk > max_chunk)
2369 offs = offset & ~PAGE_MASK;
2370 if (offs && chunk > PAGE_SIZE)
2371 chunk = offs + ((chunk - offs) & PAGE_MASK);
2373 truncate_pagecache_range(inode, offset, chunk);
2377 tr = current->journal_info;
2378 if (!test_bit(TR_TOUCHED, &tr->tr_flags))
2381 gfs2_trans_end(sdp);
2382 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
2389 int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
2391 struct inode *inode = file_inode(file);
2392 struct gfs2_inode *ip = GFS2_I(inode);
2393 struct gfs2_sbd *sdp = GFS2_SB(inode);
2394 unsigned int blocksize = i_blocksize(inode);
2398 if (!gfs2_is_stuffed(ip)) {
2399 unsigned int start_off, end_len;
2401 start_off = offset & (blocksize - 1);
2402 end_len = (offset + length) & (blocksize - 1);
2404 unsigned int len = length;
2405 if (length > blocksize - start_off)
2406 len = blocksize - start_off;
2407 error = gfs2_block_zero_range(inode, offset, len);
2410 if (start_off + length < blocksize)
2414 error = gfs2_block_zero_range(inode,
2415 offset + length - end_len, end_len);
2421 start = round_down(offset, blocksize);
2422 end = round_up(offset + length, blocksize) - 1;
2423 error = filemap_write_and_wait_range(inode->i_mapping, start, end);
2427 if (gfs2_is_jdata(ip))
2428 error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
2429 GFS2_JTRUNC_REVOKES);
2431 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2435 if (gfs2_is_stuffed(ip)) {
2436 error = stuffed_zero_range(inode, offset, length);
2441 if (gfs2_is_jdata(ip)) {
2442 BUG_ON(!current->journal_info);
2443 gfs2_journaled_truncate_range(inode, offset, length);
2445 truncate_pagecache_range(inode, offset, offset + length - 1);
2447 file_update_time(file);
2448 mark_inode_dirty(inode);
2450 if (current->journal_info)
2451 gfs2_trans_end(sdp);
2453 if (!gfs2_is_stuffed(ip))
2454 error = punch_hole(ip, offset, length);
2457 if (current->journal_info)
2458 gfs2_trans_end(sdp);
2462 static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
2467 if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode))))
2470 if (offset >= wpc->iomap.offset &&
2471 offset < wpc->iomap.offset + wpc->iomap.length)
2474 memset(&wpc->iomap, 0, sizeof(wpc->iomap));
2475 ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap);
2479 const struct iomap_writeback_ops gfs2_writeback_ops = {
2480 .map_blocks = gfs2_map_blocks,