fs/gfs2/bmap.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
   4  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
   5  */
   6
   7 #include <linux/spinlock.h>
   8 #include <linux/completion.h>
   9 #include <linux/buffer_head.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/gfs2_ondisk.h>
  12 #include <linux/crc32.h>
  13 #include <linux/iomap.h>
  14 #include <linux/ktime.h>
  15
  16 #include "gfs2.h"
  17 #include "incore.h"
  18 #include "bmap.h"
  19 #include "glock.h"
  20 #include "inode.h"
  21 #include "meta_io.h"
  22 #include "quota.h"
  23 #include "rgrp.h"
  24 #include "log.h"
  25 #include "super.h"
  26 #include "trans.h"
  27 #include "dir.h"
  28 #include "util.h"
  29 #include "aops.h"
  30 #include "trace_gfs2.h"
  31
  32 /* This doesn't need to be that large as max 64 bit pointers in a 4k
  33  * block is 512, so __u16 is fine for that. It saves stack space to
  34  * keep it small.
  35  */
  36 struct metapath {
  37         struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
  38         __u16 mp_list[GFS2_MAX_META_HEIGHT];
  39         int mp_fheight; /* find_metapath height */
  40         int mp_aheight; /* actual height (lookup height) */
  41 };
  42
  43 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
  44
  45 /**
  46  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
  47  * @ip: the inode
  48  * @dibh: the dinode buffer
  49  * @block: the block number that was allocated
  50  * @page: The (optional) page. This is looked up if @page is NULL
  51  *
  52  * Returns: errno
  53  */
  54
  55 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
  56                                u64 block, struct page *page)
  57 {
  58         struct inode *inode = &ip->i_inode;
  59
  60         if (!PageUptodate(page)) {
  61                 void *kaddr = kmap(page);
  62                 u64 dsize = i_size_read(inode);
  63
  64                 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
  65                 memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
  66                 kunmap(page);
  67
  68                 SetPageUptodate(page);
  69         }
  70
  71         if (gfs2_is_jdata(ip)) {
  72                 struct buffer_head *bh;
  73
  74                 if (!page_has_buffers(page))
  75                         create_empty_buffers(page, BIT(inode->i_blkbits),
  76                                              BIT(BH_Uptodate));
  77
  78                 bh = page_buffers(page);
  79                 if (!buffer_mapped(bh))
  80                         map_bh(bh, inode->i_sb, block);
  81
  82                 set_buffer_uptodate(bh);
  83                 gfs2_trans_add_data(ip->i_gl, bh);
  84         } else {
  85                 set_page_dirty(page);
  86                 gfs2_ordered_add_inode(ip);
  87         }
  88
  89         return 0;
  90 }
  91
  92 static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page)
  93 {
  94         struct buffer_head *bh, *dibh;
  95         struct gfs2_dinode *di;
  96         u64 block = 0;
  97         int isdir = gfs2_is_dir(ip);
  98         int error;
  99
 100         error = gfs2_meta_inode_buffer(ip, &dibh);
 101         if (error)
 102                 return error;
 103
 104         if (i_size_read(&ip->i_inode)) {
 105                 /* Get a free block, fill it with the stuffed data,
 106                    and write it out to disk */
 107
 108                 unsigned int n = 1;
 109                 error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
 110                 if (error)
 111                         goto out_brelse;
 112                 if (isdir) {
 113                         gfs2_trans_remove_revoke(GFS2_SB(&ip->i_inode), block, 1);
 114                         error = gfs2_dir_get_new_buffer(ip, block, &bh);
 115                         if (error)
 116                                 goto out_brelse;
 117                         gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
 118                                               dibh, sizeof(struct gfs2_dinode));
 119                         brelse(bh);
 120                 } else {
 121                         error = gfs2_unstuffer_page(ip, dibh, block, page);
 122                         if (error)
 123                                 goto out_brelse;
 124                 }
 125         }
 126
 127         /*  Set up the pointer to the new block  */
 128
 129         gfs2_trans_add_meta(ip->i_gl, dibh);
 130         di = (struct gfs2_dinode *)dibh->b_data;
 131         gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 132
 133         if (i_size_read(&ip->i_inode)) {
 134                 *(__be64 *)(di + 1) = cpu_to_be64(block);
 135                 gfs2_add_inode_blocks(&ip->i_inode, 1);
 136                 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 137         }
 138
 139         ip->i_height = 1;
 140         di->di_height = cpu_to_be16(1);
 141
 142 out_brelse:
 143         brelse(dibh);
 144         return error;
 145 }
 146
 147 /**
 148  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
 149  * @ip: The GFS2 inode to unstuff
 150  *
 151  * This routine unstuffs a dinode and returns it to a "normal" state such
 152  * that the height can be grown in the traditional way.
 153  *
 154  * Returns: errno
 155  */
 156
 157 int gfs2_unstuff_dinode(struct gfs2_inode *ip)
 158 {
 159         struct inode *inode = &ip->i_inode;
 160         struct page *page;
 161         int error;
 162
 163         down_write(&ip->i_rw_mutex);
 164         page = grab_cache_page(inode->i_mapping, 0);
 165         error = -ENOMEM;
 166         if (!page)
 167                 goto out;
 168         error = __gfs2_unstuff_inode(ip, page);
 169         unlock_page(page);
 170         put_page(page);
 171 out:
 172         up_write(&ip->i_rw_mutex);
 173         return error;
 174 }
 175
 176 /**
 177  * find_metapath - Find path through the metadata tree
 178  * @sdp: The superblock
 179  * @block: The disk block to look up
 180  * @mp: The metapath to return the result in
 181  * @height: The pre-calculated height of the metadata tree
 182  *
 183  *   This routine returns a struct metapath structure that defines a path
 184  *   through the metadata of inode "ip" to get to block "block".
 185  *
 186  *   Example:
 187  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
 188  *   filesystem with a blocksize of 4096.
 189  *
 190  *   find_metapath() would return a struct metapath structure set to:
 191  *   mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
 192  *
 193  *   That means that in order to get to the block containing the byte at
 194  *   offset 101342453, we would load the indirect block pointed to by pointer
 195  *   0 in the dinode.  We would then load the indirect block pointed to by
 196  *   pointer 48 in that indirect block.  We would then load the data block
 197  *   pointed to by pointer 165 in that indirect block.
 198  *
 199  *             ----------------------------------------
 200  *             | Dinode |                             |
 201  *             |        |                            4|
 202  *             |        |0 1 2 3 4 5                 9|
 203  *             |        |                            6|
 204  *             ----------------------------------------
 205  *                       |
 206  *                       |
 207  *                       V
 208  *             ----------------------------------------
 209  *             | Indirect Block                       |
 210  *             |                                     5|
 211  *             |            4 4 4 4 4 5 5            1|
 212  *             |0           5 6 7 8 9 0 1            2|
 213  *             ----------------------------------------
 214  *                                |
 215  *                                |
 216  *                                V
 217  *             ----------------------------------------
 218  *             | Indirect Block                       |
 219  *             |                         1 1 1 1 1   5|
 220  *             |                         6 6 6 6 6   1|
 221  *             |0                        3 4 5 6 7   2|
 222  *             ----------------------------------------
 223  *                                           |
 224  *                                           |
 225  *                                           V
 226  *             ----------------------------------------
 227  *             | Data block containing offset         |
 228  *             |            101342453                 |
 229  *             |                                      |
 230  *             |                                      |
 231  *             ----------------------------------------
 232  *
 233  */
 234
 235 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
 236                           struct metapath *mp, unsigned int height)
 237 {
 238         unsigned int i;
 239
 240         mp->mp_fheight = height;
 241         for (i = height; i--;)
 242                 mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
 243 }
 244
 245 static inline unsigned int metapath_branch_start(const struct metapath *mp)
 246 {
 247         if (mp->mp_list[0] == 0)
 248                 return 2;
 249         return 1;
 250 }
 251
 252 /**
 253  * metaptr1 - Return the first possible metadata pointer in a metapath buffer
 254  * @height: The metadata height (0 = dinode)
 255  * @mp: The metapath
 256  */
 257 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
 258 {
 259         struct buffer_head *bh = mp->mp_bh[height];
 260         if (height == 0)
 261                 return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
 262         return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
 263 }
 264
 265 /**
 266  * metapointer - Return pointer to start of metadata in a buffer
 267  * @height: The metadata height (0 = dinode)
 268  * @mp: The metapath
 269  *
 270  * Return a pointer to the block number of the next height of the metadata
 271  * tree given a buffer containing the pointer to the current height of the
 272  * metadata tree.
 273  */
 274
 275 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
 276 {
 277         __be64 *p = metaptr1(height, mp);
 278         return p + mp->mp_list[height];
 279 }
 280
 281 static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
 282 {
 283         const struct buffer_head *bh = mp->mp_bh[height];
 284         return (const __be64 *)(bh->b_data + bh->b_size);
 285 }
 286
 287 static void clone_metapath(struct metapath *clone, struct metapath *mp)
 288 {
 289         unsigned int hgt;
 290
 291         *clone = *mp;
 292         for (hgt = 0; hgt < mp->mp_aheight; hgt++)
 293                 get_bh(clone->mp_bh[hgt]);
 294 }
 295
 296 static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
 297 {
 298         const __be64 *t;
 299
 300         for (t = start; t < end; t++) {
 301                 struct buffer_head *rabh;
 302
 303                 if (!*t)
 304                         continue;
 305
 306                 rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
 307                 if (trylock_buffer(rabh)) {
 308                         if (!buffer_uptodate(rabh)) {
 309                                 rabh->b_end_io = end_buffer_read_sync;
 310                                 submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META |
 311                                           REQ_PRIO, rabh);
 312                                 continue;
 313                         }
 314                         unlock_buffer(rabh);
 315                 }
 316                 brelse(rabh);
 317         }
 318 }
 319
 320 static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
 321                              unsigned int x, unsigned int h)
 322 {
 323         for (; x < h; x++) {
 324                 __be64 *ptr = metapointer(x, mp);
 325                 u64 dblock = be64_to_cpu(*ptr);
 326                 int ret;
 327
 328                 if (!dblock)
 329                         break;
 330                 ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, &mp->mp_bh[x + 1]);
 331                 if (ret)
 332                         return ret;
 333         }
 334         mp->mp_aheight = x + 1;
 335         return 0;
 336 }
 337
 338 /**
 339  * lookup_metapath - Walk the metadata tree to a specific point
 340  * @ip: The inode
 341  * @mp: The metapath
 342  *
 343  * Assumes that the inode's buffer has already been looked up and
 344  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
 345  * by find_metapath().
 346  *
 347  * If this function encounters part of the tree which has not been
 348  * allocated, it returns the current height of the tree at the point
 349  * at which it found the unallocated block. Blocks which are found are
 350  * added to the mp->mp_bh[] list.
 351  *
 352  * Returns: error
 353  */
 354
 355 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 356 {
 357         return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
 358 }
 359
 360 /**
 361  * fillup_metapath - fill up buffers for the metadata path to a specific height
 362  * @ip: The inode
 363  * @mp: The metapath
 364  * @h: The height to which it should be mapped
 365  *
 366  * Similar to lookup_metapath, but does lookups for a range of heights
 367  *
 368  * Returns: error or the number of buffers filled
 369  */
 370
 371 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
 372 {
 373         unsigned int x = 0;
 374         int ret;
 375
 376         if (h) {
 377                 /* find the first buffer we need to look up. */
 378                 for (x = h - 1; x > 0; x--) {
 379                         if (mp->mp_bh[x])
 380                                 break;
 381                 }
 382         }
 383         ret = __fillup_metapath(ip, mp, x, h);
 384         if (ret)
 385                 return ret;
 386         return mp->mp_aheight - x - 1;
 387 }
 388
 389 static sector_t metapath_to_block(struct gfs2_sbd *sdp, struct metapath *mp)
 390 {
 391         sector_t factor = 1, block = 0;
 392         int hgt;
 393
 394         for (hgt = mp->mp_fheight - 1; hgt >= 0; hgt--) {
 395                 if (hgt < mp->mp_aheight)
 396                         block += mp->mp_list[hgt] * factor;
 397                 factor *= sdp->sd_inptrs;
 398         }
 399         return block;
 400 }
 401
 402 static void release_metapath(struct metapath *mp)
 403 {
 404         int i;
 405
 406         for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
 407                 if (mp->mp_bh[i] == NULL)
 408                         break;
 409                 brelse(mp->mp_bh[i]);
 410                 mp->mp_bh[i] = NULL;
 411         }
 412 }
 413
 414 /**
 415  * gfs2_extent_length - Returns length of an extent of blocks
 416  * @bh: The metadata block
 417  * @ptr: Current position in @bh
 418  * @limit: Max extent length to return
 419  * @eob: Set to 1 if we hit "end of block"
 420  *
 421  * Returns: The length of the extent (minimum of one block)
 422  */
 423
 424 static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob)
 425 {
 426         const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
 427         const __be64 *first = ptr;
 428         u64 d = be64_to_cpu(*ptr);
 429
 430         *eob = 0;
 431         do {
 432                 ptr++;
 433                 if (ptr >= end)
 434                         break;
 435                 d++;
 436         } while(be64_to_cpu(*ptr) == d);
 437         if (ptr >= end)
 438                 *eob = 1;
 439         return ptr - first;
 440 }
 441
 442 enum walker_status { WALK_STOP, WALK_FOLLOW, WALK_CONTINUE };
 443
 444 /*
 445  * gfs2_metadata_walker - walk an indirect block
 446  * @mp: Metapath to indirect block
 447  * @ptrs: Number of pointers to look at
 448  *
 449  * When returning WALK_FOLLOW, the walker must update @mp to point at the right
 450  * indirect block to follow.
 451  */
 452 typedef enum walker_status (*gfs2_metadata_walker)(struct metapath *mp,
 453                                                    unsigned int ptrs);
 454
 455 /*
 456  * gfs2_walk_metadata - walk a tree of indirect blocks
 457  * @inode: The inode
 458  * @mp: Starting point of walk
 459  * @max_len: Maximum number of blocks to walk
 460  * @walker: Called during the walk
 461  *
 462  * Returns 1 if the walk was stopped by @walker, 0 if we went past @max_len or
 463  * past the end of metadata, and a negative error code otherwise.
 464  */
 465
 466 static int gfs2_walk_metadata(struct inode *inode, struct metapath *mp,
 467                 u64 max_len, gfs2_metadata_walker walker)
 468 {
 469         struct gfs2_inode *ip = GFS2_I(inode);
 470         struct gfs2_sbd *sdp = GFS2_SB(inode);
 471         u64 factor = 1;
 472         unsigned int hgt;
 473         int ret;
 474
 475         /*
 476          * The walk starts in the lowest allocated indirect block, which may be
 477          * before the position indicated by @mp.  Adjust @max_len accordingly
 478          * to avoid a short walk.
 479          */
 480         for (hgt = mp->mp_fheight - 1; hgt >= mp->mp_aheight; hgt--) {
 481                 max_len += mp->mp_list[hgt] * factor;
 482                 mp->mp_list[hgt] = 0;
 483                 factor *= sdp->sd_inptrs;
 484         }
 485
 486         for (;;) {
 487                 u16 start = mp->mp_list[hgt];
 488                 enum walker_status status;
 489                 unsigned int ptrs;
 490                 u64 len;
 491
 492                 /* Walk indirect block. */
 493                 ptrs = (hgt >= 1 ? sdp->sd_inptrs : sdp->sd_diptrs) - start;
 494                 len = ptrs * factor;
 495                 if (len > max_len)
 496                         ptrs = DIV_ROUND_UP_ULL(max_len, factor);
 497                 status = walker(mp, ptrs);
 498                 switch (status) {
 499                 case WALK_STOP:
 500                         return 1;
 501                 case WALK_FOLLOW:
 502                         BUG_ON(mp->mp_aheight == mp->mp_fheight);
 503                         ptrs = mp->mp_list[hgt] - start;
 504                         len = ptrs * factor;
 505                         break;
 506                 case WALK_CONTINUE:
 507                         break;
 508                 }
 509                 if (len >= max_len)
 510                         break;
 511                 max_len -= len;
 512                 if (status == WALK_FOLLOW)
 513                         goto fill_up_metapath;
 514
 515 lower_metapath:
 516                 /* Decrease height of metapath. */
 517                 brelse(mp->mp_bh[hgt]);
 518                 mp->mp_bh[hgt] = NULL;
 519                 mp->mp_list[hgt] = 0;
 520                 if (!hgt)
 521                         break;
 522                 hgt--;
 523                 factor *= sdp->sd_inptrs;
 524
 525                 /* Advance in metadata tree. */
 526                 (mp->mp_list[hgt])++;
 527                 if (hgt) {
 528                         if (mp->mp_list[hgt] >= sdp->sd_inptrs)
 529                                 goto lower_metapath;
 530                 } else {
 531                         if (mp->mp_list[hgt] >= sdp->sd_diptrs)
 532                                 break;
 533                 }
 534
 535 fill_up_metapath:
 536                 /* Increase height of metapath. */
 537                 ret = fillup_metapath(ip, mp, ip->i_height - 1);
 538                 if (ret < 0)
 539                         return ret;
 540                 hgt += ret;
 541                 for (; ret; ret--)
 542                         do_div(factor, sdp->sd_inptrs);
 543                 mp->mp_aheight = hgt + 1;
 544         }
 545         return 0;
 546 }
 547
 548 static enum walker_status gfs2_hole_walker(struct metapath *mp,
 549                                            unsigned int ptrs)
 550 {
 551         const __be64 *start, *ptr, *end;
 552         unsigned int hgt;
 553
 554         hgt = mp->mp_aheight - 1;
 555         start = metapointer(hgt, mp);
 556         end = start + ptrs;
 557
 558         for (ptr = start; ptr < end; ptr++) {
 559                 if (*ptr) {
 560                         mp->mp_list[hgt] += ptr - start;
 561                         if (mp->mp_aheight == mp->mp_fheight)
 562                                 return WALK_STOP;
 563                         return WALK_FOLLOW;
 564                 }
 565         }
 566         return WALK_CONTINUE;
 567 }
 568
 569 /**
 570  * gfs2_hole_size - figure out the size of a hole
 571  * @inode: The inode
 572  * @lblock: The logical starting block number
 573  * @len: How far to look (in blocks)
 574  * @mp: The metapath at lblock
 575  * @iomap: The iomap to store the hole size in
 576  *
 577  * This function modifies @mp.
 578  *
 579  * Returns: errno on error
 580  */
 581 static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
 582                           struct metapath *mp, struct iomap *iomap)
 583 {
 584         struct metapath clone;
 585         u64 hole_size;
 586         int ret;
 587
 588         clone_metapath(&clone, mp);
 589         ret = gfs2_walk_metadata(inode, &clone, len, gfs2_hole_walker);
 590         if (ret < 0)
 591                 goto out;
 592
 593         if (ret == 1)
 594                 hole_size = metapath_to_block(GFS2_SB(inode), &clone) - lblock;
 595         else
 596                 hole_size = len;
 597         iomap->length = hole_size << inode->i_blkbits;
 598         ret = 0;
 599
 600 out:
 601         release_metapath(&clone);
 602         return ret;
 603 }
 604
 605 static inline void gfs2_indirect_init(struct metapath *mp,
 606                                       struct gfs2_glock *gl, unsigned int i,
 607                                       unsigned offset, u64 bn)
 608 {
 609         __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
 610                        ((i > 1) ? sizeof(struct gfs2_meta_header) :
 611                                  sizeof(struct gfs2_dinode)));
 612         BUG_ON(i < 1);
 613         BUG_ON(mp->mp_bh[i] != NULL);
 614         mp->mp_bh[i] = gfs2_meta_new(gl, bn);
 615         gfs2_trans_add_meta(gl, mp->mp_bh[i]);
 616         gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
 617         gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
 618         ptr += offset;
 619         *ptr = cpu_to_be64(bn);
 620 }
 621
 622 enum alloc_state {
 623         ALLOC_DATA = 0,
 624         ALLOC_GROW_DEPTH = 1,
 625         ALLOC_GROW_HEIGHT = 2,
 626         /* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
 627 };
 628
 629 /**
 630  * __gfs2_iomap_alloc - Build a metadata tree of the requested height
 631  * @inode: The GFS2 inode
 632  * @iomap: The iomap structure
 633  * @mp: The metapath, with proper height information calculated
 634  *
 635  * In this routine we may have to alloc:
 636  *   i) Indirect blocks to grow the metadata tree height
 637  *  ii) Indirect blocks to fill in lower part of the metadata tree
 638  * iii) Data blocks
 639  *
 640  * This function is called after __gfs2_iomap_get, which works out the
 641  * total number of blocks which we need via gfs2_alloc_size.
 642  *
 643  * We then do the actual allocation asking for an extent at a time (if
 644  * enough contiguous free blocks are available, there will only be one
 645  * allocation request per call) and uses the state machine to initialise
 646  * the blocks in order.
 647  *
 648  * Right now, this function will allocate at most one indirect block
 649  * worth of data -- with a default block size of 4K, that's slightly
 650  * less than 2M.  If this limitation is ever removed to allow huge
 651  * allocations, we would probably still want to limit the iomap size we
 652  * return to avoid stalling other tasks during huge writes; the next
 653  * iomap iteration would then find the blocks already allocated.
 654  *
 655  * Returns: errno on error
 656  */
 657
 658 static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 659                               struct metapath *mp)
 660 {
 661         struct gfs2_inode *ip = GFS2_I(inode);
 662         struct gfs2_sbd *sdp = GFS2_SB(inode);
 663         struct buffer_head *dibh = mp->mp_bh[0];
 664         u64 bn;
 665         unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 666         size_t dblks = iomap->length >> inode->i_blkbits;
 667         const unsigned end_of_metadata = mp->mp_fheight - 1;
 668         int ret;
 669         enum alloc_state state;
 670         __be64 *ptr;
 671         __be64 zero_bn = 0;
 672
 673         BUG_ON(mp->mp_aheight < 1);
 674         BUG_ON(dibh == NULL);
 675         BUG_ON(dblks < 1);
 676
 677         gfs2_trans_add_meta(ip->i_gl, dibh);
 678
 679         down_write(&ip->i_rw_mutex);
 680
 681         if (mp->mp_fheight == mp->mp_aheight) {
 682                 /* Bottom indirect block exists */
 683                 state = ALLOC_DATA;
 684         } else {
 685                 /* Need to allocate indirect blocks */
 686                 if (mp->mp_fheight == ip->i_height) {
 687                         /* Writing into existing tree, extend tree down */
 688                         iblks = mp->mp_fheight - mp->mp_aheight;
 689                         state = ALLOC_GROW_DEPTH;
 690                 } else {
 691                         /* Building up tree height */
 692                         state = ALLOC_GROW_HEIGHT;
 693                         iblks = mp->mp_fheight - ip->i_height;
 694                         branch_start = metapath_branch_start(mp);
 695                         iblks += (mp->mp_fheight - branch_start);
 696                 }
 697         }
 698
 699         /* start of the second part of the function (state machine) */
 700
 701         blks = dblks + iblks;
 702         i = mp->mp_aheight;
 703         do {
 704                 n = blks - alloced;
 705                 ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
 706                 if (ret)
 707                         goto out;
 708                 alloced += n;
 709                 if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 710                         gfs2_trans_remove_revoke(sdp, bn, n);
 711                 switch (state) {
 712                 /* Growing height of tree */
 713                 case ALLOC_GROW_HEIGHT:
 714                         if (i == 1) {
 715                                 ptr = (__be64 *)(dibh->b_data +
 716                                                  sizeof(struct gfs2_dinode));
 717                                 zero_bn = *ptr;
 718                         }
 719                         for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
 720                              i++, n--)
 721                                 gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
 722                         if (i - 1 == mp->mp_fheight - ip->i_height) {
 723                                 i--;
 724                                 gfs2_buffer_copy_tail(mp->mp_bh[i],
 725                                                 sizeof(struct gfs2_meta_header),
 726                                                 dibh, sizeof(struct gfs2_dinode));
 727                                 gfs2_buffer_clear_tail(dibh,
 728                                                 sizeof(struct gfs2_dinode) +
 729                                                 sizeof(__be64));
 730                                 ptr = (__be64 *)(mp->mp_bh[i]->b_data +
 731                                         sizeof(struct gfs2_meta_header));
 732                                 *ptr = zero_bn;
 733                                 state = ALLOC_GROW_DEPTH;
 734                                 for(i = branch_start; i < mp->mp_fheight; i++) {
 735                                         if (mp->mp_bh[i] == NULL)
 736                                                 break;
 737                                         brelse(mp->mp_bh[i]);
 738                                         mp->mp_bh[i] = NULL;
 739                                 }
 740                                 i = branch_start;
 741                         }
 742                         if (n == 0)
 743                                 break;
 744                         fallthrough;    /* To branching from existing tree */
 745                 case ALLOC_GROW_DEPTH:
 746                         if (i > 1 && i < mp->mp_fheight)
 747                                 gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
 748                         for (; i < mp->mp_fheight && n > 0; i++, n--)
 749                                 gfs2_indirect_init(mp, ip->i_gl, i,
 750                                                    mp->mp_list[i-1], bn++);
 751                         if (i == mp->mp_fheight)
 752                                 state = ALLOC_DATA;
 753                         if (n == 0)
 754                                 break;
 755                         fallthrough;    /* To tree complete, adding data blocks */
 756                 case ALLOC_DATA:
 757                         BUG_ON(n > dblks);
 758                         BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
 759                         gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
 760                         dblks = n;
 761                         ptr = metapointer(end_of_metadata, mp);
 762                         iomap->addr = bn << inode->i_blkbits;
 763                         iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
 764                         while (n-- > 0)
 765                                 *ptr++ = cpu_to_be64(bn++);
 766                         break;
 767                 }
 768         } while (iomap->addr == IOMAP_NULL_ADDR);
 769
 770         iomap->type = IOMAP_MAPPED;
 771         iomap->length = (u64)dblks << inode->i_blkbits;
 772         ip->i_height = mp->mp_fheight;
 773         gfs2_add_inode_blocks(&ip->i_inode, alloced);
 774         gfs2_dinode_out(ip, dibh->b_data);
 775 out:
 776         up_write(&ip->i_rw_mutex);
 777         return ret;
 778 }
 779
 780 #define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
 781
 782 /**
 783  * gfs2_alloc_size - Compute the maximum allocation size
 784  * @inode: The inode
 785  * @mp: The metapath
 786  * @size: Requested size in blocks
 787  *
 788  * Compute the maximum size of the next allocation at @mp.
 789  *
 790  * Returns: size in blocks
 791  */
 792 static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size)
 793 {
 794         struct gfs2_inode *ip = GFS2_I(inode);
 795         struct gfs2_sbd *sdp = GFS2_SB(inode);
 796         const __be64 *first, *ptr, *end;
 797
 798         /*
 799          * For writes to stuffed files, this function is called twice via
 800          * __gfs2_iomap_get, before and after unstuffing. The size we return the
 801          * first time needs to be large enough to get the reservation and
 802          * allocation sizes right.  The size we return the second time must
 803          * be exact or else __gfs2_iomap_alloc won't do the right thing.
 804          */
 805
 806         if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) {
 807                 unsigned int maxsize = mp->mp_fheight > 1 ?
 808                         sdp->sd_inptrs : sdp->sd_diptrs;
 809                 maxsize -= mp->mp_list[mp->mp_fheight - 1];
 810                 if (size > maxsize)
 811                         size = maxsize;
 812                 return size;
 813         }
 814
 815         first = metapointer(ip->i_height - 1, mp);
 816         end = metaend(ip->i_height - 1, mp);
 817         if (end - first > size)
 818                 end = first + size;
 819         for (ptr = first; ptr < end; ptr++) {
 820                 if (*ptr)
 821                         break;
 822         }
 823         return ptr - first;
 824 }
 825
 826 /**
 827  * __gfs2_iomap_get - Map blocks from an inode to disk blocks
 828  * @inode: The inode
 829  * @pos: Starting position in bytes
 830  * @length: Length to map, in bytes
 831  * @flags: iomap flags
 832  * @iomap: The iomap structure
 833  * @mp: The metapath
 834  *
 835  * Returns: errno
 836  */
 837 static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
 838                             unsigned flags, struct iomap *iomap,
 839                             struct metapath *mp)
 840 {
 841         struct gfs2_inode *ip = GFS2_I(inode);
 842         struct gfs2_sbd *sdp = GFS2_SB(inode);
 843         loff_t size = i_size_read(inode);
 844         __be64 *ptr;
 845         sector_t lblock;
 846         sector_t lblock_stop;
 847         int ret;
 848         int eob;
 849         u64 len;
 850         struct buffer_head *dibh = NULL, *bh;
 851         u8 height;
 852
 853         if (!length)
 854                 return -EINVAL;
 855
 856         down_read(&ip->i_rw_mutex);
 857
 858         ret = gfs2_meta_inode_buffer(ip, &dibh);
 859         if (ret)
 860                 goto unlock;
 861         mp->mp_bh[0] = dibh;
 862
 863         if (gfs2_is_stuffed(ip)) {
 864                 if (flags & IOMAP_WRITE) {
 865                         loff_t max_size = gfs2_max_stuffed_size(ip);
 866
 867                         if (pos + length > max_size)
 868                                 goto unstuff;
 869                         iomap->length = max_size;
 870                 } else {
 871                         if (pos >= size) {
 872                                 if (flags & IOMAP_REPORT) {
 873                                         ret = -ENOENT;
 874                                         goto unlock;
 875                                 } else {
 876                                         iomap->offset = pos;
 877                                         iomap->length = length;
 878                                         goto hole_found;
 879                                 }
 880                         }
 881                         iomap->length = size;
 882                 }
 883                 iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
 884                               sizeof(struct gfs2_dinode);
 885                 iomap->type = IOMAP_INLINE;
 886                 iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode);
 887                 goto out;
 888         }
 889
 890 unstuff:
 891         lblock = pos >> inode->i_blkbits;
 892         iomap->offset = lblock << inode->i_blkbits;
 893         lblock_stop = (pos + length - 1) >> inode->i_blkbits;
 894         len = lblock_stop - lblock + 1;
 895         iomap->length = len << inode->i_blkbits;
 896
 897         height = ip->i_height;
 898         while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
 899                 height++;
 900         find_metapath(sdp, lblock, mp, height);
 901         if (height > ip->i_height || gfs2_is_stuffed(ip))
 902                 goto do_alloc;
 903
 904         ret = lookup_metapath(ip, mp);
 905         if (ret)
 906                 goto unlock;
 907
 908         if (mp->mp_aheight != ip->i_height)
 909                 goto do_alloc;
 910
 911         ptr = metapointer(ip->i_height - 1, mp);
 912         if (*ptr == 0)
 913                 goto do_alloc;
 914
 915         bh = mp->mp_bh[ip->i_height - 1];
 916         len = gfs2_extent_length(bh, ptr, len, &eob);
 917
 918         iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
 919         iomap->length = len << inode->i_blkbits;
 920         iomap->type = IOMAP_MAPPED;
 921         iomap->flags |= IOMAP_F_MERGED;
 922         if (eob)
 923                 iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
 924
 925 out:
 926         iomap->bdev = inode->i_sb->s_bdev;
 927 unlock:
 928         up_read(&ip->i_rw_mutex);
 929         return ret;
 930
 931 do_alloc:
 932         if (flags & IOMAP_REPORT) {
 933                 if (pos >= size)
 934                         ret = -ENOENT;
 935                 else if (height == ip->i_height)
 936                         ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 937                 else
 938                         iomap->length = size - iomap->offset;
 939         } else if (flags & IOMAP_WRITE) {
 940                 u64 alloc_size;
 941
 942                 if (flags & IOMAP_DIRECT)
 943                         goto out;  /* (see gfs2_file_direct_write) */
 944
 945                 len = gfs2_alloc_size(inode, mp, len);
 946                 alloc_size = len << inode->i_blkbits;
 947                 if (alloc_size < iomap->length)
 948                         iomap->length = alloc_size;
 949         } else {
 950                 if (pos < size && height == ip->i_height)
 951                         ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 952         }
 953 hole_found:
 954         iomap->addr = IOMAP_NULL_ADDR;
 955         iomap->type = IOMAP_HOLE;
 956         goto out;
 957 }
 958
 959 static struct folio *
 960 gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
 961 {
 962         struct inode *inode = iter->inode;
 963         unsigned int blockmask = i_blocksize(inode) - 1;
 964         struct gfs2_sbd *sdp = GFS2_SB(inode);
 965         unsigned int blocks;
 966         struct folio *folio;
 967         int status;
 968
 969         blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
 970         status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
 971         if (status)
 972                 return ERR_PTR(status);
 973
 974         folio = iomap_get_folio(iter, pos, len);
 975         if (IS_ERR(folio))
 976                 gfs2_trans_end(sdp);
 977         return folio;
 978 }
 979
 980 static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos,
 981                                  unsigned copied, struct folio *folio)
 982 {
 983         struct gfs2_trans *tr = current->journal_info;
 984         struct gfs2_inode *ip = GFS2_I(inode);
 985         struct gfs2_sbd *sdp = GFS2_SB(inode);
 986
 987         if (!gfs2_is_stuffed(ip))
 988                 gfs2_trans_add_databufs(ip, folio, offset_in_folio(folio, pos),
 989                                         copied);
 990
 991         folio_unlock(folio);
 992         folio_put(folio);
 993
 994         if (tr->tr_num_buf_new)
 995                 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 996
 997         gfs2_trans_end(sdp);
 998 }
 999
1000 static const struct iomap_folio_ops gfs2_iomap_folio_ops = {
1001         .get_folio = gfs2_iomap_get_folio,
1002         .put_folio = gfs2_iomap_put_folio,
1003 };
1004
1005 static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
1006                                   loff_t length, unsigned flags,
1007                                   struct iomap *iomap,
1008                                   struct metapath *mp)
1009 {
1010         struct gfs2_inode *ip = GFS2_I(inode);
1011         struct gfs2_sbd *sdp = GFS2_SB(inode);
1012         bool unstuff;
1013         int ret;
1014
1015         unstuff = gfs2_is_stuffed(ip) &&
1016                   pos + length > gfs2_max_stuffed_size(ip);
1017
1018         if (unstuff || iomap->type == IOMAP_HOLE) {
1019                 unsigned int data_blocks, ind_blocks;
1020                 struct gfs2_alloc_parms ap = {};
1021                 unsigned int rblocks;
1022                 struct gfs2_trans *tr;
1023
1024                 gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
1025                                        &ind_blocks);
1026                 ap.target = data_blocks + ind_blocks;
1027                 ret = gfs2_quota_lock_check(ip, &ap);
1028                 if (ret)
1029                         return ret;
1030
1031                 ret = gfs2_inplace_reserve(ip, &ap);
1032                 if (ret)
1033                         goto out_qunlock;
1034
1035                 rblocks = RES_DINODE + ind_blocks;
1036                 if (gfs2_is_jdata(ip))
1037                         rblocks += data_blocks;
1038                 if (ind_blocks || data_blocks)
1039                         rblocks += RES_STATFS + RES_QUOTA;
1040                 if (inode == sdp->sd_rindex)
1041                         rblocks += 2 * RES_STATFS;
1042                 rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
1043
1044                 ret = gfs2_trans_begin(sdp, rblocks,
1045                                        iomap->length >> inode->i_blkbits);
1046                 if (ret)
1047                         goto out_trans_fail;
1048
1049                 if (unstuff) {
1050                         ret = gfs2_unstuff_dinode(ip);
1051                         if (ret)
1052                                 goto out_trans_end;
1053                         release_metapath(mp);
1054                         ret = __gfs2_iomap_get(inode, iomap->offset,
1055                                                iomap->length, flags, iomap, mp);
1056                         if (ret)
1057                                 goto out_trans_end;
1058                 }
1059
1060                 if (iomap->type == IOMAP_HOLE) {
1061                         ret = __gfs2_iomap_alloc(inode, iomap, mp);
1062                         if (ret) {
1063                                 gfs2_trans_end(sdp);
1064                                 gfs2_inplace_release(ip);
1065                                 punch_hole(ip, iomap->offset, iomap->length);
1066                                 goto out_qunlock;
1067                         }
1068                 }
1069
1070                 tr = current->journal_info;
1071                 if (tr->tr_num_buf_new)
1072                         __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1073
1074                 gfs2_trans_end(sdp);
1075         }
1076
1077         if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
1078                 iomap->folio_ops = &gfs2_iomap_folio_ops;
1079         return 0;
1080
1081 out_trans_end:
1082         gfs2_trans_end(sdp);
1083 out_trans_fail:
1084         gfs2_inplace_release(ip);
1085 out_qunlock:
1086         gfs2_quota_unlock(ip);
1087         return ret;
1088 }
1089
1090 static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
1091                             unsigned flags, struct iomap *iomap,
1092                             struct iomap *srcmap)
1093 {
1094         struct gfs2_inode *ip = GFS2_I(inode);
1095         struct metapath mp = { .mp_aheight = 1, };
1096         int ret;
1097
1098         if (gfs2_is_jdata(ip))
1099                 iomap->flags |= IOMAP_F_BUFFER_HEAD;
1100
1101         trace_gfs2_iomap_start(ip, pos, length, flags);
1102         ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
1103         if (ret)
1104                 goto out_unlock;
1105
1106         switch(flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1107         case IOMAP_WRITE:
1108                 if (flags & IOMAP_DIRECT) {
1109                         /*
1110                          * Silently fall back to buffered I/O for stuffed files
1111                          * or if we've got a hole (see gfs2_file_direct_write).
1112                          */
1113                         if (iomap->type != IOMAP_MAPPED)
1114                                 ret = -ENOTBLK;
1115                         goto out_unlock;
1116                 }
1117                 break;
1118         case IOMAP_ZERO:
1119                 if (iomap->type == IOMAP_HOLE)
1120                         goto out_unlock;
1121                 break;
1122         default:
1123                 goto out_unlock;
1124         }
1125
1126         ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
1127
1128 out_unlock:
1129         release_metapath(&mp);
1130         trace_gfs2_iomap_end(ip, iomap, ret);
1131         return ret;
1132 }
1133
1134 static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
1135                           ssize_t written, unsigned flags, struct iomap *iomap)
1136 {
1137         struct gfs2_inode *ip = GFS2_I(inode);
1138         struct gfs2_sbd *sdp = GFS2_SB(inode);
1139
1140         switch (flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1141         case IOMAP_WRITE:
1142                 if (flags & IOMAP_DIRECT)
1143                         return 0;
1144                 break;
1145         case IOMAP_ZERO:
1146                  if (iomap->type == IOMAP_HOLE)
1147                          return 0;
1148                  break;
1149         default:
1150                  return 0;
1151         }
1152
1153         if (!gfs2_is_stuffed(ip))
1154                 gfs2_ordered_add_inode(ip);
1155
1156         if (inode == sdp->sd_rindex)
1157                 adjust_fs_space(inode);
1158
1159         gfs2_inplace_release(ip);
1160
1161         if (ip->i_qadata && ip->i_qadata->qa_qd_num)
1162                 gfs2_quota_unlock(ip);
1163
1164         if (length != written && (iomap->flags & IOMAP_F_NEW)) {
1165                 /* Deallocate blocks that were just allocated. */
1166                 loff_t hstart = round_up(pos + written, i_blocksize(inode));
1167                 loff_t hend = iomap->offset + iomap->length;
1168
1169                 if (hstart < hend) {
1170                         truncate_pagecache_range(inode, hstart, hend - 1);
1171                         punch_hole(ip, hstart, hend - hstart);
1172                 }
1173         }
1174
1175         if (unlikely(!written))
1176                 return 0;
1177
1178         if (iomap->flags & IOMAP_F_SIZE_CHANGED)
1179                 mark_inode_dirty(inode);
1180         set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
1181         return 0;
1182 }
1183
1184 const struct iomap_ops gfs2_iomap_ops = {
1185         .iomap_begin = gfs2_iomap_begin,
1186         .iomap_end = gfs2_iomap_end,
1187 };
1188
1189 /**
1190  * gfs2_block_map - Map one or more blocks of an inode to a disk block
1191  * @inode: The inode
1192  * @lblock: The logical block number
1193  * @bh_map: The bh to be mapped
1194  * @create: True if its ok to alloc blocks to satify the request
1195  *
1196  * The size of the requested mapping is defined in bh_map->b_size.
1197  *
1198  * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged
1199  * when @lblock is not mapped.  Sets buffer_mapped(bh_map) and
1200  * bh_map->b_size to indicate the size of the mapping when @lblock and
1201  * successive blocks are mapped, up to the requested size.
1202  *
1203  * Sets buffer_boundary() if a read of metadata will be required
1204  * before the next block can be mapped. Sets buffer_new() if new
1205  * blocks were allocated.
1206  *
1207  * Returns: errno
1208  */
1209
1210 int gfs2_block_map(struct inode *inode, sector_t lblock,
1211                    struct buffer_head *bh_map, int create)
1212 {
1213         struct gfs2_inode *ip = GFS2_I(inode);
1214         loff_t pos = (loff_t)lblock << inode->i_blkbits;
1215         loff_t length = bh_map->b_size;
1216         struct iomap iomap = { };
1217         int ret;
1218
1219         clear_buffer_mapped(bh_map);
1220         clear_buffer_new(bh_map);
1221         clear_buffer_boundary(bh_map);
1222         trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
1223
1224         if (!create)
1225                 ret = gfs2_iomap_get(inode, pos, length, &iomap);
1226         else
1227                 ret = gfs2_iomap_alloc(inode, pos, length, &iomap);
1228         if (ret)
1229                 goto out;
1230
1231         if (iomap.length > bh_map->b_size) {
1232                 iomap.length = bh_map->b_size;
1233                 iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
1234         }
1235         if (iomap.addr != IOMAP_NULL_ADDR)
1236                 map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
1237         bh_map->b_size = iomap.length;
1238         if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
1239                 set_buffer_boundary(bh_map);
1240         if (iomap.flags & IOMAP_F_NEW)
1241                 set_buffer_new(bh_map);
1242
1243 out:
1244         trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
1245         return ret;
1246 }
1247
1248 int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock,
1249                     unsigned int *extlen)
1250 {
1251         unsigned int blkbits = inode->i_blkbits;
1252         struct iomap iomap = { };
1253         unsigned int len;
1254         int ret;
1255
1256         ret = gfs2_iomap_get(inode, lblock << blkbits, *extlen << blkbits,
1257                              &iomap);
1258         if (ret)
1259                 return ret;
1260         if (iomap.type != IOMAP_MAPPED)
1261                 return -EIO;
1262         *dblock = iomap.addr >> blkbits;
1263         len = iomap.length >> blkbits;
1264         if (len < *extlen)
1265                 *extlen = len;
1266         return 0;
1267 }
1268
1269 int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
1270                       unsigned int *extlen, bool *new)
1271 {
1272         unsigned int blkbits = inode->i_blkbits;
1273         struct iomap iomap = { };
1274         unsigned int len;
1275         int ret;
1276
1277         ret = gfs2_iomap_alloc(inode, lblock << blkbits, *extlen << blkbits,
1278                                &iomap);
1279         if (ret)
1280                 return ret;
1281         if (iomap.type != IOMAP_MAPPED)
1282                 return -EIO;
1283         *dblock = iomap.addr >> blkbits;
1284         len = iomap.length >> blkbits;
1285         if (len < *extlen)
1286                 *extlen = len;
1287         *new = iomap.flags & IOMAP_F_NEW;
1288         return 0;
1289 }
1290
1291 /*
1292  * NOTE: Never call gfs2_block_zero_range with an open transaction because it
1293  * uses iomap write to perform its actions, which begin their own transactions
1294  * (iomap_begin, get_folio, etc.)
1295  */
1296 static int gfs2_block_zero_range(struct inode *inode, loff_t from,
1297                                  unsigned int length)
1298 {
1299         BUG_ON(current->journal_info);
1300         return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
1301 }
1302
1303 #define GFS2_JTRUNC_REVOKES 8192
1304
1305 /**
1306  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
1307  * @inode: The inode being truncated
1308  * @oldsize: The original (larger) size
1309  * @newsize: The new smaller size
1310  *
1311  * With jdata files, we have to journal a revoke for each block which is
1312  * truncated. As a result, we need to split this into separate transactions
1313  * if the number of pages being truncated gets too large.
1314  */
1315
1316 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
1317 {
1318         struct gfs2_sbd *sdp = GFS2_SB(inode);
1319         u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1320         u64 chunk;
1321         int error;
1322
1323         while (oldsize != newsize) {
1324                 struct gfs2_trans *tr;
1325                 unsigned int offs;
1326
1327                 chunk = oldsize - newsize;
1328                 if (chunk > max_chunk)
1329                         chunk = max_chunk;
1330
1331                 offs = oldsize & ~PAGE_MASK;
1332                 if (offs && chunk > PAGE_SIZE)
1333                         chunk = offs + ((chunk - offs) & PAGE_MASK);
1334
1335                 truncate_pagecache(inode, oldsize - chunk);
1336                 oldsize -= chunk;
1337
1338                 tr = current->journal_info;
1339                 if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1340                         continue;
1341
1342                 gfs2_trans_end(sdp);
1343                 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1344                 if (error)
1345                         return error;
1346         }
1347
1348         return 0;
1349 }
1350
1351 static int trunc_start(struct inode *inode, u64 newsize)
1352 {
1353         struct gfs2_inode *ip = GFS2_I(inode);
1354         struct gfs2_sbd *sdp = GFS2_SB(inode);
1355         struct buffer_head *dibh = NULL;
1356         int journaled = gfs2_is_jdata(ip);
1357         u64 oldsize = inode->i_size;
1358         int error;
1359
1360         if (!gfs2_is_stuffed(ip)) {
1361                 unsigned int blocksize = i_blocksize(inode);
1362                 unsigned int offs = newsize & (blocksize - 1);
1363                 if (offs) {
1364                         error = gfs2_block_zero_range(inode, newsize,
1365                                                       blocksize - offs);
1366                         if (error)
1367                                 return error;
1368                 }
1369         }
1370         if (journaled)
1371                 error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1372         else
1373                 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1374         if (error)
1375                 return error;
1376
1377         error = gfs2_meta_inode_buffer(ip, &dibh);
1378         if (error)
1379                 goto out;
1380
1381         gfs2_trans_add_meta(ip->i_gl, dibh);
1382
1383         if (gfs2_is_stuffed(ip))
1384                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1385         else
1386                 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1387
1388         i_size_write(inode, newsize);
1389         inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1390         gfs2_dinode_out(ip, dibh->b_data);
1391
1392         if (journaled)
1393                 error = gfs2_journaled_truncate(inode, oldsize, newsize);
1394         else
1395                 truncate_pagecache(inode, newsize);
1396
1397 out:
1398         brelse(dibh);
1399         if (current->journal_info)
1400                 gfs2_trans_end(sdp);
1401         return error;
1402 }
1403
1404 int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
1405                    struct iomap *iomap)
1406 {
1407         struct metapath mp = { .mp_aheight = 1, };
1408         int ret;
1409
1410         ret = __gfs2_iomap_get(inode, pos, length, 0, iomap, &mp);
1411         release_metapath(&mp);
1412         return ret;
1413 }
1414
1415 int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length,
1416                      struct iomap *iomap)
1417 {
1418         struct metapath mp = { .mp_aheight = 1, };
1419         int ret;
1420
1421         ret = __gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
1422         if (!ret && iomap->type == IOMAP_HOLE)
1423                 ret = __gfs2_iomap_alloc(inode, iomap, &mp);
1424         release_metapath(&mp);
1425         return ret;
1426 }
1427
1428 /**
1429  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1430  * @ip: inode
1431  * @rd_gh: holder of resource group glock
1432  * @bh: buffer head to sweep
1433  * @start: starting point in bh
1434  * @end: end point in bh
1435  * @meta: true if bh points to metadata (rather than data)
1436  * @btotal: place to keep count of total blocks freed
1437  *
1438  * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1439  * free, and free them all. However, we do it one rgrp at a time. If this
1440  * block has references to multiple rgrps, we break it into individual
1441  * transactions. This allows other processes to use the rgrps while we're
1442  * focused on a single one, for better concurrency / performance.
1443  * At every transaction boundary, we rewrite the inode into the journal.
1444  * That way the bitmaps are kept consistent with the inode and we can recover
1445  * if we're interrupted by power-outages.
1446  *
1447  * Returns: 0, or return code if an error occurred.
1448  *          *btotal has the total number of blocks freed
1449  */
1450 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1451                               struct buffer_head *bh, __be64 *start, __be64 *end,
1452                               bool meta, u32 *btotal)
1453 {
1454         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1455         struct gfs2_rgrpd *rgd;
1456         struct gfs2_trans *tr;
1457         __be64 *p;
1458         int blks_outside_rgrp;
1459         u64 bn, bstart, isize_blks;
1460         s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1461         int ret = 0;
1462         bool buf_in_tr = false; /* buffer was added to transaction */
1463
1464 more_rgrps:
1465         rgd = NULL;
1466         if (gfs2_holder_initialized(rd_gh)) {
1467                 rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1468                 gfs2_assert_withdraw(sdp,
1469                              gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1470         }
1471         blks_outside_rgrp = 0;
1472         bstart = 0;
1473         blen = 0;
1474
1475         for (p = start; p < end; p++) {
1476                 if (!*p)
1477                         continue;
1478                 bn = be64_to_cpu(*p);
1479
1480                 if (rgd) {
1481                         if (!rgrp_contains_block(rgd, bn)) {
1482                                 blks_outside_rgrp++;
1483                                 continue;
1484                         }
1485                 } else {
1486                         rgd = gfs2_blk2rgrpd(sdp, bn, true);
1487                         if (unlikely(!rgd)) {
1488                                 ret = -EIO;
1489                                 goto out;
1490                         }
1491                         ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1492                                                  LM_FLAG_NODE_SCOPE, rd_gh);
1493                         if (ret)
1494                                 goto out;
1495
1496                         /* Must be done with the rgrp glock held: */
1497                         if (gfs2_rs_active(&ip->i_res) &&
1498                             rgd == ip->i_res.rs_rgd)
1499                                 gfs2_rs_deltree(&ip->i_res);
1500                 }
1501
1502                 /* The size of our transactions will be unknown until we
1503                    actually process all the metadata blocks that relate to
1504                    the rgrp. So we estimate. We know it can't be more than
1505                    the dinode's i_blocks and we don't want to exceed the
1506                    journal flush threshold, sd_log_thresh2. */
1507                 if (current->journal_info == NULL) {
1508                         unsigned int jblocks_rqsted, revokes;
1509
1510                         jblocks_rqsted = rgd->rd_length + RES_DINODE +
1511                                 RES_INDIRECT;
1512                         isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1513                         if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1514                                 jblocks_rqsted +=
1515                                         atomic_read(&sdp->sd_log_thresh2);
1516                         else
1517                                 jblocks_rqsted += isize_blks;
1518                         revokes = jblocks_rqsted;
1519                         if (meta)
1520                                 revokes += end - start;
1521                         else if (ip->i_depth)
1522                                 revokes += sdp->sd_inptrs;
1523                         ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1524                         if (ret)
1525                                 goto out_unlock;
1526                         down_write(&ip->i_rw_mutex);
1527                 }
1528                 /* check if we will exceed the transaction blocks requested */
1529                 tr = current->journal_info;
1530                 if (tr->tr_num_buf_new + RES_STATFS +
1531                     RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1532                         /* We set blks_outside_rgrp to ensure the loop will
1533                            be repeated for the same rgrp, but with a new
1534                            transaction. */
1535                         blks_outside_rgrp++;
1536                         /* This next part is tricky. If the buffer was added
1537                            to the transaction, we've already set some block
1538                            pointers to 0, so we better follow through and free
1539                            them, or we will introduce corruption (so break).
1540                            This may be impossible, or at least rare, but I
1541                            decided to cover the case regardless.
1542
1543                            If the buffer was not added to the transaction
1544                            (this call), doing so would exceed our transaction
1545                            size, so we need to end the transaction and start a
1546                            new one (so goto). */
1547
1548                         if (buf_in_tr)
1549                                 break;
1550                         goto out_unlock;
1551                 }
1552
1553                 gfs2_trans_add_meta(ip->i_gl, bh);
1554                 buf_in_tr = true;
1555                 *p = 0;
1556                 if (bstart + blen == bn) {
1557                         blen++;
1558                         continue;
1559                 }
1560                 if (bstart) {
1561                         __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1562                         (*btotal) += blen;
1563                         gfs2_add_inode_blocks(&ip->i_inode, -blen);
1564                 }
1565                 bstart = bn;
1566                 blen = 1;
1567         }
1568         if (bstart) {
1569                 __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1570                 (*btotal) += blen;
1571                 gfs2_add_inode_blocks(&ip->i_inode, -blen);
1572         }
1573 out_unlock:
1574         if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1575                                             outside the rgrp we just processed,
1576                                             do it all over again. */
1577                 if (current->journal_info) {
1578                         struct buffer_head *dibh;
1579
1580                         ret = gfs2_meta_inode_buffer(ip, &dibh);
1581                         if (ret)
1582                                 goto out;
1583
1584                         /* Every transaction boundary, we rewrite the dinode
1585                            to keep its di_blocks current in case of failure. */
1586                         inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1587                         gfs2_trans_add_meta(ip->i_gl, dibh);
1588                         gfs2_dinode_out(ip, dibh->b_data);
1589                         brelse(dibh);
1590                         up_write(&ip->i_rw_mutex);
1591                         gfs2_trans_end(sdp);
1592                         buf_in_tr = false;
1593                 }
1594                 gfs2_glock_dq_uninit(rd_gh);
1595                 cond_resched();
1596                 goto more_rgrps;
1597         }
1598 out:
1599         return ret;
1600 }
1601
1602 static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1603 {
1604         if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1605                 return false;
1606         return true;
1607 }
1608
1609 /**
1610  * find_nonnull_ptr - find a non-null pointer given a metapath and height
1611  * @sdp: The superblock
1612  * @mp: starting metapath
1613  * @h: desired height to search
1614  * @end_list: See punch_hole().
1615  * @end_aligned: See punch_hole().
1616  *
1617  * Assumes the metapath is valid (with buffers) out to height h.
1618  * Returns: true if a non-null pointer was found in the metapath buffer
1619  *          false if all remaining pointers are NULL in the buffer
1620  */
1621 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1622                              unsigned int h,
1623                              __u16 *end_list, unsigned int end_aligned)
1624 {
1625         struct buffer_head *bh = mp->mp_bh[h];
1626         __be64 *first, *ptr, *end;
1627
1628         first = metaptr1(h, mp);
1629         ptr = first + mp->mp_list[h];
1630         end = (__be64 *)(bh->b_data + bh->b_size);
1631         if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
1632                 bool keep_end = h < end_aligned;
1633                 end = first + end_list[h] + keep_end;
1634         }
1635
1636         while (ptr < end) {
1637                 if (*ptr) { /* if we have a non-null pointer */
1638                         mp->mp_list[h] = ptr - first;
1639                         h++;
1640                         if (h < GFS2_MAX_META_HEIGHT)
1641                                 mp->mp_list[h] = 0;
1642                         return true;
1643                 }
1644                 ptr++;
1645         }
1646         return false;
1647 }
1648
1649 enum dealloc_states {
1650         DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1651         DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1652         DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1653         DEALLOC_DONE = 3,       /* process complete */
1654 };
1655
1656 static inline void
1657 metapointer_range(struct metapath *mp, int height,
1658                   __u16 *start_list, unsigned int start_aligned,
1659                   __u16 *end_list, unsigned int end_aligned,
1660                   __be64 **start, __be64 **end)
1661 {
1662         struct buffer_head *bh = mp->mp_bh[height];
1663         __be64 *first;
1664
1665         first = metaptr1(height, mp);
1666         *start = first;
1667         if (mp_eq_to_hgt(mp, start_list, height)) {
1668                 bool keep_start = height < start_aligned;
1669                 *start = first + start_list[height] + keep_start;
1670         }
1671         *end = (__be64 *)(bh->b_data + bh->b_size);
1672         if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
1673                 bool keep_end = height < end_aligned;
1674                 *end = first + end_list[height] + keep_end;
1675         }
1676 }
1677
1678 static inline bool walk_done(struct gfs2_sbd *sdp,
1679                              struct metapath *mp, int height,
1680                              __u16 *end_list, unsigned int end_aligned)
1681 {
1682         __u16 end;
1683
1684         if (end_list) {
1685                 bool keep_end = height < end_aligned;
1686                 if (!mp_eq_to_hgt(mp, end_list, height))
1687                         return false;
1688                 end = end_list[height] + keep_end;
1689         } else
1690                 end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
1691         return mp->mp_list[height] >= end;
1692 }
1693
1694 /**
1695  * punch_hole - deallocate blocks in a file
1696  * @ip: inode to truncate
1697  * @offset: the start of the hole
1698  * @length: the size of the hole (or 0 for truncate)
1699  *
1700  * Punch a hole into a file or truncate a file at a given position.  This
1701  * function operates in whole blocks (@offset and @length are rounded
1702  * accordingly); partially filled blocks must be cleared otherwise.
1703  *
1704  * This function works from the bottom up, and from the right to the left. In
1705  * other words, it strips off the highest layer (data) before stripping any of
1706  * the metadata. Doing it this way is best in case the operation is interrupted
1707  * by power failure, etc.  The dinode is rewritten in every transaction to
1708  * guarantee integrity.
1709  */
1710 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
1711 {
1712         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1713         u64 maxsize = sdp->sd_heightsize[ip->i_height];
1714         struct metapath mp = {};
1715         struct buffer_head *dibh, *bh;
1716         struct gfs2_holder rd_gh;
1717         unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1718         u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
1719         __u16 start_list[GFS2_MAX_META_HEIGHT];
1720         __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
1721         unsigned int start_aligned, end_aligned;
1722         unsigned int strip_h = ip->i_height - 1;
1723         u32 btotal = 0;
1724         int ret, state;
1725         int mp_h; /* metapath buffers are read in to this height */
1726         u64 prev_bnr = 0;
1727         __be64 *start, *end;
1728
1729         if (offset >= maxsize) {
1730                 /*
1731                  * The starting point lies beyond the allocated metadata;
1732                  * there are no blocks to deallocate.
1733                  */
1734                 return 0;
1735         }
1736
1737         /*
1738          * The start position of the hole is defined by lblock, start_list, and
1739          * start_aligned.  The end position of the hole is defined by lend,
1740          * end_list, and end_aligned.
1741          *
1742          * start_aligned and end_aligned define down to which height the start
1743          * and end positions are aligned to the metadata tree (i.e., the
1744          * position is a multiple of the metadata granularity at the height
1745          * above).  This determines at which heights additional meta pointers
1746          * needs to be preserved for the remaining data.
1747          */
1748
1749         if (length) {
1750                 u64 end_offset = offset + length;
1751                 u64 lend;
1752
1753                 /*
1754                  * Clip the end at the maximum file size for the given height:
1755                  * that's how far the metadata goes; files bigger than that
1756                  * will have additional layers of indirection.
1757                  */
1758                 if (end_offset > maxsize)
1759                         end_offset = maxsize;
1760                 lend = end_offset >> bsize_shift;
1761
1762                 if (lblock >= lend)
1763                         return 0;
1764
1765                 find_metapath(sdp, lend, &mp, ip->i_height);
1766                 end_list = __end_list;
1767                 memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
1768
1769                 for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1770                         if (end_list[mp_h])
1771                                 break;
1772                 }
1773                 end_aligned = mp_h;
1774         }
1775
1776         find_metapath(sdp, lblock, &mp, ip->i_height);
1777         memcpy(start_list, mp.mp_list, sizeof(start_list));
1778
1779         for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1780                 if (start_list[mp_h])
1781                         break;
1782         }
1783         start_aligned = mp_h;
1784
1785         ret = gfs2_meta_inode_buffer(ip, &dibh);
1786         if (ret)
1787                 return ret;
1788
1789         mp.mp_bh[0] = dibh;
1790         ret = lookup_metapath(ip, &mp);
1791         if (ret)
1792                 goto out_metapath;
1793
1794         /* issue read-ahead on metadata */
1795         for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
1796                 metapointer_range(&mp, mp_h, start_list, start_aligned,
1797                                   end_list, end_aligned, &start, &end);
1798                 gfs2_metapath_ra(ip->i_gl, start, end);
1799         }
1800
1801         if (mp.mp_aheight == ip->i_height)
1802                 state = DEALLOC_MP_FULL; /* We have a complete metapath */
1803         else
1804                 state = DEALLOC_FILL_MP; /* deal with partial metapath */
1805
1806         ret = gfs2_rindex_update(sdp);
1807         if (ret)
1808                 goto out_metapath;
1809
1810         ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1811         if (ret)
1812                 goto out_metapath;
1813         gfs2_holder_mark_uninitialized(&rd_gh);
1814
1815         mp_h = strip_h;
1816
1817         while (state != DEALLOC_DONE) {
1818                 switch (state) {
1819                 /* Truncate a full metapath at the given strip height.
1820                  * Note that strip_h == mp_h in order to be in this state. */
1821                 case DEALLOC_MP_FULL:
1822                         bh = mp.mp_bh[mp_h];
1823                         gfs2_assert_withdraw(sdp, bh);
1824                         if (gfs2_assert_withdraw(sdp,
1825                                                  prev_bnr != bh->b_blocknr)) {
1826                                 fs_emerg(sdp, "inode %llu, block:%llu, i_h:%u,"
1827                                          "s_h:%u, mp_h:%u\n",
1828                                        (unsigned long long)ip->i_no_addr,
1829                                        prev_bnr, ip->i_height, strip_h, mp_h);
1830                         }
1831                         prev_bnr = bh->b_blocknr;
1832
1833                         if (gfs2_metatype_check(sdp, bh,
1834                                                 (mp_h ? GFS2_METATYPE_IN :
1835                                                         GFS2_METATYPE_DI))) {
1836                                 ret = -EIO;
1837                                 goto out;
1838                         }
1839
1840                         /*
1841                          * Below, passing end_aligned as 0 gives us the
1842                          * metapointer range excluding the end point: the end
1843                          * point is the first metapath we must not deallocate!
1844                          */
1845
1846                         metapointer_range(&mp, mp_h, start_list, start_aligned,
1847                                           end_list, 0 /* end_aligned */,
1848                                           &start, &end);
1849                         ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
1850                                                  start, end,
1851                                                  mp_h != ip->i_height - 1,
1852                                                  &btotal);
1853
1854                         /* If we hit an error or just swept dinode buffer,
1855                            just exit. */
1856                         if (ret || !mp_h) {
1857                                 state = DEALLOC_DONE;
1858                                 break;
1859                         }
1860                         state = DEALLOC_MP_LOWER;
1861                         break;
1862
1863                 /* lower the metapath strip height */
1864                 case DEALLOC_MP_LOWER:
1865                         /* We're done with the current buffer, so release it,
1866                            unless it's the dinode buffer. Then back up to the
1867                            previous pointer. */
1868                         if (mp_h) {
1869                                 brelse(mp.mp_bh[mp_h]);
1870                                 mp.mp_bh[mp_h] = NULL;
1871                         }
1872                         /* If we can't get any lower in height, we've stripped
1873                            off all we can. Next step is to back up and start
1874                            stripping the previous level of metadata. */
1875                         if (mp_h == 0) {
1876                                 strip_h--;
1877                                 memcpy(mp.mp_list, start_list, sizeof(start_list));
1878                                 mp_h = strip_h;
1879                                 state = DEALLOC_FILL_MP;
1880                                 break;
1881                         }
1882                         mp.mp_list[mp_h] = 0;
1883                         mp_h--; /* search one metadata height down */
1884                         mp.mp_list[mp_h]++;
1885                         if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
1886                                 break;
1887                         /* Here we've found a part of the metapath that is not
1888                          * allocated. We need to search at that height for the
1889                          * next non-null pointer. */
1890                         if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
1891                                 state = DEALLOC_FILL_MP;
1892                                 mp_h++;
1893                         }
1894                         /* No more non-null pointers at this height. Back up
1895                            to the previous height and try again. */
1896                         break; /* loop around in the same state */
1897
1898                 /* Fill the metapath with buffers to the given height. */
1899                 case DEALLOC_FILL_MP:
1900                         /* Fill the buffers out to the current height. */
1901                         ret = fillup_metapath(ip, &mp, mp_h);
1902                         if (ret < 0)
1903                                 goto out;
1904
1905                         /* On the first pass, issue read-ahead on metadata. */
1906                         if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) {
1907                                 unsigned int height = mp.mp_aheight - 1;
1908
1909                                 /* No read-ahead for data blocks. */
1910                                 if (mp.mp_aheight - 1 == strip_h)
1911                                         height--;
1912
1913                                 for (; height >= mp.mp_aheight - ret; height--) {
1914                                         metapointer_range(&mp, height,
1915                                                           start_list, start_aligned,
1916                                                           end_list, end_aligned,
1917                                                           &start, &end);
1918                                         gfs2_metapath_ra(ip->i_gl, start, end);
1919                                 }
1920                         }
1921
1922                         /* If buffers found for the entire strip height */
1923                         if (mp.mp_aheight - 1 == strip_h) {
1924                                 state = DEALLOC_MP_FULL;
1925                                 break;
1926                         }
1927                         if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1928                                 mp_h = mp.mp_aheight - 1;
1929
1930                         /* If we find a non-null block pointer, crawl a bit
1931                            higher up in the metapath and try again, otherwise
1932                            we need to look lower for a new starting point. */
1933                         if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
1934                                 mp_h++;
1935                         else
1936                                 state = DEALLOC_MP_LOWER;
1937                         break;
1938                 }
1939         }
1940
1941         if (btotal) {
1942                 if (current->journal_info == NULL) {
1943                         ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1944                                                RES_QUOTA, 0);
1945                         if (ret)
1946                                 goto out;
1947                         down_write(&ip->i_rw_mutex);
1948                 }
1949                 gfs2_statfs_change(sdp, 0, +btotal, 0);
1950                 gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1951                                   ip->i_inode.i_gid);
1952                 inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1953                 gfs2_trans_add_meta(ip->i_gl, dibh);
1954                 gfs2_dinode_out(ip, dibh->b_data);
1955                 up_write(&ip->i_rw_mutex);
1956                 gfs2_trans_end(sdp);
1957         }
1958
1959 out:
1960         if (gfs2_holder_initialized(&rd_gh))
1961                 gfs2_glock_dq_uninit(&rd_gh);
1962         if (current->journal_info) {
1963                 up_write(&ip->i_rw_mutex);
1964                 gfs2_trans_end(sdp);
1965                 cond_resched();
1966         }
1967         gfs2_quota_unhold(ip);
1968 out_metapath:
1969         release_metapath(&mp);
1970         return ret;
1971 }
1972
1973 static int trunc_end(struct gfs2_inode *ip)
1974 {
1975         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1976         struct buffer_head *dibh;
1977         int error;
1978
1979         error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1980         if (error)
1981                 return error;
1982
1983         down_write(&ip->i_rw_mutex);
1984
1985         error = gfs2_meta_inode_buffer(ip, &dibh);
1986         if (error)
1987                 goto out;
1988
1989         if (!i_size_read(&ip->i_inode)) {
1990                 ip->i_height = 0;
1991                 ip->i_goal = ip->i_no_addr;
1992                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1993                 gfs2_ordered_del_inode(ip);
1994         }
1995         inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1996         ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1997
1998         gfs2_trans_add_meta(ip->i_gl, dibh);
1999         gfs2_dinode_out(ip, dibh->b_data);
2000         brelse(dibh);
2001
2002 out:
2003         up_write(&ip->i_rw_mutex);
2004         gfs2_trans_end(sdp);
2005         return error;
2006 }
2007
2008 /**
2009  * do_shrink - make a file smaller
2010  * @inode: the inode
2011  * @newsize: the size to make the file
2012  *
2013  * Called with an exclusive lock on @inode. The @size must
2014  * be equal to or smaller than the current inode size.
2015  *
2016  * Returns: errno
2017  */
2018
2019 static int do_shrink(struct inode *inode, u64 newsize)
2020 {
2021         struct gfs2_inode *ip = GFS2_I(inode);
2022         int error;
2023
2024         error = trunc_start(inode, newsize);
2025         if (error < 0)
2026                 return error;
2027         if (gfs2_is_stuffed(ip))
2028                 return 0;
2029
2030         error = punch_hole(ip, newsize, 0);
2031         if (error == 0)
2032                 error = trunc_end(ip);
2033
2034         return error;
2035 }
2036
2037 /**
2038  * do_grow - Touch and update inode size
2039  * @inode: The inode
2040  * @size: The new size
2041  *
2042  * This function updates the timestamps on the inode and
2043  * may also increase the size of the inode. This function
2044  * must not be called with @size any smaller than the current
2045  * inode size.
2046  *
2047  * Although it is not strictly required to unstuff files here,
2048  * earlier versions of GFS2 have a bug in the stuffed file reading
2049  * code which will result in a buffer overrun if the size is larger
2050  * than the max stuffed file size. In order to prevent this from
2051  * occurring, such files are unstuffed, but in other cases we can
2052  * just update the inode size directly.
2053  *
2054  * Returns: 0 on success, or -ve on error
2055  */
2056
2057 static int do_grow(struct inode *inode, u64 size)
2058 {
2059         struct gfs2_inode *ip = GFS2_I(inode);
2060         struct gfs2_sbd *sdp = GFS2_SB(inode);
2061         struct gfs2_alloc_parms ap = { .target = 1, };
2062         struct buffer_head *dibh;
2063         int error;
2064         int unstuff = 0;
2065
2066         if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
2067                 error = gfs2_quota_lock_check(ip, &ap);
2068                 if (error)
2069                         return error;
2070
2071                 error = gfs2_inplace_reserve(ip, &ap);
2072                 if (error)
2073                         goto do_grow_qunlock;
2074                 unstuff = 1;
2075         }
2076
2077         error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
2078                                  (unstuff &&
2079                                   gfs2_is_jdata(ip) ? RES_JDATA : 0) +
2080                                  (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
2081                                   0 : RES_QUOTA), 0);
2082         if (error)
2083                 goto do_grow_release;
2084
2085         if (unstuff) {
2086                 error = gfs2_unstuff_dinode(ip);
2087                 if (error)
2088                         goto do_end_trans;
2089         }
2090
2091         error = gfs2_meta_inode_buffer(ip, &dibh);
2092         if (error)
2093                 goto do_end_trans;
2094
2095         truncate_setsize(inode, size);
2096         inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
2097         gfs2_trans_add_meta(ip->i_gl, dibh);
2098         gfs2_dinode_out(ip, dibh->b_data);
2099         brelse(dibh);
2100
2101 do_end_trans:
2102         gfs2_trans_end(sdp);
2103 do_grow_release:
2104         if (unstuff) {
2105                 gfs2_inplace_release(ip);
2106 do_grow_qunlock:
2107                 gfs2_quota_unlock(ip);
2108         }
2109         return error;
2110 }
2111
2112 /**
2113  * gfs2_setattr_size - make a file a given size
2114  * @inode: the inode
2115  * @newsize: the size to make the file
2116  *
2117  * The file size can grow, shrink, or stay the same size. This
2118  * is called holding i_rwsem and an exclusive glock on the inode
2119  * in question.
2120  *
2121  * Returns: errno
2122  */
2123
2124 int gfs2_setattr_size(struct inode *inode, u64 newsize)
2125 {
2126         struct gfs2_inode *ip = GFS2_I(inode);
2127         int ret;
2128
2129         BUG_ON(!S_ISREG(inode->i_mode));
2130
2131         ret = inode_newsize_ok(inode, newsize);
2132         if (ret)
2133                 return ret;
2134
2135         inode_dio_wait(inode);
2136
2137         ret = gfs2_qa_get(ip);
2138         if (ret)
2139                 goto out;
2140
2141         if (newsize >= inode->i_size) {
2142                 ret = do_grow(inode, newsize);
2143                 goto out;
2144         }
2145
2146         ret = do_shrink(inode, newsize);
2147 out:
2148         gfs2_rs_delete(ip);
2149         gfs2_qa_put(ip);
2150         return ret;
2151 }
2152
2153 int gfs2_truncatei_resume(struct gfs2_inode *ip)
2154 {
2155         int error;
2156         error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
2157         if (!error)
2158                 error = trunc_end(ip);
2159         return error;
2160 }
2161
2162 int gfs2_file_dealloc(struct gfs2_inode *ip)
2163 {
2164         return punch_hole(ip, 0, 0);
2165 }
2166
2167 /**
2168  * gfs2_free_journal_extents - Free cached journal bmap info
2169  * @jd: The journal
2170  *
2171  */
2172
2173 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
2174 {
2175         struct gfs2_journal_extent *jext;
2176
2177         while(!list_empty(&jd->extent_list)) {
2178                 jext = list_first_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2179                 list_del(&jext->list);
2180                 kfree(jext);
2181         }
2182 }
2183
2184 /**
2185  * gfs2_add_jextent - Add or merge a new extent to extent cache
2186  * @jd: The journal descriptor
2187  * @lblock: The logical block at start of new extent
2188  * @dblock: The physical block at start of new extent
2189  * @blocks: Size of extent in fs blocks
2190  *
2191  * Returns: 0 on success or -ENOMEM
2192  */
2193
2194 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
2195 {
2196         struct gfs2_journal_extent *jext;
2197
2198         if (!list_empty(&jd->extent_list)) {
2199                 jext = list_last_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2200                 if ((jext->dblock + jext->blocks) == dblock) {
2201                         jext->blocks += blocks;
2202                         return 0;
2203                 }
2204         }
2205
2206         jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
2207         if (jext == NULL)
2208                 return -ENOMEM;
2209         jext->dblock = dblock;
2210         jext->lblock = lblock;
2211         jext->blocks = blocks;
2212         list_add_tail(&jext->list, &jd->extent_list);
2213         jd->nr_extents++;
2214         return 0;
2215 }
2216
2217 /**
2218  * gfs2_map_journal_extents - Cache journal bmap info
2219  * @sdp: The super block
2220  * @jd: The journal to map
2221  *
2222  * Create a reusable "extent" mapping from all logical
2223  * blocks to all physical blocks for the given journal.  This will save
2224  * us time when writing journal blocks.  Most journals will have only one
2225  * extent that maps all their logical blocks.  That's because gfs2.mkfs
2226  * arranges the journal blocks sequentially to maximize performance.
2227  * So the extent would map the first block for the entire file length.
2228  * However, gfs2_jadd can happen while file activity is happening, so
2229  * those journals may not be sequential.  Less likely is the case where
2230  * the users created their own journals by mounting the metafs and
2231  * laying it out.  But it's still possible.  These journals might have
2232  * several extents.
2233  *
2234  * Returns: 0 on success, or error on failure
2235  */
2236
2237 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
2238 {
2239         u64 lblock = 0;
2240         u64 lblock_stop;
2241         struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
2242         struct buffer_head bh;
2243         unsigned int shift = sdp->sd_sb.sb_bsize_shift;
2244         u64 size;
2245         int rc;
2246         ktime_t start, end;
2247
2248         start = ktime_get();
2249         lblock_stop = i_size_read(jd->jd_inode) >> shift;
2250         size = (lblock_stop - lblock) << shift;
2251         jd->nr_extents = 0;
2252         WARN_ON(!list_empty(&jd->extent_list));
2253
2254         do {
2255                 bh.b_state = 0;
2256                 bh.b_blocknr = 0;
2257                 bh.b_size = size;
2258                 rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
2259                 if (rc || !buffer_mapped(&bh))
2260                         goto fail;
2261                 rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
2262                 if (rc)
2263                         goto fail;
2264                 size -= bh.b_size;
2265                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2266         } while(size > 0);
2267
2268         end = ktime_get();
2269         fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid,
2270                 jd->nr_extents, ktime_ms_delta(end, start));
2271         return 0;
2272
2273 fail:
2274         fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
2275                 rc, jd->jd_jid,
2276                 (unsigned long long)(i_size_read(jd->jd_inode) - size),
2277                 jd->nr_extents);
2278         fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
2279                 rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
2280                 bh.b_state, (unsigned long long)bh.b_size);
2281         gfs2_free_journal_extents(jd);
2282         return rc;
2283 }
2284
2285 /**
2286  * gfs2_write_alloc_required - figure out if a write will require an allocation
2287  * @ip: the file being written to
2288  * @offset: the offset to write to
2289  * @len: the number of bytes being written
2290  *
2291  * Returns: 1 if an alloc is required, 0 otherwise
2292  */
2293
2294 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
2295                               unsigned int len)
2296 {
2297         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2298         struct buffer_head bh;
2299         unsigned int shift;
2300         u64 lblock, lblock_stop, size;
2301         u64 end_of_file;
2302
2303         if (!len)
2304                 return 0;
2305
2306         if (gfs2_is_stuffed(ip)) {
2307                 if (offset + len > gfs2_max_stuffed_size(ip))
2308                         return 1;
2309                 return 0;
2310         }
2311
2312         shift = sdp->sd_sb.sb_bsize_shift;
2313         BUG_ON(gfs2_is_dir(ip));
2314         end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
2315         lblock = offset >> shift;
2316         lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
2317         if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex))
2318                 return 1;
2319
2320         size = (lblock_stop - lblock) << shift;
2321         do {
2322                 bh.b_state = 0;
2323                 bh.b_size = size;
2324                 gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
2325                 if (!buffer_mapped(&bh))
2326                         return 1;
2327                 size -= bh.b_size;
2328                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2329         } while(size > 0);
2330
2331         return 0;
2332 }
2333
2334 static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
2335 {
2336         struct gfs2_inode *ip = GFS2_I(inode);
2337         struct buffer_head *dibh;
2338         int error;
2339
2340         if (offset >= inode->i_size)
2341                 return 0;
2342         if (offset + length > inode->i_size)
2343                 length = inode->i_size - offset;
2344
2345         error = gfs2_meta_inode_buffer(ip, &dibh);
2346         if (error)
2347                 return error;
2348         gfs2_trans_add_meta(ip->i_gl, dibh);
2349         memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
2350                length);
2351         brelse(dibh);
2352         return 0;
2353 }
2354
2355 static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
2356                                          loff_t length)
2357 {
2358         struct gfs2_sbd *sdp = GFS2_SB(inode);
2359         loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
2360         int error;
2361
2362         while (length) {
2363                 struct gfs2_trans *tr;
2364                 loff_t chunk;
2365                 unsigned int offs;
2366
2367                 chunk = length;
2368                 if (chunk > max_chunk)
2369                         chunk = max_chunk;
2370
2371                 offs = offset & ~PAGE_MASK;
2372                 if (offs && chunk > PAGE_SIZE)
2373                         chunk = offs + ((chunk - offs) & PAGE_MASK);
2374
2375                 truncate_pagecache_range(inode, offset, chunk);
2376                 offset += chunk;
2377                 length -= chunk;
2378
2379                 tr = current->journal_info;
2380                 if (!test_bit(TR_TOUCHED, &tr->tr_flags))
2381                         continue;
2382
2383                 gfs2_trans_end(sdp);
2384                 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
2385                 if (error)
2386                         return error;
2387         }
2388         return 0;
2389 }
2390
2391 int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
2392 {
2393         struct inode *inode = file_inode(file);
2394         struct gfs2_inode *ip = GFS2_I(inode);
2395         struct gfs2_sbd *sdp = GFS2_SB(inode);
2396         unsigned int blocksize = i_blocksize(inode);
2397         loff_t start, end;
2398         int error;
2399
2400         if (!gfs2_is_stuffed(ip)) {
2401                 unsigned int start_off, end_len;
2402
2403                 start_off = offset & (blocksize - 1);
2404                 end_len = (offset + length) & (blocksize - 1);
2405                 if (start_off) {
2406                         unsigned int len = length;
2407                         if (length > blocksize - start_off)
2408                                 len = blocksize - start_off;
2409                         error = gfs2_block_zero_range(inode, offset, len);
2410                         if (error)
2411                                 goto out;
2412                         if (start_off + length < blocksize)
2413                                 end_len = 0;
2414                 }
2415                 if (end_len) {
2416                         error = gfs2_block_zero_range(inode,
2417                                 offset + length - end_len, end_len);
2418                         if (error)
2419                                 goto out;
2420                 }
2421         }
2422
2423         start = round_down(offset, blocksize);
2424         end = round_up(offset + length, blocksize) - 1;
2425         error = filemap_write_and_wait_range(inode->i_mapping, start, end);
2426         if (error)
2427                 return error;
2428
2429         if (gfs2_is_jdata(ip))
2430                 error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
2431                                          GFS2_JTRUNC_REVOKES);
2432         else
2433                 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2434         if (error)
2435                 return error;
2436
2437         if (gfs2_is_stuffed(ip)) {
2438                 error = stuffed_zero_range(inode, offset, length);
2439                 if (error)
2440                         goto out;
2441         }
2442
2443         if (gfs2_is_jdata(ip)) {
2444                 BUG_ON(!current->journal_info);
2445                 gfs2_journaled_truncate_range(inode, offset, length);
2446         } else
2447                 truncate_pagecache_range(inode, offset, offset + length - 1);
2448
2449         file_update_time(file);
2450         mark_inode_dirty(inode);
2451
2452         if (current->journal_info)
2453                 gfs2_trans_end(sdp);
2454
2455         if (!gfs2_is_stuffed(ip))
2456                 error = punch_hole(ip, offset, length);
2457
2458 out:
2459         if (current->journal_info)
2460                 gfs2_trans_end(sdp);
2461         return error;
2462 }
2463
2464 static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
2465                 loff_t offset)
2466 {
2467         int ret;
2468
2469         if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode))))
2470                 return -EIO;
2471
2472         if (offset >= wpc->iomap.offset &&
2473             offset < wpc->iomap.offset + wpc->iomap.length)
2474                 return 0;
2475
2476         memset(&wpc->iomap, 0, sizeof(wpc->iomap));
2477         ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap);
2478         return ret;
2479 }
2480
2481 const struct iomap_writeback_ops gfs2_writeback_ops = {
2482         .map_blocks             = gfs2_map_blocks,
2483 };