fs/gfs2/bmap.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
   4  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
   5  */
   6
   7 #include <linux/spinlock.h>
   8 #include <linux/completion.h>
   9 #include <linux/buffer_head.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/gfs2_ondisk.h>
  12 #include <linux/crc32.h>
  13 #include <linux/iomap.h>
  14 #include <linux/ktime.h>
  15
  16 #include "gfs2.h"
  17 #include "incore.h"
  18 #include "bmap.h"
  19 #include "glock.h"
  20 #include "inode.h"
  21 #include "meta_io.h"
  22 #include "quota.h"
  23 #include "rgrp.h"
  24 #include "log.h"
  25 #include "super.h"
  26 #include "trans.h"
  27 #include "dir.h"
  28 #include "util.h"
  29 #include "aops.h"
  30 #include "trace_gfs2.h"
  31
  32 /* This doesn't need to be that large as max 64 bit pointers in a 4k
  33  * block is 512, so __u16 is fine for that. It saves stack space to
  34  * keep it small.
  35  */
  36 struct metapath {
  37         struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
  38         __u16 mp_list[GFS2_MAX_META_HEIGHT];
  39         int mp_fheight; /* find_metapath height */
  40         int mp_aheight; /* actual height (lookup height) */
  41 };
  42
  43 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
  44
  45 /**
  46  * gfs2_unstuffer_folio - unstuff a stuffed inode into a block cached by a folio
  47  * @ip: the inode
  48  * @dibh: the dinode buffer
  49  * @block: the block number that was allocated
  50  * @folio: The folio.
  51  *
  52  * Returns: errno
  53  */
  54 static int gfs2_unstuffer_folio(struct gfs2_inode *ip, struct buffer_head *dibh,
  55                                u64 block, struct folio *folio)
  56 {
  57         struct inode *inode = &ip->i_inode;
  58
  59         if (!folio_test_uptodate(folio)) {
  60                 void *kaddr = kmap_local_folio(folio, 0);
  61                 u64 dsize = i_size_read(inode);
  62
  63                 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
  64                 memset(kaddr + dsize, 0, folio_size(folio) - dsize);
  65                 kunmap_local(kaddr);
  66
  67                 folio_mark_uptodate(folio);
  68         }
  69
  70         if (gfs2_is_jdata(ip)) {
  71                 struct buffer_head *bh = folio_buffers(folio);
  72
  73                 if (!bh)
  74                         bh = create_empty_buffers(folio,
  75                                 BIT(inode->i_blkbits), BIT(BH_Uptodate));
  76
  77                 if (!buffer_mapped(bh))
  78                         map_bh(bh, inode->i_sb, block);
  79
  80                 set_buffer_uptodate(bh);
  81                 gfs2_trans_add_data(ip->i_gl, bh);
  82         } else {
  83                 folio_mark_dirty(folio);
  84                 gfs2_ordered_add_inode(ip);
  85         }
  86
  87         return 0;
  88 }
  89
  90 static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct folio *folio)
  91 {
  92         struct buffer_head *bh, *dibh;
  93         struct gfs2_dinode *di;
  94         u64 block = 0;
  95         int isdir = gfs2_is_dir(ip);
  96         int error;
  97
  98         error = gfs2_meta_inode_buffer(ip, &dibh);
  99         if (error)
 100                 return error;
 101
 102         if (i_size_read(&ip->i_inode)) {
 103                 /* Get a free block, fill it with the stuffed data,
 104                    and write it out to disk */
 105
 106                 unsigned int n = 1;
 107                 error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
 108                 if (error)
 109                         goto out_brelse;
 110                 if (isdir) {
 111                         gfs2_trans_remove_revoke(GFS2_SB(&ip->i_inode), block, 1);
 112                         error = gfs2_dir_get_new_buffer(ip, block, &bh);
 113                         if (error)
 114                                 goto out_brelse;
 115                         gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
 116                                               dibh, sizeof(struct gfs2_dinode));
 117                         brelse(bh);
 118                 } else {
 119                         error = gfs2_unstuffer_folio(ip, dibh, block, folio);
 120                         if (error)
 121                                 goto out_brelse;
 122                 }
 123         }
 124
 125         /*  Set up the pointer to the new block  */
 126
 127         gfs2_trans_add_meta(ip->i_gl, dibh);
 128         di = (struct gfs2_dinode *)dibh->b_data;
 129         gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 130
 131         if (i_size_read(&ip->i_inode)) {
 132                 *(__be64 *)(di + 1) = cpu_to_be64(block);
 133                 gfs2_add_inode_blocks(&ip->i_inode, 1);
 134                 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 135         }
 136
 137         ip->i_height = 1;
 138         di->di_height = cpu_to_be16(1);
 139
 140 out_brelse:
 141         brelse(dibh);
 142         return error;
 143 }
 144
 145 /**
 146  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
 147  * @ip: The GFS2 inode to unstuff
 148  *
 149  * This routine unstuffs a dinode and returns it to a "normal" state such
 150  * that the height can be grown in the traditional way.
 151  *
 152  * Returns: errno
 153  */
 154
 155 int gfs2_unstuff_dinode(struct gfs2_inode *ip)
 156 {
 157         struct inode *inode = &ip->i_inode;
 158         struct folio *folio;
 159         int error;
 160
 161         down_write(&ip->i_rw_mutex);
 162         folio = filemap_grab_folio(inode->i_mapping, 0);
 163         error = PTR_ERR(folio);
 164         if (IS_ERR(folio))
 165                 goto out;
 166         error = __gfs2_unstuff_inode(ip, folio);
 167         folio_unlock(folio);
 168         folio_put(folio);
 169 out:
 170         up_write(&ip->i_rw_mutex);
 171         return error;
 172 }
 173
 174 /**
 175  * find_metapath - Find path through the metadata tree
 176  * @sdp: The superblock
 177  * @block: The disk block to look up
 178  * @mp: The metapath to return the result in
 179  * @height: The pre-calculated height of the metadata tree
 180  *
 181  *   This routine returns a struct metapath structure that defines a path
 182  *   through the metadata of inode "ip" to get to block "block".
 183  *
 184  *   Example:
 185  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
 186  *   filesystem with a blocksize of 4096.
 187  *
 188  *   find_metapath() would return a struct metapath structure set to:
 189  *   mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
 190  *
 191  *   That means that in order to get to the block containing the byte at
 192  *   offset 101342453, we would load the indirect block pointed to by pointer
 193  *   0 in the dinode.  We would then load the indirect block pointed to by
 194  *   pointer 48 in that indirect block.  We would then load the data block
 195  *   pointed to by pointer 165 in that indirect block.
 196  *
 197  *             ----------------------------------------
 198  *             | Dinode |                             |
 199  *             |        |                            4|
 200  *             |        |0 1 2 3 4 5                 9|
 201  *             |        |                            6|
 202  *             ----------------------------------------
 203  *                       |
 204  *                       |
 205  *                       V
 206  *             ----------------------------------------
 207  *             | Indirect Block                       |
 208  *             |                                     5|
 209  *             |            4 4 4 4 4 5 5            1|
 210  *             |0           5 6 7 8 9 0 1            2|
 211  *             ----------------------------------------
 212  *                                |
 213  *                                |
 214  *                                V
 215  *             ----------------------------------------
 216  *             | Indirect Block                       |
 217  *             |                         1 1 1 1 1   5|
 218  *             |                         6 6 6 6 6   1|
 219  *             |0                        3 4 5 6 7   2|
 220  *             ----------------------------------------
 221  *                                           |
 222  *                                           |
 223  *                                           V
 224  *             ----------------------------------------
 225  *             | Data block containing offset         |
 226  *             |            101342453                 |
 227  *             |                                      |
 228  *             |                                      |
 229  *             ----------------------------------------
 230  *
 231  */
 232
 233 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
 234                           struct metapath *mp, unsigned int height)
 235 {
 236         unsigned int i;
 237
 238         mp->mp_fheight = height;
 239         for (i = height; i--;)
 240                 mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
 241 }
 242
 243 static inline unsigned int metapath_branch_start(const struct metapath *mp)
 244 {
 245         if (mp->mp_list[0] == 0)
 246                 return 2;
 247         return 1;
 248 }
 249
 250 /**
 251  * metaptr1 - Return the first possible metadata pointer in a metapath buffer
 252  * @height: The metadata height (0 = dinode)
 253  * @mp: The metapath
 254  */
 255 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
 256 {
 257         struct buffer_head *bh = mp->mp_bh[height];
 258         if (height == 0)
 259                 return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
 260         return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
 261 }
 262
 263 /**
 264  * metapointer - Return pointer to start of metadata in a buffer
 265  * @height: The metadata height (0 = dinode)
 266  * @mp: The metapath
 267  *
 268  * Return a pointer to the block number of the next height of the metadata
 269  * tree given a buffer containing the pointer to the current height of the
 270  * metadata tree.
 271  */
 272
 273 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
 274 {
 275         __be64 *p = metaptr1(height, mp);
 276         return p + mp->mp_list[height];
 277 }
 278
 279 static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
 280 {
 281         const struct buffer_head *bh = mp->mp_bh[height];
 282         return (const __be64 *)(bh->b_data + bh->b_size);
 283 }
 284
 285 static void clone_metapath(struct metapath *clone, struct metapath *mp)
 286 {
 287         unsigned int hgt;
 288
 289         *clone = *mp;
 290         for (hgt = 0; hgt < mp->mp_aheight; hgt++)
 291                 get_bh(clone->mp_bh[hgt]);
 292 }
 293
 294 static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
 295 {
 296         const __be64 *t;
 297
 298         for (t = start; t < end; t++) {
 299                 struct buffer_head *rabh;
 300
 301                 if (!*t)
 302                         continue;
 303
 304                 rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
 305                 if (trylock_buffer(rabh)) {
 306                         if (!buffer_uptodate(rabh)) {
 307                                 rabh->b_end_io = end_buffer_read_sync;
 308                                 submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META |
 309                                           REQ_PRIO, rabh);
 310                                 continue;
 311                         }
 312                         unlock_buffer(rabh);
 313                 }
 314                 brelse(rabh);
 315         }
 316 }
 317
 318 static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
 319                              unsigned int x, unsigned int h)
 320 {
 321         for (; x < h; x++) {
 322                 __be64 *ptr = metapointer(x, mp);
 323                 u64 dblock = be64_to_cpu(*ptr);
 324                 int ret;
 325
 326                 if (!dblock)
 327                         break;
 328                 ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, &mp->mp_bh[x + 1]);
 329                 if (ret)
 330                         return ret;
 331         }
 332         mp->mp_aheight = x + 1;
 333         return 0;
 334 }
 335
 336 /**
 337  * lookup_metapath - Walk the metadata tree to a specific point
 338  * @ip: The inode
 339  * @mp: The metapath
 340  *
 341  * Assumes that the inode's buffer has already been looked up and
 342  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
 343  * by find_metapath().
 344  *
 345  * If this function encounters part of the tree which has not been
 346  * allocated, it returns the current height of the tree at the point
 347  * at which it found the unallocated block. Blocks which are found are
 348  * added to the mp->mp_bh[] list.
 349  *
 350  * Returns: error
 351  */
 352
 353 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 354 {
 355         return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
 356 }
 357
 358 /**
 359  * fillup_metapath - fill up buffers for the metadata path to a specific height
 360  * @ip: The inode
 361  * @mp: The metapath
 362  * @h: The height to which it should be mapped
 363  *
 364  * Similar to lookup_metapath, but does lookups for a range of heights
 365  *
 366  * Returns: error or the number of buffers filled
 367  */
 368
 369 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
 370 {
 371         unsigned int x = 0;
 372         int ret;
 373
 374         if (h) {
 375                 /* find the first buffer we need to look up. */
 376                 for (x = h - 1; x > 0; x--) {
 377                         if (mp->mp_bh[x])
 378                                 break;
 379                 }
 380         }
 381         ret = __fillup_metapath(ip, mp, x, h);
 382         if (ret)
 383                 return ret;
 384         return mp->mp_aheight - x - 1;
 385 }
 386
 387 static sector_t metapath_to_block(struct gfs2_sbd *sdp, struct metapath *mp)
 388 {
 389         sector_t factor = 1, block = 0;
 390         int hgt;
 391
 392         for (hgt = mp->mp_fheight - 1; hgt >= 0; hgt--) {
 393                 if (hgt < mp->mp_aheight)
 394                         block += mp->mp_list[hgt] * factor;
 395                 factor *= sdp->sd_inptrs;
 396         }
 397         return block;
 398 }
 399
 400 static void release_metapath(struct metapath *mp)
 401 {
 402         int i;
 403
 404         for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
 405                 if (mp->mp_bh[i] == NULL)
 406                         break;
 407                 brelse(mp->mp_bh[i]);
 408                 mp->mp_bh[i] = NULL;
 409         }
 410 }
 411
 412 /**
 413  * gfs2_extent_length - Returns length of an extent of blocks
 414  * @bh: The metadata block
 415  * @ptr: Current position in @bh
 416  * @limit: Max extent length to return
 417  * @eob: Set to 1 if we hit "end of block"
 418  *
 419  * Returns: The length of the extent (minimum of one block)
 420  */
 421
 422 static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob)
 423 {
 424         const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
 425         const __be64 *first = ptr;
 426         u64 d = be64_to_cpu(*ptr);
 427
 428         *eob = 0;
 429         do {
 430                 ptr++;
 431                 if (ptr >= end)
 432                         break;
 433                 d++;
 434         } while(be64_to_cpu(*ptr) == d);
 435         if (ptr >= end)
 436                 *eob = 1;
 437         return ptr - first;
 438 }
 439
 440 enum walker_status { WALK_STOP, WALK_FOLLOW, WALK_CONTINUE };
 441
 442 /*
 443  * gfs2_metadata_walker - walk an indirect block
 444  * @mp: Metapath to indirect block
 445  * @ptrs: Number of pointers to look at
 446  *
 447  * When returning WALK_FOLLOW, the walker must update @mp to point at the right
 448  * indirect block to follow.
 449  */
 450 typedef enum walker_status (*gfs2_metadata_walker)(struct metapath *mp,
 451                                                    unsigned int ptrs);
 452
 453 /*
 454  * gfs2_walk_metadata - walk a tree of indirect blocks
 455  * @inode: The inode
 456  * @mp: Starting point of walk
 457  * @max_len: Maximum number of blocks to walk
 458  * @walker: Called during the walk
 459  *
 460  * Returns 1 if the walk was stopped by @walker, 0 if we went past @max_len or
 461  * past the end of metadata, and a negative error code otherwise.
 462  */
 463
 464 static int gfs2_walk_metadata(struct inode *inode, struct metapath *mp,
 465                 u64 max_len, gfs2_metadata_walker walker)
 466 {
 467         struct gfs2_inode *ip = GFS2_I(inode);
 468         struct gfs2_sbd *sdp = GFS2_SB(inode);
 469         u64 factor = 1;
 470         unsigned int hgt;
 471         int ret;
 472
 473         /*
 474          * The walk starts in the lowest allocated indirect block, which may be
 475          * before the position indicated by @mp.  Adjust @max_len accordingly
 476          * to avoid a short walk.
 477          */
 478         for (hgt = mp->mp_fheight - 1; hgt >= mp->mp_aheight; hgt--) {
 479                 max_len += mp->mp_list[hgt] * factor;
 480                 mp->mp_list[hgt] = 0;
 481                 factor *= sdp->sd_inptrs;
 482         }
 483
 484         for (;;) {
 485                 u16 start = mp->mp_list[hgt];
 486                 enum walker_status status;
 487                 unsigned int ptrs;
 488                 u64 len;
 489
 490                 /* Walk indirect block. */
 491                 ptrs = (hgt >= 1 ? sdp->sd_inptrs : sdp->sd_diptrs) - start;
 492                 len = ptrs * factor;
 493                 if (len > max_len)
 494                         ptrs = DIV_ROUND_UP_ULL(max_len, factor);
 495                 status = walker(mp, ptrs);
 496                 switch (status) {
 497                 case WALK_STOP:
 498                         return 1;
 499                 case WALK_FOLLOW:
 500                         BUG_ON(mp->mp_aheight == mp->mp_fheight);
 501                         ptrs = mp->mp_list[hgt] - start;
 502                         len = ptrs * factor;
 503                         break;
 504                 case WALK_CONTINUE:
 505                         break;
 506                 }
 507                 if (len >= max_len)
 508                         break;
 509                 max_len -= len;
 510                 if (status == WALK_FOLLOW)
 511                         goto fill_up_metapath;
 512
 513 lower_metapath:
 514                 /* Decrease height of metapath. */
 515                 brelse(mp->mp_bh[hgt]);
 516                 mp->mp_bh[hgt] = NULL;
 517                 mp->mp_list[hgt] = 0;
 518                 if (!hgt)
 519                         break;
 520                 hgt--;
 521                 factor *= sdp->sd_inptrs;
 522
 523                 /* Advance in metadata tree. */
 524                 (mp->mp_list[hgt])++;
 525                 if (hgt) {
 526                         if (mp->mp_list[hgt] >= sdp->sd_inptrs)
 527                                 goto lower_metapath;
 528                 } else {
 529                         if (mp->mp_list[hgt] >= sdp->sd_diptrs)
 530                                 break;
 531                 }
 532
 533 fill_up_metapath:
 534                 /* Increase height of metapath. */
 535                 ret = fillup_metapath(ip, mp, ip->i_height - 1);
 536                 if (ret < 0)
 537                         return ret;
 538                 hgt += ret;
 539                 for (; ret; ret--)
 540                         do_div(factor, sdp->sd_inptrs);
 541                 mp->mp_aheight = hgt + 1;
 542         }
 543         return 0;
 544 }
 545
 546 static enum walker_status gfs2_hole_walker(struct metapath *mp,
 547                                            unsigned int ptrs)
 548 {
 549         const __be64 *start, *ptr, *end;
 550         unsigned int hgt;
 551
 552         hgt = mp->mp_aheight - 1;
 553         start = metapointer(hgt, mp);
 554         end = start + ptrs;
 555
 556         for (ptr = start; ptr < end; ptr++) {
 557                 if (*ptr) {
 558                         mp->mp_list[hgt] += ptr - start;
 559                         if (mp->mp_aheight == mp->mp_fheight)
 560                                 return WALK_STOP;
 561                         return WALK_FOLLOW;
 562                 }
 563         }
 564         return WALK_CONTINUE;
 565 }
 566
 567 /**
 568  * gfs2_hole_size - figure out the size of a hole
 569  * @inode: The inode
 570  * @lblock: The logical starting block number
 571  * @len: How far to look (in blocks)
 572  * @mp: The metapath at lblock
 573  * @iomap: The iomap to store the hole size in
 574  *
 575  * This function modifies @mp.
 576  *
 577  * Returns: errno on error
 578  */
 579 static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
 580                           struct metapath *mp, struct iomap *iomap)
 581 {
 582         struct metapath clone;
 583         u64 hole_size;
 584         int ret;
 585
 586         clone_metapath(&clone, mp);
 587         ret = gfs2_walk_metadata(inode, &clone, len, gfs2_hole_walker);
 588         if (ret < 0)
 589                 goto out;
 590
 591         if (ret == 1)
 592                 hole_size = metapath_to_block(GFS2_SB(inode), &clone) - lblock;
 593         else
 594                 hole_size = len;
 595         iomap->length = hole_size << inode->i_blkbits;
 596         ret = 0;
 597
 598 out:
 599         release_metapath(&clone);
 600         return ret;
 601 }
 602
 603 static inline void gfs2_indirect_init(struct metapath *mp,
 604                                       struct gfs2_glock *gl, unsigned int i,
 605                                       unsigned offset, u64 bn)
 606 {
 607         __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
 608                        ((i > 1) ? sizeof(struct gfs2_meta_header) :
 609                                  sizeof(struct gfs2_dinode)));
 610         BUG_ON(i < 1);
 611         BUG_ON(mp->mp_bh[i] != NULL);
 612         mp->mp_bh[i] = gfs2_meta_new(gl, bn);
 613         gfs2_trans_add_meta(gl, mp->mp_bh[i]);
 614         gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
 615         gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
 616         ptr += offset;
 617         *ptr = cpu_to_be64(bn);
 618 }
 619
 620 enum alloc_state {
 621         ALLOC_DATA = 0,
 622         ALLOC_GROW_DEPTH = 1,
 623         ALLOC_GROW_HEIGHT = 2,
 624         /* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
 625 };
 626
 627 /**
 628  * __gfs2_iomap_alloc - Build a metadata tree of the requested height
 629  * @inode: The GFS2 inode
 630  * @iomap: The iomap structure
 631  * @mp: The metapath, with proper height information calculated
 632  *
 633  * In this routine we may have to alloc:
 634  *   i) Indirect blocks to grow the metadata tree height
 635  *  ii) Indirect blocks to fill in lower part of the metadata tree
 636  * iii) Data blocks
 637  *
 638  * This function is called after __gfs2_iomap_get, which works out the
 639  * total number of blocks which we need via gfs2_alloc_size.
 640  *
 641  * We then do the actual allocation asking for an extent at a time (if
 642  * enough contiguous free blocks are available, there will only be one
 643  * allocation request per call) and uses the state machine to initialise
 644  * the blocks in order.
 645  *
 646  * Right now, this function will allocate at most one indirect block
 647  * worth of data -- with a default block size of 4K, that's slightly
 648  * less than 2M.  If this limitation is ever removed to allow huge
 649  * allocations, we would probably still want to limit the iomap size we
 650  * return to avoid stalling other tasks during huge writes; the next
 651  * iomap iteration would then find the blocks already allocated.
 652  *
 653  * Returns: errno on error
 654  */
 655
 656 static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 657                               struct metapath *mp)
 658 {
 659         struct gfs2_inode *ip = GFS2_I(inode);
 660         struct gfs2_sbd *sdp = GFS2_SB(inode);
 661         struct buffer_head *dibh = mp->mp_bh[0];
 662         u64 bn;
 663         unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 664         size_t dblks = iomap->length >> inode->i_blkbits;
 665         const unsigned end_of_metadata = mp->mp_fheight - 1;
 666         int ret;
 667         enum alloc_state state;
 668         __be64 *ptr;
 669         __be64 zero_bn = 0;
 670
 671         BUG_ON(mp->mp_aheight < 1);
 672         BUG_ON(dibh == NULL);
 673         BUG_ON(dblks < 1);
 674
 675         gfs2_trans_add_meta(ip->i_gl, dibh);
 676
 677         down_write(&ip->i_rw_mutex);
 678
 679         if (mp->mp_fheight == mp->mp_aheight) {
 680                 /* Bottom indirect block exists */
 681                 state = ALLOC_DATA;
 682         } else {
 683                 /* Need to allocate indirect blocks */
 684                 if (mp->mp_fheight == ip->i_height) {
 685                         /* Writing into existing tree, extend tree down */
 686                         iblks = mp->mp_fheight - mp->mp_aheight;
 687                         state = ALLOC_GROW_DEPTH;
 688                 } else {
 689                         /* Building up tree height */
 690                         state = ALLOC_GROW_HEIGHT;
 691                         iblks = mp->mp_fheight - ip->i_height;
 692                         branch_start = metapath_branch_start(mp);
 693                         iblks += (mp->mp_fheight - branch_start);
 694                 }
 695         }
 696
 697         /* start of the second part of the function (state machine) */
 698
 699         blks = dblks + iblks;
 700         i = mp->mp_aheight;
 701         do {
 702                 n = blks - alloced;
 703                 ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
 704                 if (ret)
 705                         goto out;
 706                 alloced += n;
 707                 if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 708                         gfs2_trans_remove_revoke(sdp, bn, n);
 709                 switch (state) {
 710                 /* Growing height of tree */
 711                 case ALLOC_GROW_HEIGHT:
 712                         if (i == 1) {
 713                                 ptr = (__be64 *)(dibh->b_data +
 714                                                  sizeof(struct gfs2_dinode));
 715                                 zero_bn = *ptr;
 716                         }
 717                         for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
 718                              i++, n--)
 719                                 gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
 720                         if (i - 1 == mp->mp_fheight - ip->i_height) {
 721                                 i--;
 722                                 gfs2_buffer_copy_tail(mp->mp_bh[i],
 723                                                 sizeof(struct gfs2_meta_header),
 724                                                 dibh, sizeof(struct gfs2_dinode));
 725                                 gfs2_buffer_clear_tail(dibh,
 726                                                 sizeof(struct gfs2_dinode) +
 727                                                 sizeof(__be64));
 728                                 ptr = (__be64 *)(mp->mp_bh[i]->b_data +
 729                                         sizeof(struct gfs2_meta_header));
 730                                 *ptr = zero_bn;
 731                                 state = ALLOC_GROW_DEPTH;
 732                                 for(i = branch_start; i < mp->mp_fheight; i++) {
 733                                         if (mp->mp_bh[i] == NULL)
 734                                                 break;
 735                                         brelse(mp->mp_bh[i]);
 736                                         mp->mp_bh[i] = NULL;
 737                                 }
 738                                 i = branch_start;
 739                         }
 740                         if (n == 0)
 741                                 break;
 742                         fallthrough;    /* To branching from existing tree */
 743                 case ALLOC_GROW_DEPTH:
 744                         if (i > 1 && i < mp->mp_fheight)
 745                                 gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
 746                         for (; i < mp->mp_fheight && n > 0; i++, n--)
 747                                 gfs2_indirect_init(mp, ip->i_gl, i,
 748                                                    mp->mp_list[i-1], bn++);
 749                         if (i == mp->mp_fheight)
 750                                 state = ALLOC_DATA;
 751                         if (n == 0)
 752                                 break;
 753                         fallthrough;    /* To tree complete, adding data blocks */
 754                 case ALLOC_DATA:
 755                         BUG_ON(n > dblks);
 756                         BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
 757                         gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
 758                         dblks = n;
 759                         ptr = metapointer(end_of_metadata, mp);
 760                         iomap->addr = bn << inode->i_blkbits;
 761                         iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
 762                         while (n-- > 0)
 763                                 *ptr++ = cpu_to_be64(bn++);
 764                         break;
 765                 }
 766         } while (iomap->addr == IOMAP_NULL_ADDR);
 767
 768         iomap->type = IOMAP_MAPPED;
 769         iomap->length = (u64)dblks << inode->i_blkbits;
 770         ip->i_height = mp->mp_fheight;
 771         gfs2_add_inode_blocks(&ip->i_inode, alloced);
 772         gfs2_dinode_out(ip, dibh->b_data);
 773 out:
 774         up_write(&ip->i_rw_mutex);
 775         return ret;
 776 }
 777
 778 #define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
 779
 780 /**
 781  * gfs2_alloc_size - Compute the maximum allocation size
 782  * @inode: The inode
 783  * @mp: The metapath
 784  * @size: Requested size in blocks
 785  *
 786  * Compute the maximum size of the next allocation at @mp.
 787  *
 788  * Returns: size in blocks
 789  */
 790 static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size)
 791 {
 792         struct gfs2_inode *ip = GFS2_I(inode);
 793         struct gfs2_sbd *sdp = GFS2_SB(inode);
 794         const __be64 *first, *ptr, *end;
 795
 796         /*
 797          * For writes to stuffed files, this function is called twice via
 798          * __gfs2_iomap_get, before and after unstuffing. The size we return the
 799          * first time needs to be large enough to get the reservation and
 800          * allocation sizes right.  The size we return the second time must
 801          * be exact or else __gfs2_iomap_alloc won't do the right thing.
 802          */
 803
 804         if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) {
 805                 unsigned int maxsize = mp->mp_fheight > 1 ?
 806                         sdp->sd_inptrs : sdp->sd_diptrs;
 807                 maxsize -= mp->mp_list[mp->mp_fheight - 1];
 808                 if (size > maxsize)
 809                         size = maxsize;
 810                 return size;
 811         }
 812
 813         first = metapointer(ip->i_height - 1, mp);
 814         end = metaend(ip->i_height - 1, mp);
 815         if (end - first > size)
 816                 end = first + size;
 817         for (ptr = first; ptr < end; ptr++) {
 818                 if (*ptr)
 819                         break;
 820         }
 821         return ptr - first;
 822 }
 823
 824 /**
 825  * __gfs2_iomap_get - Map blocks from an inode to disk blocks
 826  * @inode: The inode
 827  * @pos: Starting position in bytes
 828  * @length: Length to map, in bytes
 829  * @flags: iomap flags
 830  * @iomap: The iomap structure
 831  * @mp: The metapath
 832  *
 833  * Returns: errno
 834  */
 835 static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
 836                             unsigned flags, struct iomap *iomap,
 837                             struct metapath *mp)
 838 {
 839         struct gfs2_inode *ip = GFS2_I(inode);
 840         struct gfs2_sbd *sdp = GFS2_SB(inode);
 841         loff_t size = i_size_read(inode);
 842         __be64 *ptr;
 843         sector_t lblock;
 844         sector_t lblock_stop;
 845         int ret;
 846         int eob;
 847         u64 len;
 848         struct buffer_head *dibh = NULL, *bh;
 849         u8 height;
 850
 851         if (!length)
 852                 return -EINVAL;
 853
 854         down_read(&ip->i_rw_mutex);
 855
 856         ret = gfs2_meta_inode_buffer(ip, &dibh);
 857         if (ret)
 858                 goto unlock;
 859         mp->mp_bh[0] = dibh;
 860
 861         if (gfs2_is_stuffed(ip)) {
 862                 if (flags & IOMAP_WRITE) {
 863                         loff_t max_size = gfs2_max_stuffed_size(ip);
 864
 865                         if (pos + length > max_size)
 866                                 goto unstuff;
 867                         iomap->length = max_size;
 868                 } else {
 869                         if (pos >= size) {
 870                                 if (flags & IOMAP_REPORT) {
 871                                         ret = -ENOENT;
 872                                         goto unlock;
 873                                 } else {
 874                                         iomap->offset = pos;
 875                                         iomap->length = length;
 876                                         goto hole_found;
 877                                 }
 878                         }
 879                         iomap->length = size;
 880                 }
 881                 iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
 882                               sizeof(struct gfs2_dinode);
 883                 iomap->type = IOMAP_INLINE;
 884                 iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode);
 885                 goto out;
 886         }
 887
 888 unstuff:
 889         lblock = pos >> inode->i_blkbits;
 890         iomap->offset = lblock << inode->i_blkbits;
 891         lblock_stop = (pos + length - 1) >> inode->i_blkbits;
 892         len = lblock_stop - lblock + 1;
 893         iomap->length = len << inode->i_blkbits;
 894
 895         height = ip->i_height;
 896         while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
 897                 height++;
 898         find_metapath(sdp, lblock, mp, height);
 899         if (height > ip->i_height || gfs2_is_stuffed(ip))
 900                 goto do_alloc;
 901
 902         ret = lookup_metapath(ip, mp);
 903         if (ret)
 904                 goto unlock;
 905
 906         if (mp->mp_aheight != ip->i_height)
 907                 goto do_alloc;
 908
 909         ptr = metapointer(ip->i_height - 1, mp);
 910         if (*ptr == 0)
 911                 goto do_alloc;
 912
 913         bh = mp->mp_bh[ip->i_height - 1];
 914         len = gfs2_extent_length(bh, ptr, len, &eob);
 915
 916         iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
 917         iomap->length = len << inode->i_blkbits;
 918         iomap->type = IOMAP_MAPPED;
 919         iomap->flags |= IOMAP_F_MERGED;
 920         if (eob)
 921                 iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
 922
 923 out:
 924         iomap->bdev = inode->i_sb->s_bdev;
 925 unlock:
 926         up_read(&ip->i_rw_mutex);
 927         return ret;
 928
 929 do_alloc:
 930         if (flags & IOMAP_REPORT) {
 931                 if (pos >= size)
 932                         ret = -ENOENT;
 933                 else if (height == ip->i_height)
 934                         ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 935                 else
 936                         iomap->length = size - iomap->offset;
 937         } else if (flags & IOMAP_WRITE) {
 938                 u64 alloc_size;
 939
 940                 if (flags & IOMAP_DIRECT)
 941                         goto out;  /* (see gfs2_file_direct_write) */
 942
 943                 len = gfs2_alloc_size(inode, mp, len);
 944                 alloc_size = len << inode->i_blkbits;
 945                 if (alloc_size < iomap->length)
 946                         iomap->length = alloc_size;
 947         } else {
 948                 if (pos < size && height == ip->i_height)
 949                         ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 950         }
 951 hole_found:
 952         iomap->addr = IOMAP_NULL_ADDR;
 953         iomap->type = IOMAP_HOLE;
 954         goto out;
 955 }
 956
 957 static struct folio *
 958 gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
 959 {
 960         struct inode *inode = iter->inode;
 961         unsigned int blockmask = i_blocksize(inode) - 1;
 962         struct gfs2_sbd *sdp = GFS2_SB(inode);
 963         unsigned int blocks;
 964         struct folio *folio;
 965         int status;
 966
 967         blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
 968         status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
 969         if (status)
 970                 return ERR_PTR(status);
 971
 972         folio = iomap_get_folio(iter, pos, len);
 973         if (IS_ERR(folio))
 974                 gfs2_trans_end(sdp);
 975         return folio;
 976 }
 977
 978 static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos,
 979                                  unsigned copied, struct folio *folio)
 980 {
 981         struct gfs2_trans *tr = current->journal_info;
 982         struct gfs2_inode *ip = GFS2_I(inode);
 983         struct gfs2_sbd *sdp = GFS2_SB(inode);
 984
 985         if (!gfs2_is_stuffed(ip))
 986                 gfs2_trans_add_databufs(ip, folio, offset_in_folio(folio, pos),
 987                                         copied);
 988
 989         folio_unlock(folio);
 990         folio_put(folio);
 991
 992         if (tr->tr_num_buf_new)
 993                 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 994
 995         gfs2_trans_end(sdp);
 996 }
 997
 998 static const struct iomap_folio_ops gfs2_iomap_folio_ops = {
 999         .get_folio = gfs2_iomap_get_folio,
1000         .put_folio = gfs2_iomap_put_folio,
1001 };
1002
1003 static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
1004                                   loff_t length, unsigned flags,
1005                                   struct iomap *iomap,
1006                                   struct metapath *mp)
1007 {
1008         struct gfs2_inode *ip = GFS2_I(inode);
1009         struct gfs2_sbd *sdp = GFS2_SB(inode);
1010         bool unstuff;
1011         int ret;
1012
1013         unstuff = gfs2_is_stuffed(ip) &&
1014                   pos + length > gfs2_max_stuffed_size(ip);
1015
1016         if (unstuff || iomap->type == IOMAP_HOLE) {
1017                 unsigned int data_blocks, ind_blocks;
1018                 struct gfs2_alloc_parms ap = {};
1019                 unsigned int rblocks;
1020                 struct gfs2_trans *tr;
1021
1022                 gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
1023                                        &ind_blocks);
1024                 ap.target = data_blocks + ind_blocks;
1025                 ret = gfs2_quota_lock_check(ip, &ap);
1026                 if (ret)
1027                         return ret;
1028
1029                 ret = gfs2_inplace_reserve(ip, &ap);
1030                 if (ret)
1031                         goto out_qunlock;
1032
1033                 rblocks = RES_DINODE + ind_blocks;
1034                 if (gfs2_is_jdata(ip))
1035                         rblocks += data_blocks;
1036                 if (ind_blocks || data_blocks)
1037                         rblocks += RES_STATFS + RES_QUOTA;
1038                 if (inode == sdp->sd_rindex)
1039                         rblocks += 2 * RES_STATFS;
1040                 rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
1041
1042                 ret = gfs2_trans_begin(sdp, rblocks,
1043                                        iomap->length >> inode->i_blkbits);
1044                 if (ret)
1045                         goto out_trans_fail;
1046
1047                 if (unstuff) {
1048                         ret = gfs2_unstuff_dinode(ip);
1049                         if (ret)
1050                                 goto out_trans_end;
1051                         release_metapath(mp);
1052                         ret = __gfs2_iomap_get(inode, iomap->offset,
1053                                                iomap->length, flags, iomap, mp);
1054                         if (ret)
1055                                 goto out_trans_end;
1056                 }
1057
1058                 if (iomap->type == IOMAP_HOLE) {
1059                         ret = __gfs2_iomap_alloc(inode, iomap, mp);
1060                         if (ret) {
1061                                 gfs2_trans_end(sdp);
1062                                 gfs2_inplace_release(ip);
1063                                 punch_hole(ip, iomap->offset, iomap->length);
1064                                 goto out_qunlock;
1065                         }
1066                 }
1067
1068                 tr = current->journal_info;
1069                 if (tr->tr_num_buf_new)
1070                         __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1071
1072                 gfs2_trans_end(sdp);
1073         }
1074
1075         if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
1076                 iomap->folio_ops = &gfs2_iomap_folio_ops;
1077         return 0;
1078
1079 out_trans_end:
1080         gfs2_trans_end(sdp);
1081 out_trans_fail:
1082         gfs2_inplace_release(ip);
1083 out_qunlock:
1084         gfs2_quota_unlock(ip);
1085         return ret;
1086 }
1087
1088 static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
1089                             unsigned flags, struct iomap *iomap,
1090                             struct iomap *srcmap)
1091 {
1092         struct gfs2_inode *ip = GFS2_I(inode);
1093         struct metapath mp = { .mp_aheight = 1, };
1094         int ret;
1095
1096         if (gfs2_is_jdata(ip))
1097                 iomap->flags |= IOMAP_F_BUFFER_HEAD;
1098
1099         trace_gfs2_iomap_start(ip, pos, length, flags);
1100         ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
1101         if (ret)
1102                 goto out_unlock;
1103
1104         switch(flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1105         case IOMAP_WRITE:
1106                 if (flags & IOMAP_DIRECT) {
1107                         /*
1108                          * Silently fall back to buffered I/O for stuffed files
1109                          * or if we've got a hole (see gfs2_file_direct_write).
1110                          */
1111                         if (iomap->type != IOMAP_MAPPED)
1112                                 ret = -ENOTBLK;
1113                         goto out_unlock;
1114                 }
1115                 break;
1116         case IOMAP_ZERO:
1117                 if (iomap->type == IOMAP_HOLE)
1118                         goto out_unlock;
1119                 break;
1120         default:
1121                 goto out_unlock;
1122         }
1123
1124         ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
1125
1126 out_unlock:
1127         release_metapath(&mp);
1128         trace_gfs2_iomap_end(ip, iomap, ret);
1129         return ret;
1130 }
1131
1132 static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
1133                           ssize_t written, unsigned flags, struct iomap *iomap)
1134 {
1135         struct gfs2_inode *ip = GFS2_I(inode);
1136         struct gfs2_sbd *sdp = GFS2_SB(inode);
1137
1138         switch (flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1139         case IOMAP_WRITE:
1140                 if (flags & IOMAP_DIRECT)
1141                         return 0;
1142                 break;
1143         case IOMAP_ZERO:
1144                  if (iomap->type == IOMAP_HOLE)
1145                          return 0;
1146                  break;
1147         default:
1148                  return 0;
1149         }
1150
1151         if (!gfs2_is_stuffed(ip))
1152                 gfs2_ordered_add_inode(ip);
1153
1154         if (inode == sdp->sd_rindex)
1155                 adjust_fs_space(inode);
1156
1157         gfs2_inplace_release(ip);
1158
1159         if (ip->i_qadata && ip->i_qadata->qa_qd_num)
1160                 gfs2_quota_unlock(ip);
1161
1162         if (length != written && (iomap->flags & IOMAP_F_NEW)) {
1163                 /* Deallocate blocks that were just allocated. */
1164                 loff_t hstart = round_up(pos + written, i_blocksize(inode));
1165                 loff_t hend = iomap->offset + iomap->length;
1166
1167                 if (hstart < hend) {
1168                         truncate_pagecache_range(inode, hstart, hend - 1);
1169                         punch_hole(ip, hstart, hend - hstart);
1170                 }
1171         }
1172
1173         if (unlikely(!written))
1174                 return 0;
1175
1176         if (iomap->flags & IOMAP_F_SIZE_CHANGED)
1177                 mark_inode_dirty(inode);
1178         set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
1179         return 0;
1180 }
1181
1182 const struct iomap_ops gfs2_iomap_ops = {
1183         .iomap_begin = gfs2_iomap_begin,
1184         .iomap_end = gfs2_iomap_end,
1185 };
1186
1187 /**
1188  * gfs2_block_map - Map one or more blocks of an inode to a disk block
1189  * @inode: The inode
1190  * @lblock: The logical block number
1191  * @bh_map: The bh to be mapped
1192  * @create: True if its ok to alloc blocks to satify the request
1193  *
1194  * The size of the requested mapping is defined in bh_map->b_size.
1195  *
1196  * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged
1197  * when @lblock is not mapped.  Sets buffer_mapped(bh_map) and
1198  * bh_map->b_size to indicate the size of the mapping when @lblock and
1199  * successive blocks are mapped, up to the requested size.
1200  *
1201  * Sets buffer_boundary() if a read of metadata will be required
1202  * before the next block can be mapped. Sets buffer_new() if new
1203  * blocks were allocated.
1204  *
1205  * Returns: errno
1206  */
1207
1208 int gfs2_block_map(struct inode *inode, sector_t lblock,
1209                    struct buffer_head *bh_map, int create)
1210 {
1211         struct gfs2_inode *ip = GFS2_I(inode);
1212         loff_t pos = (loff_t)lblock << inode->i_blkbits;
1213         loff_t length = bh_map->b_size;
1214         struct iomap iomap = { };
1215         int ret;
1216
1217         clear_buffer_mapped(bh_map);
1218         clear_buffer_new(bh_map);
1219         clear_buffer_boundary(bh_map);
1220         trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
1221
1222         if (!create)
1223                 ret = gfs2_iomap_get(inode, pos, length, &iomap);
1224         else
1225                 ret = gfs2_iomap_alloc(inode, pos, length, &iomap);
1226         if (ret)
1227                 goto out;
1228
1229         if (iomap.length > bh_map->b_size) {
1230                 iomap.length = bh_map->b_size;
1231                 iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
1232         }
1233         if (iomap.addr != IOMAP_NULL_ADDR)
1234                 map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
1235         bh_map->b_size = iomap.length;
1236         if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
1237                 set_buffer_boundary(bh_map);
1238         if (iomap.flags & IOMAP_F_NEW)
1239                 set_buffer_new(bh_map);
1240
1241 out:
1242         trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
1243         return ret;
1244 }
1245
1246 int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock,
1247                     unsigned int *extlen)
1248 {
1249         unsigned int blkbits = inode->i_blkbits;
1250         struct iomap iomap = { };
1251         unsigned int len;
1252         int ret;
1253
1254         ret = gfs2_iomap_get(inode, lblock << blkbits, *extlen << blkbits,
1255                              &iomap);
1256         if (ret)
1257                 return ret;
1258         if (iomap.type != IOMAP_MAPPED)
1259                 return -EIO;
1260         *dblock = iomap.addr >> blkbits;
1261         len = iomap.length >> blkbits;
1262         if (len < *extlen)
1263                 *extlen = len;
1264         return 0;
1265 }
1266
1267 int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
1268                       unsigned int *extlen, bool *new)
1269 {
1270         unsigned int blkbits = inode->i_blkbits;
1271         struct iomap iomap = { };
1272         unsigned int len;
1273         int ret;
1274
1275         ret = gfs2_iomap_alloc(inode, lblock << blkbits, *extlen << blkbits,
1276                                &iomap);
1277         if (ret)
1278                 return ret;
1279         if (iomap.type != IOMAP_MAPPED)
1280                 return -EIO;
1281         *dblock = iomap.addr >> blkbits;
1282         len = iomap.length >> blkbits;
1283         if (len < *extlen)
1284                 *extlen = len;
1285         *new = iomap.flags & IOMAP_F_NEW;
1286         return 0;
1287 }
1288
1289 /*
1290  * NOTE: Never call gfs2_block_zero_range with an open transaction because it
1291  * uses iomap write to perform its actions, which begin their own transactions
1292  * (iomap_begin, get_folio, etc.)
1293  */
1294 static int gfs2_block_zero_range(struct inode *inode, loff_t from,
1295                                  unsigned int length)
1296 {
1297         BUG_ON(current->journal_info);
1298         return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
1299 }
1300
1301 #define GFS2_JTRUNC_REVOKES 8192
1302
1303 /**
1304  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
1305  * @inode: The inode being truncated
1306  * @oldsize: The original (larger) size
1307  * @newsize: The new smaller size
1308  *
1309  * With jdata files, we have to journal a revoke for each block which is
1310  * truncated. As a result, we need to split this into separate transactions
1311  * if the number of pages being truncated gets too large.
1312  */
1313
1314 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
1315 {
1316         struct gfs2_sbd *sdp = GFS2_SB(inode);
1317         u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1318         u64 chunk;
1319         int error;
1320
1321         while (oldsize != newsize) {
1322                 struct gfs2_trans *tr;
1323                 unsigned int offs;
1324
1325                 chunk = oldsize - newsize;
1326                 if (chunk > max_chunk)
1327                         chunk = max_chunk;
1328
1329                 offs = oldsize & ~PAGE_MASK;
1330                 if (offs && chunk > PAGE_SIZE)
1331                         chunk = offs + ((chunk - offs) & PAGE_MASK);
1332
1333                 truncate_pagecache(inode, oldsize - chunk);
1334                 oldsize -= chunk;
1335
1336                 tr = current->journal_info;
1337                 if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1338                         continue;
1339
1340                 gfs2_trans_end(sdp);
1341                 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1342                 if (error)
1343                         return error;
1344         }
1345
1346         return 0;
1347 }
1348
1349 static int trunc_start(struct inode *inode, u64 newsize)
1350 {
1351         struct gfs2_inode *ip = GFS2_I(inode);
1352         struct gfs2_sbd *sdp = GFS2_SB(inode);
1353         struct buffer_head *dibh = NULL;
1354         int journaled = gfs2_is_jdata(ip);
1355         u64 oldsize = inode->i_size;
1356         int error;
1357
1358         if (!gfs2_is_stuffed(ip)) {
1359                 unsigned int blocksize = i_blocksize(inode);
1360                 unsigned int offs = newsize & (blocksize - 1);
1361                 if (offs) {
1362                         error = gfs2_block_zero_range(inode, newsize,
1363                                                       blocksize - offs);
1364                         if (error)
1365                                 return error;
1366                 }
1367         }
1368         if (journaled)
1369                 error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1370         else
1371                 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1372         if (error)
1373                 return error;
1374
1375         error = gfs2_meta_inode_buffer(ip, &dibh);
1376         if (error)
1377                 goto out;
1378
1379         gfs2_trans_add_meta(ip->i_gl, dibh);
1380
1381         if (gfs2_is_stuffed(ip))
1382                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1383         else
1384                 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1385
1386         i_size_write(inode, newsize);
1387         inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1388         gfs2_dinode_out(ip, dibh->b_data);
1389
1390         if (journaled)
1391                 error = gfs2_journaled_truncate(inode, oldsize, newsize);
1392         else
1393                 truncate_pagecache(inode, newsize);
1394
1395 out:
1396         brelse(dibh);
1397         if (current->journal_info)
1398                 gfs2_trans_end(sdp);
1399         return error;
1400 }
1401
1402 int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
1403                    struct iomap *iomap)
1404 {
1405         struct metapath mp = { .mp_aheight = 1, };
1406         int ret;
1407
1408         ret = __gfs2_iomap_get(inode, pos, length, 0, iomap, &mp);
1409         release_metapath(&mp);
1410         return ret;
1411 }
1412
1413 int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length,
1414                      struct iomap *iomap)
1415 {
1416         struct metapath mp = { .mp_aheight = 1, };
1417         int ret;
1418
1419         ret = __gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
1420         if (!ret && iomap->type == IOMAP_HOLE)
1421                 ret = __gfs2_iomap_alloc(inode, iomap, &mp);
1422         release_metapath(&mp);
1423         return ret;
1424 }
1425
1426 /**
1427  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1428  * @ip: inode
1429  * @rd_gh: holder of resource group glock
1430  * @bh: buffer head to sweep
1431  * @start: starting point in bh
1432  * @end: end point in bh
1433  * @meta: true if bh points to metadata (rather than data)
1434  * @btotal: place to keep count of total blocks freed
1435  *
1436  * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1437  * free, and free them all. However, we do it one rgrp at a time. If this
1438  * block has references to multiple rgrps, we break it into individual
1439  * transactions. This allows other processes to use the rgrps while we're
1440  * focused on a single one, for better concurrency / performance.
1441  * At every transaction boundary, we rewrite the inode into the journal.
1442  * That way the bitmaps are kept consistent with the inode and we can recover
1443  * if we're interrupted by power-outages.
1444  *
1445  * Returns: 0, or return code if an error occurred.
1446  *          *btotal has the total number of blocks freed
1447  */
1448 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1449                               struct buffer_head *bh, __be64 *start, __be64 *end,
1450                               bool meta, u32 *btotal)
1451 {
1452         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1453         struct gfs2_rgrpd *rgd;
1454         struct gfs2_trans *tr;
1455         __be64 *p;
1456         int blks_outside_rgrp;
1457         u64 bn, bstart, isize_blks;
1458         s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1459         int ret = 0;
1460         bool buf_in_tr = false; /* buffer was added to transaction */
1461
1462 more_rgrps:
1463         rgd = NULL;
1464         if (gfs2_holder_initialized(rd_gh)) {
1465                 rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1466                 gfs2_assert_withdraw(sdp,
1467                              gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1468         }
1469         blks_outside_rgrp = 0;
1470         bstart = 0;
1471         blen = 0;
1472
1473         for (p = start; p < end; p++) {
1474                 if (!*p)
1475                         continue;
1476                 bn = be64_to_cpu(*p);
1477
1478                 if (rgd) {
1479                         if (!rgrp_contains_block(rgd, bn)) {
1480                                 blks_outside_rgrp++;
1481                                 continue;
1482                         }
1483                 } else {
1484                         rgd = gfs2_blk2rgrpd(sdp, bn, true);
1485                         if (unlikely(!rgd)) {
1486                                 ret = -EIO;
1487                                 goto out;
1488                         }
1489                         ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1490                                                  LM_FLAG_NODE_SCOPE, rd_gh);
1491                         if (ret)
1492                                 goto out;
1493
1494                         /* Must be done with the rgrp glock held: */
1495                         if (gfs2_rs_active(&ip->i_res) &&
1496                             rgd == ip->i_res.rs_rgd)
1497                                 gfs2_rs_deltree(&ip->i_res);
1498                 }
1499
1500                 /* The size of our transactions will be unknown until we
1501                    actually process all the metadata blocks that relate to
1502                    the rgrp. So we estimate. We know it can't be more than
1503                    the dinode's i_blocks and we don't want to exceed the
1504                    journal flush threshold, sd_log_thresh2. */
1505                 if (current->journal_info == NULL) {
1506                         unsigned int jblocks_rqsted, revokes;
1507
1508                         jblocks_rqsted = rgd->rd_length + RES_DINODE +
1509                                 RES_INDIRECT;
1510                         isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1511                         if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1512                                 jblocks_rqsted +=
1513                                         atomic_read(&sdp->sd_log_thresh2);
1514                         else
1515                                 jblocks_rqsted += isize_blks;
1516                         revokes = jblocks_rqsted;
1517                         if (meta)
1518                                 revokes += end - start;
1519                         else if (ip->i_depth)
1520                                 revokes += sdp->sd_inptrs;
1521                         ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1522                         if (ret)
1523                                 goto out_unlock;
1524                         down_write(&ip->i_rw_mutex);
1525                 }
1526                 /* check if we will exceed the transaction blocks requested */
1527                 tr = current->journal_info;
1528                 if (tr->tr_num_buf_new + RES_STATFS +
1529                     RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1530                         /* We set blks_outside_rgrp to ensure the loop will
1531                            be repeated for the same rgrp, but with a new
1532                            transaction. */
1533                         blks_outside_rgrp++;
1534                         /* This next part is tricky. If the buffer was added
1535                            to the transaction, we've already set some block
1536                            pointers to 0, so we better follow through and free
1537                            them, or we will introduce corruption (so break).
1538                            This may be impossible, or at least rare, but I
1539                            decided to cover the case regardless.
1540
1541                            If the buffer was not added to the transaction
1542                            (this call), doing so would exceed our transaction
1543                            size, so we need to end the transaction and start a
1544                            new one (so goto). */
1545
1546                         if (buf_in_tr)
1547                                 break;
1548                         goto out_unlock;
1549                 }
1550
1551                 gfs2_trans_add_meta(ip->i_gl, bh);
1552                 buf_in_tr = true;
1553                 *p = 0;
1554                 if (bstart + blen == bn) {
1555                         blen++;
1556                         continue;
1557                 }
1558                 if (bstart) {
1559                         __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1560                         (*btotal) += blen;
1561                         gfs2_add_inode_blocks(&ip->i_inode, -blen);
1562                 }
1563                 bstart = bn;
1564                 blen = 1;
1565         }
1566         if (bstart) {
1567                 __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1568                 (*btotal) += blen;
1569                 gfs2_add_inode_blocks(&ip->i_inode, -blen);
1570         }
1571 out_unlock:
1572         if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1573                                             outside the rgrp we just processed,
1574                                             do it all over again. */
1575                 if (current->journal_info) {
1576                         struct buffer_head *dibh;
1577
1578                         ret = gfs2_meta_inode_buffer(ip, &dibh);
1579                         if (ret)
1580                                 goto out;
1581
1582                         /* Every transaction boundary, we rewrite the dinode
1583                            to keep its di_blocks current in case of failure. */
1584                         inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1585                         gfs2_trans_add_meta(ip->i_gl, dibh);
1586                         gfs2_dinode_out(ip, dibh->b_data);
1587                         brelse(dibh);
1588                         up_write(&ip->i_rw_mutex);
1589                         gfs2_trans_end(sdp);
1590                         buf_in_tr = false;
1591                 }
1592                 gfs2_glock_dq_uninit(rd_gh);
1593                 cond_resched();
1594                 goto more_rgrps;
1595         }
1596 out:
1597         return ret;
1598 }
1599
1600 static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1601 {
1602         if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1603                 return false;
1604         return true;
1605 }
1606
1607 /**
1608  * find_nonnull_ptr - find a non-null pointer given a metapath and height
1609  * @sdp: The superblock
1610  * @mp: starting metapath
1611  * @h: desired height to search
1612  * @end_list: See punch_hole().
1613  * @end_aligned: See punch_hole().
1614  *
1615  * Assumes the metapath is valid (with buffers) out to height h.
1616  * Returns: true if a non-null pointer was found in the metapath buffer
1617  *          false if all remaining pointers are NULL in the buffer
1618  */
1619 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1620                              unsigned int h,
1621                              __u16 *end_list, unsigned int end_aligned)
1622 {
1623         struct buffer_head *bh = mp->mp_bh[h];
1624         __be64 *first, *ptr, *end;
1625
1626         first = metaptr1(h, mp);
1627         ptr = first + mp->mp_list[h];
1628         end = (__be64 *)(bh->b_data + bh->b_size);
1629         if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
1630                 bool keep_end = h < end_aligned;
1631                 end = first + end_list[h] + keep_end;
1632         }
1633
1634         while (ptr < end) {
1635                 if (*ptr) { /* if we have a non-null pointer */
1636                         mp->mp_list[h] = ptr - first;
1637                         h++;
1638                         if (h < GFS2_MAX_META_HEIGHT)
1639                                 mp->mp_list[h] = 0;
1640                         return true;
1641                 }
1642                 ptr++;
1643         }
1644         return false;
1645 }
1646
1647 enum dealloc_states {
1648         DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1649         DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1650         DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1651         DEALLOC_DONE = 3,       /* process complete */
1652 };
1653
1654 static inline void
1655 metapointer_range(struct metapath *mp, int height,
1656                   __u16 *start_list, unsigned int start_aligned,
1657                   __u16 *end_list, unsigned int end_aligned,
1658                   __be64 **start, __be64 **end)
1659 {
1660         struct buffer_head *bh = mp->mp_bh[height];
1661         __be64 *first;
1662
1663         first = metaptr1(height, mp);
1664         *start = first;
1665         if (mp_eq_to_hgt(mp, start_list, height)) {
1666                 bool keep_start = height < start_aligned;
1667                 *start = first + start_list[height] + keep_start;
1668         }
1669         *end = (__be64 *)(bh->b_data + bh->b_size);
1670         if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
1671                 bool keep_end = height < end_aligned;
1672                 *end = first + end_list[height] + keep_end;
1673         }
1674 }
1675
1676 static inline bool walk_done(struct gfs2_sbd *sdp,
1677                              struct metapath *mp, int height,
1678                              __u16 *end_list, unsigned int end_aligned)
1679 {
1680         __u16 end;
1681
1682         if (end_list) {
1683                 bool keep_end = height < end_aligned;
1684                 if (!mp_eq_to_hgt(mp, end_list, height))
1685                         return false;
1686                 end = end_list[height] + keep_end;
1687         } else
1688                 end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
1689         return mp->mp_list[height] >= end;
1690 }
1691
1692 /**
1693  * punch_hole - deallocate blocks in a file
1694  * @ip: inode to truncate
1695  * @offset: the start of the hole
1696  * @length: the size of the hole (or 0 for truncate)
1697  *
1698  * Punch a hole into a file or truncate a file at a given position.  This
1699  * function operates in whole blocks (@offset and @length are rounded
1700  * accordingly); partially filled blocks must be cleared otherwise.
1701  *
1702  * This function works from the bottom up, and from the right to the left. In
1703  * other words, it strips off the highest layer (data) before stripping any of
1704  * the metadata. Doing it this way is best in case the operation is interrupted
1705  * by power failure, etc.  The dinode is rewritten in every transaction to
1706  * guarantee integrity.
1707  */
1708 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
1709 {
1710         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1711         u64 maxsize = sdp->sd_heightsize[ip->i_height];
1712         struct metapath mp = {};
1713         struct buffer_head *dibh, *bh;
1714         struct gfs2_holder rd_gh;
1715         unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1716         u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
1717         __u16 start_list[GFS2_MAX_META_HEIGHT];
1718         __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
1719         unsigned int start_aligned, end_aligned;
1720         unsigned int strip_h = ip->i_height - 1;
1721         u32 btotal = 0;
1722         int ret, state;
1723         int mp_h; /* metapath buffers are read in to this height */
1724         u64 prev_bnr = 0;
1725         __be64 *start, *end;
1726
1727         if (offset >= maxsize) {
1728                 /*
1729                  * The starting point lies beyond the allocated metadata;
1730                  * there are no blocks to deallocate.
1731                  */
1732                 return 0;
1733         }
1734
1735         /*
1736          * The start position of the hole is defined by lblock, start_list, and
1737          * start_aligned.  The end position of the hole is defined by lend,
1738          * end_list, and end_aligned.
1739          *
1740          * start_aligned and end_aligned define down to which height the start
1741          * and end positions are aligned to the metadata tree (i.e., the
1742          * position is a multiple of the metadata granularity at the height
1743          * above).  This determines at which heights additional meta pointers
1744          * needs to be preserved for the remaining data.
1745          */
1746
1747         if (length) {
1748                 u64 end_offset = offset + length;
1749                 u64 lend;
1750
1751                 /*
1752                  * Clip the end at the maximum file size for the given height:
1753                  * that's how far the metadata goes; files bigger than that
1754                  * will have additional layers of indirection.
1755                  */
1756                 if (end_offset > maxsize)
1757                         end_offset = maxsize;
1758                 lend = end_offset >> bsize_shift;
1759
1760                 if (lblock >= lend)
1761                         return 0;
1762
1763                 find_metapath(sdp, lend, &mp, ip->i_height);
1764                 end_list = __end_list;
1765                 memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
1766
1767                 for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1768                         if (end_list[mp_h])
1769                                 break;
1770                 }
1771                 end_aligned = mp_h;
1772         }
1773
1774         find_metapath(sdp, lblock, &mp, ip->i_height);
1775         memcpy(start_list, mp.mp_list, sizeof(start_list));
1776
1777         for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1778                 if (start_list[mp_h])
1779                         break;
1780         }
1781         start_aligned = mp_h;
1782
1783         ret = gfs2_meta_inode_buffer(ip, &dibh);
1784         if (ret)
1785                 return ret;
1786
1787         mp.mp_bh[0] = dibh;
1788         ret = lookup_metapath(ip, &mp);
1789         if (ret)
1790                 goto out_metapath;
1791
1792         /* issue read-ahead on metadata */
1793         for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
1794                 metapointer_range(&mp, mp_h, start_list, start_aligned,
1795                                   end_list, end_aligned, &start, &end);
1796                 gfs2_metapath_ra(ip->i_gl, start, end);
1797         }
1798
1799         if (mp.mp_aheight == ip->i_height)
1800                 state = DEALLOC_MP_FULL; /* We have a complete metapath */
1801         else
1802                 state = DEALLOC_FILL_MP; /* deal with partial metapath */
1803
1804         ret = gfs2_rindex_update(sdp);
1805         if (ret)
1806                 goto out_metapath;
1807
1808         ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1809         if (ret)
1810                 goto out_metapath;
1811         gfs2_holder_mark_uninitialized(&rd_gh);
1812
1813         mp_h = strip_h;
1814
1815         while (state != DEALLOC_DONE) {
1816                 switch (state) {
1817                 /* Truncate a full metapath at the given strip height.
1818                  * Note that strip_h == mp_h in order to be in this state. */
1819                 case DEALLOC_MP_FULL:
1820                         bh = mp.mp_bh[mp_h];
1821                         gfs2_assert_withdraw(sdp, bh);
1822                         if (gfs2_assert_withdraw(sdp,
1823                                                  prev_bnr != bh->b_blocknr)) {
1824                                 fs_emerg(sdp, "inode %llu, block:%llu, i_h:%u,"
1825                                          "s_h:%u, mp_h:%u\n",
1826                                        (unsigned long long)ip->i_no_addr,
1827                                        prev_bnr, ip->i_height, strip_h, mp_h);
1828                         }
1829                         prev_bnr = bh->b_blocknr;
1830
1831                         if (gfs2_metatype_check(sdp, bh,
1832                                                 (mp_h ? GFS2_METATYPE_IN :
1833                                                         GFS2_METATYPE_DI))) {
1834                                 ret = -EIO;
1835                                 goto out;
1836                         }
1837
1838                         /*
1839                          * Below, passing end_aligned as 0 gives us the
1840                          * metapointer range excluding the end point: the end
1841                          * point is the first metapath we must not deallocate!
1842                          */
1843
1844                         metapointer_range(&mp, mp_h, start_list, start_aligned,
1845                                           end_list, 0 /* end_aligned */,
1846                                           &start, &end);
1847                         ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
1848                                                  start, end,
1849                                                  mp_h != ip->i_height - 1,
1850                                                  &btotal);
1851
1852                         /* If we hit an error or just swept dinode buffer,
1853                            just exit. */
1854                         if (ret || !mp_h) {
1855                                 state = DEALLOC_DONE;
1856                                 break;
1857                         }
1858                         state = DEALLOC_MP_LOWER;
1859                         break;
1860
1861                 /* lower the metapath strip height */
1862                 case DEALLOC_MP_LOWER:
1863                         /* We're done with the current buffer, so release it,
1864                            unless it's the dinode buffer. Then back up to the
1865                            previous pointer. */
1866                         if (mp_h) {
1867                                 brelse(mp.mp_bh[mp_h]);
1868                                 mp.mp_bh[mp_h] = NULL;
1869                         }
1870                         /* If we can't get any lower in height, we've stripped
1871                            off all we can. Next step is to back up and start
1872                            stripping the previous level of metadata. */
1873                         if (mp_h == 0) {
1874                                 strip_h--;
1875                                 memcpy(mp.mp_list, start_list, sizeof(start_list));
1876                                 mp_h = strip_h;
1877                                 state = DEALLOC_FILL_MP;
1878                                 break;
1879                         }
1880                         mp.mp_list[mp_h] = 0;
1881                         mp_h--; /* search one metadata height down */
1882                         mp.mp_list[mp_h]++;
1883                         if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
1884                                 break;
1885                         /* Here we've found a part of the metapath that is not
1886                          * allocated. We need to search at that height for the
1887                          * next non-null pointer. */
1888                         if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
1889                                 state = DEALLOC_FILL_MP;
1890                                 mp_h++;
1891                         }
1892                         /* No more non-null pointers at this height. Back up
1893                            to the previous height and try again. */
1894                         break; /* loop around in the same state */
1895
1896                 /* Fill the metapath with buffers to the given height. */
1897                 case DEALLOC_FILL_MP:
1898                         /* Fill the buffers out to the current height. */
1899                         ret = fillup_metapath(ip, &mp, mp_h);
1900                         if (ret < 0)
1901                                 goto out;
1902
1903                         /* On the first pass, issue read-ahead on metadata. */
1904                         if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) {
1905                                 unsigned int height = mp.mp_aheight - 1;
1906
1907                                 /* No read-ahead for data blocks. */
1908                                 if (mp.mp_aheight - 1 == strip_h)
1909                                         height--;
1910
1911                                 for (; height >= mp.mp_aheight - ret; height--) {
1912                                         metapointer_range(&mp, height,
1913                                                           start_list, start_aligned,
1914                                                           end_list, end_aligned,
1915                                                           &start, &end);
1916                                         gfs2_metapath_ra(ip->i_gl, start, end);
1917                                 }
1918                         }
1919
1920                         /* If buffers found for the entire strip height */
1921                         if (mp.mp_aheight - 1 == strip_h) {
1922                                 state = DEALLOC_MP_FULL;
1923                                 break;
1924                         }
1925                         if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1926                                 mp_h = mp.mp_aheight - 1;
1927
1928                         /* If we find a non-null block pointer, crawl a bit
1929                            higher up in the metapath and try again, otherwise
1930                            we need to look lower for a new starting point. */
1931                         if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
1932                                 mp_h++;
1933                         else
1934                                 state = DEALLOC_MP_LOWER;
1935                         break;
1936                 }
1937         }
1938
1939         if (btotal) {
1940                 if (current->journal_info == NULL) {
1941                         ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1942                                                RES_QUOTA, 0);
1943                         if (ret)
1944                                 goto out;
1945                         down_write(&ip->i_rw_mutex);
1946                 }
1947                 gfs2_statfs_change(sdp, 0, +btotal, 0);
1948                 gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1949                                   ip->i_inode.i_gid);
1950                 inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1951                 gfs2_trans_add_meta(ip->i_gl, dibh);
1952                 gfs2_dinode_out(ip, dibh->b_data);
1953                 up_write(&ip->i_rw_mutex);
1954                 gfs2_trans_end(sdp);
1955         }
1956
1957 out:
1958         if (gfs2_holder_initialized(&rd_gh))
1959                 gfs2_glock_dq_uninit(&rd_gh);
1960         if (current->journal_info) {
1961                 up_write(&ip->i_rw_mutex);
1962                 gfs2_trans_end(sdp);
1963                 cond_resched();
1964         }
1965         gfs2_quota_unhold(ip);
1966 out_metapath:
1967         release_metapath(&mp);
1968         return ret;
1969 }
1970
1971 static int trunc_end(struct gfs2_inode *ip)
1972 {
1973         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1974         struct buffer_head *dibh;
1975         int error;
1976
1977         error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1978         if (error)
1979                 return error;
1980
1981         down_write(&ip->i_rw_mutex);
1982
1983         error = gfs2_meta_inode_buffer(ip, &dibh);
1984         if (error)
1985                 goto out;
1986
1987         if (!i_size_read(&ip->i_inode)) {
1988                 ip->i_height = 0;
1989                 ip->i_goal = ip->i_no_addr;
1990                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1991                 gfs2_ordered_del_inode(ip);
1992         }
1993         inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1994         ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1995
1996         gfs2_trans_add_meta(ip->i_gl, dibh);
1997         gfs2_dinode_out(ip, dibh->b_data);
1998         brelse(dibh);
1999
2000 out:
2001         up_write(&ip->i_rw_mutex);
2002         gfs2_trans_end(sdp);
2003         return error;
2004 }
2005
2006 /**
2007  * do_shrink - make a file smaller
2008  * @inode: the inode
2009  * @newsize: the size to make the file
2010  *
2011  * Called with an exclusive lock on @inode. The @size must
2012  * be equal to or smaller than the current inode size.
2013  *
2014  * Returns: errno
2015  */
2016
2017 static int do_shrink(struct inode *inode, u64 newsize)
2018 {
2019         struct gfs2_inode *ip = GFS2_I(inode);
2020         int error;
2021
2022         error = trunc_start(inode, newsize);
2023         if (error < 0)
2024                 return error;
2025         if (gfs2_is_stuffed(ip))
2026                 return 0;
2027
2028         error = punch_hole(ip, newsize, 0);
2029         if (error == 0)
2030                 error = trunc_end(ip);
2031
2032         return error;
2033 }
2034
2035 /**
2036  * do_grow - Touch and update inode size
2037  * @inode: The inode
2038  * @size: The new size
2039  *
2040  * This function updates the timestamps on the inode and
2041  * may also increase the size of the inode. This function
2042  * must not be called with @size any smaller than the current
2043  * inode size.
2044  *
2045  * Although it is not strictly required to unstuff files here,
2046  * earlier versions of GFS2 have a bug in the stuffed file reading
2047  * code which will result in a buffer overrun if the size is larger
2048  * than the max stuffed file size. In order to prevent this from
2049  * occurring, such files are unstuffed, but in other cases we can
2050  * just update the inode size directly.
2051  *
2052  * Returns: 0 on success, or -ve on error
2053  */
2054
2055 static int do_grow(struct inode *inode, u64 size)
2056 {
2057         struct gfs2_inode *ip = GFS2_I(inode);
2058         struct gfs2_sbd *sdp = GFS2_SB(inode);
2059         struct gfs2_alloc_parms ap = { .target = 1, };
2060         struct buffer_head *dibh;
2061         int error;
2062         int unstuff = 0;
2063
2064         if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
2065                 error = gfs2_quota_lock_check(ip, &ap);
2066                 if (error)
2067                         return error;
2068
2069                 error = gfs2_inplace_reserve(ip, &ap);
2070                 if (error)
2071                         goto do_grow_qunlock;
2072                 unstuff = 1;
2073         }
2074
2075         error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
2076                                  (unstuff &&
2077                                   gfs2_is_jdata(ip) ? RES_JDATA : 0) +
2078                                  (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
2079                                   0 : RES_QUOTA), 0);
2080         if (error)
2081                 goto do_grow_release;
2082
2083         if (unstuff) {
2084                 error = gfs2_unstuff_dinode(ip);
2085                 if (error)
2086                         goto do_end_trans;
2087         }
2088
2089         error = gfs2_meta_inode_buffer(ip, &dibh);
2090         if (error)
2091                 goto do_end_trans;
2092
2093         truncate_setsize(inode, size);
2094         inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
2095         gfs2_trans_add_meta(ip->i_gl, dibh);
2096         gfs2_dinode_out(ip, dibh->b_data);
2097         brelse(dibh);
2098
2099 do_end_trans:
2100         gfs2_trans_end(sdp);
2101 do_grow_release:
2102         if (unstuff) {
2103                 gfs2_inplace_release(ip);
2104 do_grow_qunlock:
2105                 gfs2_quota_unlock(ip);
2106         }
2107         return error;
2108 }
2109
2110 /**
2111  * gfs2_setattr_size - make a file a given size
2112  * @inode: the inode
2113  * @newsize: the size to make the file
2114  *
2115  * The file size can grow, shrink, or stay the same size. This
2116  * is called holding i_rwsem and an exclusive glock on the inode
2117  * in question.
2118  *
2119  * Returns: errno
2120  */
2121
2122 int gfs2_setattr_size(struct inode *inode, u64 newsize)
2123 {
2124         struct gfs2_inode *ip = GFS2_I(inode);
2125         int ret;
2126
2127         BUG_ON(!S_ISREG(inode->i_mode));
2128
2129         ret = inode_newsize_ok(inode, newsize);
2130         if (ret)
2131                 return ret;
2132
2133         inode_dio_wait(inode);
2134
2135         ret = gfs2_qa_get(ip);
2136         if (ret)
2137                 goto out;
2138
2139         if (newsize >= inode->i_size) {
2140                 ret = do_grow(inode, newsize);
2141                 goto out;
2142         }
2143
2144         ret = do_shrink(inode, newsize);
2145 out:
2146         gfs2_rs_delete(ip);
2147         gfs2_qa_put(ip);
2148         return ret;
2149 }
2150
2151 int gfs2_truncatei_resume(struct gfs2_inode *ip)
2152 {
2153         int error;
2154         error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
2155         if (!error)
2156                 error = trunc_end(ip);
2157         return error;
2158 }
2159
2160 int gfs2_file_dealloc(struct gfs2_inode *ip)
2161 {
2162         return punch_hole(ip, 0, 0);
2163 }
2164
2165 /**
2166  * gfs2_free_journal_extents - Free cached journal bmap info
2167  * @jd: The journal
2168  *
2169  */
2170
2171 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
2172 {
2173         struct gfs2_journal_extent *jext;
2174
2175         while(!list_empty(&jd->extent_list)) {
2176                 jext = list_first_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2177                 list_del(&jext->list);
2178                 kfree(jext);
2179         }
2180 }
2181
2182 /**
2183  * gfs2_add_jextent - Add or merge a new extent to extent cache
2184  * @jd: The journal descriptor
2185  * @lblock: The logical block at start of new extent
2186  * @dblock: The physical block at start of new extent
2187  * @blocks: Size of extent in fs blocks
2188  *
2189  * Returns: 0 on success or -ENOMEM
2190  */
2191
2192 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
2193 {
2194         struct gfs2_journal_extent *jext;
2195
2196         if (!list_empty(&jd->extent_list)) {
2197                 jext = list_last_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2198                 if ((jext->dblock + jext->blocks) == dblock) {
2199                         jext->blocks += blocks;
2200                         return 0;
2201                 }
2202         }
2203
2204         jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
2205         if (jext == NULL)
2206                 return -ENOMEM;
2207         jext->dblock = dblock;
2208         jext->lblock = lblock;
2209         jext->blocks = blocks;
2210         list_add_tail(&jext->list, &jd->extent_list);
2211         jd->nr_extents++;
2212         return 0;
2213 }
2214
2215 /**
2216  * gfs2_map_journal_extents - Cache journal bmap info
2217  * @sdp: The super block
2218  * @jd: The journal to map
2219  *
2220  * Create a reusable "extent" mapping from all logical
2221  * blocks to all physical blocks for the given journal.  This will save
2222  * us time when writing journal blocks.  Most journals will have only one
2223  * extent that maps all their logical blocks.  That's because gfs2.mkfs
2224  * arranges the journal blocks sequentially to maximize performance.
2225  * So the extent would map the first block for the entire file length.
2226  * However, gfs2_jadd can happen while file activity is happening, so
2227  * those journals may not be sequential.  Less likely is the case where
2228  * the users created their own journals by mounting the metafs and
2229  * laying it out.  But it's still possible.  These journals might have
2230  * several extents.
2231  *
2232  * Returns: 0 on success, or error on failure
2233  */
2234
2235 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
2236 {
2237         u64 lblock = 0;
2238         u64 lblock_stop;
2239         struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
2240         struct buffer_head bh;
2241         unsigned int shift = sdp->sd_sb.sb_bsize_shift;
2242         u64 size;
2243         int rc;
2244         ktime_t start, end;
2245
2246         start = ktime_get();
2247         lblock_stop = i_size_read(jd->jd_inode) >> shift;
2248         size = (lblock_stop - lblock) << shift;
2249         jd->nr_extents = 0;
2250         WARN_ON(!list_empty(&jd->extent_list));
2251
2252         do {
2253                 bh.b_state = 0;
2254                 bh.b_blocknr = 0;
2255                 bh.b_size = size;
2256                 rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
2257                 if (rc || !buffer_mapped(&bh))
2258                         goto fail;
2259                 rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
2260                 if (rc)
2261                         goto fail;
2262                 size -= bh.b_size;
2263                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2264         } while(size > 0);
2265
2266         end = ktime_get();
2267         fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid,
2268                 jd->nr_extents, ktime_ms_delta(end, start));
2269         return 0;
2270
2271 fail:
2272         fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
2273                 rc, jd->jd_jid,
2274                 (unsigned long long)(i_size_read(jd->jd_inode) - size),
2275                 jd->nr_extents);
2276         fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
2277                 rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
2278                 bh.b_state, (unsigned long long)bh.b_size);
2279         gfs2_free_journal_extents(jd);
2280         return rc;
2281 }
2282
2283 /**
2284  * gfs2_write_alloc_required - figure out if a write will require an allocation
2285  * @ip: the file being written to
2286  * @offset: the offset to write to
2287  * @len: the number of bytes being written
2288  *
2289  * Returns: 1 if an alloc is required, 0 otherwise
2290  */
2291
2292 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
2293                               unsigned int len)
2294 {
2295         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2296         struct buffer_head bh;
2297         unsigned int shift;
2298         u64 lblock, lblock_stop, size;
2299         u64 end_of_file;
2300
2301         if (!len)
2302                 return 0;
2303
2304         if (gfs2_is_stuffed(ip)) {
2305                 if (offset + len > gfs2_max_stuffed_size(ip))
2306                         return 1;
2307                 return 0;
2308         }
2309
2310         shift = sdp->sd_sb.sb_bsize_shift;
2311         BUG_ON(gfs2_is_dir(ip));
2312         end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
2313         lblock = offset >> shift;
2314         lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
2315         if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex))
2316                 return 1;
2317
2318         size = (lblock_stop - lblock) << shift;
2319         do {
2320                 bh.b_state = 0;
2321                 bh.b_size = size;
2322                 gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
2323                 if (!buffer_mapped(&bh))
2324                         return 1;
2325                 size -= bh.b_size;
2326                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2327         } while(size > 0);
2328
2329         return 0;
2330 }
2331
2332 static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
2333 {
2334         struct gfs2_inode *ip = GFS2_I(inode);
2335         struct buffer_head *dibh;
2336         int error;
2337
2338         if (offset >= inode->i_size)
2339                 return 0;
2340         if (offset + length > inode->i_size)
2341                 length = inode->i_size - offset;
2342
2343         error = gfs2_meta_inode_buffer(ip, &dibh);
2344         if (error)
2345                 return error;
2346         gfs2_trans_add_meta(ip->i_gl, dibh);
2347         memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
2348                length);
2349         brelse(dibh);
2350         return 0;
2351 }
2352
2353 static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
2354                                          loff_t length)
2355 {
2356         struct gfs2_sbd *sdp = GFS2_SB(inode);
2357         loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
2358         int error;
2359
2360         while (length) {
2361                 struct gfs2_trans *tr;
2362                 loff_t chunk;
2363                 unsigned int offs;
2364
2365                 chunk = length;
2366                 if (chunk > max_chunk)
2367                         chunk = max_chunk;
2368
2369                 offs = offset & ~PAGE_MASK;
2370                 if (offs && chunk > PAGE_SIZE)
2371                         chunk = offs + ((chunk - offs) & PAGE_MASK);
2372
2373                 truncate_pagecache_range(inode, offset, chunk);
2374                 offset += chunk;
2375                 length -= chunk;
2376
2377                 tr = current->journal_info;
2378                 if (!test_bit(TR_TOUCHED, &tr->tr_flags))
2379                         continue;
2380
2381                 gfs2_trans_end(sdp);
2382                 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
2383                 if (error)
2384                         return error;
2385         }
2386         return 0;
2387 }
2388
2389 int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
2390 {
2391         struct inode *inode = file_inode(file);
2392         struct gfs2_inode *ip = GFS2_I(inode);
2393         struct gfs2_sbd *sdp = GFS2_SB(inode);
2394         unsigned int blocksize = i_blocksize(inode);
2395         loff_t start, end;
2396         int error;
2397
2398         if (!gfs2_is_stuffed(ip)) {
2399                 unsigned int start_off, end_len;
2400
2401                 start_off = offset & (blocksize - 1);
2402                 end_len = (offset + length) & (blocksize - 1);
2403                 if (start_off) {
2404                         unsigned int len = length;
2405                         if (length > blocksize - start_off)
2406                                 len = blocksize - start_off;
2407                         error = gfs2_block_zero_range(inode, offset, len);
2408                         if (error)
2409                                 goto out;
2410                         if (start_off + length < blocksize)
2411                                 end_len = 0;
2412                 }
2413                 if (end_len) {
2414                         error = gfs2_block_zero_range(inode,
2415                                 offset + length - end_len, end_len);
2416                         if (error)
2417                                 goto out;
2418                 }
2419         }
2420
2421         start = round_down(offset, blocksize);
2422         end = round_up(offset + length, blocksize) - 1;
2423         error = filemap_write_and_wait_range(inode->i_mapping, start, end);
2424         if (error)
2425                 return error;
2426
2427         if (gfs2_is_jdata(ip))
2428                 error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
2429                                          GFS2_JTRUNC_REVOKES);
2430         else
2431                 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2432         if (error)
2433                 return error;
2434
2435         if (gfs2_is_stuffed(ip)) {
2436                 error = stuffed_zero_range(inode, offset, length);
2437                 if (error)
2438                         goto out;
2439         }
2440
2441         if (gfs2_is_jdata(ip)) {
2442                 BUG_ON(!current->journal_info);
2443                 gfs2_journaled_truncate_range(inode, offset, length);
2444         } else
2445                 truncate_pagecache_range(inode, offset, offset + length - 1);
2446
2447         file_update_time(file);
2448         mark_inode_dirty(inode);
2449
2450         if (current->journal_info)
2451                 gfs2_trans_end(sdp);
2452
2453         if (!gfs2_is_stuffed(ip))
2454                 error = punch_hole(ip, offset, length);
2455
2456 out:
2457         if (current->journal_info)
2458                 gfs2_trans_end(sdp);
2459         return error;
2460 }
2461
2462 static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
2463                 loff_t offset)
2464 {
2465         int ret;
2466
2467         if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode))))
2468                 return -EIO;
2469
2470         if (offset >= wpc->iomap.offset &&
2471             offset < wpc->iomap.offset + wpc->iomap.length)
2472                 return 0;
2473
2474         memset(&wpc->iomap, 0, sizeof(wpc->iomap));
2475         ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap);
2476         return ret;
2477 }
2478
2479 const struct iomap_writeback_ops gfs2_writeback_ops = {
2480         .map_blocks             = gfs2_map_blocks,
2481 };