fs/gfs2/bmap.c

   1 /*
   2  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
   3  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
   4  *
   5  * This copyrighted material is made available to anyone wishing to use,
   6  * modify, copy, or redistribute it subject to the terms and conditions
   7  * of the GNU General Public License version 2.
   8  */
   9
  10 #include <linux/spinlock.h>
  11 #include <linux/completion.h>
  12 #include <linux/buffer_head.h>
  13 #include <linux/blkdev.h>
  14 #include <linux/gfs2_ondisk.h>
  15 #include <linux/crc32.h>
  16 #include <linux/iomap.h>
  17
  18 #include "gfs2.h"
  19 #include "incore.h"
  20 #include "bmap.h"
  21 #include "glock.h"
  22 #include "inode.h"
  23 #include "meta_io.h"
  24 #include "quota.h"
  25 #include "rgrp.h"
  26 #include "log.h"
  27 #include "super.h"
  28 #include "trans.h"
  29 #include "dir.h"
  30 #include "util.h"
  31 #include "trace_gfs2.h"
  32
  33 /* This doesn't need to be that large as max 64 bit pointers in a 4k
  34  * block is 512, so __u16 is fine for that. It saves stack space to
  35  * keep it small.
  36  */
  37 struct metapath {
  38         struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
  39         __u16 mp_list[GFS2_MAX_META_HEIGHT];
  40         int mp_fheight; /* find_metapath height */
  41         int mp_aheight; /* actual height (lookup height) */
  42 };
  43
  44 /**
  45  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
  46  * @ip: the inode
  47  * @dibh: the dinode buffer
  48  * @block: the block number that was allocated
  49  * @page: The (optional) page. This is looked up if @page is NULL
  50  *
  51  * Returns: errno
  52  */
  53
  54 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
  55                                u64 block, struct page *page)
  56 {
  57         struct inode *inode = &ip->i_inode;
  58         struct buffer_head *bh;
  59         int release = 0;
  60
  61         if (!page || page->index) {
  62                 page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
  63                 if (!page)
  64                         return -ENOMEM;
  65                 release = 1;
  66         }
  67
  68         if (!PageUptodate(page)) {
  69                 void *kaddr = kmap(page);
  70                 u64 dsize = i_size_read(inode);
  71
  72                 if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
  73                         dsize = dibh->b_size - sizeof(struct gfs2_dinode);
  74
  75                 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
  76                 memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
  77                 kunmap(page);
  78
  79                 SetPageUptodate(page);
  80         }
  81
  82         if (!page_has_buffers(page))
  83                 create_empty_buffers(page, BIT(inode->i_blkbits),
  84                                      BIT(BH_Uptodate));
  85
  86         bh = page_buffers(page);
  87
  88         if (!buffer_mapped(bh))
  89                 map_bh(bh, inode->i_sb, block);
  90
  91         set_buffer_uptodate(bh);
  92         if (!gfs2_is_jdata(ip))
  93                 mark_buffer_dirty(bh);
  94         if (!gfs2_is_writeback(ip))
  95                 gfs2_trans_add_data(ip->i_gl, bh);
  96
  97         if (release) {
  98                 unlock_page(page);
  99                 put_page(page);
 100         }
 101
 102         return 0;
 103 }
 104
 105 /**
 106  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
 107  * @ip: The GFS2 inode to unstuff
 108  * @page: The (optional) page. This is looked up if the @page is NULL
 109  *
 110  * This routine unstuffs a dinode and returns it to a "normal" state such
 111  * that the height can be grown in the traditional way.
 112  *
 113  * Returns: errno
 114  */
 115
 116 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 117 {
 118         struct buffer_head *bh, *dibh;
 119         struct gfs2_dinode *di;
 120         u64 block = 0;
 121         int isdir = gfs2_is_dir(ip);
 122         int error;
 123
 124         down_write(&ip->i_rw_mutex);
 125
 126         error = gfs2_meta_inode_buffer(ip, &dibh);
 127         if (error)
 128                 goto out;
 129
 130         if (i_size_read(&ip->i_inode)) {
 131                 /* Get a free block, fill it with the stuffed data,
 132                    and write it out to disk */
 133
 134                 unsigned int n = 1;
 135                 error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
 136                 if (error)
 137                         goto out_brelse;
 138                 if (isdir) {
 139                         gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
 140                         error = gfs2_dir_get_new_buffer(ip, block, &bh);
 141                         if (error)
 142                                 goto out_brelse;
 143                         gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
 144                                               dibh, sizeof(struct gfs2_dinode));
 145                         brelse(bh);
 146                 } else {
 147                         error = gfs2_unstuffer_page(ip, dibh, block, page);
 148                         if (error)
 149                                 goto out_brelse;
 150                 }
 151         }
 152
 153         /*  Set up the pointer to the new block  */
 154
 155         gfs2_trans_add_meta(ip->i_gl, dibh);
 156         di = (struct gfs2_dinode *)dibh->b_data;
 157         gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 158
 159         if (i_size_read(&ip->i_inode)) {
 160                 *(__be64 *)(di + 1) = cpu_to_be64(block);
 161                 gfs2_add_inode_blocks(&ip->i_inode, 1);
 162                 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 163         }
 164
 165         ip->i_height = 1;
 166         di->di_height = cpu_to_be16(1);
 167
 168 out_brelse:
 169         brelse(dibh);
 170 out:
 171         up_write(&ip->i_rw_mutex);
 172         return error;
 173 }
 174
 175
 176 /**
 177  * find_metapath - Find path through the metadata tree
 178  * @sdp: The superblock
 179  * @mp: The metapath to return the result in
 180  * @block: The disk block to look up
 181  * @height: The pre-calculated height of the metadata tree
 182  *
 183  *   This routine returns a struct metapath structure that defines a path
 184  *   through the metadata of inode "ip" to get to block "block".
 185  *
 186  *   Example:
 187  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
 188  *   filesystem with a blocksize of 4096.
 189  *
 190  *   find_metapath() would return a struct metapath structure set to:
 191  *   mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
 192  *   and mp_list[2] = 165.
 193  *
 194  *   That means that in order to get to the block containing the byte at
 195  *   offset 101342453, we would load the indirect block pointed to by pointer
 196  *   0 in the dinode.  We would then load the indirect block pointed to by
 197  *   pointer 48 in that indirect block.  We would then load the data block
 198  *   pointed to by pointer 165 in that indirect block.
 199  *
 200  *             ----------------------------------------
 201  *             | Dinode |                             |
 202  *             |        |                            4|
 203  *             |        |0 1 2 3 4 5                 9|
 204  *             |        |                            6|
 205  *             ----------------------------------------
 206  *                       |
 207  *                       |
 208  *                       V
 209  *             ----------------------------------------
 210  *             | Indirect Block                       |
 211  *             |                                     5|
 212  *             |            4 4 4 4 4 5 5            1|
 213  *             |0           5 6 7 8 9 0 1            2|
 214  *             ----------------------------------------
 215  *                                |
 216  *                                |
 217  *                                V
 218  *             ----------------------------------------
 219  *             | Indirect Block                       |
 220  *             |                         1 1 1 1 1   5|
 221  *             |                         6 6 6 6 6   1|
 222  *             |0                        3 4 5 6 7   2|
 223  *             ----------------------------------------
 224  *                                           |
 225  *                                           |
 226  *                                           V
 227  *             ----------------------------------------
 228  *             | Data block containing offset         |
 229  *             |            101342453                 |
 230  *             |                                      |
 231  *             |                                      |
 232  *             ----------------------------------------
 233  *
 234  */
 235
 236 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
 237                           struct metapath *mp, unsigned int height)
 238 {
 239         unsigned int i;
 240
 241         mp->mp_fheight = height;
 242         for (i = height; i--;)
 243                 mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
 244 }
 245
 246 static inline unsigned int metapath_branch_start(const struct metapath *mp)
 247 {
 248         if (mp->mp_list[0] == 0)
 249                 return 2;
 250         return 1;
 251 }
 252
 253 /**
 254  * metaptr1 - Return the first possible metadata pointer in a metapath buffer
 255  * @height: The metadata height (0 = dinode)
 256  * @mp: The metapath
 257  */
 258 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
 259 {
 260         struct buffer_head *bh = mp->mp_bh[height];
 261         if (height == 0)
 262                 return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
 263         return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
 264 }
 265
 266 /**
 267  * metapointer - Return pointer to start of metadata in a buffer
 268  * @height: The metadata height (0 = dinode)
 269  * @mp: The metapath
 270  *
 271  * Return a pointer to the block number of the next height of the metadata
 272  * tree given a buffer containing the pointer to the current height of the
 273  * metadata tree.
 274  */
 275
 276 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
 277 {
 278         __be64 *p = metaptr1(height, mp);
 279         return p + mp->mp_list[height];
 280 }
 281
 282 static void gfs2_metapath_ra(struct gfs2_glock *gl, struct metapath *mp,
 283                              unsigned int height)
 284 {
 285         struct buffer_head *bh = mp->mp_bh[height];
 286         const __be64 *pos = metapointer(height, mp);
 287         const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size);
 288         const __be64 *t;
 289
 290         for (t = pos; t < endp; t++) {
 291                 struct buffer_head *rabh;
 292
 293                 if (!*t)
 294                         continue;
 295
 296                 rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
 297                 if (trylock_buffer(rabh)) {
 298                         if (!buffer_uptodate(rabh)) {
 299                                 rabh->b_end_io = end_buffer_read_sync;
 300                                 submit_bh(REQ_OP_READ,
 301                                           REQ_RAHEAD | REQ_META | REQ_PRIO,
 302                                           rabh);
 303                                 continue;
 304                         }
 305                         unlock_buffer(rabh);
 306                 }
 307                 brelse(rabh);
 308         }
 309 }
 310
 311 static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
 312                              unsigned int x, unsigned int h)
 313 {
 314         for (; x < h; x++) {
 315                 __be64 *ptr = metapointer(x, mp);
 316                 u64 dblock = be64_to_cpu(*ptr);
 317                 int ret;
 318
 319                 if (!dblock)
 320                         break;
 321                 ret = gfs2_meta_indirect_buffer(ip, x + 1, dblock, &mp->mp_bh[x + 1]);
 322                 if (ret)
 323                         return ret;
 324         }
 325         mp->mp_aheight = x + 1;
 326         return 0;
 327 }
 328
 329 /**
 330  * lookup_metapath - Walk the metadata tree to a specific point
 331  * @ip: The inode
 332  * @mp: The metapath
 333  *
 334  * Assumes that the inode's buffer has already been looked up and
 335  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
 336  * by find_metapath().
 337  *
 338  * If this function encounters part of the tree which has not been
 339  * allocated, it returns the current height of the tree at the point
 340  * at which it found the unallocated block. Blocks which are found are
 341  * added to the mp->mp_bh[] list.
 342  *
 343  * Returns: error
 344  */
 345
 346 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 347 {
 348         return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
 349 }
 350
 351 /**
 352  * fillup_metapath - fill up buffers for the metadata path to a specific height
 353  * @ip: The inode
 354  * @mp: The metapath
 355  * @h: The height to which it should be mapped
 356  *
 357  * Similar to lookup_metapath, but does lookups for a range of heights
 358  *
 359  * Returns: error or the number of buffers filled
 360  */
 361
 362 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
 363 {
 364         unsigned int x = 0;
 365         int ret;
 366
 367         if (h) {
 368                 /* find the first buffer we need to look up. */
 369                 for (x = h - 1; x > 0; x--) {
 370                         if (mp->mp_bh[x])
 371                                 break;
 372                 }
 373         }
 374         ret = __fillup_metapath(ip, mp, x, h);
 375         if (ret)
 376                 return ret;
 377         return mp->mp_aheight - x - 1;
 378 }
 379
 380 static inline void release_metapath(struct metapath *mp)
 381 {
 382         int i;
 383
 384         for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
 385                 if (mp->mp_bh[i] == NULL)
 386                         break;
 387                 brelse(mp->mp_bh[i]);
 388         }
 389 }
 390
 391 /**
 392  * gfs2_extent_length - Returns length of an extent of blocks
 393  * @start: Start of the buffer
 394  * @len: Length of the buffer in bytes
 395  * @ptr: Current position in the buffer
 396  * @limit: Max extent length to return (0 = unlimited)
 397  * @eob: Set to 1 if we hit "end of block"
 398  *
 399  * If the first block is zero (unallocated) it will return the number of
 400  * unallocated blocks in the extent, otherwise it will return the number
 401  * of contiguous blocks in the extent.
 402  *
 403  * Returns: The length of the extent (minimum of one block)
 404  */
 405
 406 static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
 407 {
 408         const __be64 *end = (start + len);
 409         const __be64 *first = ptr;
 410         u64 d = be64_to_cpu(*ptr);
 411
 412         *eob = 0;
 413         do {
 414                 ptr++;
 415                 if (ptr >= end)
 416                         break;
 417                 if (limit && --limit == 0)
 418                         break;
 419                 if (d)
 420                         d++;
 421         } while(be64_to_cpu(*ptr) == d);
 422         if (ptr >= end)
 423                 *eob = 1;
 424         return (ptr - first);
 425 }
 426
 427 static inline void bmap_lock(struct gfs2_inode *ip, int create)
 428 {
 429         if (create)
 430                 down_write(&ip->i_rw_mutex);
 431         else
 432                 down_read(&ip->i_rw_mutex);
 433 }
 434
 435 static inline void bmap_unlock(struct gfs2_inode *ip, int create)
 436 {
 437         if (create)
 438                 up_write(&ip->i_rw_mutex);
 439         else
 440                 up_read(&ip->i_rw_mutex);
 441 }
 442
 443 static inline __be64 *gfs2_indirect_init(struct metapath *mp,
 444                                          struct gfs2_glock *gl, unsigned int i,
 445                                          unsigned offset, u64 bn)
 446 {
 447         __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
 448                        ((i > 1) ? sizeof(struct gfs2_meta_header) :
 449                                  sizeof(struct gfs2_dinode)));
 450         BUG_ON(i < 1);
 451         BUG_ON(mp->mp_bh[i] != NULL);
 452         mp->mp_bh[i] = gfs2_meta_new(gl, bn);
 453         gfs2_trans_add_meta(gl, mp->mp_bh[i]);
 454         gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
 455         gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
 456         ptr += offset;
 457         *ptr = cpu_to_be64(bn);
 458         return ptr;
 459 }
 460
 461 enum alloc_state {
 462         ALLOC_DATA = 0,
 463         ALLOC_GROW_DEPTH = 1,
 464         ALLOC_GROW_HEIGHT = 2,
 465         /* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
 466 };
 467
 468 static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt)
 469 {
 470         if (hgt)
 471                 return sdp->sd_inptrs;
 472         return sdp->sd_diptrs;
 473 }
 474
 475 /**
 476  * gfs2_bmap_alloc - Build a metadata tree of the requested height
 477  * @inode: The GFS2 inode
 478  * @lblock: The logical starting block of the extent
 479  * @bh_map: This is used to return the mapping details
 480  * @zero_new: True if newly allocated blocks should be zeroed
 481  * @mp: The metapath, with proper height information calculated
 482  * @maxlen: The max number of data blocks to alloc
 483  * @dblock: Pointer to return the resulting new block
 484  * @dblks: Pointer to return the number of blocks allocated
 485  *
 486  * In this routine we may have to alloc:
 487  *   i) Indirect blocks to grow the metadata tree height
 488  *  ii) Indirect blocks to fill in lower part of the metadata tree
 489  * iii) Data blocks
 490  *
 491  * The function is in two parts. The first part works out the total
 492  * number of blocks which we need. The second part does the actual
 493  * allocation asking for an extent at a time (if enough contiguous free
 494  * blocks are available, there will only be one request per bmap call)
 495  * and uses the state machine to initialise the blocks in order.
 496  *
 497  * Returns: errno on error
 498  */
 499
 500 static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 501                             unsigned flags, struct metapath *mp)
 502 {
 503         struct gfs2_inode *ip = GFS2_I(inode);
 504         struct gfs2_sbd *sdp = GFS2_SB(inode);
 505         struct super_block *sb = sdp->sd_vfs;
 506         struct buffer_head *dibh = mp->mp_bh[0];
 507         u64 bn;
 508         unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 509         unsigned dblks = 0;
 510         unsigned ptrs_per_blk;
 511         const unsigned end_of_metadata = mp->mp_fheight - 1;
 512         int ret;
 513         enum alloc_state state;
 514         __be64 *ptr;
 515         __be64 zero_bn = 0;
 516         size_t maxlen = iomap->length >> inode->i_blkbits;
 517
 518         BUG_ON(mp->mp_aheight < 1);
 519         BUG_ON(dibh == NULL);
 520
 521         gfs2_trans_add_meta(ip->i_gl, dibh);
 522
 523         if (mp->mp_fheight == mp->mp_aheight) {
 524                 struct buffer_head *bh;
 525                 int eob;
 526
 527                 /* Bottom indirect block exists, find unalloced extent size */
 528                 ptr = metapointer(end_of_metadata, mp);
 529                 bh = mp->mp_bh[end_of_metadata];
 530                 dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr,
 531                                            maxlen, &eob);
 532                 BUG_ON(dblks < 1);
 533                 state = ALLOC_DATA;
 534         } else {
 535                 /* Need to allocate indirect blocks */
 536                 ptrs_per_blk = mp->mp_fheight > 1 ? sdp->sd_inptrs :
 537                         sdp->sd_diptrs;
 538                 dblks = min(maxlen, (size_t)(ptrs_per_blk -
 539                                              mp->mp_list[end_of_metadata]));
 540                 if (mp->mp_fheight == ip->i_height) {
 541                         /* Writing into existing tree, extend tree down */
 542                         iblks = mp->mp_fheight - mp->mp_aheight;
 543                         state = ALLOC_GROW_DEPTH;
 544                 } else {
 545                         /* Building up tree height */
 546                         state = ALLOC_GROW_HEIGHT;
 547                         iblks = mp->mp_fheight - ip->i_height;
 548                         branch_start = metapath_branch_start(mp);
 549                         iblks += (mp->mp_fheight - branch_start);
 550                 }
 551         }
 552
 553         /* start of the second part of the function (state machine) */
 554
 555         blks = dblks + iblks;
 556         i = mp->mp_aheight;
 557         do {
 558                 int error;
 559                 n = blks - alloced;
 560                 error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
 561                 if (error)
 562                         return error;
 563                 alloced += n;
 564                 if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 565                         gfs2_trans_add_unrevoke(sdp, bn, n);
 566                 switch (state) {
 567                 /* Growing height of tree */
 568                 case ALLOC_GROW_HEIGHT:
 569                         if (i == 1) {
 570                                 ptr = (__be64 *)(dibh->b_data +
 571                                                  sizeof(struct gfs2_dinode));
 572                                 zero_bn = *ptr;
 573                         }
 574                         for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
 575                              i++, n--)
 576                                 gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
 577                         if (i - 1 == mp->mp_fheight - ip->i_height) {
 578                                 i--;
 579                                 gfs2_buffer_copy_tail(mp->mp_bh[i],
 580                                                 sizeof(struct gfs2_meta_header),
 581                                                 dibh, sizeof(struct gfs2_dinode));
 582                                 gfs2_buffer_clear_tail(dibh,
 583                                                 sizeof(struct gfs2_dinode) +
 584                                                 sizeof(__be64));
 585                                 ptr = (__be64 *)(mp->mp_bh[i]->b_data +
 586                                         sizeof(struct gfs2_meta_header));
 587                                 *ptr = zero_bn;
 588                                 state = ALLOC_GROW_DEPTH;
 589                                 for(i = branch_start; i < mp->mp_fheight; i++) {
 590                                         if (mp->mp_bh[i] == NULL)
 591                                                 break;
 592                                         brelse(mp->mp_bh[i]);
 593                                         mp->mp_bh[i] = NULL;
 594                                 }
 595                                 i = branch_start;
 596                         }
 597                         if (n == 0)
 598                                 break;
 599                 /* Branching from existing tree */
 600                 case ALLOC_GROW_DEPTH:
 601                         if (i > 1 && i < mp->mp_fheight)
 602                                 gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
 603                         for (; i < mp->mp_fheight && n > 0; i++, n--)
 604                                 gfs2_indirect_init(mp, ip->i_gl, i,
 605                                                    mp->mp_list[i-1], bn++);
 606                         if (i == mp->mp_fheight)
 607                                 state = ALLOC_DATA;
 608                         if (n == 0)
 609                                 break;
 610                 /* Tree complete, adding data blocks */
 611                 case ALLOC_DATA:
 612                         BUG_ON(n > dblks);
 613                         BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
 614                         gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
 615                         dblks = n;
 616                         ptr = metapointer(end_of_metadata, mp);
 617                         iomap->addr = bn << inode->i_blkbits;
 618                         iomap->flags |= IOMAP_F_NEW;
 619                         while (n-- > 0)
 620                                 *ptr++ = cpu_to_be64(bn++);
 621                         if (flags & IOMAP_ZERO) {
 622                                 ret = sb_issue_zeroout(sb, iomap->addr >> inode->i_blkbits,
 623                                                        dblks, GFP_NOFS);
 624                                 if (ret) {
 625                                         fs_err(sdp,
 626                                                "Failed to zero data buffers\n");
 627                                         flags &= ~IOMAP_ZERO;
 628                                 }
 629                         }
 630                         break;
 631                 }
 632         } while (iomap->addr == IOMAP_NULL_ADDR);
 633
 634         iomap->length = (u64)dblks << inode->i_blkbits;
 635         ip->i_height = mp->mp_fheight;
 636         gfs2_add_inode_blocks(&ip->i_inode, alloced);
 637         gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
 638         return 0;
 639 }
 640
 641 /**
 642  * hole_size - figure out the size of a hole
 643  * @inode: The inode
 644  * @lblock: The logical starting block number
 645  * @mp: The metapath
 646  *
 647  * Returns: The hole size in bytes
 648  *
 649  */
 650 static u64 hole_size(struct inode *inode, sector_t lblock, struct metapath *mp)
 651 {
 652         struct gfs2_inode *ip = GFS2_I(inode);
 653         struct gfs2_sbd *sdp = GFS2_SB(inode);
 654         struct metapath mp_eof;
 655         u64 factor = 1;
 656         int hgt;
 657         u64 holesz = 0;
 658         const __be64 *first, *end, *ptr;
 659         const struct buffer_head *bh;
 660         u64 lblock_stop = (i_size_read(inode) - 1) >> inode->i_blkbits;
 661         int zeroptrs;
 662         bool done = false;
 663
 664         /* Get another metapath, to the very last byte */
 665         find_metapath(sdp, lblock_stop, &mp_eof, ip->i_height);
 666         for (hgt = ip->i_height - 1; hgt >= 0 && !done; hgt--) {
 667                 bh = mp->mp_bh[hgt];
 668                 if (bh) {
 669                         zeroptrs = 0;
 670                         first = metapointer(hgt, mp);
 671                         end = (const __be64 *)(bh->b_data + bh->b_size);
 672
 673                         for (ptr = first; ptr < end; ptr++) {
 674                                 if (*ptr) {
 675                                         done = true;
 676                                         break;
 677                                 } else {
 678                                         zeroptrs++;
 679                                 }
 680                         }
 681                 } else {
 682                         zeroptrs = sdp->sd_inptrs;
 683                 }
 684                 if (factor * zeroptrs >= lblock_stop - lblock + 1) {
 685                         holesz = lblock_stop - lblock + 1;
 686                         break;
 687                 }
 688                 holesz += factor * zeroptrs;
 689
 690                 factor *= sdp->sd_inptrs;
 691                 if (hgt && (mp->mp_list[hgt - 1] < mp_eof.mp_list[hgt - 1]))
 692                         (mp->mp_list[hgt - 1])++;
 693         }
 694         return holesz << inode->i_blkbits;
 695 }
 696
 697 static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
 698 {
 699         struct gfs2_inode *ip = GFS2_I(inode);
 700
 701         iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
 702                       sizeof(struct gfs2_dinode);
 703         iomap->offset = 0;
 704         iomap->length = i_size_read(inode);
 705         iomap->type = IOMAP_MAPPED;
 706         iomap->flags = IOMAP_F_DATA_INLINE;
 707 }
 708
 709 /**
 710  * gfs2_iomap_begin - Map blocks from an inode to disk blocks
 711  * @inode: The inode
 712  * @pos: Starting position in bytes
 713  * @length: Length to map, in bytes
 714  * @flags: iomap flags
 715  * @iomap: The iomap structure
 716  *
 717  * Returns: errno
 718  */
 719 int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
 720                      unsigned flags, struct iomap *iomap)
 721 {
 722         struct gfs2_inode *ip = GFS2_I(inode);
 723         struct gfs2_sbd *sdp = GFS2_SB(inode);
 724         struct metapath mp = { .mp_aheight = 1, };
 725         unsigned int factor = sdp->sd_sb.sb_bsize;
 726         const u64 *arr = sdp->sd_heightsize;
 727         __be64 *ptr;
 728         sector_t lblock;
 729         sector_t lend;
 730         int ret;
 731         int eob;
 732         unsigned int len;
 733         struct buffer_head *bh;
 734         u8 height;
 735
 736         trace_gfs2_iomap_start(ip, pos, length, flags);
 737         if (!length) {
 738                 ret = -EINVAL;
 739                 goto out;
 740         }
 741
 742         if ((flags & IOMAP_REPORT) && gfs2_is_stuffed(ip)) {
 743                 gfs2_stuffed_iomap(inode, iomap);
 744                 if (pos >= iomap->length)
 745                         return -ENOENT;
 746                 ret = 0;
 747                 goto out;
 748         }
 749
 750         lblock = pos >> inode->i_blkbits;
 751         lend = (pos + length + sdp->sd_sb.sb_bsize - 1) >> inode->i_blkbits;
 752
 753         iomap->offset = lblock << inode->i_blkbits;
 754         iomap->addr = IOMAP_NULL_ADDR;
 755         iomap->type = IOMAP_HOLE;
 756         iomap->length = (u64)(lend - lblock) << inode->i_blkbits;
 757         iomap->flags = IOMAP_F_MERGED;
 758         bmap_lock(ip, 0);
 759
 760         /*
 761          * Directory data blocks have a struct gfs2_meta_header header, so the
 762          * remaining size is smaller than the filesystem block size.  Logical
 763          * block numbers for directories are in units of this remaining size!
 764          */
 765         if (gfs2_is_dir(ip)) {
 766                 factor = sdp->sd_jbsize;
 767                 arr = sdp->sd_jheightsize;
 768         }
 769
 770         ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
 771         if (ret)
 772                 goto out_release;
 773
 774         height = ip->i_height;
 775         while ((lblock + 1) * factor > arr[height])
 776                 height++;
 777         find_metapath(sdp, lblock, &mp, height);
 778         if (height > ip->i_height || gfs2_is_stuffed(ip))
 779                 goto do_alloc;
 780
 781         ret = lookup_metapath(ip, &mp);
 782         if (ret)
 783                 goto out_release;
 784
 785         if (mp.mp_aheight != ip->i_height)
 786                 goto do_alloc;
 787
 788         ptr = metapointer(ip->i_height - 1, &mp);
 789         if (*ptr == 0)
 790                 goto do_alloc;
 791
 792         iomap->type = IOMAP_MAPPED;
 793         iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
 794
 795         bh = mp.mp_bh[ip->i_height - 1];
 796         len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, &eob);
 797         if (eob)
 798                 iomap->flags |= IOMAP_F_BOUNDARY;
 799         iomap->length = (u64)len << inode->i_blkbits;
 800
 801         ret = 0;
 802
 803 out_release:
 804         release_metapath(&mp);
 805         bmap_unlock(ip, 0);
 806 out:
 807         trace_gfs2_iomap_end(ip, iomap, ret);
 808         return ret;
 809
 810 do_alloc:
 811         if (!(flags & IOMAP_WRITE)) {
 812                 if (pos >= i_size_read(inode)) {
 813                         ret = -ENOENT;
 814                         goto out_release;
 815                 }
 816                 ret = 0;
 817                 iomap->length = hole_size(inode, lblock, &mp);
 818                 goto out_release;
 819         }
 820
 821         ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
 822         goto out_release;
 823 }
 824
 825 /**
 826  * gfs2_block_map - Map a block from an inode to a disk block
 827  * @inode: The inode
 828  * @lblock: The logical block number
 829  * @bh_map: The bh to be mapped
 830  * @create: True if its ok to alloc blocks to satify the request
 831  *
 832  * Sets buffer_mapped() if successful, sets buffer_boundary() if a
 833  * read of metadata will be required before the next block can be
 834  * mapped. Sets buffer_new() if new blocks were allocated.
 835  *
 836  * Returns: errno
 837  */
 838
 839 int gfs2_block_map(struct inode *inode, sector_t lblock,
 840                    struct buffer_head *bh_map, int create)
 841 {
 842         struct gfs2_inode *ip = GFS2_I(inode);
 843         struct iomap iomap;
 844         int ret, flags = 0;
 845
 846         clear_buffer_mapped(bh_map);
 847         clear_buffer_new(bh_map);
 848         clear_buffer_boundary(bh_map);
 849         trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
 850
 851         if (create)
 852                 flags |= IOMAP_WRITE;
 853         if (buffer_zeronew(bh_map))
 854                 flags |= IOMAP_ZERO;
 855         ret = gfs2_iomap_begin(inode, (loff_t)lblock << inode->i_blkbits,
 856                                bh_map->b_size, flags, &iomap);
 857         if (ret) {
 858                 if (!create && ret == -ENOENT) {
 859                         /* Return unmapped buffer beyond the end of file.  */
 860                         ret = 0;
 861                 }
 862                 goto out;
 863         }
 864
 865         if (iomap.length > bh_map->b_size) {
 866                 iomap.length = bh_map->b_size;
 867                 iomap.flags &= ~IOMAP_F_BOUNDARY;
 868         }
 869         if (iomap.addr != IOMAP_NULL_ADDR)
 870                 map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
 871         bh_map->b_size = iomap.length;
 872         if (iomap.flags & IOMAP_F_BOUNDARY)
 873                 set_buffer_boundary(bh_map);
 874         if (iomap.flags & IOMAP_F_NEW)
 875                 set_buffer_new(bh_map);
 876
 877 out:
 878         trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
 879         return ret;
 880 }
 881
 882 /*
 883  * Deprecated: do not use in new code
 884  */
 885 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
 886 {
 887         struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
 888         int ret;
 889         int create = *new;
 890
 891         BUG_ON(!extlen);
 892         BUG_ON(!dblock);
 893         BUG_ON(!new);
 894
 895         bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
 896         ret = gfs2_block_map(inode, lblock, &bh, create);
 897         *extlen = bh.b_size >> inode->i_blkbits;
 898         *dblock = bh.b_blocknr;
 899         if (buffer_new(&bh))
 900                 *new = 1;
 901         else
 902                 *new = 0;
 903         return ret;
 904 }
 905
 906 /**
 907  * gfs2_block_truncate_page - Deal with zeroing out data for truncate
 908  *
 909  * This is partly borrowed from ext3.
 910  */
 911 static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
 912 {
 913         struct inode *inode = mapping->host;
 914         struct gfs2_inode *ip = GFS2_I(inode);
 915         unsigned long index = from >> PAGE_SHIFT;
 916         unsigned offset = from & (PAGE_SIZE-1);
 917         unsigned blocksize, iblock, length, pos;
 918         struct buffer_head *bh;
 919         struct page *page;
 920         int err;
 921
 922         page = find_or_create_page(mapping, index, GFP_NOFS);
 923         if (!page)
 924                 return 0;
 925
 926         blocksize = inode->i_sb->s_blocksize;
 927         length = blocksize - (offset & (blocksize - 1));
 928         iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
 929
 930         if (!page_has_buffers(page))
 931                 create_empty_buffers(page, blocksize, 0);
 932
 933         /* Find the buffer that contains "offset" */
 934         bh = page_buffers(page);
 935         pos = blocksize;
 936         while (offset >= pos) {
 937                 bh = bh->b_this_page;
 938                 iblock++;
 939                 pos += blocksize;
 940         }
 941
 942         err = 0;
 943
 944         if (!buffer_mapped(bh)) {
 945                 gfs2_block_map(inode, iblock, bh, 0);
 946                 /* unmapped? It's a hole - nothing to do */
 947                 if (!buffer_mapped(bh))
 948                         goto unlock;
 949         }
 950
 951         /* Ok, it's mapped. Make sure it's up-to-date */
 952         if (PageUptodate(page))
 953                 set_buffer_uptodate(bh);
 954
 955         if (!buffer_uptodate(bh)) {
 956                 err = -EIO;
 957                 ll_rw_block(REQ_OP_READ, 0, 1, &bh);
 958                 wait_on_buffer(bh);
 959                 /* Uhhuh. Read error. Complain and punt. */
 960                 if (!buffer_uptodate(bh))
 961                         goto unlock;
 962                 err = 0;
 963         }
 964
 965         if (!gfs2_is_writeback(ip))
 966                 gfs2_trans_add_data(ip->i_gl, bh);
 967
 968         zero_user(page, offset, length);
 969         mark_buffer_dirty(bh);
 970 unlock:
 971         unlock_page(page);
 972         put_page(page);
 973         return err;
 974 }
 975
 976 #define GFS2_JTRUNC_REVOKES 8192
 977
 978 /**
 979  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
 980  * @inode: The inode being truncated
 981  * @oldsize: The original (larger) size
 982  * @newsize: The new smaller size
 983  *
 984  * With jdata files, we have to journal a revoke for each block which is
 985  * truncated. As a result, we need to split this into separate transactions
 986  * if the number of pages being truncated gets too large.
 987  */
 988
 989 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
 990 {
 991         struct gfs2_sbd *sdp = GFS2_SB(inode);
 992         u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
 993         u64 chunk;
 994         int error;
 995
 996         while (oldsize != newsize) {
 997                 struct gfs2_trans *tr;
 998                 unsigned int offs;
 999
1000                 chunk = oldsize - newsize;
1001                 if (chunk > max_chunk)
1002                         chunk = max_chunk;
1003
1004                 offs = oldsize & ~PAGE_MASK;
1005                 if (offs && chunk > PAGE_SIZE)
1006                         chunk = offs + ((chunk - offs) & PAGE_MASK);
1007
1008                 truncate_pagecache(inode, oldsize - chunk);
1009                 oldsize -= chunk;
1010
1011                 tr = current->journal_info;
1012                 if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1013                         continue;
1014
1015                 gfs2_trans_end(sdp);
1016                 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1017                 if (error)
1018                         return error;
1019         }
1020
1021         return 0;
1022 }
1023
1024 static int trunc_start(struct inode *inode, u64 newsize)
1025 {
1026         struct gfs2_inode *ip = GFS2_I(inode);
1027         struct gfs2_sbd *sdp = GFS2_SB(inode);
1028         struct address_space *mapping = inode->i_mapping;
1029         struct buffer_head *dibh = NULL;
1030         int journaled = gfs2_is_jdata(ip);
1031         u64 oldsize = inode->i_size;
1032         int error;
1033
1034         if (journaled)
1035                 error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1036         else
1037                 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1038         if (error)
1039                 return error;
1040
1041         error = gfs2_meta_inode_buffer(ip, &dibh);
1042         if (error)
1043                 goto out;
1044
1045         gfs2_trans_add_meta(ip->i_gl, dibh);
1046
1047         if (gfs2_is_stuffed(ip)) {
1048                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1049         } else {
1050                 if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
1051                         error = gfs2_block_truncate_page(mapping, newsize);
1052                         if (error)
1053                                 goto out;
1054                 }
1055                 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1056         }
1057
1058         i_size_write(inode, newsize);
1059         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1060         gfs2_dinode_out(ip, dibh->b_data);
1061
1062         if (journaled)
1063                 error = gfs2_journaled_truncate(inode, oldsize, newsize);
1064         else
1065                 truncate_pagecache(inode, newsize);
1066
1067 out:
1068         brelse(dibh);
1069         if (current->journal_info)
1070                 gfs2_trans_end(sdp);
1071         return error;
1072 }
1073
1074 /**
1075  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1076  * @ip: inode
1077  * @rg_gh: holder of resource group glock
1078  * @mp: current metapath fully populated with buffers
1079  * @btotal: place to keep count of total blocks freed
1080  * @hgt: height we're processing
1081  * @first: true if this is the first call to this function for this height
1082  *
1083  * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1084  * free, and free them all. However, we do it one rgrp at a time. If this
1085  * block has references to multiple rgrps, we break it into individual
1086  * transactions. This allows other processes to use the rgrps while we're
1087  * focused on a single one, for better concurrency / performance.
1088  * At every transaction boundary, we rewrite the inode into the journal.
1089  * That way the bitmaps are kept consistent with the inode and we can recover
1090  * if we're interrupted by power-outages.
1091  *
1092  * Returns: 0, or return code if an error occurred.
1093  *          *btotal has the total number of blocks freed
1094  */
1095 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1096                               const struct metapath *mp, u32 *btotal, int hgt,
1097                               bool preserve1)
1098 {
1099         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1100         struct gfs2_rgrpd *rgd;
1101         struct gfs2_trans *tr;
1102         struct buffer_head *bh = mp->mp_bh[hgt];
1103         __be64 *top, *bottom, *p;
1104         int blks_outside_rgrp;
1105         u64 bn, bstart, isize_blks;
1106         s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1107         int meta = ((hgt != ip->i_height - 1) ? 1 : 0);
1108         int ret = 0;
1109         bool buf_in_tr = false; /* buffer was added to transaction */
1110
1111         if (gfs2_metatype_check(sdp, bh,
1112                                 (hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)))
1113                 return -EIO;
1114
1115 more_rgrps:
1116         blks_outside_rgrp = 0;
1117         bstart = 0;
1118         blen = 0;
1119         top = metapointer(hgt, mp); /* first ptr from metapath */
1120         /* If we're keeping some data at the truncation point, we've got to
1121            preserve the metadata tree by adding 1 to the starting metapath. */
1122         if (preserve1)
1123                 top++;
1124
1125         bottom = (__be64 *)(bh->b_data + bh->b_size);
1126
1127         for (p = top; p < bottom; p++) {
1128                 if (!*p)
1129                         continue;
1130                 bn = be64_to_cpu(*p);
1131                 if (gfs2_holder_initialized(rd_gh)) {
1132                         rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1133                         gfs2_assert_withdraw(sdp,
1134                                      gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1135                 } else {
1136                         rgd = gfs2_blk2rgrpd(sdp, bn, true);
1137                         ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1138                                                  0, rd_gh);
1139                         if (ret)
1140                                 goto out;
1141
1142                         /* Must be done with the rgrp glock held: */
1143                         if (gfs2_rs_active(&ip->i_res) &&
1144                             rgd == ip->i_res.rs_rbm.rgd)
1145                                 gfs2_rs_deltree(&ip->i_res);
1146                 }
1147
1148                 if (!rgrp_contains_block(rgd, bn)) {
1149                         blks_outside_rgrp++;
1150                         continue;
1151                 }
1152
1153                 /* The size of our transactions will be unknown until we
1154                    actually process all the metadata blocks that relate to
1155                    the rgrp. So we estimate. We know it can't be more than
1156                    the dinode's i_blocks and we don't want to exceed the
1157                    journal flush threshold, sd_log_thresh2. */
1158                 if (current->journal_info == NULL) {
1159                         unsigned int jblocks_rqsted, revokes;
1160
1161                         jblocks_rqsted = rgd->rd_length + RES_DINODE +
1162                                 RES_INDIRECT;
1163                         isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1164                         if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1165                                 jblocks_rqsted +=
1166                                         atomic_read(&sdp->sd_log_thresh2);
1167                         else
1168                                 jblocks_rqsted += isize_blks;
1169                         revokes = jblocks_rqsted;
1170                         if (meta)
1171                                 revokes += hptrs(sdp, hgt);
1172                         else if (ip->i_depth)
1173                                 revokes += sdp->sd_inptrs;
1174                         ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1175                         if (ret)
1176                                 goto out_unlock;
1177                         down_write(&ip->i_rw_mutex);
1178                 }
1179                 /* check if we will exceed the transaction blocks requested */
1180                 tr = current->journal_info;
1181                 if (tr->tr_num_buf_new + RES_STATFS +
1182                     RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1183                         /* We set blks_outside_rgrp to ensure the loop will
1184                            be repeated for the same rgrp, but with a new
1185                            transaction. */
1186                         blks_outside_rgrp++;
1187                         /* This next part is tricky. If the buffer was added
1188                            to the transaction, we've already set some block
1189                            pointers to 0, so we better follow through and free
1190                            them, or we will introduce corruption (so break).
1191                            This may be impossible, or at least rare, but I
1192                            decided to cover the case regardless.
1193
1194                            If the buffer was not added to the transaction
1195                            (this call), doing so would exceed our transaction
1196                            size, so we need to end the transaction and start a
1197                            new one (so goto). */
1198
1199                         if (buf_in_tr)
1200                                 break;
1201                         goto out_unlock;
1202                 }
1203
1204                 gfs2_trans_add_meta(ip->i_gl, bh);
1205                 buf_in_tr = true;
1206                 *p = 0;
1207                 if (bstart + blen == bn) {
1208                         blen++;
1209                         continue;
1210                 }
1211                 if (bstart) {
1212                         __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1213                         (*btotal) += blen;
1214                         gfs2_add_inode_blocks(&ip->i_inode, -blen);
1215                 }
1216                 bstart = bn;
1217                 blen = 1;
1218         }
1219         if (bstart) {
1220                 __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1221                 (*btotal) += blen;
1222                 gfs2_add_inode_blocks(&ip->i_inode, -blen);
1223         }
1224 out_unlock:
1225         if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1226                                             outside the rgrp we just processed,
1227                                             do it all over again. */
1228                 if (current->journal_info) {
1229                         struct buffer_head *dibh = mp->mp_bh[0];
1230
1231                         /* Every transaction boundary, we rewrite the dinode
1232                            to keep its di_blocks current in case of failure. */
1233                         ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1234                                 current_time(&ip->i_inode);
1235                         gfs2_trans_add_meta(ip->i_gl, dibh);
1236                         gfs2_dinode_out(ip, dibh->b_data);
1237                         up_write(&ip->i_rw_mutex);
1238                         gfs2_trans_end(sdp);
1239                 }
1240                 gfs2_glock_dq_uninit(rd_gh);
1241                 cond_resched();
1242                 goto more_rgrps;
1243         }
1244 out:
1245         return ret;
1246 }
1247
1248 /**
1249  * find_nonnull_ptr - find a non-null pointer given a metapath and height
1250  * assumes the metapath is valid (with buffers) out to height h
1251  * @mp: starting metapath
1252  * @h: desired height to search
1253  *
1254  * Returns: true if a non-null pointer was found in the metapath buffer
1255  *          false if all remaining pointers are NULL in the buffer
1256  */
1257 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1258                              unsigned int h)
1259 {
1260         __be64 *ptr;
1261         unsigned int ptrs = hptrs(sdp, h) - 1;
1262
1263         while (true) {
1264                 ptr = metapointer(h, mp);
1265                 if (*ptr) { /* if we have a non-null pointer */
1266                         /* Now zero the metapath after the current height. */
1267                         h++;
1268                         if (h < GFS2_MAX_META_HEIGHT)
1269                                 memset(&mp->mp_list[h], 0,
1270                                        (GFS2_MAX_META_HEIGHT - h) *
1271                                        sizeof(mp->mp_list[0]));
1272                         return true;
1273                 }
1274
1275                 if (mp->mp_list[h] < ptrs)
1276                         mp->mp_list[h]++;
1277                 else
1278                         return false; /* no more pointers in this buffer */
1279         }
1280 }
1281
1282 enum dealloc_states {
1283         DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1284         DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1285         DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1286         DEALLOC_DONE = 3,       /* process complete */
1287 };
1288
1289 static bool mp_eq_to_hgt(struct metapath *mp, __u16 *nbof, unsigned int h)
1290 {
1291         if (memcmp(mp->mp_list, nbof, h * sizeof(mp->mp_list[0])))
1292                 return false;
1293         return true;
1294 }
1295
1296 /**
1297  * trunc_dealloc - truncate a file down to a desired size
1298  * @ip: inode to truncate
1299  * @newsize: The desired size of the file
1300  *
1301  * This function truncates a file to newsize. It works from the
1302  * bottom up, and from the right to the left. In other words, it strips off
1303  * the highest layer (data) before stripping any of the metadata. Doing it
1304  * this way is best in case the operation is interrupted by power failure, etc.
1305  * The dinode is rewritten in every transaction to guarantee integrity.
1306  */
1307 static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize)
1308 {
1309         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1310         struct metapath mp;
1311         struct buffer_head *dibh, *bh;
1312         struct gfs2_holder rd_gh;
1313         u64 lblock;
1314         __u16 nbof[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */
1315         unsigned int strip_h = ip->i_height - 1;
1316         u32 btotal = 0;
1317         int ret, state;
1318         int mp_h; /* metapath buffers are read in to this height */
1319         u64 prev_bnr = 0;
1320         bool preserve1; /* need to preserve the first meta pointer? */
1321
1322         if (!newsize)
1323                 lblock = 0;
1324         else
1325                 lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift;
1326
1327         memset(&mp, 0, sizeof(mp));
1328         find_metapath(sdp, lblock, &mp, ip->i_height);
1329
1330         memcpy(&nbof, &mp.mp_list, sizeof(nbof));
1331
1332         ret = gfs2_meta_inode_buffer(ip, &dibh);
1333         if (ret)
1334                 return ret;
1335
1336         mp.mp_bh[0] = dibh;
1337         ret = lookup_metapath(ip, &mp);
1338         if (ret)
1339                 goto out_metapath;
1340
1341         /* issue read-ahead on metadata */
1342         for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++)
1343                 gfs2_metapath_ra(ip->i_gl, &mp, mp_h);
1344
1345         if (mp.mp_aheight == ip->i_height)
1346                 state = DEALLOC_MP_FULL; /* We have a complete metapath */
1347         else
1348                 state = DEALLOC_FILL_MP; /* deal with partial metapath */
1349
1350         ret = gfs2_rindex_update(sdp);
1351         if (ret)
1352                 goto out_metapath;
1353
1354         ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1355         if (ret)
1356                 goto out_metapath;
1357         gfs2_holder_mark_uninitialized(&rd_gh);
1358
1359         mp_h = strip_h;
1360
1361         while (state != DEALLOC_DONE) {
1362                 switch (state) {
1363                 /* Truncate a full metapath at the given strip height.
1364                  * Note that strip_h == mp_h in order to be in this state. */
1365                 case DEALLOC_MP_FULL:
1366                         /* If we're truncating to a non-zero size and the mp is
1367                            at the beginning of file for the strip height, we
1368                            need to preserve the first metadata pointer. */
1369                         preserve1 = (newsize && mp_eq_to_hgt(&mp, nbof, mp_h));
1370                         bh = mp.mp_bh[mp_h];
1371                         gfs2_assert_withdraw(sdp, bh);
1372                         if (gfs2_assert_withdraw(sdp,
1373                                                  prev_bnr != bh->b_blocknr)) {
1374                                 printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
1375                                        "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
1376                                        sdp->sd_fsname,
1377                                        (unsigned long long)ip->i_no_addr,
1378                                        prev_bnr, ip->i_height, strip_h, mp_h);
1379                         }
1380                         prev_bnr = bh->b_blocknr;
1381                         ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal,
1382                                                  mp_h, preserve1);
1383                         /* If we hit an error or just swept dinode buffer,
1384                            just exit. */
1385                         if (ret || !mp_h) {
1386                                 state = DEALLOC_DONE;
1387                                 break;
1388                         }
1389                         state = DEALLOC_MP_LOWER;
1390                         break;
1391
1392                 /* lower the metapath strip height */
1393                 case DEALLOC_MP_LOWER:
1394                         /* We're done with the current buffer, so release it,
1395                            unless it's the dinode buffer. Then back up to the
1396                            previous pointer. */
1397                         if (mp_h) {
1398                                 brelse(mp.mp_bh[mp_h]);
1399                                 mp.mp_bh[mp_h] = NULL;
1400                         }
1401                         /* If we can't get any lower in height, we've stripped
1402                            off all we can. Next step is to back up and start
1403                            stripping the previous level of metadata. */
1404                         if (mp_h == 0) {
1405                                 strip_h--;
1406                                 memcpy(&mp.mp_list, &nbof, sizeof(nbof));
1407                                 mp_h = strip_h;
1408                                 state = DEALLOC_FILL_MP;
1409                                 break;
1410                         }
1411                         mp.mp_list[mp_h] = 0;
1412                         mp_h--; /* search one metadata height down */
1413                         if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1)
1414                                 break; /* loop around in the same state */
1415                         mp.mp_list[mp_h]++;
1416                         /* Here we've found a part of the metapath that is not
1417                          * allocated. We need to search at that height for the
1418                          * next non-null pointer. */
1419                         if (find_nonnull_ptr(sdp, &mp, mp_h)) {
1420                                 state = DEALLOC_FILL_MP;
1421                                 mp_h++;
1422                         }
1423                         /* No more non-null pointers at this height. Back up
1424                            to the previous height and try again. */
1425                         break; /* loop around in the same state */
1426
1427                 /* Fill the metapath with buffers to the given height. */
1428                 case DEALLOC_FILL_MP:
1429                         /* Fill the buffers out to the current height. */
1430                         ret = fillup_metapath(ip, &mp, mp_h);
1431                         if (ret < 0)
1432                                 goto out;
1433
1434                         /* issue read-ahead on metadata */
1435                         if (mp.mp_aheight > 1) {
1436                                 for (; ret > 1; ret--)
1437                                         gfs2_metapath_ra(ip->i_gl, &mp,
1438                                                 mp.mp_aheight - ret);
1439                         }
1440
1441                         /* If buffers found for the entire strip height */
1442                         if (mp.mp_aheight - 1 == strip_h) {
1443                                 state = DEALLOC_MP_FULL;
1444                                 break;
1445                         }
1446                         if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1447                                 mp_h = mp.mp_aheight - 1;
1448
1449                         /* If we find a non-null block pointer, crawl a bit
1450                            higher up in the metapath and try again, otherwise
1451                            we need to look lower for a new starting point. */
1452                         if (find_nonnull_ptr(sdp, &mp, mp_h))
1453                                 mp_h++;
1454                         else
1455                                 state = DEALLOC_MP_LOWER;
1456                         break;
1457                 }
1458         }
1459
1460         if (btotal) {
1461                 if (current->journal_info == NULL) {
1462                         ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1463                                                RES_QUOTA, 0);
1464                         if (ret)
1465                                 goto out;
1466                         down_write(&ip->i_rw_mutex);
1467                 }
1468                 gfs2_statfs_change(sdp, 0, +btotal, 0);
1469                 gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1470                                   ip->i_inode.i_gid);
1471                 ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1472                 gfs2_trans_add_meta(ip->i_gl, dibh);
1473                 gfs2_dinode_out(ip, dibh->b_data);
1474                 up_write(&ip->i_rw_mutex);
1475                 gfs2_trans_end(sdp);
1476         }
1477
1478 out:
1479         if (gfs2_holder_initialized(&rd_gh))
1480                 gfs2_glock_dq_uninit(&rd_gh);
1481         if (current->journal_info) {
1482                 up_write(&ip->i_rw_mutex);
1483                 gfs2_trans_end(sdp);
1484                 cond_resched();
1485         }
1486         gfs2_quota_unhold(ip);
1487 out_metapath:
1488         release_metapath(&mp);
1489         return ret;
1490 }
1491
1492 static int trunc_end(struct gfs2_inode *ip)
1493 {
1494         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1495         struct buffer_head *dibh;
1496         int error;
1497
1498         error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1499         if (error)
1500                 return error;
1501
1502         down_write(&ip->i_rw_mutex);
1503
1504         error = gfs2_meta_inode_buffer(ip, &dibh);
1505         if (error)
1506                 goto out;
1507
1508         if (!i_size_read(&ip->i_inode)) {
1509                 ip->i_height = 0;
1510                 ip->i_goal = ip->i_no_addr;
1511                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1512                 gfs2_ordered_del_inode(ip);
1513         }
1514         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1515         ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1516
1517         gfs2_trans_add_meta(ip->i_gl, dibh);
1518         gfs2_dinode_out(ip, dibh->b_data);
1519         brelse(dibh);
1520
1521 out:
1522         up_write(&ip->i_rw_mutex);
1523         gfs2_trans_end(sdp);
1524         return error;
1525 }
1526
1527 /**
1528  * do_shrink - make a file smaller
1529  * @inode: the inode
1530  * @newsize: the size to make the file
1531  *
1532  * Called with an exclusive lock on @inode. The @size must
1533  * be equal to or smaller than the current inode size.
1534  *
1535  * Returns: errno
1536  */
1537
1538 static int do_shrink(struct inode *inode, u64 newsize)
1539 {
1540         struct gfs2_inode *ip = GFS2_I(inode);
1541         int error;
1542
1543         error = trunc_start(inode, newsize);
1544         if (error < 0)
1545                 return error;
1546         if (gfs2_is_stuffed(ip))
1547                 return 0;
1548
1549         error = trunc_dealloc(ip, newsize);
1550         if (error == 0)
1551                 error = trunc_end(ip);
1552
1553         return error;
1554 }
1555
1556 void gfs2_trim_blocks(struct inode *inode)
1557 {
1558         int ret;
1559
1560         ret = do_shrink(inode, inode->i_size);
1561         WARN_ON(ret != 0);
1562 }
1563
1564 /**
1565  * do_grow - Touch and update inode size
1566  * @inode: The inode
1567  * @size: The new size
1568  *
1569  * This function updates the timestamps on the inode and
1570  * may also increase the size of the inode. This function
1571  * must not be called with @size any smaller than the current
1572  * inode size.
1573  *
1574  * Although it is not strictly required to unstuff files here,
1575  * earlier versions of GFS2 have a bug in the stuffed file reading
1576  * code which will result in a buffer overrun if the size is larger
1577  * than the max stuffed file size. In order to prevent this from
1578  * occurring, such files are unstuffed, but in other cases we can
1579  * just update the inode size directly.
1580  *
1581  * Returns: 0 on success, or -ve on error
1582  */
1583
1584 static int do_grow(struct inode *inode, u64 size)
1585 {
1586         struct gfs2_inode *ip = GFS2_I(inode);
1587         struct gfs2_sbd *sdp = GFS2_SB(inode);
1588         struct gfs2_alloc_parms ap = { .target = 1, };
1589         struct buffer_head *dibh;
1590         int error;
1591         int unstuff = 0;
1592
1593         if (gfs2_is_stuffed(ip) &&
1594             (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
1595                 error = gfs2_quota_lock_check(ip, &ap);
1596                 if (error)
1597                         return error;
1598
1599                 error = gfs2_inplace_reserve(ip, &ap);
1600                 if (error)
1601                         goto do_grow_qunlock;
1602                 unstuff = 1;
1603         }
1604
1605         error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
1606                                  (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
1607                                   0 : RES_QUOTA), 0);
1608         if (error)
1609                 goto do_grow_release;
1610
1611         if (unstuff) {
1612                 error = gfs2_unstuff_dinode(ip, NULL);
1613                 if (error)
1614                         goto do_end_trans;
1615         }
1616
1617         error = gfs2_meta_inode_buffer(ip, &dibh);
1618         if (error)
1619                 goto do_end_trans;
1620
1621         i_size_write(inode, size);
1622         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1623         gfs2_trans_add_meta(ip->i_gl, dibh);
1624         gfs2_dinode_out(ip, dibh->b_data);
1625         brelse(dibh);
1626
1627 do_end_trans:
1628         gfs2_trans_end(sdp);
1629 do_grow_release:
1630         if (unstuff) {
1631                 gfs2_inplace_release(ip);
1632 do_grow_qunlock:
1633                 gfs2_quota_unlock(ip);
1634         }
1635         return error;
1636 }
1637
1638 /**
1639  * gfs2_setattr_size - make a file a given size
1640  * @inode: the inode
1641  * @newsize: the size to make the file
1642  *
1643  * The file size can grow, shrink, or stay the same size. This
1644  * is called holding i_mutex and an exclusive glock on the inode
1645  * in question.
1646  *
1647  * Returns: errno
1648  */
1649
1650 int gfs2_setattr_size(struct inode *inode, u64 newsize)
1651 {
1652         struct gfs2_inode *ip = GFS2_I(inode);
1653         int ret;
1654
1655         BUG_ON(!S_ISREG(inode->i_mode));
1656
1657         ret = inode_newsize_ok(inode, newsize);
1658         if (ret)
1659                 return ret;
1660
1661         inode_dio_wait(inode);
1662
1663         ret = gfs2_rsqa_alloc(ip);
1664         if (ret)
1665                 goto out;
1666
1667         if (newsize >= inode->i_size) {
1668                 ret = do_grow(inode, newsize);
1669                 goto out;
1670         }
1671
1672         ret = do_shrink(inode, newsize);
1673 out:
1674         gfs2_rsqa_delete(ip, NULL);
1675         return ret;
1676 }
1677
1678 int gfs2_truncatei_resume(struct gfs2_inode *ip)
1679 {
1680         int error;
1681         error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
1682         if (!error)
1683                 error = trunc_end(ip);
1684         return error;
1685 }
1686
1687 int gfs2_file_dealloc(struct gfs2_inode *ip)
1688 {
1689         return trunc_dealloc(ip, 0);
1690 }
1691
1692 /**
1693  * gfs2_free_journal_extents - Free cached journal bmap info
1694  * @jd: The journal
1695  *
1696  */
1697
1698 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
1699 {
1700         struct gfs2_journal_extent *jext;
1701
1702         while(!list_empty(&jd->extent_list)) {
1703                 jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
1704                 list_del(&jext->list);
1705                 kfree(jext);
1706         }
1707 }
1708
1709 /**
1710  * gfs2_add_jextent - Add or merge a new extent to extent cache
1711  * @jd: The journal descriptor
1712  * @lblock: The logical block at start of new extent
1713  * @dblock: The physical block at start of new extent
1714  * @blocks: Size of extent in fs blocks
1715  *
1716  * Returns: 0 on success or -ENOMEM
1717  */
1718
1719 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
1720 {
1721         struct gfs2_journal_extent *jext;
1722
1723         if (!list_empty(&jd->extent_list)) {
1724                 jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
1725                 if ((jext->dblock + jext->blocks) == dblock) {
1726                         jext->blocks += blocks;
1727                         return 0;
1728                 }
1729         }
1730
1731         jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
1732         if (jext == NULL)
1733                 return -ENOMEM;
1734         jext->dblock = dblock;
1735         jext->lblock = lblock;
1736         jext->blocks = blocks;
1737         list_add_tail(&jext->list, &jd->extent_list);
1738         jd->nr_extents++;
1739         return 0;
1740 }
1741
1742 /**
1743  * gfs2_map_journal_extents - Cache journal bmap info
1744  * @sdp: The super block
1745  * @jd: The journal to map
1746  *
1747  * Create a reusable "extent" mapping from all logical
1748  * blocks to all physical blocks for the given journal.  This will save
1749  * us time when writing journal blocks.  Most journals will have only one
1750  * extent that maps all their logical blocks.  That's because gfs2.mkfs
1751  * arranges the journal blocks sequentially to maximize performance.
1752  * So the extent would map the first block for the entire file length.
1753  * However, gfs2_jadd can happen while file activity is happening, so
1754  * those journals may not be sequential.  Less likely is the case where
1755  * the users created their own journals by mounting the metafs and
1756  * laying it out.  But it's still possible.  These journals might have
1757  * several extents.
1758  *
1759  * Returns: 0 on success, or error on failure
1760  */
1761
1762 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
1763 {
1764         u64 lblock = 0;
1765         u64 lblock_stop;
1766         struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
1767         struct buffer_head bh;
1768         unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1769         u64 size;
1770         int rc;
1771
1772         lblock_stop = i_size_read(jd->jd_inode) >> shift;
1773         size = (lblock_stop - lblock) << shift;
1774         jd->nr_extents = 0;
1775         WARN_ON(!list_empty(&jd->extent_list));
1776
1777         do {
1778                 bh.b_state = 0;
1779                 bh.b_blocknr = 0;
1780                 bh.b_size = size;
1781                 rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
1782                 if (rc || !buffer_mapped(&bh))
1783                         goto fail;
1784                 rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
1785                 if (rc)
1786                         goto fail;
1787                 size -= bh.b_size;
1788                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1789         } while(size > 0);
1790
1791         fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
1792                 jd->nr_extents);
1793         return 0;
1794
1795 fail:
1796         fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
1797                 rc, jd->jd_jid,
1798                 (unsigned long long)(i_size_read(jd->jd_inode) - size),
1799                 jd->nr_extents);
1800         fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
1801                 rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
1802                 bh.b_state, (unsigned long long)bh.b_size);
1803         gfs2_free_journal_extents(jd);
1804         return rc;
1805 }
1806
1807 /**
1808  * gfs2_write_alloc_required - figure out if a write will require an allocation
1809  * @ip: the file being written to
1810  * @offset: the offset to write to
1811  * @len: the number of bytes being written
1812  *
1813  * Returns: 1 if an alloc is required, 0 otherwise
1814  */
1815
1816 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1817                               unsigned int len)
1818 {
1819         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1820         struct buffer_head bh;
1821         unsigned int shift;
1822         u64 lblock, lblock_stop, size;
1823         u64 end_of_file;
1824
1825         if (!len)
1826                 return 0;
1827
1828         if (gfs2_is_stuffed(ip)) {
1829                 if (offset + len >
1830                     sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1831                         return 1;
1832                 return 0;
1833         }
1834
1835         shift = sdp->sd_sb.sb_bsize_shift;
1836         BUG_ON(gfs2_is_dir(ip));
1837         end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
1838         lblock = offset >> shift;
1839         lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1840         if (lblock_stop > end_of_file)
1841                 return 1;
1842
1843         size = (lblock_stop - lblock) << shift;
1844         do {
1845                 bh.b_state = 0;
1846                 bh.b_size = size;
1847                 gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
1848                 if (!buffer_mapped(&bh))
1849                         return 1;
1850                 size -= bh.b_size;
1851                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1852         } while(size > 0);
1853
1854         return 0;
1855 }
1856