fs/gfs2/bmap.c

   1 /*
   2  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
   3  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
   4  *
   5  * This copyrighted material is made available to anyone wishing to use,
   6  * modify, copy, or redistribute it subject to the terms and conditions
   7  * of the GNU General Public License version 2.
   8  */
   9
  10 #include <linux/spinlock.h>
  11 #include <linux/completion.h>
  12 #include <linux/buffer_head.h>
  13 #include <linux/blkdev.h>
  14 #include <linux/gfs2_ondisk.h>
  15 #include <linux/crc32.h>
  16 #include <linux/iomap.h>
  17
  18 #include "gfs2.h"
  19 #include "incore.h"
  20 #include "bmap.h"
  21 #include "glock.h"
  22 #include "inode.h"
  23 #include "meta_io.h"
  24 #include "quota.h"
  25 #include "rgrp.h"
  26 #include "log.h"
  27 #include "super.h"
  28 #include "trans.h"
  29 #include "dir.h"
  30 #include "util.h"
  31 #include "trace_gfs2.h"
  32
  33 /* This doesn't need to be that large as max 64 bit pointers in a 4k
  34  * block is 512, so __u16 is fine for that. It saves stack space to
  35  * keep it small.
  36  */
  37 struct metapath {
  38         struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
  39         __u16 mp_list[GFS2_MAX_META_HEIGHT];
  40         int mp_fheight; /* find_metapath height */
  41         int mp_aheight; /* actual height (lookup height) */
  42 };
  43
  44 /**
  45  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
  46  * @ip: the inode
  47  * @dibh: the dinode buffer
  48  * @block: the block number that was allocated
  49  * @page: The (optional) page. This is looked up if @page is NULL
  50  *
  51  * Returns: errno
  52  */
  53
  54 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
  55                                u64 block, struct page *page)
  56 {
  57         struct inode *inode = &ip->i_inode;
  58         struct buffer_head *bh;
  59         int release = 0;
  60
  61         if (!page || page->index) {
  62                 page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
  63                 if (!page)
  64                         return -ENOMEM;
  65                 release = 1;
  66         }
  67
  68         if (!PageUptodate(page)) {
  69                 void *kaddr = kmap(page);
  70                 u64 dsize = i_size_read(inode);
  71
  72                 if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
  73                         dsize = dibh->b_size - sizeof(struct gfs2_dinode);
  74
  75                 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
  76                 memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
  77                 kunmap(page);
  78
  79                 SetPageUptodate(page);
  80         }
  81
  82         if (!page_has_buffers(page))
  83                 create_empty_buffers(page, BIT(inode->i_blkbits),
  84                                      BIT(BH_Uptodate));
  85
  86         bh = page_buffers(page);
  87
  88         if (!buffer_mapped(bh))
  89                 map_bh(bh, inode->i_sb, block);
  90
  91         set_buffer_uptodate(bh);
  92         if (!gfs2_is_jdata(ip))
  93                 mark_buffer_dirty(bh);
  94         if (!gfs2_is_writeback(ip))
  95                 gfs2_trans_add_data(ip->i_gl, bh);
  96
  97         if (release) {
  98                 unlock_page(page);
  99                 put_page(page);
 100         }
 101
 102         return 0;
 103 }
 104
 105 /**
 106  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
 107  * @ip: The GFS2 inode to unstuff
 108  * @page: The (optional) page. This is looked up if the @page is NULL
 109  *
 110  * This routine unstuffs a dinode and returns it to a "normal" state such
 111  * that the height can be grown in the traditional way.
 112  *
 113  * Returns: errno
 114  */
 115
 116 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 117 {
 118         struct buffer_head *bh, *dibh;
 119         struct gfs2_dinode *di;
 120         u64 block = 0;
 121         int isdir = gfs2_is_dir(ip);
 122         int error;
 123
 124         down_write(&ip->i_rw_mutex);
 125
 126         error = gfs2_meta_inode_buffer(ip, &dibh);
 127         if (error)
 128                 goto out;
 129
 130         if (i_size_read(&ip->i_inode)) {
 131                 /* Get a free block, fill it with the stuffed data,
 132                    and write it out to disk */
 133
 134                 unsigned int n = 1;
 135                 error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
 136                 if (error)
 137                         goto out_brelse;
 138                 if (isdir) {
 139                         gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
 140                         error = gfs2_dir_get_new_buffer(ip, block, &bh);
 141                         if (error)
 142                                 goto out_brelse;
 143                         gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
 144                                               dibh, sizeof(struct gfs2_dinode));
 145                         brelse(bh);
 146                 } else {
 147                         error = gfs2_unstuffer_page(ip, dibh, block, page);
 148                         if (error)
 149                                 goto out_brelse;
 150                 }
 151         }
 152
 153         /*  Set up the pointer to the new block  */
 154
 155         gfs2_trans_add_meta(ip->i_gl, dibh);
 156         di = (struct gfs2_dinode *)dibh->b_data;
 157         gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 158
 159         if (i_size_read(&ip->i_inode)) {
 160                 *(__be64 *)(di + 1) = cpu_to_be64(block);
 161                 gfs2_add_inode_blocks(&ip->i_inode, 1);
 162                 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 163         }
 164
 165         ip->i_height = 1;
 166         di->di_height = cpu_to_be16(1);
 167
 168 out_brelse:
 169         brelse(dibh);
 170 out:
 171         up_write(&ip->i_rw_mutex);
 172         return error;
 173 }
 174
 175
 176 /**
 177  * find_metapath - Find path through the metadata tree
 178  * @sdp: The superblock
 179  * @mp: The metapath to return the result in
 180  * @block: The disk block to look up
 181  * @height: The pre-calculated height of the metadata tree
 182  *
 183  *   This routine returns a struct metapath structure that defines a path
 184  *   through the metadata of inode "ip" to get to block "block".
 185  *
 186  *   Example:
 187  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
 188  *   filesystem with a blocksize of 4096.
 189  *
 190  *   find_metapath() would return a struct metapath structure set to:
 191  *   mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
 192  *   and mp_list[2] = 165.
 193  *
 194  *   That means that in order to get to the block containing the byte at
 195  *   offset 101342453, we would load the indirect block pointed to by pointer
 196  *   0 in the dinode.  We would then load the indirect block pointed to by
 197  *   pointer 48 in that indirect block.  We would then load the data block
 198  *   pointed to by pointer 165 in that indirect block.
 199  *
 200  *             ----------------------------------------
 201  *             | Dinode |                             |
 202  *             |        |                            4|
 203  *             |        |0 1 2 3 4 5                 9|
 204  *             |        |                            6|
 205  *             ----------------------------------------
 206  *                       |
 207  *                       |
 208  *                       V
 209  *             ----------------------------------------
 210  *             | Indirect Block                       |
 211  *             |                                     5|
 212  *             |            4 4 4 4 4 5 5            1|
 213  *             |0           5 6 7 8 9 0 1            2|
 214  *             ----------------------------------------
 215  *                                |
 216  *                                |
 217  *                                V
 218  *             ----------------------------------------
 219  *             | Indirect Block                       |
 220  *             |                         1 1 1 1 1   5|
 221  *             |                         6 6 6 6 6   1|
 222  *             |0                        3 4 5 6 7   2|
 223  *             ----------------------------------------
 224  *                                           |
 225  *                                           |
 226  *                                           V
 227  *             ----------------------------------------
 228  *             | Data block containing offset         |
 229  *             |            101342453                 |
 230  *             |                                      |
 231  *             |                                      |
 232  *             ----------------------------------------
 233  *
 234  */
 235
 236 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
 237                           struct metapath *mp, unsigned int height)
 238 {
 239         unsigned int i;
 240
 241         mp->mp_fheight = height;
 242         for (i = height; i--;)
 243                 mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
 244 }
 245
 246 static inline unsigned int metapath_branch_start(const struct metapath *mp)
 247 {
 248         if (mp->mp_list[0] == 0)
 249                 return 2;
 250         return 1;
 251 }
 252
 253 /**
 254  * metaptr1 - Return the first possible metadata pointer in a metapath buffer
 255  * @height: The metadata height (0 = dinode)
 256  * @mp: The metapath
 257  */
 258 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
 259 {
 260         struct buffer_head *bh = mp->mp_bh[height];
 261         if (height == 0)
 262                 return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
 263         return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
 264 }
 265
 266 /**
 267  * metapointer - Return pointer to start of metadata in a buffer
 268  * @height: The metadata height (0 = dinode)
 269  * @mp: The metapath
 270  *
 271  * Return a pointer to the block number of the next height of the metadata
 272  * tree given a buffer containing the pointer to the current height of the
 273  * metadata tree.
 274  */
 275
 276 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
 277 {
 278         __be64 *p = metaptr1(height, mp);
 279         return p + mp->mp_list[height];
 280 }
 281
 282 static void gfs2_metapath_ra(struct gfs2_glock *gl, struct metapath *mp,
 283                              unsigned int height)
 284 {
 285         struct buffer_head *bh = mp->mp_bh[height];
 286         const __be64 *pos = metapointer(height, mp);
 287         const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size);
 288         const __be64 *t;
 289
 290         for (t = pos; t < endp; t++) {
 291                 struct buffer_head *rabh;
 292
 293                 if (!*t)
 294                         continue;
 295
 296                 rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
 297                 if (trylock_buffer(rabh)) {
 298                         if (!buffer_uptodate(rabh)) {
 299                                 rabh->b_end_io = end_buffer_read_sync;
 300                                 submit_bh(REQ_OP_READ,
 301                                           REQ_RAHEAD | REQ_META | REQ_PRIO,
 302                                           rabh);
 303                                 continue;
 304                         }
 305                         unlock_buffer(rabh);
 306                 }
 307                 brelse(rabh);
 308         }
 309 }
 310
 311 static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
 312                              unsigned int x, unsigned int h)
 313 {
 314         for (; x < h; x++) {
 315                 __be64 *ptr = metapointer(x, mp);
 316                 u64 dblock = be64_to_cpu(*ptr);
 317                 int ret;
 318
 319                 if (!dblock)
 320                         break;
 321                 ret = gfs2_meta_indirect_buffer(ip, x + 1, dblock, &mp->mp_bh[x + 1]);
 322                 if (ret)
 323                         return ret;
 324         }
 325         mp->mp_aheight = x + 1;
 326         return 0;
 327 }
 328
 329 /**
 330  * lookup_metapath - Walk the metadata tree to a specific point
 331  * @ip: The inode
 332  * @mp: The metapath
 333  *
 334  * Assumes that the inode's buffer has already been looked up and
 335  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
 336  * by find_metapath().
 337  *
 338  * If this function encounters part of the tree which has not been
 339  * allocated, it returns the current height of the tree at the point
 340  * at which it found the unallocated block. Blocks which are found are
 341  * added to the mp->mp_bh[] list.
 342  *
 343  * Returns: error
 344  */
 345
 346 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 347 {
 348         return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
 349 }
 350
 351 /**
 352  * fillup_metapath - fill up buffers for the metadata path to a specific height
 353  * @ip: The inode
 354  * @mp: The metapath
 355  * @h: The height to which it should be mapped
 356  *
 357  * Similar to lookup_metapath, but does lookups for a range of heights
 358  *
 359  * Returns: error or the number of buffers filled
 360  */
 361
 362 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
 363 {
 364         unsigned int x = 0;
 365         int ret;
 366
 367         if (h) {
 368                 /* find the first buffer we need to look up. */
 369                 for (x = h - 1; x > 0; x--) {
 370                         if (mp->mp_bh[x])
 371                                 break;
 372                 }
 373         }
 374         ret = __fillup_metapath(ip, mp, x, h);
 375         if (ret)
 376                 return ret;
 377         return mp->mp_aheight - x - 1;
 378 }
 379
 380 static inline void release_metapath(struct metapath *mp)
 381 {
 382         int i;
 383
 384         for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
 385                 if (mp->mp_bh[i] == NULL)
 386                         break;
 387                 brelse(mp->mp_bh[i]);
 388         }
 389 }
 390
 391 /**
 392  * gfs2_extent_length - Returns length of an extent of blocks
 393  * @start: Start of the buffer
 394  * @len: Length of the buffer in bytes
 395  * @ptr: Current position in the buffer
 396  * @limit: Max extent length to return (0 = unlimited)
 397  * @eob: Set to 1 if we hit "end of block"
 398  *
 399  * If the first block is zero (unallocated) it will return the number of
 400  * unallocated blocks in the extent, otherwise it will return the number
 401  * of contiguous blocks in the extent.
 402  *
 403  * Returns: The length of the extent (minimum of one block)
 404  */
 405
 406 static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
 407 {
 408         const __be64 *end = (start + len);
 409         const __be64 *first = ptr;
 410         u64 d = be64_to_cpu(*ptr);
 411
 412         *eob = 0;
 413         do {
 414                 ptr++;
 415                 if (ptr >= end)
 416                         break;
 417                 if (limit && --limit == 0)
 418                         break;
 419                 if (d)
 420                         d++;
 421         } while(be64_to_cpu(*ptr) == d);
 422         if (ptr >= end)
 423                 *eob = 1;
 424         return (ptr - first);
 425 }
 426
 427 static inline void bmap_lock(struct gfs2_inode *ip, int create)
 428 {
 429         if (create)
 430                 down_write(&ip->i_rw_mutex);
 431         else
 432                 down_read(&ip->i_rw_mutex);
 433 }
 434
 435 static inline void bmap_unlock(struct gfs2_inode *ip, int create)
 436 {
 437         if (create)
 438                 up_write(&ip->i_rw_mutex);
 439         else
 440                 up_read(&ip->i_rw_mutex);
 441 }
 442
 443 static inline __be64 *gfs2_indirect_init(struct metapath *mp,
 444                                          struct gfs2_glock *gl, unsigned int i,
 445                                          unsigned offset, u64 bn)
 446 {
 447         __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
 448                        ((i > 1) ? sizeof(struct gfs2_meta_header) :
 449                                  sizeof(struct gfs2_dinode)));
 450         BUG_ON(i < 1);
 451         BUG_ON(mp->mp_bh[i] != NULL);
 452         mp->mp_bh[i] = gfs2_meta_new(gl, bn);
 453         gfs2_trans_add_meta(gl, mp->mp_bh[i]);
 454         gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
 455         gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
 456         ptr += offset;
 457         *ptr = cpu_to_be64(bn);
 458         return ptr;
 459 }
 460
 461 enum alloc_state {
 462         ALLOC_DATA = 0,
 463         ALLOC_GROW_DEPTH = 1,
 464         ALLOC_GROW_HEIGHT = 2,
 465         /* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
 466 };
 467
 468 static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt)
 469 {
 470         if (hgt)
 471                 return sdp->sd_inptrs;
 472         return sdp->sd_diptrs;
 473 }
 474
 475 /**
 476  * gfs2_bmap_alloc - Build a metadata tree of the requested height
 477  * @inode: The GFS2 inode
 478  * @lblock: The logical starting block of the extent
 479  * @bh_map: This is used to return the mapping details
 480  * @zero_new: True if newly allocated blocks should be zeroed
 481  * @mp: The metapath, with proper height information calculated
 482  * @maxlen: The max number of data blocks to alloc
 483  * @dblock: Pointer to return the resulting new block
 484  * @dblks: Pointer to return the number of blocks allocated
 485  *
 486  * In this routine we may have to alloc:
 487  *   i) Indirect blocks to grow the metadata tree height
 488  *  ii) Indirect blocks to fill in lower part of the metadata tree
 489  * iii) Data blocks
 490  *
 491  * The function is in two parts. The first part works out the total
 492  * number of blocks which we need. The second part does the actual
 493  * allocation asking for an extent at a time (if enough contiguous free
 494  * blocks are available, there will only be one request per bmap call)
 495  * and uses the state machine to initialise the blocks in order.
 496  *
 497  * Returns: errno on error
 498  */
 499
 500 static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 501                             unsigned flags, struct metapath *mp)
 502 {
 503         struct gfs2_inode *ip = GFS2_I(inode);
 504         struct gfs2_sbd *sdp = GFS2_SB(inode);
 505         struct super_block *sb = sdp->sd_vfs;
 506         struct buffer_head *dibh = mp->mp_bh[0];
 507         u64 bn;
 508         unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 509         unsigned dblks = 0;
 510         unsigned ptrs_per_blk;
 511         const unsigned end_of_metadata = mp->mp_fheight - 1;
 512         int ret;
 513         enum alloc_state state;
 514         __be64 *ptr;
 515         __be64 zero_bn = 0;
 516         size_t maxlen = iomap->length >> inode->i_blkbits;
 517
 518         BUG_ON(mp->mp_aheight < 1);
 519         BUG_ON(dibh == NULL);
 520
 521         gfs2_trans_add_meta(ip->i_gl, dibh);
 522
 523         if (mp->mp_fheight == mp->mp_aheight) {
 524                 struct buffer_head *bh;
 525                 int eob;
 526
 527                 /* Bottom indirect block exists, find unalloced extent size */
 528                 ptr = metapointer(end_of_metadata, mp);
 529                 bh = mp->mp_bh[end_of_metadata];
 530                 dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr,
 531                                            maxlen, &eob);
 532                 BUG_ON(dblks < 1);
 533                 state = ALLOC_DATA;
 534         } else {
 535                 /* Need to allocate indirect blocks */
 536                 ptrs_per_blk = mp->mp_fheight > 1 ? sdp->sd_inptrs :
 537                         sdp->sd_diptrs;
 538                 dblks = min(maxlen, (size_t)(ptrs_per_blk -
 539                                              mp->mp_list[end_of_metadata]));
 540                 if (mp->mp_fheight == ip->i_height) {
 541                         /* Writing into existing tree, extend tree down */
 542                         iblks = mp->mp_fheight - mp->mp_aheight;
 543                         state = ALLOC_GROW_DEPTH;
 544                 } else {
 545                         /* Building up tree height */
 546                         state = ALLOC_GROW_HEIGHT;
 547                         iblks = mp->mp_fheight - ip->i_height;
 548                         branch_start = metapath_branch_start(mp);
 549                         iblks += (mp->mp_fheight - branch_start);
 550                 }
 551         }
 552
 553         /* start of the second part of the function (state machine) */
 554
 555         blks = dblks + iblks;
 556         i = mp->mp_aheight;
 557         do {
 558                 int error;
 559                 n = blks - alloced;
 560                 error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
 561                 if (error)
 562                         return error;
 563                 alloced += n;
 564                 if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 565                         gfs2_trans_add_unrevoke(sdp, bn, n);
 566                 switch (state) {
 567                 /* Growing height of tree */
 568                 case ALLOC_GROW_HEIGHT:
 569                         if (i == 1) {
 570                                 ptr = (__be64 *)(dibh->b_data +
 571                                                  sizeof(struct gfs2_dinode));
 572                                 zero_bn = *ptr;
 573                         }
 574                         for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
 575                              i++, n--)
 576                                 gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
 577                         if (i - 1 == mp->mp_fheight - ip->i_height) {
 578                                 i--;
 579                                 gfs2_buffer_copy_tail(mp->mp_bh[i],
 580                                                 sizeof(struct gfs2_meta_header),
 581                                                 dibh, sizeof(struct gfs2_dinode));
 582                                 gfs2_buffer_clear_tail(dibh,
 583                                                 sizeof(struct gfs2_dinode) +
 584                                                 sizeof(__be64));
 585                                 ptr = (__be64 *)(mp->mp_bh[i]->b_data +
 586                                         sizeof(struct gfs2_meta_header));
 587                                 *ptr = zero_bn;
 588                                 state = ALLOC_GROW_DEPTH;
 589                                 for(i = branch_start; i < mp->mp_fheight; i++) {
 590                                         if (mp->mp_bh[i] == NULL)
 591                                                 break;
 592                                         brelse(mp->mp_bh[i]);
 593                                         mp->mp_bh[i] = NULL;
 594                                 }
 595                                 i = branch_start;
 596                         }
 597                         if (n == 0)
 598                                 break;
 599                 /* Branching from existing tree */
 600                 case ALLOC_GROW_DEPTH:
 601                         if (i > 1 && i < mp->mp_fheight)
 602                                 gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
 603                         for (; i < mp->mp_fheight && n > 0; i++, n--)
 604                                 gfs2_indirect_init(mp, ip->i_gl, i,
 605                                                    mp->mp_list[i-1], bn++);
 606                         if (i == mp->mp_fheight)
 607                                 state = ALLOC_DATA;
 608                         if (n == 0)
 609                                 break;
 610                 /* Tree complete, adding data blocks */
 611                 case ALLOC_DATA:
 612                         BUG_ON(n > dblks);
 613                         BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
 614                         gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
 615                         dblks = n;
 616                         ptr = metapointer(end_of_metadata, mp);
 617                         iomap->addr = bn << inode->i_blkbits;
 618                         iomap->flags |= IOMAP_F_NEW;
 619                         while (n-- > 0)
 620                                 *ptr++ = cpu_to_be64(bn++);
 621                         if (flags & IOMAP_ZERO) {
 622                                 ret = sb_issue_zeroout(sb, iomap->addr >> inode->i_blkbits,
 623                                                        dblks, GFP_NOFS);
 624                                 if (ret) {
 625                                         fs_err(sdp,
 626                                                "Failed to zero data buffers\n");
 627                                         flags &= ~IOMAP_ZERO;
 628                                 }
 629                         }
 630                         break;
 631                 }
 632         } while (iomap->addr == IOMAP_NULL_ADDR);
 633
 634         iomap->length = (u64)dblks << inode->i_blkbits;
 635         ip->i_height = mp->mp_fheight;
 636         gfs2_add_inode_blocks(&ip->i_inode, alloced);
 637         gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
 638         return 0;
 639 }
 640
 641 /**
 642  * hole_size - figure out the size of a hole
 643  * @inode: The inode
 644  * @lblock: The logical starting block number
 645  * @mp: The metapath
 646  *
 647  * Returns: The hole size in bytes
 648  *
 649  */
 650 static u64 hole_size(struct inode *inode, sector_t lblock, struct metapath *mp)
 651 {
 652         struct gfs2_inode *ip = GFS2_I(inode);
 653         struct gfs2_sbd *sdp = GFS2_SB(inode);
 654         struct metapath mp_eof;
 655         u64 factor = 1;
 656         int hgt;
 657         u64 holesz = 0;
 658         const __be64 *first, *end, *ptr;
 659         const struct buffer_head *bh;
 660         u64 lblock_stop = (i_size_read(inode) - 1) >> inode->i_blkbits;
 661         int zeroptrs;
 662         bool done = false;
 663
 664         /* Get another metapath, to the very last byte */
 665         find_metapath(sdp, lblock_stop, &mp_eof, ip->i_height);
 666         for (hgt = ip->i_height - 1; hgt >= 0 && !done; hgt--) {
 667                 bh = mp->mp_bh[hgt];
 668                 if (bh) {
 669                         zeroptrs = 0;
 670                         first = metapointer(hgt, mp);
 671                         end = (const __be64 *)(bh->b_data + bh->b_size);
 672
 673                         for (ptr = first; ptr < end; ptr++) {
 674                                 if (*ptr) {
 675                                         done = true;
 676                                         break;
 677                                 } else {
 678                                         zeroptrs++;
 679                                 }
 680                         }
 681                 } else {
 682                         zeroptrs = sdp->sd_inptrs;
 683                 }
 684                 if (factor * zeroptrs >= lblock_stop - lblock + 1) {
 685                         holesz = lblock_stop - lblock + 1;
 686                         break;
 687                 }
 688                 holesz += factor * zeroptrs;
 689
 690                 factor *= sdp->sd_inptrs;
 691                 if (hgt && (mp->mp_list[hgt - 1] < mp_eof.mp_list[hgt - 1]))
 692                         (mp->mp_list[hgt - 1])++;
 693         }
 694         return holesz << inode->i_blkbits;
 695 }
 696
 697 static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
 698 {
 699         struct gfs2_inode *ip = GFS2_I(inode);
 700
 701         iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
 702                       sizeof(struct gfs2_dinode);
 703         iomap->offset = 0;
 704         iomap->length = i_size_read(inode);
 705         iomap->type = IOMAP_MAPPED;
 706         iomap->flags = IOMAP_F_DATA_INLINE;
 707 }
 708
 709 /**
 710  * gfs2_iomap_begin - Map blocks from an inode to disk blocks
 711  * @inode: The inode
 712  * @pos: Starting position in bytes
 713  * @length: Length to map, in bytes
 714  * @flags: iomap flags
 715  * @iomap: The iomap structure
 716  *
 717  * Returns: errno
 718  */
 719 int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
 720                      unsigned flags, struct iomap *iomap)
 721 {
 722         struct gfs2_inode *ip = GFS2_I(inode);
 723         struct gfs2_sbd *sdp = GFS2_SB(inode);
 724         struct metapath mp = { .mp_aheight = 1, };
 725         unsigned int factor = sdp->sd_sb.sb_bsize;
 726         const u64 *arr = sdp->sd_heightsize;
 727         __be64 *ptr;
 728         sector_t lblock;
 729         sector_t lend;
 730         int ret;
 731         int eob;
 732         unsigned int len;
 733         struct buffer_head *bh;
 734         u8 height;
 735
 736         trace_gfs2_iomap_start(ip, pos, length, flags);
 737         if (!length) {
 738                 ret = -EINVAL;
 739                 goto out;
 740         }
 741
 742         if ((flags & IOMAP_REPORT) && gfs2_is_stuffed(ip)) {
 743                 gfs2_stuffed_iomap(inode, iomap);
 744                 if (pos >= iomap->length)
 745                         return -ENOENT;
 746                 ret = 0;
 747                 goto out;
 748         }
 749
 750         lblock = pos >> inode->i_blkbits;
 751         lend = (pos + length + sdp->sd_sb.sb_bsize - 1) >> inode->i_blkbits;
 752
 753         iomap->offset = lblock << inode->i_blkbits;
 754         iomap->addr = IOMAP_NULL_ADDR;
 755         iomap->type = IOMAP_HOLE;
 756         iomap->length = (u64)(lend - lblock) << inode->i_blkbits;
 757         iomap->flags = IOMAP_F_MERGED;
 758         bmap_lock(ip, 0);
 759
 760         /*
 761          * Directory data blocks have a struct gfs2_meta_header header, so the
 762          * remaining size is smaller than the filesystem block size.  Logical
 763          * block numbers for directories are in units of this remaining size!
 764          */
 765         if (gfs2_is_dir(ip)) {
 766                 factor = sdp->sd_jbsize;
 767                 arr = sdp->sd_jheightsize;
 768         }
 769
 770         ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
 771         if (ret)
 772                 goto out_release;
 773
 774         height = ip->i_height;
 775         while ((lblock + 1) * factor > arr[height])
 776                 height++;
 777         find_metapath(sdp, lblock, &mp, height);
 778         if (height > ip->i_height || gfs2_is_stuffed(ip))
 779                 goto do_alloc;
 780
 781         ret = lookup_metapath(ip, &mp);
 782         if (ret)
 783                 goto out_release;
 784
 785         if (mp.mp_aheight != ip->i_height)
 786                 goto do_alloc;
 787
 788         ptr = metapointer(ip->i_height - 1, &mp);
 789         if (*ptr == 0)
 790                 goto do_alloc;
 791
 792         iomap->type = IOMAP_MAPPED;
 793         iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
 794
 795         bh = mp.mp_bh[ip->i_height - 1];
 796         len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, &eob);
 797         if (eob)
 798                 iomap->flags |= IOMAP_F_BOUNDARY;
 799         iomap->length = (u64)len << inode->i_blkbits;
 800
 801         ret = 0;
 802
 803 out_release:
 804         release_metapath(&mp);
 805         bmap_unlock(ip, 0);
 806 out:
 807         trace_gfs2_iomap_end(ip, iomap, ret);
 808         return ret;
 809
 810 do_alloc:
 811         if (!(flags & IOMAP_WRITE)) {
 812                 if (pos >= i_size_read(inode)) {
 813                         ret = -ENOENT;
 814                         goto out_release;
 815                 }
 816                 ret = 0;
 817                 iomap->length = hole_size(inode, lblock, &mp);
 818                 goto out_release;
 819         }
 820
 821         ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
 822         goto out_release;
 823 }
 824
 825 /**
 826  * gfs2_block_map - Map a block from an inode to a disk block
 827  * @inode: The inode
 828  * @lblock: The logical block number
 829  * @bh_map: The bh to be mapped
 830  * @create: True if its ok to alloc blocks to satify the request
 831  *
 832  * Sets buffer_mapped() if successful, sets buffer_boundary() if a
 833  * read of metadata will be required before the next block can be
 834  * mapped. Sets buffer_new() if new blocks were allocated.
 835  *
 836  * Returns: errno
 837  */
 838
 839 int gfs2_block_map(struct inode *inode, sector_t lblock,
 840                    struct buffer_head *bh_map, int create)
 841 {
 842         struct gfs2_inode *ip = GFS2_I(inode);
 843         struct iomap iomap;
 844         int ret, flags = 0;
 845
 846         clear_buffer_mapped(bh_map);
 847         clear_buffer_new(bh_map);
 848         clear_buffer_boundary(bh_map);
 849         trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
 850
 851         if (create)
 852                 flags |= IOMAP_WRITE;
 853         if (buffer_zeronew(bh_map))
 854                 flags |= IOMAP_ZERO;
 855         ret = gfs2_iomap_begin(inode, (loff_t)lblock << inode->i_blkbits,
 856                                bh_map->b_size, flags, &iomap);
 857         if (ret) {
 858                 if (!create && ret == -ENOENT) {
 859                         /* Return unmapped buffer beyond the end of file.  */
 860                         ret = 0;
 861                 }
 862                 goto out;
 863         }
 864
 865         if (iomap.length > bh_map->b_size) {
 866                 iomap.length = bh_map->b_size;
 867                 iomap.flags &= ~IOMAP_F_BOUNDARY;
 868         }
 869         if (iomap.addr != IOMAP_NULL_ADDR)
 870                 map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
 871         bh_map->b_size = iomap.length;
 872         if (iomap.flags & IOMAP_F_BOUNDARY)
 873                 set_buffer_boundary(bh_map);
 874         if (iomap.flags & IOMAP_F_NEW)
 875                 set_buffer_new(bh_map);
 876
 877 out:
 878         trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
 879         return ret;
 880 }
 881
 882 /*
 883  * Deprecated: do not use in new code
 884  */
 885 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
 886 {
 887         struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
 888         int ret;
 889         int create = *new;
 890
 891         BUG_ON(!extlen);
 892         BUG_ON(!dblock);
 893         BUG_ON(!new);
 894
 895         bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
 896         ret = gfs2_block_map(inode, lblock, &bh, create);
 897         *extlen = bh.b_size >> inode->i_blkbits;
 898         *dblock = bh.b_blocknr;
 899         if (buffer_new(&bh))
 900                 *new = 1;
 901         else
 902                 *new = 0;
 903         return ret;
 904 }
 905
 906 /**
 907  * gfs2_block_zero_range - Deal with zeroing out data
 908  *
 909  * This is partly borrowed from ext3.
 910  */
 911 static int gfs2_block_zero_range(struct inode *inode, loff_t from,
 912                                  unsigned int length)
 913 {
 914         struct address_space *mapping = inode->i_mapping;
 915         struct gfs2_inode *ip = GFS2_I(inode);
 916         unsigned long index = from >> PAGE_SHIFT;
 917         unsigned offset = from & (PAGE_SIZE-1);
 918         unsigned blocksize, iblock, pos;
 919         struct buffer_head *bh;
 920         struct page *page;
 921         int err;
 922
 923         page = find_or_create_page(mapping, index, GFP_NOFS);
 924         if (!page)
 925                 return 0;
 926
 927         blocksize = inode->i_sb->s_blocksize;
 928         iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
 929
 930         if (!page_has_buffers(page))
 931                 create_empty_buffers(page, blocksize, 0);
 932
 933         /* Find the buffer that contains "offset" */
 934         bh = page_buffers(page);
 935         pos = blocksize;
 936         while (offset >= pos) {
 937                 bh = bh->b_this_page;
 938                 iblock++;
 939                 pos += blocksize;
 940         }
 941
 942         err = 0;
 943
 944         if (!buffer_mapped(bh)) {
 945                 gfs2_block_map(inode, iblock, bh, 0);
 946                 /* unmapped? It's a hole - nothing to do */
 947                 if (!buffer_mapped(bh))
 948                         goto unlock;
 949         }
 950
 951         /* Ok, it's mapped. Make sure it's up-to-date */
 952         if (PageUptodate(page))
 953                 set_buffer_uptodate(bh);
 954
 955         if (!buffer_uptodate(bh)) {
 956                 err = -EIO;
 957                 ll_rw_block(REQ_OP_READ, 0, 1, &bh);
 958                 wait_on_buffer(bh);
 959                 /* Uhhuh. Read error. Complain and punt. */
 960                 if (!buffer_uptodate(bh))
 961                         goto unlock;
 962                 err = 0;
 963         }
 964
 965         if (!gfs2_is_writeback(ip))
 966                 gfs2_trans_add_data(ip->i_gl, bh);
 967
 968         zero_user(page, offset, length);
 969         mark_buffer_dirty(bh);
 970 unlock:
 971         unlock_page(page);
 972         put_page(page);
 973         return err;
 974 }
 975
 976 #define GFS2_JTRUNC_REVOKES 8192
 977
 978 /**
 979  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
 980  * @inode: The inode being truncated
 981  * @oldsize: The original (larger) size
 982  * @newsize: The new smaller size
 983  *
 984  * With jdata files, we have to journal a revoke for each block which is
 985  * truncated. As a result, we need to split this into separate transactions
 986  * if the number of pages being truncated gets too large.
 987  */
 988
 989 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
 990 {
 991         struct gfs2_sbd *sdp = GFS2_SB(inode);
 992         u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
 993         u64 chunk;
 994         int error;
 995
 996         while (oldsize != newsize) {
 997                 struct gfs2_trans *tr;
 998                 unsigned int offs;
 999
1000                 chunk = oldsize - newsize;
1001                 if (chunk > max_chunk)
1002                         chunk = max_chunk;
1003
1004                 offs = oldsize & ~PAGE_MASK;
1005                 if (offs && chunk > PAGE_SIZE)
1006                         chunk = offs + ((chunk - offs) & PAGE_MASK);
1007
1008                 truncate_pagecache(inode, oldsize - chunk);
1009                 oldsize -= chunk;
1010
1011                 tr = current->journal_info;
1012                 if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1013                         continue;
1014
1015                 gfs2_trans_end(sdp);
1016                 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1017                 if (error)
1018                         return error;
1019         }
1020
1021         return 0;
1022 }
1023
1024 static int trunc_start(struct inode *inode, u64 newsize)
1025 {
1026         struct gfs2_inode *ip = GFS2_I(inode);
1027         struct gfs2_sbd *sdp = GFS2_SB(inode);
1028         struct buffer_head *dibh = NULL;
1029         int journaled = gfs2_is_jdata(ip);
1030         u64 oldsize = inode->i_size;
1031         int error;
1032
1033         if (journaled)
1034                 error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1035         else
1036                 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1037         if (error)
1038                 return error;
1039
1040         error = gfs2_meta_inode_buffer(ip, &dibh);
1041         if (error)
1042                 goto out;
1043
1044         gfs2_trans_add_meta(ip->i_gl, dibh);
1045
1046         if (gfs2_is_stuffed(ip)) {
1047                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1048         } else {
1049                 unsigned int blocksize = i_blocksize(inode);
1050                 unsigned int offs = newsize & (blocksize - 1);
1051                 if (offs) {
1052                         error = gfs2_block_zero_range(inode, newsize,
1053                                                       blocksize - offs);
1054                         if (error)
1055                                 goto out;
1056                 }
1057                 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1058         }
1059
1060         i_size_write(inode, newsize);
1061         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1062         gfs2_dinode_out(ip, dibh->b_data);
1063
1064         if (journaled)
1065                 error = gfs2_journaled_truncate(inode, oldsize, newsize);
1066         else
1067                 truncate_pagecache(inode, newsize);
1068
1069 out:
1070         brelse(dibh);
1071         if (current->journal_info)
1072                 gfs2_trans_end(sdp);
1073         return error;
1074 }
1075
1076 /**
1077  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1078  * @ip: inode
1079  * @rg_gh: holder of resource group glock
1080  * @mp: current metapath fully populated with buffers
1081  * @btotal: place to keep count of total blocks freed
1082  * @hgt: height we're processing
1083  * @keep_start: preserve the first meta pointer
1084  *
1085  * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1086  * free, and free them all. However, we do it one rgrp at a time. If this
1087  * block has references to multiple rgrps, we break it into individual
1088  * transactions. This allows other processes to use the rgrps while we're
1089  * focused on a single one, for better concurrency / performance.
1090  * At every transaction boundary, we rewrite the inode into the journal.
1091  * That way the bitmaps are kept consistent with the inode and we can recover
1092  * if we're interrupted by power-outages.
1093  *
1094  * Returns: 0, or return code if an error occurred.
1095  *          *btotal has the total number of blocks freed
1096  */
1097 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1098                               const struct metapath *mp, u32 *btotal, int hgt,
1099                               bool keep_start)
1100 {
1101         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1102         struct gfs2_rgrpd *rgd;
1103         struct gfs2_trans *tr;
1104         struct buffer_head *bh = mp->mp_bh[hgt];
1105         __be64 *top, *bottom, *p;
1106         int blks_outside_rgrp;
1107         u64 bn, bstart, isize_blks;
1108         s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1109         int meta = ((hgt != ip->i_height - 1) ? 1 : 0);
1110         int ret = 0;
1111         bool buf_in_tr = false; /* buffer was added to transaction */
1112
1113         if (gfs2_metatype_check(sdp, bh,
1114                                 (hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)))
1115                 return -EIO;
1116
1117 more_rgrps:
1118         blks_outside_rgrp = 0;
1119         bstart = 0;
1120         blen = 0;
1121         top = metapointer(hgt, mp); /* first ptr from metapath */
1122         /* If we're keeping some data at the truncation point, we've got to
1123            preserve the metadata tree by adding 1 to the starting metapath. */
1124         if (keep_start)
1125                 top++;
1126
1127         bottom = (__be64 *)(bh->b_data + bh->b_size);
1128
1129         for (p = top; p < bottom; p++) {
1130                 if (!*p)
1131                         continue;
1132                 bn = be64_to_cpu(*p);
1133                 if (gfs2_holder_initialized(rd_gh)) {
1134                         rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1135                         gfs2_assert_withdraw(sdp,
1136                                      gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1137                 } else {
1138                         rgd = gfs2_blk2rgrpd(sdp, bn, true);
1139                         ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1140                                                  0, rd_gh);
1141                         if (ret)
1142                                 goto out;
1143
1144                         /* Must be done with the rgrp glock held: */
1145                         if (gfs2_rs_active(&ip->i_res) &&
1146                             rgd == ip->i_res.rs_rbm.rgd)
1147                                 gfs2_rs_deltree(&ip->i_res);
1148                 }
1149
1150                 if (!rgrp_contains_block(rgd, bn)) {
1151                         blks_outside_rgrp++;
1152                         continue;
1153                 }
1154
1155                 /* The size of our transactions will be unknown until we
1156                    actually process all the metadata blocks that relate to
1157                    the rgrp. So we estimate. We know it can't be more than
1158                    the dinode's i_blocks and we don't want to exceed the
1159                    journal flush threshold, sd_log_thresh2. */
1160                 if (current->journal_info == NULL) {
1161                         unsigned int jblocks_rqsted, revokes;
1162
1163                         jblocks_rqsted = rgd->rd_length + RES_DINODE +
1164                                 RES_INDIRECT;
1165                         isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1166                         if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1167                                 jblocks_rqsted +=
1168                                         atomic_read(&sdp->sd_log_thresh2);
1169                         else
1170                                 jblocks_rqsted += isize_blks;
1171                         revokes = jblocks_rqsted;
1172                         if (meta)
1173                                 revokes += hptrs(sdp, hgt);
1174                         else if (ip->i_depth)
1175                                 revokes += sdp->sd_inptrs;
1176                         ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1177                         if (ret)
1178                                 goto out_unlock;
1179                         down_write(&ip->i_rw_mutex);
1180                 }
1181                 /* check if we will exceed the transaction blocks requested */
1182                 tr = current->journal_info;
1183                 if (tr->tr_num_buf_new + RES_STATFS +
1184                     RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1185                         /* We set blks_outside_rgrp to ensure the loop will
1186                            be repeated for the same rgrp, but with a new
1187                            transaction. */
1188                         blks_outside_rgrp++;
1189                         /* This next part is tricky. If the buffer was added
1190                            to the transaction, we've already set some block
1191                            pointers to 0, so we better follow through and free
1192                            them, or we will introduce corruption (so break).
1193                            This may be impossible, or at least rare, but I
1194                            decided to cover the case regardless.
1195
1196                            If the buffer was not added to the transaction
1197                            (this call), doing so would exceed our transaction
1198                            size, so we need to end the transaction and start a
1199                            new one (so goto). */
1200
1201                         if (buf_in_tr)
1202                                 break;
1203                         goto out_unlock;
1204                 }
1205
1206                 gfs2_trans_add_meta(ip->i_gl, bh);
1207                 buf_in_tr = true;
1208                 *p = 0;
1209                 if (bstart + blen == bn) {
1210                         blen++;
1211                         continue;
1212                 }
1213                 if (bstart) {
1214                         __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1215                         (*btotal) += blen;
1216                         gfs2_add_inode_blocks(&ip->i_inode, -blen);
1217                 }
1218                 bstart = bn;
1219                 blen = 1;
1220         }
1221         if (bstart) {
1222                 __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1223                 (*btotal) += blen;
1224                 gfs2_add_inode_blocks(&ip->i_inode, -blen);
1225         }
1226 out_unlock:
1227         if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1228                                             outside the rgrp we just processed,
1229                                             do it all over again. */
1230                 if (current->journal_info) {
1231                         struct buffer_head *dibh = mp->mp_bh[0];
1232
1233                         /* Every transaction boundary, we rewrite the dinode
1234                            to keep its di_blocks current in case of failure. */
1235                         ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1236                                 current_time(&ip->i_inode);
1237                         gfs2_trans_add_meta(ip->i_gl, dibh);
1238                         gfs2_dinode_out(ip, dibh->b_data);
1239                         up_write(&ip->i_rw_mutex);
1240                         gfs2_trans_end(sdp);
1241                 }
1242                 gfs2_glock_dq_uninit(rd_gh);
1243                 cond_resched();
1244                 goto more_rgrps;
1245         }
1246 out:
1247         return ret;
1248 }
1249
1250 /**
1251  * find_nonnull_ptr - find a non-null pointer given a metapath and height
1252  * assumes the metapath is valid (with buffers) out to height h
1253  * @mp: starting metapath
1254  * @h: desired height to search
1255  *
1256  * Returns: true if a non-null pointer was found in the metapath buffer
1257  *          false if all remaining pointers are NULL in the buffer
1258  */
1259 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1260                              unsigned int h)
1261 {
1262         __be64 *ptr;
1263         unsigned int ptrs = hptrs(sdp, h) - 1;
1264
1265         while (true) {
1266                 ptr = metapointer(h, mp);
1267                 if (*ptr) { /* if we have a non-null pointer */
1268                         /* Now zero the metapath after the current height. */
1269                         h++;
1270                         if (h < GFS2_MAX_META_HEIGHT)
1271                                 memset(&mp->mp_list[h], 0,
1272                                        (GFS2_MAX_META_HEIGHT - h) *
1273                                        sizeof(mp->mp_list[0]));
1274                         return true;
1275                 }
1276
1277                 if (mp->mp_list[h] < ptrs)
1278                         mp->mp_list[h]++;
1279                 else
1280                         return false; /* no more pointers in this buffer */
1281         }
1282 }
1283
1284 enum dealloc_states {
1285         DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1286         DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1287         DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1288         DEALLOC_DONE = 3,       /* process complete */
1289 };
1290
1291 static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1292 {
1293         if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1294                 return false;
1295         return true;
1296 }
1297
1298 /**
1299  * trunc_dealloc - truncate a file down to a desired size
1300  * @ip: inode to truncate
1301  * @newsize: The desired size of the file
1302  *
1303  * This function truncates a file to newsize. It works from the
1304  * bottom up, and from the right to the left. In other words, it strips off
1305  * the highest layer (data) before stripping any of the metadata. Doing it
1306  * this way is best in case the operation is interrupted by power failure, etc.
1307  * The dinode is rewritten in every transaction to guarantee integrity.
1308  */
1309 static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize)
1310 {
1311         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1312         struct metapath mp;
1313         struct buffer_head *dibh, *bh;
1314         struct gfs2_holder rd_gh;
1315         unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1316         u64 lblock = (newsize + (1 << bsize_shift) - 1) >> bsize_shift;
1317         __u16 start_list[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */
1318         unsigned int start_aligned;
1319         unsigned int strip_h = ip->i_height - 1;
1320         u32 btotal = 0;
1321         int ret, state;
1322         int mp_h; /* metapath buffers are read in to this height */
1323         u64 prev_bnr = 0;
1324         bool keep_start; /* need to preserve the first meta pointer? */
1325
1326         memset(&mp, 0, sizeof(mp));
1327         find_metapath(sdp, lblock, &mp, ip->i_height);
1328
1329         memcpy(start_list, mp.mp_list, sizeof(start_list));
1330
1331         /*
1332          * Set start_aligned to the metadata height up to which the truncate
1333          * point is aligned to the metadata tree (i.e., the truncate point is a
1334          * multiple of the granularity at the height above).  This determines
1335          * at which heights an additional meta pointer needs to be preserved:
1336          * an additional meta pointer is needed at a given height if
1337          * height < start_aligned.
1338          */
1339         for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1340                 if (start_list[mp_h])
1341                         break;
1342         }
1343         start_aligned = mp_h;
1344
1345         ret = gfs2_meta_inode_buffer(ip, &dibh);
1346         if (ret)
1347                 return ret;
1348
1349         mp.mp_bh[0] = dibh;
1350         ret = lookup_metapath(ip, &mp);
1351         if (ret)
1352                 goto out_metapath;
1353
1354         /* issue read-ahead on metadata */
1355         for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++)
1356                 gfs2_metapath_ra(ip->i_gl, &mp, mp_h);
1357
1358         if (mp.mp_aheight == ip->i_height)
1359                 state = DEALLOC_MP_FULL; /* We have a complete metapath */
1360         else
1361                 state = DEALLOC_FILL_MP; /* deal with partial metapath */
1362
1363         ret = gfs2_rindex_update(sdp);
1364         if (ret)
1365                 goto out_metapath;
1366
1367         ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1368         if (ret)
1369                 goto out_metapath;
1370         gfs2_holder_mark_uninitialized(&rd_gh);
1371
1372         mp_h = strip_h;
1373
1374         while (state != DEALLOC_DONE) {
1375                 switch (state) {
1376                 /* Truncate a full metapath at the given strip height.
1377                  * Note that strip_h == mp_h in order to be in this state. */
1378                 case DEALLOC_MP_FULL:
1379                         bh = mp.mp_bh[mp_h];
1380                         gfs2_assert_withdraw(sdp, bh);
1381                         if (gfs2_assert_withdraw(sdp,
1382                                                  prev_bnr != bh->b_blocknr)) {
1383                                 printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
1384                                        "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
1385                                        sdp->sd_fsname,
1386                                        (unsigned long long)ip->i_no_addr,
1387                                        prev_bnr, ip->i_height, strip_h, mp_h);
1388                         }
1389                         prev_bnr = bh->b_blocknr;
1390
1391                         keep_start = mp_h < start_aligned &&
1392                                      mp_eq_to_hgt(&mp, start_list, mp_h);
1393
1394                         ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal,
1395                                                  mp_h, keep_start);
1396                         /* If we hit an error or just swept dinode buffer,
1397                            just exit. */
1398                         if (ret || !mp_h) {
1399                                 state = DEALLOC_DONE;
1400                                 break;
1401                         }
1402                         state = DEALLOC_MP_LOWER;
1403                         break;
1404
1405                 /* lower the metapath strip height */
1406                 case DEALLOC_MP_LOWER:
1407                         /* We're done with the current buffer, so release it,
1408                            unless it's the dinode buffer. Then back up to the
1409                            previous pointer. */
1410                         if (mp_h) {
1411                                 brelse(mp.mp_bh[mp_h]);
1412                                 mp.mp_bh[mp_h] = NULL;
1413                         }
1414                         /* If we can't get any lower in height, we've stripped
1415                            off all we can. Next step is to back up and start
1416                            stripping the previous level of metadata. */
1417                         if (mp_h == 0) {
1418                                 strip_h--;
1419                                 memcpy(mp.mp_list, start_list, sizeof(start_list));
1420                                 mp_h = strip_h;
1421                                 state = DEALLOC_FILL_MP;
1422                                 break;
1423                         }
1424                         mp.mp_list[mp_h] = 0;
1425                         mp_h--; /* search one metadata height down */
1426                         if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1)
1427                                 break; /* loop around in the same state */
1428                         mp.mp_list[mp_h]++;
1429                         /* Here we've found a part of the metapath that is not
1430                          * allocated. We need to search at that height for the
1431                          * next non-null pointer. */
1432                         if (find_nonnull_ptr(sdp, &mp, mp_h)) {
1433                                 state = DEALLOC_FILL_MP;
1434                                 mp_h++;
1435                         }
1436                         /* No more non-null pointers at this height. Back up
1437                            to the previous height and try again. */
1438                         break; /* loop around in the same state */
1439
1440                 /* Fill the metapath with buffers to the given height. */
1441                 case DEALLOC_FILL_MP:
1442                         /* Fill the buffers out to the current height. */
1443                         ret = fillup_metapath(ip, &mp, mp_h);
1444                         if (ret < 0)
1445                                 goto out;
1446
1447                         /* issue read-ahead on metadata */
1448                         if (mp.mp_aheight > 1) {
1449                                 for (; ret > 1; ret--)
1450                                         gfs2_metapath_ra(ip->i_gl, &mp,
1451                                                 mp.mp_aheight - ret);
1452                         }
1453
1454                         /* If buffers found for the entire strip height */
1455                         if (mp.mp_aheight - 1 == strip_h) {
1456                                 state = DEALLOC_MP_FULL;
1457                                 break;
1458                         }
1459                         if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1460                                 mp_h = mp.mp_aheight - 1;
1461
1462                         /* If we find a non-null block pointer, crawl a bit
1463                            higher up in the metapath and try again, otherwise
1464                            we need to look lower for a new starting point. */
1465                         if (find_nonnull_ptr(sdp, &mp, mp_h))
1466                                 mp_h++;
1467                         else
1468                                 state = DEALLOC_MP_LOWER;
1469                         break;
1470                 }
1471         }
1472
1473         if (btotal) {
1474                 if (current->journal_info == NULL) {
1475                         ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1476                                                RES_QUOTA, 0);
1477                         if (ret)
1478                                 goto out;
1479                         down_write(&ip->i_rw_mutex);
1480                 }
1481                 gfs2_statfs_change(sdp, 0, +btotal, 0);
1482                 gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1483                                   ip->i_inode.i_gid);
1484                 ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1485                 gfs2_trans_add_meta(ip->i_gl, dibh);
1486                 gfs2_dinode_out(ip, dibh->b_data);
1487                 up_write(&ip->i_rw_mutex);
1488                 gfs2_trans_end(sdp);
1489         }
1490
1491 out:
1492         if (gfs2_holder_initialized(&rd_gh))
1493                 gfs2_glock_dq_uninit(&rd_gh);
1494         if (current->journal_info) {
1495                 up_write(&ip->i_rw_mutex);
1496                 gfs2_trans_end(sdp);
1497                 cond_resched();
1498         }
1499         gfs2_quota_unhold(ip);
1500 out_metapath:
1501         release_metapath(&mp);
1502         return ret;
1503 }
1504
1505 static int trunc_end(struct gfs2_inode *ip)
1506 {
1507         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1508         struct buffer_head *dibh;
1509         int error;
1510
1511         error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1512         if (error)
1513                 return error;
1514
1515         down_write(&ip->i_rw_mutex);
1516
1517         error = gfs2_meta_inode_buffer(ip, &dibh);
1518         if (error)
1519                 goto out;
1520
1521         if (!i_size_read(&ip->i_inode)) {
1522                 ip->i_height = 0;
1523                 ip->i_goal = ip->i_no_addr;
1524                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1525                 gfs2_ordered_del_inode(ip);
1526         }
1527         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1528         ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1529
1530         gfs2_trans_add_meta(ip->i_gl, dibh);
1531         gfs2_dinode_out(ip, dibh->b_data);
1532         brelse(dibh);
1533
1534 out:
1535         up_write(&ip->i_rw_mutex);
1536         gfs2_trans_end(sdp);
1537         return error;
1538 }
1539
1540 /**
1541  * do_shrink - make a file smaller
1542  * @inode: the inode
1543  * @newsize: the size to make the file
1544  *
1545  * Called with an exclusive lock on @inode. The @size must
1546  * be equal to or smaller than the current inode size.
1547  *
1548  * Returns: errno
1549  */
1550
1551 static int do_shrink(struct inode *inode, u64 newsize)
1552 {
1553         struct gfs2_inode *ip = GFS2_I(inode);
1554         int error;
1555
1556         error = trunc_start(inode, newsize);
1557         if (error < 0)
1558                 return error;
1559         if (gfs2_is_stuffed(ip))
1560                 return 0;
1561
1562         error = trunc_dealloc(ip, newsize);
1563         if (error == 0)
1564                 error = trunc_end(ip);
1565
1566         return error;
1567 }
1568
1569 void gfs2_trim_blocks(struct inode *inode)
1570 {
1571         int ret;
1572
1573         ret = do_shrink(inode, inode->i_size);
1574         WARN_ON(ret != 0);
1575 }
1576
1577 /**
1578  * do_grow - Touch and update inode size
1579  * @inode: The inode
1580  * @size: The new size
1581  *
1582  * This function updates the timestamps on the inode and
1583  * may also increase the size of the inode. This function
1584  * must not be called with @size any smaller than the current
1585  * inode size.
1586  *
1587  * Although it is not strictly required to unstuff files here,
1588  * earlier versions of GFS2 have a bug in the stuffed file reading
1589  * code which will result in a buffer overrun if the size is larger
1590  * than the max stuffed file size. In order to prevent this from
1591  * occurring, such files are unstuffed, but in other cases we can
1592  * just update the inode size directly.
1593  *
1594  * Returns: 0 on success, or -ve on error
1595  */
1596
1597 static int do_grow(struct inode *inode, u64 size)
1598 {
1599         struct gfs2_inode *ip = GFS2_I(inode);
1600         struct gfs2_sbd *sdp = GFS2_SB(inode);
1601         struct gfs2_alloc_parms ap = { .target = 1, };
1602         struct buffer_head *dibh;
1603         int error;
1604         int unstuff = 0;
1605
1606         if (gfs2_is_stuffed(ip) &&
1607             (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
1608                 error = gfs2_quota_lock_check(ip, &ap);
1609                 if (error)
1610                         return error;
1611
1612                 error = gfs2_inplace_reserve(ip, &ap);
1613                 if (error)
1614                         goto do_grow_qunlock;
1615                 unstuff = 1;
1616         }
1617
1618         error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
1619                                  (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
1620                                   0 : RES_QUOTA), 0);
1621         if (error)
1622                 goto do_grow_release;
1623
1624         if (unstuff) {
1625                 error = gfs2_unstuff_dinode(ip, NULL);
1626                 if (error)
1627                         goto do_end_trans;
1628         }
1629
1630         error = gfs2_meta_inode_buffer(ip, &dibh);
1631         if (error)
1632                 goto do_end_trans;
1633
1634         i_size_write(inode, size);
1635         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1636         gfs2_trans_add_meta(ip->i_gl, dibh);
1637         gfs2_dinode_out(ip, dibh->b_data);
1638         brelse(dibh);
1639
1640 do_end_trans:
1641         gfs2_trans_end(sdp);
1642 do_grow_release:
1643         if (unstuff) {
1644                 gfs2_inplace_release(ip);
1645 do_grow_qunlock:
1646                 gfs2_quota_unlock(ip);
1647         }
1648         return error;
1649 }
1650
1651 /**
1652  * gfs2_setattr_size - make a file a given size
1653  * @inode: the inode
1654  * @newsize: the size to make the file
1655  *
1656  * The file size can grow, shrink, or stay the same size. This
1657  * is called holding i_mutex and an exclusive glock on the inode
1658  * in question.
1659  *
1660  * Returns: errno
1661  */
1662
1663 int gfs2_setattr_size(struct inode *inode, u64 newsize)
1664 {
1665         struct gfs2_inode *ip = GFS2_I(inode);
1666         int ret;
1667
1668         BUG_ON(!S_ISREG(inode->i_mode));
1669
1670         ret = inode_newsize_ok(inode, newsize);
1671         if (ret)
1672                 return ret;
1673
1674         inode_dio_wait(inode);
1675
1676         ret = gfs2_rsqa_alloc(ip);
1677         if (ret)
1678                 goto out;
1679
1680         if (newsize >= inode->i_size) {
1681                 ret = do_grow(inode, newsize);
1682                 goto out;
1683         }
1684
1685         ret = do_shrink(inode, newsize);
1686 out:
1687         gfs2_rsqa_delete(ip, NULL);
1688         return ret;
1689 }
1690
1691 int gfs2_truncatei_resume(struct gfs2_inode *ip)
1692 {
1693         int error;
1694         error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
1695         if (!error)
1696                 error = trunc_end(ip);
1697         return error;
1698 }
1699
1700 int gfs2_file_dealloc(struct gfs2_inode *ip)
1701 {
1702         return trunc_dealloc(ip, 0);
1703 }
1704
1705 /**
1706  * gfs2_free_journal_extents - Free cached journal bmap info
1707  * @jd: The journal
1708  *
1709  */
1710
1711 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
1712 {
1713         struct gfs2_journal_extent *jext;
1714
1715         while(!list_empty(&jd->extent_list)) {
1716                 jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
1717                 list_del(&jext->list);
1718                 kfree(jext);
1719         }
1720 }
1721
1722 /**
1723  * gfs2_add_jextent - Add or merge a new extent to extent cache
1724  * @jd: The journal descriptor
1725  * @lblock: The logical block at start of new extent
1726  * @dblock: The physical block at start of new extent
1727  * @blocks: Size of extent in fs blocks
1728  *
1729  * Returns: 0 on success or -ENOMEM
1730  */
1731
1732 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
1733 {
1734         struct gfs2_journal_extent *jext;
1735
1736         if (!list_empty(&jd->extent_list)) {
1737                 jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
1738                 if ((jext->dblock + jext->blocks) == dblock) {
1739                         jext->blocks += blocks;
1740                         return 0;
1741                 }
1742         }
1743
1744         jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
1745         if (jext == NULL)
1746                 return -ENOMEM;
1747         jext->dblock = dblock;
1748         jext->lblock = lblock;
1749         jext->blocks = blocks;
1750         list_add_tail(&jext->list, &jd->extent_list);
1751         jd->nr_extents++;
1752         return 0;
1753 }
1754
1755 /**
1756  * gfs2_map_journal_extents - Cache journal bmap info
1757  * @sdp: The super block
1758  * @jd: The journal to map
1759  *
1760  * Create a reusable "extent" mapping from all logical
1761  * blocks to all physical blocks for the given journal.  This will save
1762  * us time when writing journal blocks.  Most journals will have only one
1763  * extent that maps all their logical blocks.  That's because gfs2.mkfs
1764  * arranges the journal blocks sequentially to maximize performance.
1765  * So the extent would map the first block for the entire file length.
1766  * However, gfs2_jadd can happen while file activity is happening, so
1767  * those journals may not be sequential.  Less likely is the case where
1768  * the users created their own journals by mounting the metafs and
1769  * laying it out.  But it's still possible.  These journals might have
1770  * several extents.
1771  *
1772  * Returns: 0 on success, or error on failure
1773  */
1774
1775 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
1776 {
1777         u64 lblock = 0;
1778         u64 lblock_stop;
1779         struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
1780         struct buffer_head bh;
1781         unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1782         u64 size;
1783         int rc;
1784
1785         lblock_stop = i_size_read(jd->jd_inode) >> shift;
1786         size = (lblock_stop - lblock) << shift;
1787         jd->nr_extents = 0;
1788         WARN_ON(!list_empty(&jd->extent_list));
1789
1790         do {
1791                 bh.b_state = 0;
1792                 bh.b_blocknr = 0;
1793                 bh.b_size = size;
1794                 rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
1795                 if (rc || !buffer_mapped(&bh))
1796                         goto fail;
1797                 rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
1798                 if (rc)
1799                         goto fail;
1800                 size -= bh.b_size;
1801                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1802         } while(size > 0);
1803
1804         fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
1805                 jd->nr_extents);
1806         return 0;
1807
1808 fail:
1809         fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
1810                 rc, jd->jd_jid,
1811                 (unsigned long long)(i_size_read(jd->jd_inode) - size),
1812                 jd->nr_extents);
1813         fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
1814                 rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
1815                 bh.b_state, (unsigned long long)bh.b_size);
1816         gfs2_free_journal_extents(jd);
1817         return rc;
1818 }
1819
1820 /**
1821  * gfs2_write_alloc_required - figure out if a write will require an allocation
1822  * @ip: the file being written to
1823  * @offset: the offset to write to
1824  * @len: the number of bytes being written
1825  *
1826  * Returns: 1 if an alloc is required, 0 otherwise
1827  */
1828
1829 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1830                               unsigned int len)
1831 {
1832         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1833         struct buffer_head bh;
1834         unsigned int shift;
1835         u64 lblock, lblock_stop, size;
1836         u64 end_of_file;
1837
1838         if (!len)
1839                 return 0;
1840
1841         if (gfs2_is_stuffed(ip)) {
1842                 if (offset + len >
1843                     sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1844                         return 1;
1845                 return 0;
1846         }
1847
1848         shift = sdp->sd_sb.sb_bsize_shift;
1849         BUG_ON(gfs2_is_dir(ip));
1850         end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
1851         lblock = offset >> shift;
1852         lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1853         if (lblock_stop > end_of_file)
1854                 return 1;
1855
1856         size = (lblock_stop - lblock) << shift;
1857         do {
1858                 bh.b_state = 0;
1859                 bh.b_size = size;
1860                 gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
1861                 if (!buffer_mapped(&bh))
1862                         return 1;
1863                 size -= bh.b_size;
1864                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1865         } while(size > 0);
1866
1867         return 0;
1868 }
1869