Merge branch 'fix/misc' into for-linus
[sfrench/cifs-2.6.git] / fs / ext4 / extents.c
1 /*
2  * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3  * Written by Alex Tomas <alex@clusterfs.com>
4  *
5  * Architecture independence:
6  *   Copyright (c) 2005, Bull S.A.
7  *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public Licens
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21  */
22
23 /*
24  * Extents support for EXT4
25  *
26  * TODO:
27  *   - ext4*_error() should be used in some situations
28  *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29  *   - smart tree reduction
30  */
31
32 #include <linux/module.h>
33 #include <linux/fs.h>
34 #include <linux/time.h>
35 #include <linux/jbd2.h>
36 #include <linux/highuid.h>
37 #include <linux/pagemap.h>
38 #include <linux/quotaops.h>
39 #include <linux/string.h>
40 #include <linux/slab.h>
41 #include <linux/falloc.h>
42 #include <asm/uaccess.h>
43 #include <linux/fiemap.h>
44 #include "ext4_jbd2.h"
45 #include "ext4_extents.h"
46
47
48 /*
49  * ext_pblock:
50  * combine low and high parts of physical block number into ext4_fsblk_t
51  */
52 static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
53 {
54         ext4_fsblk_t block;
55
56         block = le32_to_cpu(ex->ee_start_lo);
57         block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
58         return block;
59 }
60
61 /*
62  * idx_pblock:
63  * combine low and high parts of a leaf physical block number into ext4_fsblk_t
64  */
65 ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
66 {
67         ext4_fsblk_t block;
68
69         block = le32_to_cpu(ix->ei_leaf_lo);
70         block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
71         return block;
72 }
73
74 /*
75  * ext4_ext_store_pblock:
76  * stores a large physical block number into an extent struct,
77  * breaking it into parts
78  */
79 void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
80 {
81         ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
82         ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
83 }
84
85 /*
86  * ext4_idx_store_pblock:
87  * stores a large physical block number into an index struct,
88  * breaking it into parts
89  */
90 static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
91 {
92         ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
93         ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
94 }
95
96 static int ext4_ext_journal_restart(handle_t *handle, int needed)
97 {
98         int err;
99
100         if (!ext4_handle_valid(handle))
101                 return 0;
102         if (handle->h_buffer_credits > needed)
103                 return 0;
104         err = ext4_journal_extend(handle, needed);
105         if (err <= 0)
106                 return err;
107         return ext4_journal_restart(handle, needed);
108 }
109
110 /*
111  * could return:
112  *  - EROFS
113  *  - ENOMEM
114  */
115 static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
116                                 struct ext4_ext_path *path)
117 {
118         if (path->p_bh) {
119                 /* path points to block */
120                 return ext4_journal_get_write_access(handle, path->p_bh);
121         }
122         /* path points to leaf/index in inode body */
123         /* we use in-core data, no need to protect them */
124         return 0;
125 }
126
127 /*
128  * could return:
129  *  - EROFS
130  *  - ENOMEM
131  *  - EIO
132  */
133 static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
134                                 struct ext4_ext_path *path)
135 {
136         int err;
137         if (path->p_bh) {
138                 /* path points to block */
139                 err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
140         } else {
141                 /* path points to leaf/index in inode body */
142                 err = ext4_mark_inode_dirty(handle, inode);
143         }
144         return err;
145 }
146
147 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
148                               struct ext4_ext_path *path,
149                               ext4_lblk_t block)
150 {
151         struct ext4_inode_info *ei = EXT4_I(inode);
152         ext4_fsblk_t bg_start;
153         ext4_fsblk_t last_block;
154         ext4_grpblk_t colour;
155         ext4_group_t block_group;
156         int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
157         int depth;
158
159         if (path) {
160                 struct ext4_extent *ex;
161                 depth = path->p_depth;
162
163                 /* try to predict block placement */
164                 ex = path[depth].p_ext;
165                 if (ex)
166                         return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
167
168                 /* it looks like index is empty;
169                  * try to find starting block from index itself */
170                 if (path[depth].p_bh)
171                         return path[depth].p_bh->b_blocknr;
172         }
173
174         /* OK. use inode's group */
175         block_group = ei->i_block_group;
176         if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
177                 /*
178                  * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
179                  * block groups per flexgroup, reserve the first block 
180                  * group for directories and special files.  Regular 
181                  * files will start at the second block group.  This
182                  * tends to speed up directory access and improves 
183                  * fsck times.
184                  */
185                 block_group &= ~(flex_size-1);
186                 if (S_ISREG(inode->i_mode))
187                         block_group++;
188         }
189         bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
190                 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
191         last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
192
193         /*
194          * If we are doing delayed allocation, we don't need take
195          * colour into account.
196          */
197         if (test_opt(inode->i_sb, DELALLOC))
198                 return bg_start;
199
200         if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
201                 colour = (current->pid % 16) *
202                         (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
203         else
204                 colour = (current->pid % 16) * ((last_block - bg_start) / 16);
205         return bg_start + colour + block;
206 }
207
208 /*
209  * Allocation for a meta data block
210  */
211 static ext4_fsblk_t
212 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
213                         struct ext4_ext_path *path,
214                         struct ext4_extent *ex, int *err)
215 {
216         ext4_fsblk_t goal, newblock;
217
218         goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
219         newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
220         return newblock;
221 }
222
223 static int ext4_ext_space_block(struct inode *inode)
224 {
225         int size;
226
227         size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
228                         / sizeof(struct ext4_extent);
229 #ifdef AGGRESSIVE_TEST
230         if (size > 6)
231                 size = 6;
232 #endif
233         return size;
234 }
235
236 static int ext4_ext_space_block_idx(struct inode *inode)
237 {
238         int size;
239
240         size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
241                         / sizeof(struct ext4_extent_idx);
242 #ifdef AGGRESSIVE_TEST
243         if (size > 5)
244                 size = 5;
245 #endif
246         return size;
247 }
248
249 static int ext4_ext_space_root(struct inode *inode)
250 {
251         int size;
252
253         size = sizeof(EXT4_I(inode)->i_data);
254         size -= sizeof(struct ext4_extent_header);
255         size /= sizeof(struct ext4_extent);
256 #ifdef AGGRESSIVE_TEST
257         if (size > 3)
258                 size = 3;
259 #endif
260         return size;
261 }
262
263 static int ext4_ext_space_root_idx(struct inode *inode)
264 {
265         int size;
266
267         size = sizeof(EXT4_I(inode)->i_data);
268         size -= sizeof(struct ext4_extent_header);
269         size /= sizeof(struct ext4_extent_idx);
270 #ifdef AGGRESSIVE_TEST
271         if (size > 4)
272                 size = 4;
273 #endif
274         return size;
275 }
276
277 /*
278  * Calculate the number of metadata blocks needed
279  * to allocate @blocks
280  * Worse case is one block per extent
281  */
282 int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
283 {
284         int lcap, icap, rcap, leafs, idxs, num;
285         int newextents = blocks;
286
287         rcap = ext4_ext_space_root_idx(inode);
288         lcap = ext4_ext_space_block(inode);
289         icap = ext4_ext_space_block_idx(inode);
290
291         /* number of new leaf blocks needed */
292         num = leafs = (newextents + lcap - 1) / lcap;
293
294         /*
295          * Worse case, we need separate index block(s)
296          * to link all new leaf blocks
297          */
298         idxs = (leafs + icap - 1) / icap;
299         do {
300                 num += idxs;
301                 idxs = (idxs + icap - 1) / icap;
302         } while (idxs > rcap);
303
304         return num;
305 }
306
307 static int
308 ext4_ext_max_entries(struct inode *inode, int depth)
309 {
310         int max;
311
312         if (depth == ext_depth(inode)) {
313                 if (depth == 0)
314                         max = ext4_ext_space_root(inode);
315                 else
316                         max = ext4_ext_space_root_idx(inode);
317         } else {
318                 if (depth == 0)
319                         max = ext4_ext_space_block(inode);
320                 else
321                         max = ext4_ext_space_block_idx(inode);
322         }
323
324         return max;
325 }
326
327 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
328 {
329         ext4_fsblk_t block = ext_pblock(ext);
330         int len = ext4_ext_get_actual_len(ext);
331         struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
332         if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
333                         ((block + len) > ext4_blocks_count(es))))
334                 return 0;
335         else
336                 return 1;
337 }
338
339 static int ext4_valid_extent_idx(struct inode *inode,
340                                 struct ext4_extent_idx *ext_idx)
341 {
342         ext4_fsblk_t block = idx_pblock(ext_idx);
343         struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
344         if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
345                         (block >= ext4_blocks_count(es))))
346                 return 0;
347         else
348                 return 1;
349 }
350
351 static int ext4_valid_extent_entries(struct inode *inode,
352                                 struct ext4_extent_header *eh,
353                                 int depth)
354 {
355         struct ext4_extent *ext;
356         struct ext4_extent_idx *ext_idx;
357         unsigned short entries;
358         if (eh->eh_entries == 0)
359                 return 1;
360
361         entries = le16_to_cpu(eh->eh_entries);
362
363         if (depth == 0) {
364                 /* leaf entries */
365                 ext = EXT_FIRST_EXTENT(eh);
366                 while (entries) {
367                         if (!ext4_valid_extent(inode, ext))
368                                 return 0;
369                         ext++;
370                         entries--;
371                 }
372         } else {
373                 ext_idx = EXT_FIRST_INDEX(eh);
374                 while (entries) {
375                         if (!ext4_valid_extent_idx(inode, ext_idx))
376                                 return 0;
377                         ext_idx++;
378                         entries--;
379                 }
380         }
381         return 1;
382 }
383
384 static int __ext4_ext_check(const char *function, struct inode *inode,
385                                         struct ext4_extent_header *eh,
386                                         int depth)
387 {
388         const char *error_msg;
389         int max = 0;
390
391         if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
392                 error_msg = "invalid magic";
393                 goto corrupted;
394         }
395         if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
396                 error_msg = "unexpected eh_depth";
397                 goto corrupted;
398         }
399         if (unlikely(eh->eh_max == 0)) {
400                 error_msg = "invalid eh_max";
401                 goto corrupted;
402         }
403         max = ext4_ext_max_entries(inode, depth);
404         if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
405                 error_msg = "too large eh_max";
406                 goto corrupted;
407         }
408         if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
409                 error_msg = "invalid eh_entries";
410                 goto corrupted;
411         }
412         if (!ext4_valid_extent_entries(inode, eh, depth)) {
413                 error_msg = "invalid extent entries";
414                 goto corrupted;
415         }
416         return 0;
417
418 corrupted:
419         ext4_error(inode->i_sb, function,
420                         "bad header/extent in inode #%lu: %s - magic %x, "
421                         "entries %u, max %u(%u), depth %u(%u)",
422                         inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
423                         le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
424                         max, le16_to_cpu(eh->eh_depth), depth);
425
426         return -EIO;
427 }
428
429 #define ext4_ext_check(inode, eh, depth)        \
430         __ext4_ext_check(__func__, inode, eh, depth)
431
432 int ext4_ext_check_inode(struct inode *inode)
433 {
434         return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
435 }
436
437 #ifdef EXT_DEBUG
438 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
439 {
440         int k, l = path->p_depth;
441
442         ext_debug("path:");
443         for (k = 0; k <= l; k++, path++) {
444                 if (path->p_idx) {
445                   ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
446                             idx_pblock(path->p_idx));
447                 } else if (path->p_ext) {
448                         ext_debug("  %d:%d:%llu ",
449                                   le32_to_cpu(path->p_ext->ee_block),
450                                   ext4_ext_get_actual_len(path->p_ext),
451                                   ext_pblock(path->p_ext));
452                 } else
453                         ext_debug("  []");
454         }
455         ext_debug("\n");
456 }
457
458 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
459 {
460         int depth = ext_depth(inode);
461         struct ext4_extent_header *eh;
462         struct ext4_extent *ex;
463         int i;
464
465         if (!path)
466                 return;
467
468         eh = path[depth].p_hdr;
469         ex = EXT_FIRST_EXTENT(eh);
470
471         for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
472                 ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
473                           ext4_ext_get_actual_len(ex), ext_pblock(ex));
474         }
475         ext_debug("\n");
476 }
477 #else
478 #define ext4_ext_show_path(inode, path)
479 #define ext4_ext_show_leaf(inode, path)
480 #endif
481
482 void ext4_ext_drop_refs(struct ext4_ext_path *path)
483 {
484         int depth = path->p_depth;
485         int i;
486
487         for (i = 0; i <= depth; i++, path++)
488                 if (path->p_bh) {
489                         brelse(path->p_bh);
490                         path->p_bh = NULL;
491                 }
492 }
493
494 /*
495  * ext4_ext_binsearch_idx:
496  * binary search for the closest index of the given block
497  * the header must be checked before calling this
498  */
499 static void
500 ext4_ext_binsearch_idx(struct inode *inode,
501                         struct ext4_ext_path *path, ext4_lblk_t block)
502 {
503         struct ext4_extent_header *eh = path->p_hdr;
504         struct ext4_extent_idx *r, *l, *m;
505
506
507         ext_debug("binsearch for %u(idx):  ", block);
508
509         l = EXT_FIRST_INDEX(eh) + 1;
510         r = EXT_LAST_INDEX(eh);
511         while (l <= r) {
512                 m = l + (r - l) / 2;
513                 if (block < le32_to_cpu(m->ei_block))
514                         r = m - 1;
515                 else
516                         l = m + 1;
517                 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
518                                 m, le32_to_cpu(m->ei_block),
519                                 r, le32_to_cpu(r->ei_block));
520         }
521
522         path->p_idx = l - 1;
523         ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
524                   idx_pblock(path->p_idx));
525
526 #ifdef CHECK_BINSEARCH
527         {
528                 struct ext4_extent_idx *chix, *ix;
529                 int k;
530
531                 chix = ix = EXT_FIRST_INDEX(eh);
532                 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
533                   if (k != 0 &&
534                       le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
535                                 printk(KERN_DEBUG "k=%d, ix=0x%p, "
536                                        "first=0x%p\n", k,
537                                        ix, EXT_FIRST_INDEX(eh));
538                                 printk(KERN_DEBUG "%u <= %u\n",
539                                        le32_to_cpu(ix->ei_block),
540                                        le32_to_cpu(ix[-1].ei_block));
541                         }
542                         BUG_ON(k && le32_to_cpu(ix->ei_block)
543                                            <= le32_to_cpu(ix[-1].ei_block));
544                         if (block < le32_to_cpu(ix->ei_block))
545                                 break;
546                         chix = ix;
547                 }
548                 BUG_ON(chix != path->p_idx);
549         }
550 #endif
551
552 }
553
554 /*
555  * ext4_ext_binsearch:
556  * binary search for closest extent of the given block
557  * the header must be checked before calling this
558  */
559 static void
560 ext4_ext_binsearch(struct inode *inode,
561                 struct ext4_ext_path *path, ext4_lblk_t block)
562 {
563         struct ext4_extent_header *eh = path->p_hdr;
564         struct ext4_extent *r, *l, *m;
565
566         if (eh->eh_entries == 0) {
567                 /*
568                  * this leaf is empty:
569                  * we get such a leaf in split/add case
570                  */
571                 return;
572         }
573
574         ext_debug("binsearch for %u:  ", block);
575
576         l = EXT_FIRST_EXTENT(eh) + 1;
577         r = EXT_LAST_EXTENT(eh);
578
579         while (l <= r) {
580                 m = l + (r - l) / 2;
581                 if (block < le32_to_cpu(m->ee_block))
582                         r = m - 1;
583                 else
584                         l = m + 1;
585                 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
586                                 m, le32_to_cpu(m->ee_block),
587                                 r, le32_to_cpu(r->ee_block));
588         }
589
590         path->p_ext = l - 1;
591         ext_debug("  -> %d:%llu:%d ",
592                         le32_to_cpu(path->p_ext->ee_block),
593                         ext_pblock(path->p_ext),
594                         ext4_ext_get_actual_len(path->p_ext));
595
596 #ifdef CHECK_BINSEARCH
597         {
598                 struct ext4_extent *chex, *ex;
599                 int k;
600
601                 chex = ex = EXT_FIRST_EXTENT(eh);
602                 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
603                         BUG_ON(k && le32_to_cpu(ex->ee_block)
604                                           <= le32_to_cpu(ex[-1].ee_block));
605                         if (block < le32_to_cpu(ex->ee_block))
606                                 break;
607                         chex = ex;
608                 }
609                 BUG_ON(chex != path->p_ext);
610         }
611 #endif
612
613 }
614
615 int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
616 {
617         struct ext4_extent_header *eh;
618
619         eh = ext_inode_hdr(inode);
620         eh->eh_depth = 0;
621         eh->eh_entries = 0;
622         eh->eh_magic = EXT4_EXT_MAGIC;
623         eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
624         ext4_mark_inode_dirty(handle, inode);
625         ext4_ext_invalidate_cache(inode);
626         return 0;
627 }
628
629 struct ext4_ext_path *
630 ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
631                                         struct ext4_ext_path *path)
632 {
633         struct ext4_extent_header *eh;
634         struct buffer_head *bh;
635         short int depth, i, ppos = 0, alloc = 0;
636
637         eh = ext_inode_hdr(inode);
638         depth = ext_depth(inode);
639
640         /* account possible depth increase */
641         if (!path) {
642                 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
643                                 GFP_NOFS);
644                 if (!path)
645                         return ERR_PTR(-ENOMEM);
646                 alloc = 1;
647         }
648         path[0].p_hdr = eh;
649         path[0].p_bh = NULL;
650
651         i = depth;
652         /* walk through the tree */
653         while (i) {
654                 int need_to_validate = 0;
655
656                 ext_debug("depth %d: num %d, max %d\n",
657                           ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
658
659                 ext4_ext_binsearch_idx(inode, path + ppos, block);
660                 path[ppos].p_block = idx_pblock(path[ppos].p_idx);
661                 path[ppos].p_depth = i;
662                 path[ppos].p_ext = NULL;
663
664                 bh = sb_getblk(inode->i_sb, path[ppos].p_block);
665                 if (unlikely(!bh))
666                         goto err;
667                 if (!bh_uptodate_or_lock(bh)) {
668                         if (bh_submit_read(bh) < 0) {
669                                 put_bh(bh);
670                                 goto err;
671                         }
672                         /* validate the extent entries */
673                         need_to_validate = 1;
674                 }
675                 eh = ext_block_hdr(bh);
676                 ppos++;
677                 BUG_ON(ppos > depth);
678                 path[ppos].p_bh = bh;
679                 path[ppos].p_hdr = eh;
680                 i--;
681
682                 if (need_to_validate && ext4_ext_check(inode, eh, i))
683                         goto err;
684         }
685
686         path[ppos].p_depth = i;
687         path[ppos].p_ext = NULL;
688         path[ppos].p_idx = NULL;
689
690         /* find extent */
691         ext4_ext_binsearch(inode, path + ppos, block);
692         /* if not an empty leaf */
693         if (path[ppos].p_ext)
694                 path[ppos].p_block = ext_pblock(path[ppos].p_ext);
695
696         ext4_ext_show_path(inode, path);
697
698         return path;
699
700 err:
701         ext4_ext_drop_refs(path);
702         if (alloc)
703                 kfree(path);
704         return ERR_PTR(-EIO);
705 }
706
707 /*
708  * ext4_ext_insert_index:
709  * insert new index [@logical;@ptr] into the block at @curp;
710  * check where to insert: before @curp or after @curp
711  */
712 static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
713                                 struct ext4_ext_path *curp,
714                                 int logical, ext4_fsblk_t ptr)
715 {
716         struct ext4_extent_idx *ix;
717         int len, err;
718
719         err = ext4_ext_get_access(handle, inode, curp);
720         if (err)
721                 return err;
722
723         BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
724         len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
725         if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
726                 /* insert after */
727                 if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
728                         len = (len - 1) * sizeof(struct ext4_extent_idx);
729                         len = len < 0 ? 0 : len;
730                         ext_debug("insert new index %d after: %llu. "
731                                         "move %d from 0x%p to 0x%p\n",
732                                         logical, ptr, len,
733                                         (curp->p_idx + 1), (curp->p_idx + 2));
734                         memmove(curp->p_idx + 2, curp->p_idx + 1, len);
735                 }
736                 ix = curp->p_idx + 1;
737         } else {
738                 /* insert before */
739                 len = len * sizeof(struct ext4_extent_idx);
740                 len = len < 0 ? 0 : len;
741                 ext_debug("insert new index %d before: %llu. "
742                                 "move %d from 0x%p to 0x%p\n",
743                                 logical, ptr, len,
744                                 curp->p_idx, (curp->p_idx + 1));
745                 memmove(curp->p_idx + 1, curp->p_idx, len);
746                 ix = curp->p_idx;
747         }
748
749         ix->ei_block = cpu_to_le32(logical);
750         ext4_idx_store_pblock(ix, ptr);
751         le16_add_cpu(&curp->p_hdr->eh_entries, 1);
752
753         BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
754                              > le16_to_cpu(curp->p_hdr->eh_max));
755         BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
756
757         err = ext4_ext_dirty(handle, inode, curp);
758         ext4_std_error(inode->i_sb, err);
759
760         return err;
761 }
762
763 /*
764  * ext4_ext_split:
765  * inserts new subtree into the path, using free index entry
766  * at depth @at:
767  * - allocates all needed blocks (new leaf and all intermediate index blocks)
768  * - makes decision where to split
769  * - moves remaining extents and index entries (right to the split point)
770  *   into the newly allocated blocks
771  * - initializes subtree
772  */
773 static int ext4_ext_split(handle_t *handle, struct inode *inode,
774                                 struct ext4_ext_path *path,
775                                 struct ext4_extent *newext, int at)
776 {
777         struct buffer_head *bh = NULL;
778         int depth = ext_depth(inode);
779         struct ext4_extent_header *neh;
780         struct ext4_extent_idx *fidx;
781         struct ext4_extent *ex;
782         int i = at, k, m, a;
783         ext4_fsblk_t newblock, oldblock;
784         __le32 border;
785         ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
786         int err = 0;
787
788         /* make decision: where to split? */
789         /* FIXME: now decision is simplest: at current extent */
790
791         /* if current leaf will be split, then we should use
792          * border from split point */
793         BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
794         if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
795                 border = path[depth].p_ext[1].ee_block;
796                 ext_debug("leaf will be split."
797                                 " next leaf starts at %d\n",
798                                   le32_to_cpu(border));
799         } else {
800                 border = newext->ee_block;
801                 ext_debug("leaf will be added."
802                                 " next leaf starts at %d\n",
803                                 le32_to_cpu(border));
804         }
805
806         /*
807          * If error occurs, then we break processing
808          * and mark filesystem read-only. index won't
809          * be inserted and tree will be in consistent
810          * state. Next mount will repair buffers too.
811          */
812
813         /*
814          * Get array to track all allocated blocks.
815          * We need this to handle errors and free blocks
816          * upon them.
817          */
818         ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
819         if (!ablocks)
820                 return -ENOMEM;
821
822         /* allocate all needed blocks */
823         ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
824         for (a = 0; a < depth - at; a++) {
825                 newblock = ext4_ext_new_meta_block(handle, inode, path,
826                                                    newext, &err);
827                 if (newblock == 0)
828                         goto cleanup;
829                 ablocks[a] = newblock;
830         }
831
832         /* initialize new leaf */
833         newblock = ablocks[--a];
834         BUG_ON(newblock == 0);
835         bh = sb_getblk(inode->i_sb, newblock);
836         if (!bh) {
837                 err = -EIO;
838                 goto cleanup;
839         }
840         lock_buffer(bh);
841
842         err = ext4_journal_get_create_access(handle, bh);
843         if (err)
844                 goto cleanup;
845
846         neh = ext_block_hdr(bh);
847         neh->eh_entries = 0;
848         neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
849         neh->eh_magic = EXT4_EXT_MAGIC;
850         neh->eh_depth = 0;
851         ex = EXT_FIRST_EXTENT(neh);
852
853         /* move remainder of path[depth] to the new leaf */
854         BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
855         /* start copy from next extent */
856         /* TODO: we could do it by single memmove */
857         m = 0;
858         path[depth].p_ext++;
859         while (path[depth].p_ext <=
860                         EXT_MAX_EXTENT(path[depth].p_hdr)) {
861                 ext_debug("move %d:%llu:%d in new leaf %llu\n",
862                                 le32_to_cpu(path[depth].p_ext->ee_block),
863                                 ext_pblock(path[depth].p_ext),
864                                 ext4_ext_get_actual_len(path[depth].p_ext),
865                                 newblock);
866                 /*memmove(ex++, path[depth].p_ext++,
867                                 sizeof(struct ext4_extent));
868                 neh->eh_entries++;*/
869                 path[depth].p_ext++;
870                 m++;
871         }
872         if (m) {
873                 memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
874                 le16_add_cpu(&neh->eh_entries, m);
875         }
876
877         set_buffer_uptodate(bh);
878         unlock_buffer(bh);
879
880         err = ext4_handle_dirty_metadata(handle, inode, bh);
881         if (err)
882                 goto cleanup;
883         brelse(bh);
884         bh = NULL;
885
886         /* correct old leaf */
887         if (m) {
888                 err = ext4_ext_get_access(handle, inode, path + depth);
889                 if (err)
890                         goto cleanup;
891                 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
892                 err = ext4_ext_dirty(handle, inode, path + depth);
893                 if (err)
894                         goto cleanup;
895
896         }
897
898         /* create intermediate indexes */
899         k = depth - at - 1;
900         BUG_ON(k < 0);
901         if (k)
902                 ext_debug("create %d intermediate indices\n", k);
903         /* insert new index into current index block */
904         /* current depth stored in i var */
905         i = depth - 1;
906         while (k--) {
907                 oldblock = newblock;
908                 newblock = ablocks[--a];
909                 bh = sb_getblk(inode->i_sb, newblock);
910                 if (!bh) {
911                         err = -EIO;
912                         goto cleanup;
913                 }
914                 lock_buffer(bh);
915
916                 err = ext4_journal_get_create_access(handle, bh);
917                 if (err)
918                         goto cleanup;
919
920                 neh = ext_block_hdr(bh);
921                 neh->eh_entries = cpu_to_le16(1);
922                 neh->eh_magic = EXT4_EXT_MAGIC;
923                 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
924                 neh->eh_depth = cpu_to_le16(depth - i);
925                 fidx = EXT_FIRST_INDEX(neh);
926                 fidx->ei_block = border;
927                 ext4_idx_store_pblock(fidx, oldblock);
928
929                 ext_debug("int.index at %d (block %llu): %u -> %llu\n",
930                                 i, newblock, le32_to_cpu(border), oldblock);
931                 /* copy indexes */
932                 m = 0;
933                 path[i].p_idx++;
934
935                 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
936                                 EXT_MAX_INDEX(path[i].p_hdr));
937                 BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
938                                 EXT_LAST_INDEX(path[i].p_hdr));
939                 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
940                         ext_debug("%d: move %d:%llu in new index %llu\n", i,
941                                         le32_to_cpu(path[i].p_idx->ei_block),
942                                         idx_pblock(path[i].p_idx),
943                                         newblock);
944                         /*memmove(++fidx, path[i].p_idx++,
945                                         sizeof(struct ext4_extent_idx));
946                         neh->eh_entries++;
947                         BUG_ON(neh->eh_entries > neh->eh_max);*/
948                         path[i].p_idx++;
949                         m++;
950                 }
951                 if (m) {
952                         memmove(++fidx, path[i].p_idx - m,
953                                 sizeof(struct ext4_extent_idx) * m);
954                         le16_add_cpu(&neh->eh_entries, m);
955                 }
956                 set_buffer_uptodate(bh);
957                 unlock_buffer(bh);
958
959                 err = ext4_handle_dirty_metadata(handle, inode, bh);
960                 if (err)
961                         goto cleanup;
962                 brelse(bh);
963                 bh = NULL;
964
965                 /* correct old index */
966                 if (m) {
967                         err = ext4_ext_get_access(handle, inode, path + i);
968                         if (err)
969                                 goto cleanup;
970                         le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
971                         err = ext4_ext_dirty(handle, inode, path + i);
972                         if (err)
973                                 goto cleanup;
974                 }
975
976                 i--;
977         }
978
979         /* insert new index */
980         err = ext4_ext_insert_index(handle, inode, path + at,
981                                     le32_to_cpu(border), newblock);
982
983 cleanup:
984         if (bh) {
985                 if (buffer_locked(bh))
986                         unlock_buffer(bh);
987                 brelse(bh);
988         }
989
990         if (err) {
991                 /* free all allocated blocks in error case */
992                 for (i = 0; i < depth; i++) {
993                         if (!ablocks[i])
994                                 continue;
995                         ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
996                 }
997         }
998         kfree(ablocks);
999
1000         return err;
1001 }
1002
1003 /*
1004  * ext4_ext_grow_indepth:
1005  * implements tree growing procedure:
1006  * - allocates new block
1007  * - moves top-level data (index block or leaf) into the new block
1008  * - initializes new top-level, creating index that points to the
1009  *   just created block
1010  */
1011 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1012                                         struct ext4_ext_path *path,
1013                                         struct ext4_extent *newext)
1014 {
1015         struct ext4_ext_path *curp = path;
1016         struct ext4_extent_header *neh;
1017         struct ext4_extent_idx *fidx;
1018         struct buffer_head *bh;
1019         ext4_fsblk_t newblock;
1020         int err = 0;
1021
1022         newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
1023         if (newblock == 0)
1024                 return err;
1025
1026         bh = sb_getblk(inode->i_sb, newblock);
1027         if (!bh) {
1028                 err = -EIO;
1029                 ext4_std_error(inode->i_sb, err);
1030                 return err;
1031         }
1032         lock_buffer(bh);
1033
1034         err = ext4_journal_get_create_access(handle, bh);
1035         if (err) {
1036                 unlock_buffer(bh);
1037                 goto out;
1038         }
1039
1040         /* move top-level index/leaf into new block */
1041         memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
1042
1043         /* set size of new block */
1044         neh = ext_block_hdr(bh);
1045         /* old root could have indexes or leaves
1046          * so calculate e_max right way */
1047         if (ext_depth(inode))
1048           neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
1049         else
1050           neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
1051         neh->eh_magic = EXT4_EXT_MAGIC;
1052         set_buffer_uptodate(bh);
1053         unlock_buffer(bh);
1054
1055         err = ext4_handle_dirty_metadata(handle, inode, bh);
1056         if (err)
1057                 goto out;
1058
1059         /* create index in new top-level index: num,max,pointer */
1060         err = ext4_ext_get_access(handle, inode, curp);
1061         if (err)
1062                 goto out;
1063
1064         curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
1065         curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
1066         curp->p_hdr->eh_entries = cpu_to_le16(1);
1067         curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
1068
1069         if (path[0].p_hdr->eh_depth)
1070                 curp->p_idx->ei_block =
1071                         EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
1072         else
1073                 curp->p_idx->ei_block =
1074                         EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
1075         ext4_idx_store_pblock(curp->p_idx, newblock);
1076
1077         neh = ext_inode_hdr(inode);
1078         fidx = EXT_FIRST_INDEX(neh);
1079         ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1080                   le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1081                   le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
1082
1083         neh->eh_depth = cpu_to_le16(path->p_depth + 1);
1084         err = ext4_ext_dirty(handle, inode, curp);
1085 out:
1086         brelse(bh);
1087
1088         return err;
1089 }
1090
1091 /*
1092  * ext4_ext_create_new_leaf:
1093  * finds empty index and adds new leaf.
1094  * if no free index is found, then it requests in-depth growing.
1095  */
1096 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1097                                         struct ext4_ext_path *path,
1098                                         struct ext4_extent *newext)
1099 {
1100         struct ext4_ext_path *curp;
1101         int depth, i, err = 0;
1102
1103 repeat:
1104         i = depth = ext_depth(inode);
1105
1106         /* walk up to the tree and look for free index entry */
1107         curp = path + depth;
1108         while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1109                 i--;
1110                 curp--;
1111         }
1112
1113         /* we use already allocated block for index block,
1114          * so subsequent data blocks should be contiguous */
1115         if (EXT_HAS_FREE_INDEX(curp)) {
1116                 /* if we found index with free entry, then use that
1117                  * entry: create all needed subtree and add new leaf */
1118                 err = ext4_ext_split(handle, inode, path, newext, i);
1119                 if (err)
1120                         goto out;
1121
1122                 /* refill path */
1123                 ext4_ext_drop_refs(path);
1124                 path = ext4_ext_find_extent(inode,
1125                                     (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1126                                     path);
1127                 if (IS_ERR(path))
1128                         err = PTR_ERR(path);
1129         } else {
1130                 /* tree is full, time to grow in depth */
1131                 err = ext4_ext_grow_indepth(handle, inode, path, newext);
1132                 if (err)
1133                         goto out;
1134
1135                 /* refill path */
1136                 ext4_ext_drop_refs(path);
1137                 path = ext4_ext_find_extent(inode,
1138                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1139                                     path);
1140                 if (IS_ERR(path)) {
1141                         err = PTR_ERR(path);
1142                         goto out;
1143                 }
1144
1145                 /*
1146                  * only first (depth 0 -> 1) produces free space;
1147                  * in all other cases we have to split the grown tree
1148                  */
1149                 depth = ext_depth(inode);
1150                 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1151                         /* now we need to split */
1152                         goto repeat;
1153                 }
1154         }
1155
1156 out:
1157         return err;
1158 }
1159
1160 /*
1161  * search the closest allocated block to the left for *logical
1162  * and returns it at @logical + it's physical address at @phys
1163  * if *logical is the smallest allocated block, the function
1164  * returns 0 at @phys
1165  * return value contains 0 (success) or error code
1166  */
1167 int
1168 ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1169                         ext4_lblk_t *logical, ext4_fsblk_t *phys)
1170 {
1171         struct ext4_extent_idx *ix;
1172         struct ext4_extent *ex;
1173         int depth, ee_len;
1174
1175         BUG_ON(path == NULL);
1176         depth = path->p_depth;
1177         *phys = 0;
1178
1179         if (depth == 0 && path->p_ext == NULL)
1180                 return 0;
1181
1182         /* usually extent in the path covers blocks smaller
1183          * then *logical, but it can be that extent is the
1184          * first one in the file */
1185
1186         ex = path[depth].p_ext;
1187         ee_len = ext4_ext_get_actual_len(ex);
1188         if (*logical < le32_to_cpu(ex->ee_block)) {
1189                 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1190                 while (--depth >= 0) {
1191                         ix = path[depth].p_idx;
1192                         BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1193                 }
1194                 return 0;
1195         }
1196
1197         BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1198
1199         *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1200         *phys = ext_pblock(ex) + ee_len - 1;
1201         return 0;
1202 }
1203
1204 /*
1205  * search the closest allocated block to the right for *logical
1206  * and returns it at @logical + it's physical address at @phys
1207  * if *logical is the smallest allocated block, the function
1208  * returns 0 at @phys
1209  * return value contains 0 (success) or error code
1210  */
1211 int
1212 ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1213                         ext4_lblk_t *logical, ext4_fsblk_t *phys)
1214 {
1215         struct buffer_head *bh = NULL;
1216         struct ext4_extent_header *eh;
1217         struct ext4_extent_idx *ix;
1218         struct ext4_extent *ex;
1219         ext4_fsblk_t block;
1220         int depth;      /* Note, NOT eh_depth; depth from top of tree */
1221         int ee_len;
1222
1223         BUG_ON(path == NULL);
1224         depth = path->p_depth;
1225         *phys = 0;
1226
1227         if (depth == 0 && path->p_ext == NULL)
1228                 return 0;
1229
1230         /* usually extent in the path covers blocks smaller
1231          * then *logical, but it can be that extent is the
1232          * first one in the file */
1233
1234         ex = path[depth].p_ext;
1235         ee_len = ext4_ext_get_actual_len(ex);
1236         if (*logical < le32_to_cpu(ex->ee_block)) {
1237                 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1238                 while (--depth >= 0) {
1239                         ix = path[depth].p_idx;
1240                         BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1241                 }
1242                 *logical = le32_to_cpu(ex->ee_block);
1243                 *phys = ext_pblock(ex);
1244                 return 0;
1245         }
1246
1247         BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1248
1249         if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1250                 /* next allocated block in this leaf */
1251                 ex++;
1252                 *logical = le32_to_cpu(ex->ee_block);
1253                 *phys = ext_pblock(ex);
1254                 return 0;
1255         }
1256
1257         /* go up and search for index to the right */
1258         while (--depth >= 0) {
1259                 ix = path[depth].p_idx;
1260                 if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1261                         goto got_index;
1262         }
1263
1264         /* we've gone up to the root and found no index to the right */
1265         return 0;
1266
1267 got_index:
1268         /* we've found index to the right, let's
1269          * follow it and find the closest allocated
1270          * block to the right */
1271         ix++;
1272         block = idx_pblock(ix);
1273         while (++depth < path->p_depth) {
1274                 bh = sb_bread(inode->i_sb, block);
1275                 if (bh == NULL)
1276                         return -EIO;
1277                 eh = ext_block_hdr(bh);
1278                 /* subtract from p_depth to get proper eh_depth */
1279                 if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1280                         put_bh(bh);
1281                         return -EIO;
1282                 }
1283                 ix = EXT_FIRST_INDEX(eh);
1284                 block = idx_pblock(ix);
1285                 put_bh(bh);
1286         }
1287
1288         bh = sb_bread(inode->i_sb, block);
1289         if (bh == NULL)
1290                 return -EIO;
1291         eh = ext_block_hdr(bh);
1292         if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1293                 put_bh(bh);
1294                 return -EIO;
1295         }
1296         ex = EXT_FIRST_EXTENT(eh);
1297         *logical = le32_to_cpu(ex->ee_block);
1298         *phys = ext_pblock(ex);
1299         put_bh(bh);
1300         return 0;
1301 }
1302
1303 /*
1304  * ext4_ext_next_allocated_block:
1305  * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
1306  * NOTE: it considers block number from index entry as
1307  * allocated block. Thus, index entries have to be consistent
1308  * with leaves.
1309  */
1310 static ext4_lblk_t
1311 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1312 {
1313         int depth;
1314
1315         BUG_ON(path == NULL);
1316         depth = path->p_depth;
1317
1318         if (depth == 0 && path->p_ext == NULL)
1319                 return EXT_MAX_BLOCK;
1320
1321         while (depth >= 0) {
1322                 if (depth == path->p_depth) {
1323                         /* leaf */
1324                         if (path[depth].p_ext !=
1325                                         EXT_LAST_EXTENT(path[depth].p_hdr))
1326                           return le32_to_cpu(path[depth].p_ext[1].ee_block);
1327                 } else {
1328                         /* index */
1329                         if (path[depth].p_idx !=
1330                                         EXT_LAST_INDEX(path[depth].p_hdr))
1331                           return le32_to_cpu(path[depth].p_idx[1].ei_block);
1332                 }
1333                 depth--;
1334         }
1335
1336         return EXT_MAX_BLOCK;
1337 }
1338
1339 /*
1340  * ext4_ext_next_leaf_block:
1341  * returns first allocated block from next leaf or EXT_MAX_BLOCK
1342  */
1343 static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
1344                                         struct ext4_ext_path *path)
1345 {
1346         int depth;
1347
1348         BUG_ON(path == NULL);
1349         depth = path->p_depth;
1350
1351         /* zero-tree has no leaf blocks at all */
1352         if (depth == 0)
1353                 return EXT_MAX_BLOCK;
1354
1355         /* go to index block */
1356         depth--;
1357
1358         while (depth >= 0) {
1359                 if (path[depth].p_idx !=
1360                                 EXT_LAST_INDEX(path[depth].p_hdr))
1361                         return (ext4_lblk_t)
1362                                 le32_to_cpu(path[depth].p_idx[1].ei_block);
1363                 depth--;
1364         }
1365
1366         return EXT_MAX_BLOCK;
1367 }
1368
1369 /*
1370  * ext4_ext_correct_indexes:
1371  * if leaf gets modified and modified extent is first in the leaf,
1372  * then we have to correct all indexes above.
1373  * TODO: do we need to correct tree in all cases?
1374  */
1375 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1376                                 struct ext4_ext_path *path)
1377 {
1378         struct ext4_extent_header *eh;
1379         int depth = ext_depth(inode);
1380         struct ext4_extent *ex;
1381         __le32 border;
1382         int k, err = 0;
1383
1384         eh = path[depth].p_hdr;
1385         ex = path[depth].p_ext;
1386         BUG_ON(ex == NULL);
1387         BUG_ON(eh == NULL);
1388
1389         if (depth == 0) {
1390                 /* there is no tree at all */
1391                 return 0;
1392         }
1393
1394         if (ex != EXT_FIRST_EXTENT(eh)) {
1395                 /* we correct tree if first leaf got modified only */
1396                 return 0;
1397         }
1398
1399         /*
1400          * TODO: we need correction if border is smaller than current one
1401          */
1402         k = depth - 1;
1403         border = path[depth].p_ext->ee_block;
1404         err = ext4_ext_get_access(handle, inode, path + k);
1405         if (err)
1406                 return err;
1407         path[k].p_idx->ei_block = border;
1408         err = ext4_ext_dirty(handle, inode, path + k);
1409         if (err)
1410                 return err;
1411
1412         while (k--) {
1413                 /* change all left-side indexes */
1414                 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1415                         break;
1416                 err = ext4_ext_get_access(handle, inode, path + k);
1417                 if (err)
1418                         break;
1419                 path[k].p_idx->ei_block = border;
1420                 err = ext4_ext_dirty(handle, inode, path + k);
1421                 if (err)
1422                         break;
1423         }
1424
1425         return err;
1426 }
1427
1428 static int
1429 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1430                                 struct ext4_extent *ex2)
1431 {
1432         unsigned short ext1_ee_len, ext2_ee_len, max_len;
1433
1434         /*
1435          * Make sure that either both extents are uninitialized, or
1436          * both are _not_.
1437          */
1438         if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1439                 return 0;
1440
1441         if (ext4_ext_is_uninitialized(ex1))
1442                 max_len = EXT_UNINIT_MAX_LEN;
1443         else
1444                 max_len = EXT_INIT_MAX_LEN;
1445
1446         ext1_ee_len = ext4_ext_get_actual_len(ex1);
1447         ext2_ee_len = ext4_ext_get_actual_len(ex2);
1448
1449         if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1450                         le32_to_cpu(ex2->ee_block))
1451                 return 0;
1452
1453         /*
1454          * To allow future support for preallocated extents to be added
1455          * as an RO_COMPAT feature, refuse to merge to extents if
1456          * this can result in the top bit of ee_len being set.
1457          */
1458         if (ext1_ee_len + ext2_ee_len > max_len)
1459                 return 0;
1460 #ifdef AGGRESSIVE_TEST
1461         if (ext1_ee_len >= 4)
1462                 return 0;
1463 #endif
1464
1465         if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
1466                 return 1;
1467         return 0;
1468 }
1469
1470 /*
1471  * This function tries to merge the "ex" extent to the next extent in the tree.
1472  * It always tries to merge towards right. If you want to merge towards
1473  * left, pass "ex - 1" as argument instead of "ex".
1474  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1475  * 1 if they got merged.
1476  */
1477 int ext4_ext_try_to_merge(struct inode *inode,
1478                           struct ext4_ext_path *path,
1479                           struct ext4_extent *ex)
1480 {
1481         struct ext4_extent_header *eh;
1482         unsigned int depth, len;
1483         int merge_done = 0;
1484         int uninitialized = 0;
1485
1486         depth = ext_depth(inode);
1487         BUG_ON(path[depth].p_hdr == NULL);
1488         eh = path[depth].p_hdr;
1489
1490         while (ex < EXT_LAST_EXTENT(eh)) {
1491                 if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1492                         break;
1493                 /* merge with next extent! */
1494                 if (ext4_ext_is_uninitialized(ex))
1495                         uninitialized = 1;
1496                 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1497                                 + ext4_ext_get_actual_len(ex + 1));
1498                 if (uninitialized)
1499                         ext4_ext_mark_uninitialized(ex);
1500
1501                 if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1502                         len = (EXT_LAST_EXTENT(eh) - ex - 1)
1503                                 * sizeof(struct ext4_extent);
1504                         memmove(ex + 1, ex + 2, len);
1505                 }
1506                 le16_add_cpu(&eh->eh_entries, -1);
1507                 merge_done = 1;
1508                 WARN_ON(eh->eh_entries == 0);
1509                 if (!eh->eh_entries)
1510                         ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
1511                            "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
1512         }
1513
1514         return merge_done;
1515 }
1516
1517 /*
1518  * check if a portion of the "newext" extent overlaps with an
1519  * existing extent.
1520  *
1521  * If there is an overlap discovered, it updates the length of the newext
1522  * such that there will be no overlap, and then returns 1.
1523  * If there is no overlap found, it returns 0.
1524  */
1525 unsigned int ext4_ext_check_overlap(struct inode *inode,
1526                                     struct ext4_extent *newext,
1527                                     struct ext4_ext_path *path)
1528 {
1529         ext4_lblk_t b1, b2;
1530         unsigned int depth, len1;
1531         unsigned int ret = 0;
1532
1533         b1 = le32_to_cpu(newext->ee_block);
1534         len1 = ext4_ext_get_actual_len(newext);
1535         depth = ext_depth(inode);
1536         if (!path[depth].p_ext)
1537                 goto out;
1538         b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1539
1540         /*
1541          * get the next allocated block if the extent in the path
1542          * is before the requested block(s)
1543          */
1544         if (b2 < b1) {
1545                 b2 = ext4_ext_next_allocated_block(path);
1546                 if (b2 == EXT_MAX_BLOCK)
1547                         goto out;
1548         }
1549
1550         /* check for wrap through zero on extent logical start block*/
1551         if (b1 + len1 < b1) {
1552                 len1 = EXT_MAX_BLOCK - b1;
1553                 newext->ee_len = cpu_to_le16(len1);
1554                 ret = 1;
1555         }
1556
1557         /* check for overlap */
1558         if (b1 + len1 > b2) {
1559                 newext->ee_len = cpu_to_le16(b2 - b1);
1560                 ret = 1;
1561         }
1562 out:
1563         return ret;
1564 }
1565
1566 /*
1567  * ext4_ext_insert_extent:
1568  * tries to merge requsted extent into the existing extent or
1569  * inserts requested extent as new one into the tree,
1570  * creating new leaf in the no-space case.
1571  */
1572 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1573                                 struct ext4_ext_path *path,
1574                                 struct ext4_extent *newext)
1575 {
1576         struct ext4_extent_header *eh;
1577         struct ext4_extent *ex, *fex;
1578         struct ext4_extent *nearex; /* nearest extent */
1579         struct ext4_ext_path *npath = NULL;
1580         int depth, len, err;
1581         ext4_lblk_t next;
1582         unsigned uninitialized = 0;
1583
1584         BUG_ON(ext4_ext_get_actual_len(newext) == 0);
1585         depth = ext_depth(inode);
1586         ex = path[depth].p_ext;
1587         BUG_ON(path[depth].p_hdr == NULL);
1588
1589         /* try to insert block into found extent and return */
1590         if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
1591                 ext_debug("append %d block to %d:%d (from %llu)\n",
1592                                 ext4_ext_get_actual_len(newext),
1593                                 le32_to_cpu(ex->ee_block),
1594                                 ext4_ext_get_actual_len(ex), ext_pblock(ex));
1595                 err = ext4_ext_get_access(handle, inode, path + depth);
1596                 if (err)
1597                         return err;
1598
1599                 /*
1600                  * ext4_can_extents_be_merged should have checked that either
1601                  * both extents are uninitialized, or both aren't. Thus we
1602                  * need to check only one of them here.
1603                  */
1604                 if (ext4_ext_is_uninitialized(ex))
1605                         uninitialized = 1;
1606                 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1607                                         + ext4_ext_get_actual_len(newext));
1608                 if (uninitialized)
1609                         ext4_ext_mark_uninitialized(ex);
1610                 eh = path[depth].p_hdr;
1611                 nearex = ex;
1612                 goto merge;
1613         }
1614
1615 repeat:
1616         depth = ext_depth(inode);
1617         eh = path[depth].p_hdr;
1618         if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1619                 goto has_space;
1620
1621         /* probably next leaf has space for us? */
1622         fex = EXT_LAST_EXTENT(eh);
1623         next = ext4_ext_next_leaf_block(inode, path);
1624         if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
1625             && next != EXT_MAX_BLOCK) {
1626                 ext_debug("next leaf block - %d\n", next);
1627                 BUG_ON(npath != NULL);
1628                 npath = ext4_ext_find_extent(inode, next, NULL);
1629                 if (IS_ERR(npath))
1630                         return PTR_ERR(npath);
1631                 BUG_ON(npath->p_depth != path->p_depth);
1632                 eh = npath[depth].p_hdr;
1633                 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1634                         ext_debug("next leaf isnt full(%d)\n",
1635                                   le16_to_cpu(eh->eh_entries));
1636                         path = npath;
1637                         goto repeat;
1638                 }
1639                 ext_debug("next leaf has no free space(%d,%d)\n",
1640                           le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1641         }
1642
1643         /*
1644          * There is no free space in the found leaf.
1645          * We're gonna add a new leaf in the tree.
1646          */
1647         err = ext4_ext_create_new_leaf(handle, inode, path, newext);
1648         if (err)
1649                 goto cleanup;
1650         depth = ext_depth(inode);
1651         eh = path[depth].p_hdr;
1652
1653 has_space:
1654         nearex = path[depth].p_ext;
1655
1656         err = ext4_ext_get_access(handle, inode, path + depth);
1657         if (err)
1658                 goto cleanup;
1659
1660         if (!nearex) {
1661                 /* there is no extent in this leaf, create first one */
1662                 ext_debug("first extent in the leaf: %d:%llu:%d\n",
1663                                 le32_to_cpu(newext->ee_block),
1664                                 ext_pblock(newext),
1665                                 ext4_ext_get_actual_len(newext));
1666                 path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1667         } else if (le32_to_cpu(newext->ee_block)
1668                            > le32_to_cpu(nearex->ee_block)) {
1669 /*              BUG_ON(newext->ee_block == nearex->ee_block); */
1670                 if (nearex != EXT_LAST_EXTENT(eh)) {
1671                         len = EXT_MAX_EXTENT(eh) - nearex;
1672                         len = (len - 1) * sizeof(struct ext4_extent);
1673                         len = len < 0 ? 0 : len;
1674                         ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
1675                                         "move %d from 0x%p to 0x%p\n",
1676                                         le32_to_cpu(newext->ee_block),
1677                                         ext_pblock(newext),
1678                                         ext4_ext_get_actual_len(newext),
1679                                         nearex, len, nearex + 1, nearex + 2);
1680                         memmove(nearex + 2, nearex + 1, len);
1681                 }
1682                 path[depth].p_ext = nearex + 1;
1683         } else {
1684                 BUG_ON(newext->ee_block == nearex->ee_block);
1685                 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1686                 len = len < 0 ? 0 : len;
1687                 ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
1688                                 "move %d from 0x%p to 0x%p\n",
1689                                 le32_to_cpu(newext->ee_block),
1690                                 ext_pblock(newext),
1691                                 ext4_ext_get_actual_len(newext),
1692                                 nearex, len, nearex + 1, nearex + 2);
1693                 memmove(nearex + 1, nearex, len);
1694                 path[depth].p_ext = nearex;
1695         }
1696
1697         le16_add_cpu(&eh->eh_entries, 1);
1698         nearex = path[depth].p_ext;
1699         nearex->ee_block = newext->ee_block;
1700         ext4_ext_store_pblock(nearex, ext_pblock(newext));
1701         nearex->ee_len = newext->ee_len;
1702
1703 merge:
1704         /* try to merge extents to the right */
1705         ext4_ext_try_to_merge(inode, path, nearex);
1706
1707         /* try to merge extents to the left */
1708
1709         /* time to correct all indexes above */
1710         err = ext4_ext_correct_indexes(handle, inode, path);
1711         if (err)
1712                 goto cleanup;
1713
1714         err = ext4_ext_dirty(handle, inode, path + depth);
1715
1716 cleanup:
1717         if (npath) {
1718                 ext4_ext_drop_refs(npath);
1719                 kfree(npath);
1720         }
1721         ext4_ext_invalidate_cache(inode);
1722         return err;
1723 }
1724
1725 int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1726                         ext4_lblk_t num, ext_prepare_callback func,
1727                         void *cbdata)
1728 {
1729         struct ext4_ext_path *path = NULL;
1730         struct ext4_ext_cache cbex;
1731         struct ext4_extent *ex;
1732         ext4_lblk_t next, start = 0, end = 0;
1733         ext4_lblk_t last = block + num;
1734         int depth, exists, err = 0;
1735
1736         BUG_ON(func == NULL);
1737         BUG_ON(inode == NULL);
1738
1739         while (block < last && block != EXT_MAX_BLOCK) {
1740                 num = last - block;
1741                 /* find extent for this block */
1742                 path = ext4_ext_find_extent(inode, block, path);
1743                 if (IS_ERR(path)) {
1744                         err = PTR_ERR(path);
1745                         path = NULL;
1746                         break;
1747                 }
1748
1749                 depth = ext_depth(inode);
1750                 BUG_ON(path[depth].p_hdr == NULL);
1751                 ex = path[depth].p_ext;
1752                 next = ext4_ext_next_allocated_block(path);
1753
1754                 exists = 0;
1755                 if (!ex) {
1756                         /* there is no extent yet, so try to allocate
1757                          * all requested space */
1758                         start = block;
1759                         end = block + num;
1760                 } else if (le32_to_cpu(ex->ee_block) > block) {
1761                         /* need to allocate space before found extent */
1762                         start = block;
1763                         end = le32_to_cpu(ex->ee_block);
1764                         if (block + num < end)
1765                                 end = block + num;
1766                 } else if (block >= le32_to_cpu(ex->ee_block)
1767                                         + ext4_ext_get_actual_len(ex)) {
1768                         /* need to allocate space after found extent */
1769                         start = block;
1770                         end = block + num;
1771                         if (end >= next)
1772                                 end = next;
1773                 } else if (block >= le32_to_cpu(ex->ee_block)) {
1774                         /*
1775                          * some part of requested space is covered
1776                          * by found extent
1777                          */
1778                         start = block;
1779                         end = le32_to_cpu(ex->ee_block)
1780                                 + ext4_ext_get_actual_len(ex);
1781                         if (block + num < end)
1782                                 end = block + num;
1783                         exists = 1;
1784                 } else {
1785                         BUG();
1786                 }
1787                 BUG_ON(end <= start);
1788
1789                 if (!exists) {
1790                         cbex.ec_block = start;
1791                         cbex.ec_len = end - start;
1792                         cbex.ec_start = 0;
1793                         cbex.ec_type = EXT4_EXT_CACHE_GAP;
1794                 } else {
1795                         cbex.ec_block = le32_to_cpu(ex->ee_block);
1796                         cbex.ec_len = ext4_ext_get_actual_len(ex);
1797                         cbex.ec_start = ext_pblock(ex);
1798                         cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1799                 }
1800
1801                 BUG_ON(cbex.ec_len == 0);
1802                 err = func(inode, path, &cbex, ex, cbdata);
1803                 ext4_ext_drop_refs(path);
1804
1805                 if (err < 0)
1806                         break;
1807
1808                 if (err == EXT_REPEAT)
1809                         continue;
1810                 else if (err == EXT_BREAK) {
1811                         err = 0;
1812                         break;
1813                 }
1814
1815                 if (ext_depth(inode) != depth) {
1816                         /* depth was changed. we have to realloc path */
1817                         kfree(path);
1818                         path = NULL;
1819                 }
1820
1821                 block = cbex.ec_block + cbex.ec_len;
1822         }
1823
1824         if (path) {
1825                 ext4_ext_drop_refs(path);
1826                 kfree(path);
1827         }
1828
1829         return err;
1830 }
1831
1832 static void
1833 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1834                         __u32 len, ext4_fsblk_t start, int type)
1835 {
1836         struct ext4_ext_cache *cex;
1837         BUG_ON(len == 0);
1838         cex = &EXT4_I(inode)->i_cached_extent;
1839         cex->ec_type = type;
1840         cex->ec_block = block;
1841         cex->ec_len = len;
1842         cex->ec_start = start;
1843 }
1844
1845 /*
1846  * ext4_ext_put_gap_in_cache:
1847  * calculate boundaries of the gap that the requested block fits into
1848  * and cache this gap
1849  */
1850 static void
1851 ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1852                                 ext4_lblk_t block)
1853 {
1854         int depth = ext_depth(inode);
1855         unsigned long len;
1856         ext4_lblk_t lblock;
1857         struct ext4_extent *ex;
1858
1859         ex = path[depth].p_ext;
1860         if (ex == NULL) {
1861                 /* there is no extent yet, so gap is [0;-] */
1862                 lblock = 0;
1863                 len = EXT_MAX_BLOCK;
1864                 ext_debug("cache gap(whole file):");
1865         } else if (block < le32_to_cpu(ex->ee_block)) {
1866                 lblock = block;
1867                 len = le32_to_cpu(ex->ee_block) - block;
1868                 ext_debug("cache gap(before): %u [%u:%u]",
1869                                 block,
1870                                 le32_to_cpu(ex->ee_block),
1871                                  ext4_ext_get_actual_len(ex));
1872         } else if (block >= le32_to_cpu(ex->ee_block)
1873                         + ext4_ext_get_actual_len(ex)) {
1874                 ext4_lblk_t next;
1875                 lblock = le32_to_cpu(ex->ee_block)
1876                         + ext4_ext_get_actual_len(ex);
1877
1878                 next = ext4_ext_next_allocated_block(path);
1879                 ext_debug("cache gap(after): [%u:%u] %u",
1880                                 le32_to_cpu(ex->ee_block),
1881                                 ext4_ext_get_actual_len(ex),
1882                                 block);
1883                 BUG_ON(next == lblock);
1884                 len = next - lblock;
1885         } else {
1886                 lblock = len = 0;
1887                 BUG();
1888         }
1889
1890         ext_debug(" -> %u:%lu\n", lblock, len);
1891         ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
1892 }
1893
1894 static int
1895 ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1896                         struct ext4_extent *ex)
1897 {
1898         struct ext4_ext_cache *cex;
1899
1900         cex = &EXT4_I(inode)->i_cached_extent;
1901
1902         /* has cache valid data? */
1903         if (cex->ec_type == EXT4_EXT_CACHE_NO)
1904                 return EXT4_EXT_CACHE_NO;
1905
1906         BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1907                         cex->ec_type != EXT4_EXT_CACHE_EXTENT);
1908         if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
1909                 ex->ee_block = cpu_to_le32(cex->ec_block);
1910                 ext4_ext_store_pblock(ex, cex->ec_start);
1911                 ex->ee_len = cpu_to_le16(cex->ec_len);
1912                 ext_debug("%u cached by %u:%u:%llu\n",
1913                                 block,
1914                                 cex->ec_block, cex->ec_len, cex->ec_start);
1915                 return cex->ec_type;
1916         }
1917
1918         /* not in cache */
1919         return EXT4_EXT_CACHE_NO;
1920 }
1921
1922 /*
1923  * ext4_ext_rm_idx:
1924  * removes index from the index block.
1925  * It's used in truncate case only, thus all requests are for
1926  * last index in the block only.
1927  */
1928 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1929                         struct ext4_ext_path *path)
1930 {
1931         struct buffer_head *bh;
1932         int err;
1933         ext4_fsblk_t leaf;
1934
1935         /* free index block */
1936         path--;
1937         leaf = idx_pblock(path->p_idx);
1938         BUG_ON(path->p_hdr->eh_entries == 0);
1939         err = ext4_ext_get_access(handle, inode, path);
1940         if (err)
1941                 return err;
1942         le16_add_cpu(&path->p_hdr->eh_entries, -1);
1943         err = ext4_ext_dirty(handle, inode, path);
1944         if (err)
1945                 return err;
1946         ext_debug("index is empty, remove it, free block %llu\n", leaf);
1947         bh = sb_find_get_block(inode->i_sb, leaf);
1948         ext4_forget(handle, 1, inode, bh, leaf);
1949         ext4_free_blocks(handle, inode, leaf, 1, 1);
1950         return err;
1951 }
1952
1953 /*
1954  * ext4_ext_calc_credits_for_single_extent:
1955  * This routine returns max. credits that needed to insert an extent
1956  * to the extent tree.
1957  * When pass the actual path, the caller should calculate credits
1958  * under i_data_sem.
1959  */
1960 int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
1961                                                 struct ext4_ext_path *path)
1962 {
1963         if (path) {
1964                 int depth = ext_depth(inode);
1965                 int ret = 0;
1966
1967                 /* probably there is space in leaf? */
1968                 if (le16_to_cpu(path[depth].p_hdr->eh_entries)
1969                                 < le16_to_cpu(path[depth].p_hdr->eh_max)) {
1970
1971                         /*
1972                          *  There are some space in the leaf tree, no
1973                          *  need to account for leaf block credit
1974                          *
1975                          *  bitmaps and block group descriptor blocks
1976                          *  and other metadat blocks still need to be
1977                          *  accounted.
1978                          */
1979                         /* 1 bitmap, 1 block group descriptor */
1980                         ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
1981                 }
1982         }
1983
1984         return ext4_chunk_trans_blocks(inode, nrblocks);
1985 }
1986
1987 /*
1988  * How many index/leaf blocks need to change/allocate to modify nrblocks?
1989  *
1990  * if nrblocks are fit in a single extent (chunk flag is 1), then
1991  * in the worse case, each tree level index/leaf need to be changed
1992  * if the tree split due to insert a new extent, then the old tree
1993  * index/leaf need to be updated too
1994  *
1995  * If the nrblocks are discontiguous, they could cause
1996  * the whole tree split more than once, but this is really rare.
1997  */
1998 int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
1999 {
2000         int index;
2001         int depth = ext_depth(inode);
2002
2003         if (chunk)
2004                 index = depth * 2;
2005         else
2006                 index = depth * 3;
2007
2008         return index;
2009 }
2010
2011 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2012                                 struct ext4_extent *ex,
2013                                 ext4_lblk_t from, ext4_lblk_t to)
2014 {
2015         struct buffer_head *bh;
2016         unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2017         int i, metadata = 0;
2018
2019         if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2020                 metadata = 1;
2021 #ifdef EXTENTS_STATS
2022         {
2023                 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2024                 spin_lock(&sbi->s_ext_stats_lock);
2025                 sbi->s_ext_blocks += ee_len;
2026                 sbi->s_ext_extents++;
2027                 if (ee_len < sbi->s_ext_min)
2028                         sbi->s_ext_min = ee_len;
2029                 if (ee_len > sbi->s_ext_max)
2030                         sbi->s_ext_max = ee_len;
2031                 if (ext_depth(inode) > sbi->s_depth_max)
2032                         sbi->s_depth_max = ext_depth(inode);
2033                 spin_unlock(&sbi->s_ext_stats_lock);
2034         }
2035 #endif
2036         if (from >= le32_to_cpu(ex->ee_block)
2037             && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2038                 /* tail removal */
2039                 ext4_lblk_t num;
2040                 ext4_fsblk_t start;
2041
2042                 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2043                 start = ext_pblock(ex) + ee_len - num;
2044                 ext_debug("free last %u blocks starting %llu\n", num, start);
2045                 for (i = 0; i < num; i++) {
2046                         bh = sb_find_get_block(inode->i_sb, start + i);
2047                         ext4_forget(handle, 0, inode, bh, start + i);
2048                 }
2049                 ext4_free_blocks(handle, inode, start, num, metadata);
2050         } else if (from == le32_to_cpu(ex->ee_block)
2051                    && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2052                 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
2053                         from, to, le32_to_cpu(ex->ee_block), ee_len);
2054         } else {
2055                 printk(KERN_INFO "strange request: removal(2) "
2056                                 "%u-%u from %u:%u\n",
2057                                 from, to, le32_to_cpu(ex->ee_block), ee_len);
2058         }
2059         return 0;
2060 }
2061
2062 static int
2063 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2064                 struct ext4_ext_path *path, ext4_lblk_t start)
2065 {
2066         int err = 0, correct_index = 0;
2067         int depth = ext_depth(inode), credits;
2068         struct ext4_extent_header *eh;
2069         ext4_lblk_t a, b, block;
2070         unsigned num;
2071         ext4_lblk_t ex_ee_block;
2072         unsigned short ex_ee_len;
2073         unsigned uninitialized = 0;
2074         struct ext4_extent *ex;
2075
2076         /* the header must be checked already in ext4_ext_remove_space() */
2077         ext_debug("truncate since %u in leaf\n", start);
2078         if (!path[depth].p_hdr)
2079                 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2080         eh = path[depth].p_hdr;
2081         BUG_ON(eh == NULL);
2082
2083         /* find where to start removing */
2084         ex = EXT_LAST_EXTENT(eh);
2085
2086         ex_ee_block = le32_to_cpu(ex->ee_block);
2087         if (ext4_ext_is_uninitialized(ex))
2088                 uninitialized = 1;
2089         ex_ee_len = ext4_ext_get_actual_len(ex);
2090
2091         while (ex >= EXT_FIRST_EXTENT(eh) &&
2092                         ex_ee_block + ex_ee_len > start) {
2093                 ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
2094                 path[depth].p_ext = ex;
2095
2096                 a = ex_ee_block > start ? ex_ee_block : start;
2097                 b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
2098                         ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
2099
2100                 ext_debug("  border %u:%u\n", a, b);
2101
2102                 if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
2103                         block = 0;
2104                         num = 0;
2105                         BUG();
2106                 } else if (a != ex_ee_block) {
2107                         /* remove tail of the extent */
2108                         block = ex_ee_block;
2109                         num = a - block;
2110                 } else if (b != ex_ee_block + ex_ee_len - 1) {
2111                         /* remove head of the extent */
2112                         block = a;
2113                         num = b - a;
2114                         /* there is no "make a hole" API yet */
2115                         BUG();
2116                 } else {
2117                         /* remove whole extent: excellent! */
2118                         block = ex_ee_block;
2119                         num = 0;
2120                         BUG_ON(a != ex_ee_block);
2121                         BUG_ON(b != ex_ee_block + ex_ee_len - 1);
2122                 }
2123
2124                 /*
2125                  * 3 for leaf, sb, and inode plus 2 (bmap and group
2126                  * descriptor) for each block group; assume two block
2127                  * groups plus ex_ee_len/blocks_per_block_group for
2128                  * the worst case
2129                  */
2130                 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2131                 if (ex == EXT_FIRST_EXTENT(eh)) {
2132                         correct_index = 1;
2133                         credits += (ext_depth(inode)) + 1;
2134                 }
2135                 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2136
2137                 err = ext4_ext_journal_restart(handle, credits);
2138                 if (err)
2139                         goto out;
2140
2141                 err = ext4_ext_get_access(handle, inode, path + depth);
2142                 if (err)
2143                         goto out;
2144
2145                 err = ext4_remove_blocks(handle, inode, ex, a, b);
2146                 if (err)
2147                         goto out;
2148
2149                 if (num == 0) {
2150                         /* this extent is removed; mark slot entirely unused */
2151                         ext4_ext_store_pblock(ex, 0);
2152                         le16_add_cpu(&eh->eh_entries, -1);
2153                 }
2154
2155                 ex->ee_block = cpu_to_le32(block);
2156                 ex->ee_len = cpu_to_le16(num);
2157                 /*
2158                  * Do not mark uninitialized if all the blocks in the
2159                  * extent have been removed.
2160                  */
2161                 if (uninitialized && num)
2162                         ext4_ext_mark_uninitialized(ex);
2163
2164                 err = ext4_ext_dirty(handle, inode, path + depth);
2165                 if (err)
2166                         goto out;
2167
2168                 ext_debug("new extent: %u:%u:%llu\n", block, num,
2169                                 ext_pblock(ex));
2170                 ex--;
2171                 ex_ee_block = le32_to_cpu(ex->ee_block);
2172                 ex_ee_len = ext4_ext_get_actual_len(ex);
2173         }
2174
2175         if (correct_index && eh->eh_entries)
2176                 err = ext4_ext_correct_indexes(handle, inode, path);
2177
2178         /* if this leaf is free, then we should
2179          * remove it from index block above */
2180         if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2181                 err = ext4_ext_rm_idx(handle, inode, path + depth);
2182
2183 out:
2184         return err;
2185 }
2186
2187 /*
2188  * ext4_ext_more_to_rm:
2189  * returns 1 if current index has to be freed (even partial)
2190  */
2191 static int
2192 ext4_ext_more_to_rm(struct ext4_ext_path *path)
2193 {
2194         BUG_ON(path->p_idx == NULL);
2195
2196         if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2197                 return 0;
2198
2199         /*
2200          * if truncate on deeper level happened, it wasn't partial,
2201          * so we have to consider current index for truncation
2202          */
2203         if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2204                 return 0;
2205         return 1;
2206 }
2207
2208 static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2209 {
2210         struct super_block *sb = inode->i_sb;
2211         int depth = ext_depth(inode);
2212         struct ext4_ext_path *path;
2213         handle_t *handle;
2214         int i = 0, err = 0;
2215
2216         ext_debug("truncate since %u\n", start);
2217
2218         /* probably first extent we're gonna free will be last in block */
2219         handle = ext4_journal_start(inode, depth + 1);
2220         if (IS_ERR(handle))
2221                 return PTR_ERR(handle);
2222
2223         ext4_ext_invalidate_cache(inode);
2224
2225         /*
2226          * We start scanning from right side, freeing all the blocks
2227          * after i_size and walking into the tree depth-wise.
2228          */
2229         path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2230         if (path == NULL) {
2231                 ext4_journal_stop(handle);
2232                 return -ENOMEM;
2233         }
2234         path[0].p_hdr = ext_inode_hdr(inode);
2235         if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2236                 err = -EIO;
2237                 goto out;
2238         }
2239         path[0].p_depth = depth;
2240
2241         while (i >= 0 && err == 0) {
2242                 if (i == depth) {
2243                         /* this is leaf block */
2244                         err = ext4_ext_rm_leaf(handle, inode, path, start);
2245                         /* root level has p_bh == NULL, brelse() eats this */
2246                         brelse(path[i].p_bh);
2247                         path[i].p_bh = NULL;
2248                         i--;
2249                         continue;
2250                 }
2251
2252                 /* this is index block */
2253                 if (!path[i].p_hdr) {
2254                         ext_debug("initialize header\n");
2255                         path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2256                 }
2257
2258                 if (!path[i].p_idx) {
2259                         /* this level hasn't been touched yet */
2260                         path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2261                         path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2262                         ext_debug("init index ptr: hdr 0x%p, num %d\n",
2263                                   path[i].p_hdr,
2264                                   le16_to_cpu(path[i].p_hdr->eh_entries));
2265                 } else {
2266                         /* we were already here, see at next index */
2267                         path[i].p_idx--;
2268                 }
2269
2270                 ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2271                                 i, EXT_FIRST_INDEX(path[i].p_hdr),
2272                                 path[i].p_idx);
2273                 if (ext4_ext_more_to_rm(path + i)) {
2274                         struct buffer_head *bh;
2275                         /* go to the next level */
2276                         ext_debug("move to level %d (block %llu)\n",
2277                                   i + 1, idx_pblock(path[i].p_idx));
2278                         memset(path + i + 1, 0, sizeof(*path));
2279                         bh = sb_bread(sb, idx_pblock(path[i].p_idx));
2280                         if (!bh) {
2281                                 /* should we reset i_size? */
2282                                 err = -EIO;
2283                                 break;
2284                         }
2285                         if (WARN_ON(i + 1 > depth)) {
2286                                 err = -EIO;
2287                                 break;
2288                         }
2289                         if (ext4_ext_check(inode, ext_block_hdr(bh),
2290                                                         depth - i - 1)) {
2291                                 err = -EIO;
2292                                 break;
2293                         }
2294                         path[i + 1].p_bh = bh;
2295
2296                         /* save actual number of indexes since this
2297                          * number is changed at the next iteration */
2298                         path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2299                         i++;
2300                 } else {
2301                         /* we finished processing this index, go up */
2302                         if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2303                                 /* index is empty, remove it;
2304                                  * handle must be already prepared by the
2305                                  * truncatei_leaf() */
2306                                 err = ext4_ext_rm_idx(handle, inode, path + i);
2307                         }
2308                         /* root level has p_bh == NULL, brelse() eats this */
2309                         brelse(path[i].p_bh);
2310                         path[i].p_bh = NULL;
2311                         i--;
2312                         ext_debug("return to level %d\n", i);
2313                 }
2314         }
2315
2316         /* TODO: flexible tree reduction should be here */
2317         if (path->p_hdr->eh_entries == 0) {
2318                 /*
2319                  * truncate to zero freed all the tree,
2320                  * so we need to correct eh_depth
2321                  */
2322                 err = ext4_ext_get_access(handle, inode, path);
2323                 if (err == 0) {
2324                         ext_inode_hdr(inode)->eh_depth = 0;
2325                         ext_inode_hdr(inode)->eh_max =
2326                                 cpu_to_le16(ext4_ext_space_root(inode));
2327                         err = ext4_ext_dirty(handle, inode, path);
2328                 }
2329         }
2330 out:
2331         ext4_ext_drop_refs(path);
2332         kfree(path);
2333         ext4_journal_stop(handle);
2334
2335         return err;
2336 }
2337
2338 /*
2339  * called at mount time
2340  */
2341 void ext4_ext_init(struct super_block *sb)
2342 {
2343         /*
2344          * possible initialization would be here
2345          */
2346
2347         if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2348                 printk(KERN_INFO "EXT4-fs: file extents enabled");
2349 #ifdef AGGRESSIVE_TEST
2350                 printk(", aggressive tests");
2351 #endif
2352 #ifdef CHECK_BINSEARCH
2353                 printk(", check binsearch");
2354 #endif
2355 #ifdef EXTENTS_STATS
2356                 printk(", stats");
2357 #endif
2358                 printk("\n");
2359 #ifdef EXTENTS_STATS
2360                 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2361                 EXT4_SB(sb)->s_ext_min = 1 << 30;
2362                 EXT4_SB(sb)->s_ext_max = 0;
2363 #endif
2364         }
2365 }
2366
2367 /*
2368  * called at umount time
2369  */
2370 void ext4_ext_release(struct super_block *sb)
2371 {
2372         if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2373                 return;
2374
2375 #ifdef EXTENTS_STATS
2376         if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2377                 struct ext4_sb_info *sbi = EXT4_SB(sb);
2378                 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2379                         sbi->s_ext_blocks, sbi->s_ext_extents,
2380                         sbi->s_ext_blocks / sbi->s_ext_extents);
2381                 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2382                         sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2383         }
2384 #endif
2385 }
2386
2387 static void bi_complete(struct bio *bio, int error)
2388 {
2389         complete((struct completion *)bio->bi_private);
2390 }
2391
2392 /* FIXME!! we need to try to merge to left or right after zero-out  */
2393 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2394 {
2395         int ret = -EIO;
2396         struct bio *bio;
2397         int blkbits, blocksize;
2398         sector_t ee_pblock;
2399         struct completion event;
2400         unsigned int ee_len, len, done, offset;
2401
2402
2403         blkbits   = inode->i_blkbits;
2404         blocksize = inode->i_sb->s_blocksize;
2405         ee_len    = ext4_ext_get_actual_len(ex);
2406         ee_pblock = ext_pblock(ex);
2407
2408         /* convert ee_pblock to 512 byte sectors */
2409         ee_pblock = ee_pblock << (blkbits - 9);
2410
2411         while (ee_len > 0) {
2412
2413                 if (ee_len > BIO_MAX_PAGES)
2414                         len = BIO_MAX_PAGES;
2415                 else
2416                         len = ee_len;
2417
2418                 bio = bio_alloc(GFP_NOIO, len);
2419                 bio->bi_sector = ee_pblock;
2420                 bio->bi_bdev   = inode->i_sb->s_bdev;
2421
2422                 done = 0;
2423                 offset = 0;
2424                 while (done < len) {
2425                         ret = bio_add_page(bio, ZERO_PAGE(0),
2426                                                         blocksize, offset);
2427                         if (ret != blocksize) {
2428                                 /*
2429                                  * We can't add any more pages because of
2430                                  * hardware limitations.  Start a new bio.
2431                                  */
2432                                 break;
2433                         }
2434                         done++;
2435                         offset += blocksize;
2436                         if (offset >= PAGE_CACHE_SIZE)
2437                                 offset = 0;
2438                 }
2439
2440                 init_completion(&event);
2441                 bio->bi_private = &event;
2442                 bio->bi_end_io = bi_complete;
2443                 submit_bio(WRITE, bio);
2444                 wait_for_completion(&event);
2445
2446                 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
2447                         ret = 0;
2448                 else {
2449                         ret = -EIO;
2450                         break;
2451                 }
2452                 bio_put(bio);
2453                 ee_len    -= done;
2454                 ee_pblock += done  << (blkbits - 9);
2455         }
2456         return ret;
2457 }
2458
2459 #define EXT4_EXT_ZERO_LEN 7
2460
2461 /*
2462  * This function is called by ext4_ext_get_blocks() if someone tries to write
2463  * to an uninitialized extent. It may result in splitting the uninitialized
2464  * extent into multiple extents (upto three - one initialized and two
2465  * uninitialized).
2466  * There are three possibilities:
2467  *   a> There is no split required: Entire extent should be initialized
2468  *   b> Splits in two extents: Write is happening at either end of the extent
2469  *   c> Splits in three extents: Somone is writing in middle of the extent
2470  */
2471 static int ext4_ext_convert_to_initialized(handle_t *handle,
2472                                                 struct inode *inode,
2473                                                 struct ext4_ext_path *path,
2474                                                 ext4_lblk_t iblock,
2475                                                 unsigned int max_blocks)
2476 {
2477         struct ext4_extent *ex, newex, orig_ex;
2478         struct ext4_extent *ex1 = NULL;
2479         struct ext4_extent *ex2 = NULL;
2480         struct ext4_extent *ex3 = NULL;
2481         struct ext4_extent_header *eh;
2482         ext4_lblk_t ee_block;
2483         unsigned int allocated, ee_len, depth;
2484         ext4_fsblk_t newblock;
2485         int err = 0;
2486         int ret = 0;
2487
2488         depth = ext_depth(inode);
2489         eh = path[depth].p_hdr;
2490         ex = path[depth].p_ext;
2491         ee_block = le32_to_cpu(ex->ee_block);
2492         ee_len = ext4_ext_get_actual_len(ex);
2493         allocated = ee_len - (iblock - ee_block);
2494         newblock = iblock - ee_block + ext_pblock(ex);
2495         ex2 = ex;
2496         orig_ex.ee_block = ex->ee_block;
2497         orig_ex.ee_len   = cpu_to_le16(ee_len);
2498         ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2499
2500         err = ext4_ext_get_access(handle, inode, path + depth);
2501         if (err)
2502                 goto out;
2503         /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2504         if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
2505                 err =  ext4_ext_zeroout(inode, &orig_ex);
2506                 if (err)
2507                         goto fix_extent_len;
2508                 /* update the extent length and mark as initialized */
2509                 ex->ee_block = orig_ex.ee_block;
2510                 ex->ee_len   = orig_ex.ee_len;
2511                 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2512                 ext4_ext_dirty(handle, inode, path + depth);
2513                 /* zeroed the full extent */
2514                 return allocated;
2515         }
2516
2517         /* ex1: ee_block to iblock - 1 : uninitialized */
2518         if (iblock > ee_block) {
2519                 ex1 = ex;
2520                 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2521                 ext4_ext_mark_uninitialized(ex1);
2522                 ex2 = &newex;
2523         }
2524         /*
2525          * for sanity, update the length of the ex2 extent before
2526          * we insert ex3, if ex1 is NULL. This is to avoid temporary
2527          * overlap of blocks.
2528          */
2529         if (!ex1 && allocated > max_blocks)
2530                 ex2->ee_len = cpu_to_le16(max_blocks);
2531         /* ex3: to ee_block + ee_len : uninitialised */
2532         if (allocated > max_blocks) {
2533                 unsigned int newdepth;
2534                 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2535                 if (allocated <= EXT4_EXT_ZERO_LEN) {
2536                         /*
2537                          * iblock == ee_block is handled by the zerouout
2538                          * at the beginning.
2539                          * Mark first half uninitialized.
2540                          * Mark second half initialized and zero out the
2541                          * initialized extent
2542                          */
2543                         ex->ee_block = orig_ex.ee_block;
2544                         ex->ee_len   = cpu_to_le16(ee_len - allocated);
2545                         ext4_ext_mark_uninitialized(ex);
2546                         ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2547                         ext4_ext_dirty(handle, inode, path + depth);
2548
2549                         ex3 = &newex;
2550                         ex3->ee_block = cpu_to_le32(iblock);
2551                         ext4_ext_store_pblock(ex3, newblock);
2552                         ex3->ee_len = cpu_to_le16(allocated);
2553                         err = ext4_ext_insert_extent(handle, inode, path, ex3);
2554                         if (err == -ENOSPC) {
2555                                 err =  ext4_ext_zeroout(inode, &orig_ex);
2556                                 if (err)
2557                                         goto fix_extent_len;
2558                                 ex->ee_block = orig_ex.ee_block;
2559                                 ex->ee_len   = orig_ex.ee_len;
2560                                 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2561                                 ext4_ext_dirty(handle, inode, path + depth);
2562                                 /* blocks available from iblock */
2563                                 return allocated;
2564
2565                         } else if (err)
2566                                 goto fix_extent_len;
2567
2568                         /*
2569                          * We need to zero out the second half because
2570                          * an fallocate request can update file size and
2571                          * converting the second half to initialized extent
2572                          * implies that we can leak some junk data to user
2573                          * space.
2574                          */
2575                         err =  ext4_ext_zeroout(inode, ex3);
2576                         if (err) {
2577                                 /*
2578                                  * We should actually mark the
2579                                  * second half as uninit and return error
2580                                  * Insert would have changed the extent
2581                                  */
2582                                 depth = ext_depth(inode);
2583                                 ext4_ext_drop_refs(path);
2584                                 path = ext4_ext_find_extent(inode,
2585                                                                 iblock, path);
2586                                 if (IS_ERR(path)) {
2587                                         err = PTR_ERR(path);
2588                                         return err;
2589                                 }
2590                                 /* get the second half extent details */
2591                                 ex = path[depth].p_ext;
2592                                 err = ext4_ext_get_access(handle, inode,
2593                                                                 path + depth);
2594                                 if (err)
2595                                         return err;
2596                                 ext4_ext_mark_uninitialized(ex);
2597                                 ext4_ext_dirty(handle, inode, path + depth);
2598                                 return err;
2599                         }
2600
2601                         /* zeroed the second half */
2602                         return allocated;
2603                 }
2604                 ex3 = &newex;
2605                 ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2606                 ext4_ext_store_pblock(ex3, newblock + max_blocks);
2607                 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2608                 ext4_ext_mark_uninitialized(ex3);
2609                 err = ext4_ext_insert_extent(handle, inode, path, ex3);
2610                 if (err == -ENOSPC) {
2611                         err =  ext4_ext_zeroout(inode, &orig_ex);
2612                         if (err)
2613                                 goto fix_extent_len;
2614                         /* update the extent length and mark as initialized */
2615                         ex->ee_block = orig_ex.ee_block;
2616                         ex->ee_len   = orig_ex.ee_len;
2617                         ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2618                         ext4_ext_dirty(handle, inode, path + depth);
2619                         /* zeroed the full extent */
2620                         /* blocks available from iblock */
2621                         return allocated;
2622
2623                 } else if (err)
2624                         goto fix_extent_len;
2625                 /*
2626                  * The depth, and hence eh & ex might change
2627                  * as part of the insert above.
2628                  */
2629                 newdepth = ext_depth(inode);
2630                 /*
2631                  * update the extent length after successful insert of the
2632                  * split extent
2633                  */
2634                 orig_ex.ee_len = cpu_to_le16(ee_len -
2635                                                 ext4_ext_get_actual_len(ex3));
2636                 depth = newdepth;
2637                 ext4_ext_drop_refs(path);
2638                 path = ext4_ext_find_extent(inode, iblock, path);
2639                 if (IS_ERR(path)) {
2640                         err = PTR_ERR(path);
2641                         goto out;
2642                 }
2643                 eh = path[depth].p_hdr;
2644                 ex = path[depth].p_ext;
2645                 if (ex2 != &newex)
2646                         ex2 = ex;
2647
2648                 err = ext4_ext_get_access(handle, inode, path + depth);
2649                 if (err)
2650                         goto out;
2651
2652                 allocated = max_blocks;
2653
2654                 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
2655                  * to insert a extent in the middle zerout directly
2656                  * otherwise give the extent a chance to merge to left
2657                  */
2658                 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2659                                                         iblock != ee_block) {
2660                         err =  ext4_ext_zeroout(inode, &orig_ex);
2661                         if (err)
2662                                 goto fix_extent_len;
2663                         /* update the extent length and mark as initialized */
2664                         ex->ee_block = orig_ex.ee_block;
2665                         ex->ee_len   = orig_ex.ee_len;
2666                         ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2667                         ext4_ext_dirty(handle, inode, path + depth);
2668                         /* zero out the first half */
2669                         /* blocks available from iblock */
2670                         return allocated;
2671                 }
2672         }
2673         /*
2674          * If there was a change of depth as part of the
2675          * insertion of ex3 above, we need to update the length
2676          * of the ex1 extent again here
2677          */
2678         if (ex1 && ex1 != ex) {
2679                 ex1 = ex;
2680                 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2681                 ext4_ext_mark_uninitialized(ex1);
2682                 ex2 = &newex;
2683         }
2684         /* ex2: iblock to iblock + maxblocks-1 : initialised */
2685         ex2->ee_block = cpu_to_le32(iblock);
2686         ext4_ext_store_pblock(ex2, newblock);
2687         ex2->ee_len = cpu_to_le16(allocated);
2688         if (ex2 != ex)
2689                 goto insert;
2690         /*
2691          * New (initialized) extent starts from the first block
2692          * in the current extent. i.e., ex2 == ex
2693          * We have to see if it can be merged with the extent
2694          * on the left.
2695          */
2696         if (ex2 > EXT_FIRST_EXTENT(eh)) {
2697                 /*
2698                  * To merge left, pass "ex2 - 1" to try_to_merge(),
2699                  * since it merges towards right _only_.
2700                  */
2701                 ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
2702                 if (ret) {
2703                         err = ext4_ext_correct_indexes(handle, inode, path);
2704                         if (err)
2705                                 goto out;
2706                         depth = ext_depth(inode);
2707                         ex2--;
2708                 }
2709         }
2710         /*
2711          * Try to Merge towards right. This might be required
2712          * only when the whole extent is being written to.
2713          * i.e. ex2 == ex and ex3 == NULL.
2714          */
2715         if (!ex3) {
2716                 ret = ext4_ext_try_to_merge(inode, path, ex2);
2717                 if (ret) {
2718                         err = ext4_ext_correct_indexes(handle, inode, path);
2719                         if (err)
2720                                 goto out;
2721                 }
2722         }
2723         /* Mark modified extent as dirty */
2724         err = ext4_ext_dirty(handle, inode, path + depth);
2725         goto out;
2726 insert:
2727         err = ext4_ext_insert_extent(handle, inode, path, &newex);
2728         if (err == -ENOSPC) {
2729                 err =  ext4_ext_zeroout(inode, &orig_ex);
2730                 if (err)
2731                         goto fix_extent_len;
2732                 /* update the extent length and mark as initialized */
2733                 ex->ee_block = orig_ex.ee_block;
2734                 ex->ee_len   = orig_ex.ee_len;
2735                 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2736                 ext4_ext_dirty(handle, inode, path + depth);
2737                 /* zero out the first half */
2738                 return allocated;
2739         } else if (err)
2740                 goto fix_extent_len;
2741 out:
2742         return err ? err : allocated;
2743
2744 fix_extent_len:
2745         ex->ee_block = orig_ex.ee_block;
2746         ex->ee_len   = orig_ex.ee_len;
2747         ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2748         ext4_ext_mark_uninitialized(ex);
2749         ext4_ext_dirty(handle, inode, path + depth);
2750         return err;
2751 }
2752
2753 /*
2754  * Block allocation/map/preallocation routine for extents based files
2755  *
2756  *
2757  * Need to be called with
2758  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
2759  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
2760  *
2761  * return > 0, number of of blocks already mapped/allocated
2762  *          if create == 0 and these are pre-allocated blocks
2763  *              buffer head is unmapped
2764  *          otherwise blocks are mapped
2765  *
2766  * return = 0, if plain look up failed (blocks have not been allocated)
2767  *          buffer head is unmapped
2768  *
2769  * return < 0, error case.
2770  */
2771 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2772                         ext4_lblk_t iblock,
2773                         unsigned int max_blocks, struct buffer_head *bh_result,
2774                         int create, int extend_disksize)
2775 {
2776         struct ext4_ext_path *path = NULL;
2777         struct ext4_extent_header *eh;
2778         struct ext4_extent newex, *ex;
2779         ext4_fsblk_t newblock;
2780         int err = 0, depth, ret, cache_type;
2781         unsigned int allocated = 0;
2782         struct ext4_allocation_request ar;
2783         loff_t disksize;
2784
2785         __clear_bit(BH_New, &bh_result->b_state);
2786         ext_debug("blocks %u/%u requested for inode %u\n",
2787                         iblock, max_blocks, inode->i_ino);
2788
2789         /* check in cache */
2790         cache_type = ext4_ext_in_cache(inode, iblock, &newex);
2791         if (cache_type) {
2792                 if (cache_type == EXT4_EXT_CACHE_GAP) {
2793                         if (!create) {
2794                                 /*
2795                                  * block isn't allocated yet and
2796                                  * user doesn't want to allocate it
2797                                  */
2798                                 goto out2;
2799                         }
2800                         /* we should allocate requested block */
2801                 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
2802                         /* block is already allocated */
2803                         newblock = iblock
2804                                    - le32_to_cpu(newex.ee_block)
2805                                    + ext_pblock(&newex);
2806                         /* number of remaining blocks in the extent */
2807                         allocated = ext4_ext_get_actual_len(&newex) -
2808                                         (iblock - le32_to_cpu(newex.ee_block));
2809                         goto out;
2810                 } else {
2811                         BUG();
2812                 }
2813         }
2814
2815         /* find extent for this block */
2816         path = ext4_ext_find_extent(inode, iblock, NULL);
2817         if (IS_ERR(path)) {
2818                 err = PTR_ERR(path);
2819                 path = NULL;
2820                 goto out2;
2821         }
2822
2823         depth = ext_depth(inode);
2824
2825         /*
2826          * consistent leaf must not be empty;
2827          * this situation is possible, though, _during_ tree modification;
2828          * this is why assert can't be put in ext4_ext_find_extent()
2829          */
2830         BUG_ON(path[depth].p_ext == NULL && depth != 0);
2831         eh = path[depth].p_hdr;
2832
2833         ex = path[depth].p_ext;
2834         if (ex) {
2835                 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
2836                 ext4_fsblk_t ee_start = ext_pblock(ex);
2837                 unsigned short ee_len;
2838
2839                 /*
2840                  * Uninitialized extents are treated as holes, except that
2841                  * we split out initialized portions during a write.
2842                  */
2843                 ee_len = ext4_ext_get_actual_len(ex);
2844                 /* if found extent covers block, simply return it */
2845                 if (iblock >= ee_block && iblock < ee_block + ee_len) {
2846                         newblock = iblock - ee_block + ee_start;
2847                         /* number of remaining blocks in the extent */
2848                         allocated = ee_len - (iblock - ee_block);
2849                         ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
2850                                         ee_block, ee_len, newblock);
2851
2852                         /* Do not put uninitialized extent in the cache */
2853                         if (!ext4_ext_is_uninitialized(ex)) {
2854                                 ext4_ext_put_in_cache(inode, ee_block,
2855                                                         ee_len, ee_start,
2856                                                         EXT4_EXT_CACHE_EXTENT);
2857                                 goto out;
2858                         }
2859                         if (create == EXT4_CREATE_UNINITIALIZED_EXT)
2860                                 goto out;
2861                         if (!create) {
2862                                 /*
2863                                  * We have blocks reserved already.  We
2864                                  * return allocated blocks so that delalloc
2865                                  * won't do block reservation for us.  But
2866                                  * the buffer head will be unmapped so that
2867                                  * a read from the block returns 0s.
2868                                  */
2869                                 if (allocated > max_blocks)
2870                                         allocated = max_blocks;
2871                                 set_buffer_unwritten(bh_result);
2872                                 goto out2;
2873                         }
2874
2875                         ret = ext4_ext_convert_to_initialized(handle, inode,
2876                                                                 path, iblock,
2877                                                                 max_blocks);
2878                         if (ret <= 0) {
2879                                 err = ret;
2880                                 goto out2;
2881                         } else
2882                                 allocated = ret;
2883                         goto outnew;
2884                 }
2885         }
2886
2887         /*
2888          * requested block isn't allocated yet;
2889          * we couldn't try to create block if create flag is zero
2890          */
2891         if (!create) {
2892                 /*
2893                  * put just found gap into cache to speed up
2894                  * subsequent requests
2895                  */
2896                 ext4_ext_put_gap_in_cache(inode, path, iblock);
2897                 goto out2;
2898         }
2899         /*
2900          * Okay, we need to do block allocation.
2901          */
2902
2903         /* find neighbour allocated blocks */
2904         ar.lleft = iblock;
2905         err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
2906         if (err)
2907                 goto out2;
2908         ar.lright = iblock;
2909         err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
2910         if (err)
2911                 goto out2;
2912
2913         /*
2914          * See if request is beyond maximum number of blocks we can have in
2915          * a single extent. For an initialized extent this limit is
2916          * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
2917          * EXT_UNINIT_MAX_LEN.
2918          */
2919         if (max_blocks > EXT_INIT_MAX_LEN &&
2920             create != EXT4_CREATE_UNINITIALIZED_EXT)
2921                 max_blocks = EXT_INIT_MAX_LEN;
2922         else if (max_blocks > EXT_UNINIT_MAX_LEN &&
2923                  create == EXT4_CREATE_UNINITIALIZED_EXT)
2924                 max_blocks = EXT_UNINIT_MAX_LEN;
2925
2926         /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
2927         newex.ee_block = cpu_to_le32(iblock);
2928         newex.ee_len = cpu_to_le16(max_blocks);
2929         err = ext4_ext_check_overlap(inode, &newex, path);
2930         if (err)
2931                 allocated = ext4_ext_get_actual_len(&newex);
2932         else
2933                 allocated = max_blocks;
2934
2935         /* allocate new block */
2936         ar.inode = inode;
2937         ar.goal = ext4_ext_find_goal(inode, path, iblock);
2938         ar.logical = iblock;
2939         ar.len = allocated;
2940         if (S_ISREG(inode->i_mode))
2941                 ar.flags = EXT4_MB_HINT_DATA;
2942         else
2943                 /* disable in-core preallocation for non-regular files */
2944                 ar.flags = 0;
2945         newblock = ext4_mb_new_blocks(handle, &ar, &err);
2946         if (!newblock)
2947                 goto out2;
2948         ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
2949                   ar.goal, newblock, allocated);
2950
2951         /* try to insert new extent into found leaf and return */
2952         ext4_ext_store_pblock(&newex, newblock);
2953         newex.ee_len = cpu_to_le16(ar.len);
2954         if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
2955                 ext4_ext_mark_uninitialized(&newex);
2956         err = ext4_ext_insert_extent(handle, inode, path, &newex);
2957         if (err) {
2958                 /* free data blocks we just allocated */
2959                 /* not a good idea to call discard here directly,
2960                  * but otherwise we'd need to call it every free() */
2961                 ext4_discard_preallocations(inode);
2962                 ext4_free_blocks(handle, inode, ext_pblock(&newex),
2963                                         ext4_ext_get_actual_len(&newex), 0);
2964                 goto out2;
2965         }
2966
2967         /* previous routine could use block we allocated */
2968         newblock = ext_pblock(&newex);
2969         allocated = ext4_ext_get_actual_len(&newex);
2970 outnew:
2971         if (extend_disksize) {
2972                 disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
2973                 if (disksize > i_size_read(inode))
2974                         disksize = i_size_read(inode);
2975                 if (disksize > EXT4_I(inode)->i_disksize)
2976                         EXT4_I(inode)->i_disksize = disksize;
2977         }
2978
2979         set_buffer_new(bh_result);
2980
2981         /* Cache only when it is _not_ an uninitialized extent */
2982         if (create != EXT4_CREATE_UNINITIALIZED_EXT)
2983                 ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
2984                                                 EXT4_EXT_CACHE_EXTENT);
2985 out:
2986         if (allocated > max_blocks)
2987                 allocated = max_blocks;
2988         ext4_ext_show_leaf(inode, path);
2989         set_buffer_mapped(bh_result);
2990         bh_result->b_bdev = inode->i_sb->s_bdev;
2991         bh_result->b_blocknr = newblock;
2992 out2:
2993         if (path) {
2994                 ext4_ext_drop_refs(path);
2995                 kfree(path);
2996         }
2997         return err ? err : allocated;
2998 }
2999
3000 void ext4_ext_truncate(struct inode *inode)
3001 {
3002         struct address_space *mapping = inode->i_mapping;
3003         struct super_block *sb = inode->i_sb;
3004         ext4_lblk_t last_block;
3005         handle_t *handle;
3006         int err = 0;
3007
3008         /*
3009          * probably first extent we're gonna free will be last in block
3010          */
3011         err = ext4_writepage_trans_blocks(inode);
3012         handle = ext4_journal_start(inode, err);
3013         if (IS_ERR(handle))
3014                 return;
3015
3016         if (inode->i_size & (sb->s_blocksize - 1))
3017                 ext4_block_truncate_page(handle, mapping, inode->i_size);
3018
3019         if (ext4_orphan_add(handle, inode))
3020                 goto out_stop;
3021
3022         down_write(&EXT4_I(inode)->i_data_sem);
3023         ext4_ext_invalidate_cache(inode);
3024
3025         ext4_discard_preallocations(inode);
3026
3027         /*
3028          * TODO: optimization is possible here.
3029          * Probably we need not scan at all,
3030          * because page truncation is enough.
3031          */
3032
3033         /* we have to know where to truncate from in crash case */
3034         EXT4_I(inode)->i_disksize = inode->i_size;
3035         ext4_mark_inode_dirty(handle, inode);
3036
3037         last_block = (inode->i_size + sb->s_blocksize - 1)
3038                         >> EXT4_BLOCK_SIZE_BITS(sb);
3039         err = ext4_ext_remove_space(inode, last_block);
3040
3041         /* In a multi-transaction truncate, we only make the final
3042          * transaction synchronous.
3043          */
3044         if (IS_SYNC(inode))
3045                 ext4_handle_sync(handle);
3046
3047 out_stop:
3048         up_write(&EXT4_I(inode)->i_data_sem);
3049         /*
3050          * If this was a simple ftruncate() and the file will remain alive,
3051          * then we need to clear up the orphan record which we created above.
3052          * However, if this was a real unlink then we were called by
3053          * ext4_delete_inode(), and we allow that function to clean up the
3054          * orphan info for us.
3055          */
3056         if (inode->i_nlink)
3057                 ext4_orphan_del(handle, inode);
3058
3059         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3060         ext4_mark_inode_dirty(handle, inode);
3061         ext4_journal_stop(handle);
3062 }
3063
3064 static void ext4_falloc_update_inode(struct inode *inode,
3065                                 int mode, loff_t new_size, int update_ctime)
3066 {
3067         struct timespec now;
3068
3069         if (update_ctime) {
3070                 now = current_fs_time(inode->i_sb);
3071                 if (!timespec_equal(&inode->i_ctime, &now))
3072                         inode->i_ctime = now;
3073         }
3074         /*
3075          * Update only when preallocation was requested beyond
3076          * the file size.
3077          */
3078         if (!(mode & FALLOC_FL_KEEP_SIZE)) {
3079                 if (new_size > i_size_read(inode))
3080                         i_size_write(inode, new_size);
3081                 if (new_size > EXT4_I(inode)->i_disksize)
3082                         ext4_update_i_disksize(inode, new_size);
3083         }
3084
3085 }
3086
3087 /*
3088  * preallocate space for a file. This implements ext4's fallocate inode
3089  * operation, which gets called from sys_fallocate system call.
3090  * For block-mapped files, posix_fallocate should fall back to the method
3091  * of writing zeroes to the required new blocks (the same behavior which is
3092  * expected for file systems which do not support fallocate() system call).
3093  */
3094 long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3095 {
3096         handle_t *handle;
3097         ext4_lblk_t block;
3098         loff_t new_size;
3099         unsigned int max_blocks;
3100         int ret = 0;
3101         int ret2 = 0;
3102         int retries = 0;
3103         struct buffer_head map_bh;
3104         unsigned int credits, blkbits = inode->i_blkbits;
3105
3106         /*
3107          * currently supporting (pre)allocate mode for extent-based
3108          * files _only_
3109          */
3110         if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
3111                 return -EOPNOTSUPP;
3112
3113         /* preallocation to directories is currently not supported */
3114         if (S_ISDIR(inode->i_mode))
3115                 return -ENODEV;
3116
3117         block = offset >> blkbits;
3118         /*
3119          * We can't just convert len to max_blocks because
3120          * If blocksize = 4096 offset = 3072 and len = 2048
3121          */
3122         max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3123                                                         - block;
3124         /*
3125          * credits to insert 1 extent into extent tree
3126          */
3127         credits = ext4_chunk_trans_blocks(inode, max_blocks);
3128         mutex_lock(&inode->i_mutex);
3129 retry:
3130         while (ret >= 0 && ret < max_blocks) {
3131                 block = block + ret;
3132                 max_blocks = max_blocks - ret;
3133                 handle = ext4_journal_start(inode, credits);
3134                 if (IS_ERR(handle)) {
3135                         ret = PTR_ERR(handle);
3136                         break;
3137                 }
3138                 ret = ext4_get_blocks_wrap(handle, inode, block,
3139                                           max_blocks, &map_bh,
3140                                           EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
3141                 if (ret <= 0) {
3142 #ifdef EXT4FS_DEBUG
3143                         WARN_ON(ret <= 0);
3144                         printk(KERN_ERR "%s: ext4_ext_get_blocks "
3145                                     "returned error inode#%lu, block=%u, "
3146                                     "max_blocks=%u", __func__,
3147                                     inode->i_ino, block, max_blocks);
3148 #endif
3149                         ext4_mark_inode_dirty(handle, inode);
3150                         ret2 = ext4_journal_stop(handle);
3151                         break;
3152                 }
3153                 if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
3154                                                 blkbits) >> blkbits))
3155                         new_size = offset + len;
3156                 else
3157                         new_size = (block + ret) << blkbits;
3158
3159                 ext4_falloc_update_inode(inode, mode, new_size,
3160                                                 buffer_new(&map_bh));
3161                 ext4_mark_inode_dirty(handle, inode);
3162                 ret2 = ext4_journal_stop(handle);
3163                 if (ret2)
3164                         break;
3165         }
3166         if (ret == -ENOSPC &&
3167                         ext4_should_retry_alloc(inode->i_sb, &retries)) {
3168                 ret = 0;
3169                 goto retry;
3170         }
3171         mutex_unlock(&inode->i_mutex);
3172         return ret > 0 ? ret2 : ret;
3173 }
3174
3175 /*
3176  * Callback function called for each extent to gather FIEMAP information.
3177  */
3178 static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3179                        struct ext4_ext_cache *newex, struct ext4_extent *ex,
3180                        void *data)
3181 {
3182         struct fiemap_extent_info *fieinfo = data;
3183         unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
3184         __u64   logical;
3185         __u64   physical;
3186         __u64   length;
3187         __u32   flags = 0;
3188         int     error;
3189
3190         logical =  (__u64)newex->ec_block << blksize_bits;
3191
3192         if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
3193                 pgoff_t offset;
3194                 struct page *page;
3195                 struct buffer_head *bh = NULL;
3196
3197                 offset = logical >> PAGE_SHIFT;
3198                 page = find_get_page(inode->i_mapping, offset);
3199                 if (!page || !page_has_buffers(page))
3200                         return EXT_CONTINUE;
3201
3202                 bh = page_buffers(page);
3203
3204                 if (!bh)
3205                         return EXT_CONTINUE;
3206
3207                 if (buffer_delay(bh)) {
3208                         flags |= FIEMAP_EXTENT_DELALLOC;
3209                         page_cache_release(page);
3210                 } else {
3211                         page_cache_release(page);
3212                         return EXT_CONTINUE;
3213                 }
3214         }
3215
3216         physical = (__u64)newex->ec_start << blksize_bits;
3217         length =   (__u64)newex->ec_len << blksize_bits;
3218
3219         if (ex && ext4_ext_is_uninitialized(ex))
3220                 flags |= FIEMAP_EXTENT_UNWRITTEN;
3221
3222         /*
3223          * If this extent reaches EXT_MAX_BLOCK, it must be last.
3224          *
3225          * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
3226          * this also indicates no more allocated blocks.
3227          *
3228          * XXX this might miss a single-block extent at EXT_MAX_BLOCK
3229          */
3230         if (logical + length - 1 == EXT_MAX_BLOCK ||
3231             ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
3232                 flags |= FIEMAP_EXTENT_LAST;
3233
3234         error = fiemap_fill_next_extent(fieinfo, logical, physical,
3235                                         length, flags);
3236         if (error < 0)
3237                 return error;
3238         if (error == 1)
3239                 return EXT_BREAK;
3240
3241         return EXT_CONTINUE;
3242 }
3243
3244 /* fiemap flags we can handle specified here */
3245 #define EXT4_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
3246
3247 static int ext4_xattr_fiemap(struct inode *inode,
3248                                 struct fiemap_extent_info *fieinfo)
3249 {
3250         __u64 physical = 0;
3251         __u64 length;
3252         __u32 flags = FIEMAP_EXTENT_LAST;
3253         int blockbits = inode->i_sb->s_blocksize_bits;
3254         int error = 0;
3255
3256         /* in-inode? */
3257         if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
3258                 struct ext4_iloc iloc;
3259                 int offset;     /* offset of xattr in inode */
3260
3261                 error = ext4_get_inode_loc(inode, &iloc);
3262                 if (error)
3263                         return error;
3264                 physical = iloc.bh->b_blocknr << blockbits;
3265                 offset = EXT4_GOOD_OLD_INODE_SIZE +
3266                                 EXT4_I(inode)->i_extra_isize;
3267                 physical += offset;
3268                 length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
3269                 flags |= FIEMAP_EXTENT_DATA_INLINE;
3270         } else { /* external block */
3271                 physical = EXT4_I(inode)->i_file_acl << blockbits;
3272                 length = inode->i_sb->s_blocksize;
3273         }
3274
3275         if (physical)
3276                 error = fiemap_fill_next_extent(fieinfo, 0, physical,
3277                                                 length, flags);
3278         return (error < 0 ? error : 0);
3279 }
3280
3281 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3282                 __u64 start, __u64 len)
3283 {
3284         ext4_lblk_t start_blk;
3285         ext4_lblk_t len_blks;
3286         int error = 0;
3287
3288         /* fallback to generic here if not in extents fmt */
3289         if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
3290                 return generic_block_fiemap(inode, fieinfo, start, len,
3291                         ext4_get_block);
3292
3293         if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
3294                 return -EBADR;
3295
3296         if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
3297                 error = ext4_xattr_fiemap(inode, fieinfo);
3298         } else {
3299                 start_blk = start >> inode->i_sb->s_blocksize_bits;
3300                 len_blks = len >> inode->i_sb->s_blocksize_bits;
3301
3302                 /*
3303                  * Walk the extent tree gathering extent information.
3304                  * ext4_ext_fiemap_cb will push extents back to user.
3305                  */
3306                 down_write(&EXT4_I(inode)->i_data_sem);
3307                 error = ext4_ext_walk_space(inode, start_blk, len_blks,
3308                                           ext4_ext_fiemap_cb, fieinfo);
3309                 up_write(&EXT4_I(inode)->i_data_sem);
3310         }
3311
3312         return error;
3313 }
3314