Merge branch 's5p-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[sfrench/cifs-2.6.git] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31 #include <linux/quotaops.h>
32
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "aops.h"
39 #include "blockcheck.h"
40 #include "dlmglue.h"
41 #include "extent_map.h"
42 #include "inode.h"
43 #include "journal.h"
44 #include "localalloc.h"
45 #include "suballoc.h"
46 #include "sysfile.h"
47 #include "file.h"
48 #include "super.h"
49 #include "uptodate.h"
50 #include "xattr.h"
51 #include "refcounttree.h"
52 #include "ocfs2_trace.h"
53
54 #include "buffer_head_io.h"
55
56 enum ocfs2_contig_type {
57         CONTIG_NONE = 0,
58         CONTIG_LEFT,
59         CONTIG_RIGHT,
60         CONTIG_LEFTRIGHT,
61 };
62
63 static enum ocfs2_contig_type
64         ocfs2_extent_rec_contig(struct super_block *sb,
65                                 struct ocfs2_extent_rec *ext,
66                                 struct ocfs2_extent_rec *insert_rec);
67 /*
68  * Operations for a specific extent tree type.
69  *
70  * To implement an on-disk btree (extent tree) type in ocfs2, add
71  * an ocfs2_extent_tree_operations structure and the matching
72  * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
73  * for the allocation portion of the extent tree.
74  */
75 struct ocfs2_extent_tree_operations {
76         /*
77          * last_eb_blk is the block number of the right most leaf extent
78          * block.  Most on-disk structures containing an extent tree store
79          * this value for fast access.  The ->eo_set_last_eb_blk() and
80          * ->eo_get_last_eb_blk() operations access this value.  They are
81          *  both required.
82          */
83         void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
84                                    u64 blkno);
85         u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
86
87         /*
88          * The on-disk structure usually keeps track of how many total
89          * clusters are stored in this extent tree.  This function updates
90          * that value.  new_clusters is the delta, and must be
91          * added to the total.  Required.
92          */
93         void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
94                                    u32 new_clusters);
95
96         /*
97          * If this extent tree is supported by an extent map, insert
98          * a record into the map.
99          */
100         void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
101                                      struct ocfs2_extent_rec *rec);
102
103         /*
104          * If this extent tree is supported by an extent map, truncate the
105          * map to clusters,
106          */
107         void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
108                                        u32 clusters);
109
110         /*
111          * If ->eo_insert_check() exists, it is called before rec is
112          * inserted into the extent tree.  It is optional.
113          */
114         int (*eo_insert_check)(struct ocfs2_extent_tree *et,
115                                struct ocfs2_extent_rec *rec);
116         int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
117
118         /*
119          * --------------------------------------------------------------
120          * The remaining are internal to ocfs2_extent_tree and don't have
121          * accessor functions
122          */
123
124         /*
125          * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
126          * It is required.
127          */
128         void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
129
130         /*
131          * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
132          * it exists.  If it does not, et->et_max_leaf_clusters is set
133          * to 0 (unlimited).  Optional.
134          */
135         void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
136
137         /*
138          * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
139          * are contiguous or not. Optional. Don't need to set it if use
140          * ocfs2_extent_rec as the tree leaf.
141          */
142         enum ocfs2_contig_type
143                 (*eo_extent_contig)(struct ocfs2_extent_tree *et,
144                                     struct ocfs2_extent_rec *ext,
145                                     struct ocfs2_extent_rec *insert_rec);
146 };
147
148
149 /*
150  * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
151  * in the methods.
152  */
153 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
154 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
155                                          u64 blkno);
156 static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
157                                          u32 clusters);
158 static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
159                                            struct ocfs2_extent_rec *rec);
160 static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
161                                              u32 clusters);
162 static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
163                                      struct ocfs2_extent_rec *rec);
164 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
165 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
166 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
167         .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
168         .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
169         .eo_update_clusters     = ocfs2_dinode_update_clusters,
170         .eo_extent_map_insert   = ocfs2_dinode_extent_map_insert,
171         .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate,
172         .eo_insert_check        = ocfs2_dinode_insert_check,
173         .eo_sanity_check        = ocfs2_dinode_sanity_check,
174         .eo_fill_root_el        = ocfs2_dinode_fill_root_el,
175 };
176
177 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
178                                          u64 blkno)
179 {
180         struct ocfs2_dinode *di = et->et_object;
181
182         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
183         di->i_last_eb_blk = cpu_to_le64(blkno);
184 }
185
186 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
187 {
188         struct ocfs2_dinode *di = et->et_object;
189
190         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
191         return le64_to_cpu(di->i_last_eb_blk);
192 }
193
194 static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
195                                          u32 clusters)
196 {
197         struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
198         struct ocfs2_dinode *di = et->et_object;
199
200         le32_add_cpu(&di->i_clusters, clusters);
201         spin_lock(&oi->ip_lock);
202         oi->ip_clusters = le32_to_cpu(di->i_clusters);
203         spin_unlock(&oi->ip_lock);
204 }
205
206 static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
207                                            struct ocfs2_extent_rec *rec)
208 {
209         struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
210
211         ocfs2_extent_map_insert_rec(inode, rec);
212 }
213
214 static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
215                                              u32 clusters)
216 {
217         struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
218
219         ocfs2_extent_map_trunc(inode, clusters);
220 }
221
222 static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
223                                      struct ocfs2_extent_rec *rec)
224 {
225         struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
226         struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
227
228         BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
229         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
230                         (oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
231                         "Device %s, asking for sparse allocation: inode %llu, "
232                         "cpos %u, clusters %u\n",
233                         osb->dev_str,
234                         (unsigned long long)oi->ip_blkno,
235                         rec->e_cpos, oi->ip_clusters);
236
237         return 0;
238 }
239
240 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
241 {
242         struct ocfs2_dinode *di = et->et_object;
243
244         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
245         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
246
247         return 0;
248 }
249
250 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
251 {
252         struct ocfs2_dinode *di = et->et_object;
253
254         et->et_root_el = &di->id2.i_list;
255 }
256
257
258 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
259 {
260         struct ocfs2_xattr_value_buf *vb = et->et_object;
261
262         et->et_root_el = &vb->vb_xv->xr_list;
263 }
264
265 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
266                                               u64 blkno)
267 {
268         struct ocfs2_xattr_value_buf *vb = et->et_object;
269
270         vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
271 }
272
273 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
274 {
275         struct ocfs2_xattr_value_buf *vb = et->et_object;
276
277         return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
278 }
279
280 static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
281                                               u32 clusters)
282 {
283         struct ocfs2_xattr_value_buf *vb = et->et_object;
284
285         le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
286 }
287
288 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
289         .eo_set_last_eb_blk     = ocfs2_xattr_value_set_last_eb_blk,
290         .eo_get_last_eb_blk     = ocfs2_xattr_value_get_last_eb_blk,
291         .eo_update_clusters     = ocfs2_xattr_value_update_clusters,
292         .eo_fill_root_el        = ocfs2_xattr_value_fill_root_el,
293 };
294
295 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
296 {
297         struct ocfs2_xattr_block *xb = et->et_object;
298
299         et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
300 }
301
302 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
303 {
304         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
305         et->et_max_leaf_clusters =
306                 ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
307 }
308
309 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
310                                              u64 blkno)
311 {
312         struct ocfs2_xattr_block *xb = et->et_object;
313         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
314
315         xt->xt_last_eb_blk = cpu_to_le64(blkno);
316 }
317
318 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
319 {
320         struct ocfs2_xattr_block *xb = et->et_object;
321         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
322
323         return le64_to_cpu(xt->xt_last_eb_blk);
324 }
325
326 static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
327                                              u32 clusters)
328 {
329         struct ocfs2_xattr_block *xb = et->et_object;
330
331         le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
332 }
333
334 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
335         .eo_set_last_eb_blk     = ocfs2_xattr_tree_set_last_eb_blk,
336         .eo_get_last_eb_blk     = ocfs2_xattr_tree_get_last_eb_blk,
337         .eo_update_clusters     = ocfs2_xattr_tree_update_clusters,
338         .eo_fill_root_el        = ocfs2_xattr_tree_fill_root_el,
339         .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
340 };
341
342 static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
343                                           u64 blkno)
344 {
345         struct ocfs2_dx_root_block *dx_root = et->et_object;
346
347         dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
348 }
349
350 static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
351 {
352         struct ocfs2_dx_root_block *dx_root = et->et_object;
353
354         return le64_to_cpu(dx_root->dr_last_eb_blk);
355 }
356
357 static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
358                                           u32 clusters)
359 {
360         struct ocfs2_dx_root_block *dx_root = et->et_object;
361
362         le32_add_cpu(&dx_root->dr_clusters, clusters);
363 }
364
365 static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
366 {
367         struct ocfs2_dx_root_block *dx_root = et->et_object;
368
369         BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
370
371         return 0;
372 }
373
374 static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
375 {
376         struct ocfs2_dx_root_block *dx_root = et->et_object;
377
378         et->et_root_el = &dx_root->dr_list;
379 }
380
381 static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
382         .eo_set_last_eb_blk     = ocfs2_dx_root_set_last_eb_blk,
383         .eo_get_last_eb_blk     = ocfs2_dx_root_get_last_eb_blk,
384         .eo_update_clusters     = ocfs2_dx_root_update_clusters,
385         .eo_sanity_check        = ocfs2_dx_root_sanity_check,
386         .eo_fill_root_el        = ocfs2_dx_root_fill_root_el,
387 };
388
389 static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
390 {
391         struct ocfs2_refcount_block *rb = et->et_object;
392
393         et->et_root_el = &rb->rf_list;
394 }
395
396 static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
397                                                 u64 blkno)
398 {
399         struct ocfs2_refcount_block *rb = et->et_object;
400
401         rb->rf_last_eb_blk = cpu_to_le64(blkno);
402 }
403
404 static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
405 {
406         struct ocfs2_refcount_block *rb = et->et_object;
407
408         return le64_to_cpu(rb->rf_last_eb_blk);
409 }
410
411 static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
412                                                 u32 clusters)
413 {
414         struct ocfs2_refcount_block *rb = et->et_object;
415
416         le32_add_cpu(&rb->rf_clusters, clusters);
417 }
418
419 static enum ocfs2_contig_type
420 ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
421                                   struct ocfs2_extent_rec *ext,
422                                   struct ocfs2_extent_rec *insert_rec)
423 {
424         return CONTIG_NONE;
425 }
426
427 static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
428         .eo_set_last_eb_blk     = ocfs2_refcount_tree_set_last_eb_blk,
429         .eo_get_last_eb_blk     = ocfs2_refcount_tree_get_last_eb_blk,
430         .eo_update_clusters     = ocfs2_refcount_tree_update_clusters,
431         .eo_fill_root_el        = ocfs2_refcount_tree_fill_root_el,
432         .eo_extent_contig       = ocfs2_refcount_tree_extent_contig,
433 };
434
435 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
436                                      struct ocfs2_caching_info *ci,
437                                      struct buffer_head *bh,
438                                      ocfs2_journal_access_func access,
439                                      void *obj,
440                                      struct ocfs2_extent_tree_operations *ops)
441 {
442         et->et_ops = ops;
443         et->et_root_bh = bh;
444         et->et_ci = ci;
445         et->et_root_journal_access = access;
446         if (!obj)
447                 obj = (void *)bh->b_data;
448         et->et_object = obj;
449
450         et->et_ops->eo_fill_root_el(et);
451         if (!et->et_ops->eo_fill_max_leaf_clusters)
452                 et->et_max_leaf_clusters = 0;
453         else
454                 et->et_ops->eo_fill_max_leaf_clusters(et);
455 }
456
457 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
458                                    struct ocfs2_caching_info *ci,
459                                    struct buffer_head *bh)
460 {
461         __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
462                                  NULL, &ocfs2_dinode_et_ops);
463 }
464
465 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
466                                        struct ocfs2_caching_info *ci,
467                                        struct buffer_head *bh)
468 {
469         __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
470                                  NULL, &ocfs2_xattr_tree_et_ops);
471 }
472
473 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
474                                         struct ocfs2_caching_info *ci,
475                                         struct ocfs2_xattr_value_buf *vb)
476 {
477         __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
478                                  &ocfs2_xattr_value_et_ops);
479 }
480
481 void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
482                                     struct ocfs2_caching_info *ci,
483                                     struct buffer_head *bh)
484 {
485         __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
486                                  NULL, &ocfs2_dx_root_et_ops);
487 }
488
489 void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
490                                      struct ocfs2_caching_info *ci,
491                                      struct buffer_head *bh)
492 {
493         __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
494                                  NULL, &ocfs2_refcount_tree_et_ops);
495 }
496
497 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
498                                             u64 new_last_eb_blk)
499 {
500         et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
501 }
502
503 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
504 {
505         return et->et_ops->eo_get_last_eb_blk(et);
506 }
507
508 static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
509                                             u32 clusters)
510 {
511         et->et_ops->eo_update_clusters(et, clusters);
512 }
513
514 static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
515                                               struct ocfs2_extent_rec *rec)
516 {
517         if (et->et_ops->eo_extent_map_insert)
518                 et->et_ops->eo_extent_map_insert(et, rec);
519 }
520
521 static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
522                                                 u32 clusters)
523 {
524         if (et->et_ops->eo_extent_map_truncate)
525                 et->et_ops->eo_extent_map_truncate(et, clusters);
526 }
527
528 static inline int ocfs2_et_root_journal_access(handle_t *handle,
529                                                struct ocfs2_extent_tree *et,
530                                                int type)
531 {
532         return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
533                                           type);
534 }
535
536 static inline enum ocfs2_contig_type
537         ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
538                                struct ocfs2_extent_rec *rec,
539                                struct ocfs2_extent_rec *insert_rec)
540 {
541         if (et->et_ops->eo_extent_contig)
542                 return et->et_ops->eo_extent_contig(et, rec, insert_rec);
543
544         return ocfs2_extent_rec_contig(
545                                 ocfs2_metadata_cache_get_super(et->et_ci),
546                                 rec, insert_rec);
547 }
548
549 static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
550                                         struct ocfs2_extent_rec *rec)
551 {
552         int ret = 0;
553
554         if (et->et_ops->eo_insert_check)
555                 ret = et->et_ops->eo_insert_check(et, rec);
556         return ret;
557 }
558
559 static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
560 {
561         int ret = 0;
562
563         if (et->et_ops->eo_sanity_check)
564                 ret = et->et_ops->eo_sanity_check(et);
565         return ret;
566 }
567
568 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
569                                          struct ocfs2_extent_block *eb);
570 static void ocfs2_adjust_rightmost_records(handle_t *handle,
571                                            struct ocfs2_extent_tree *et,
572                                            struct ocfs2_path *path,
573                                            struct ocfs2_extent_rec *insert_rec);
574 /*
575  * Reset the actual path elements so that we can re-use the structure
576  * to build another path. Generally, this involves freeing the buffer
577  * heads.
578  */
579 void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
580 {
581         int i, start = 0, depth = 0;
582         struct ocfs2_path_item *node;
583
584         if (keep_root)
585                 start = 1;
586
587         for(i = start; i < path_num_items(path); i++) {
588                 node = &path->p_node[i];
589
590                 brelse(node->bh);
591                 node->bh = NULL;
592                 node->el = NULL;
593         }
594
595         /*
596          * Tree depth may change during truncate, or insert. If we're
597          * keeping the root extent list, then make sure that our path
598          * structure reflects the proper depth.
599          */
600         if (keep_root)
601                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
602         else
603                 path_root_access(path) = NULL;
604
605         path->p_tree_depth = depth;
606 }
607
608 void ocfs2_free_path(struct ocfs2_path *path)
609 {
610         if (path) {
611                 ocfs2_reinit_path(path, 0);
612                 kfree(path);
613         }
614 }
615
616 /*
617  * All the elements of src into dest. After this call, src could be freed
618  * without affecting dest.
619  *
620  * Both paths should have the same root. Any non-root elements of dest
621  * will be freed.
622  */
623 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
624 {
625         int i;
626
627         BUG_ON(path_root_bh(dest) != path_root_bh(src));
628         BUG_ON(path_root_el(dest) != path_root_el(src));
629         BUG_ON(path_root_access(dest) != path_root_access(src));
630
631         ocfs2_reinit_path(dest, 1);
632
633         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
634                 dest->p_node[i].bh = src->p_node[i].bh;
635                 dest->p_node[i].el = src->p_node[i].el;
636
637                 if (dest->p_node[i].bh)
638                         get_bh(dest->p_node[i].bh);
639         }
640 }
641
642 /*
643  * Make the *dest path the same as src and re-initialize src path to
644  * have a root only.
645  */
646 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
647 {
648         int i;
649
650         BUG_ON(path_root_bh(dest) != path_root_bh(src));
651         BUG_ON(path_root_access(dest) != path_root_access(src));
652
653         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
654                 brelse(dest->p_node[i].bh);
655
656                 dest->p_node[i].bh = src->p_node[i].bh;
657                 dest->p_node[i].el = src->p_node[i].el;
658
659                 src->p_node[i].bh = NULL;
660                 src->p_node[i].el = NULL;
661         }
662 }
663
664 /*
665  * Insert an extent block at given index.
666  *
667  * This will not take an additional reference on eb_bh.
668  */
669 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
670                                         struct buffer_head *eb_bh)
671 {
672         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
673
674         /*
675          * Right now, no root bh is an extent block, so this helps
676          * catch code errors with dinode trees. The assertion can be
677          * safely removed if we ever need to insert extent block
678          * structures at the root.
679          */
680         BUG_ON(index == 0);
681
682         path->p_node[index].bh = eb_bh;
683         path->p_node[index].el = &eb->h_list;
684 }
685
686 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
687                                          struct ocfs2_extent_list *root_el,
688                                          ocfs2_journal_access_func access)
689 {
690         struct ocfs2_path *path;
691
692         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
693
694         path = kzalloc(sizeof(*path), GFP_NOFS);
695         if (path) {
696                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
697                 get_bh(root_bh);
698                 path_root_bh(path) = root_bh;
699                 path_root_el(path) = root_el;
700                 path_root_access(path) = access;
701         }
702
703         return path;
704 }
705
706 struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
707 {
708         return ocfs2_new_path(path_root_bh(path), path_root_el(path),
709                               path_root_access(path));
710 }
711
712 struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
713 {
714         return ocfs2_new_path(et->et_root_bh, et->et_root_el,
715                               et->et_root_journal_access);
716 }
717
718 /*
719  * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
720  * otherwise it's the root_access function.
721  *
722  * I don't like the way this function's name looks next to
723  * ocfs2_journal_access_path(), but I don't have a better one.
724  */
725 int ocfs2_path_bh_journal_access(handle_t *handle,
726                                  struct ocfs2_caching_info *ci,
727                                  struct ocfs2_path *path,
728                                  int idx)
729 {
730         ocfs2_journal_access_func access = path_root_access(path);
731
732         if (!access)
733                 access = ocfs2_journal_access;
734
735         if (idx)
736                 access = ocfs2_journal_access_eb;
737
738         return access(handle, ci, path->p_node[idx].bh,
739                       OCFS2_JOURNAL_ACCESS_WRITE);
740 }
741
742 /*
743  * Convenience function to journal all components in a path.
744  */
745 int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
746                               handle_t *handle,
747                               struct ocfs2_path *path)
748 {
749         int i, ret = 0;
750
751         if (!path)
752                 goto out;
753
754         for(i = 0; i < path_num_items(path); i++) {
755                 ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
756                 if (ret < 0) {
757                         mlog_errno(ret);
758                         goto out;
759                 }
760         }
761
762 out:
763         return ret;
764 }
765
766 /*
767  * Return the index of the extent record which contains cluster #v_cluster.
768  * -1 is returned if it was not found.
769  *
770  * Should work fine on interior and exterior nodes.
771  */
772 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
773 {
774         int ret = -1;
775         int i;
776         struct ocfs2_extent_rec *rec;
777         u32 rec_end, rec_start, clusters;
778
779         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
780                 rec = &el->l_recs[i];
781
782                 rec_start = le32_to_cpu(rec->e_cpos);
783                 clusters = ocfs2_rec_clusters(el, rec);
784
785                 rec_end = rec_start + clusters;
786
787                 if (v_cluster >= rec_start && v_cluster < rec_end) {
788                         ret = i;
789                         break;
790                 }
791         }
792
793         return ret;
794 }
795
796 /*
797  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
798  * ocfs2_extent_rec_contig only work properly against leaf nodes!
799  */
800 static int ocfs2_block_extent_contig(struct super_block *sb,
801                                      struct ocfs2_extent_rec *ext,
802                                      u64 blkno)
803 {
804         u64 blk_end = le64_to_cpu(ext->e_blkno);
805
806         blk_end += ocfs2_clusters_to_blocks(sb,
807                                     le16_to_cpu(ext->e_leaf_clusters));
808
809         return blkno == blk_end;
810 }
811
812 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
813                                   struct ocfs2_extent_rec *right)
814 {
815         u32 left_range;
816
817         left_range = le32_to_cpu(left->e_cpos) +
818                 le16_to_cpu(left->e_leaf_clusters);
819
820         return (left_range == le32_to_cpu(right->e_cpos));
821 }
822
823 static enum ocfs2_contig_type
824         ocfs2_extent_rec_contig(struct super_block *sb,
825                                 struct ocfs2_extent_rec *ext,
826                                 struct ocfs2_extent_rec *insert_rec)
827 {
828         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
829
830         /*
831          * Refuse to coalesce extent records with different flag
832          * fields - we don't want to mix unwritten extents with user
833          * data.
834          */
835         if (ext->e_flags != insert_rec->e_flags)
836                 return CONTIG_NONE;
837
838         if (ocfs2_extents_adjacent(ext, insert_rec) &&
839             ocfs2_block_extent_contig(sb, ext, blkno))
840                         return CONTIG_RIGHT;
841
842         blkno = le64_to_cpu(ext->e_blkno);
843         if (ocfs2_extents_adjacent(insert_rec, ext) &&
844             ocfs2_block_extent_contig(sb, insert_rec, blkno))
845                 return CONTIG_LEFT;
846
847         return CONTIG_NONE;
848 }
849
850 /*
851  * NOTE: We can have pretty much any combination of contiguousness and
852  * appending.
853  *
854  * The usefulness of APPEND_TAIL is more in that it lets us know that
855  * we'll have to update the path to that leaf.
856  */
857 enum ocfs2_append_type {
858         APPEND_NONE = 0,
859         APPEND_TAIL,
860 };
861
862 enum ocfs2_split_type {
863         SPLIT_NONE = 0,
864         SPLIT_LEFT,
865         SPLIT_RIGHT,
866 };
867
868 struct ocfs2_insert_type {
869         enum ocfs2_split_type   ins_split;
870         enum ocfs2_append_type  ins_appending;
871         enum ocfs2_contig_type  ins_contig;
872         int                     ins_contig_index;
873         int                     ins_tree_depth;
874 };
875
876 struct ocfs2_merge_ctxt {
877         enum ocfs2_contig_type  c_contig_type;
878         int                     c_has_empty_extent;
879         int                     c_split_covers_rec;
880 };
881
882 static int ocfs2_validate_extent_block(struct super_block *sb,
883                                        struct buffer_head *bh)
884 {
885         int rc;
886         struct ocfs2_extent_block *eb =
887                 (struct ocfs2_extent_block *)bh->b_data;
888
889         trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr);
890
891         BUG_ON(!buffer_uptodate(bh));
892
893         /*
894          * If the ecc fails, we return the error but otherwise
895          * leave the filesystem running.  We know any error is
896          * local to this block.
897          */
898         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
899         if (rc) {
900                 mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
901                      (unsigned long long)bh->b_blocknr);
902                 return rc;
903         }
904
905         /*
906          * Errors after here are fatal.
907          */
908
909         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
910                 ocfs2_error(sb,
911                             "Extent block #%llu has bad signature %.*s",
912                             (unsigned long long)bh->b_blocknr, 7,
913                             eb->h_signature);
914                 return -EINVAL;
915         }
916
917         if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
918                 ocfs2_error(sb,
919                             "Extent block #%llu has an invalid h_blkno "
920                             "of %llu",
921                             (unsigned long long)bh->b_blocknr,
922                             (unsigned long long)le64_to_cpu(eb->h_blkno));
923                 return -EINVAL;
924         }
925
926         if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
927                 ocfs2_error(sb,
928                             "Extent block #%llu has an invalid "
929                             "h_fs_generation of #%u",
930                             (unsigned long long)bh->b_blocknr,
931                             le32_to_cpu(eb->h_fs_generation));
932                 return -EINVAL;
933         }
934
935         return 0;
936 }
937
938 int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
939                             struct buffer_head **bh)
940 {
941         int rc;
942         struct buffer_head *tmp = *bh;
943
944         rc = ocfs2_read_block(ci, eb_blkno, &tmp,
945                               ocfs2_validate_extent_block);
946
947         /* If ocfs2_read_block() got us a new bh, pass it up. */
948         if (!rc && !*bh)
949                 *bh = tmp;
950
951         return rc;
952 }
953
954
955 /*
956  * How many free extents have we got before we need more meta data?
957  */
958 int ocfs2_num_free_extents(struct ocfs2_super *osb,
959                            struct ocfs2_extent_tree *et)
960 {
961         int retval;
962         struct ocfs2_extent_list *el = NULL;
963         struct ocfs2_extent_block *eb;
964         struct buffer_head *eb_bh = NULL;
965         u64 last_eb_blk = 0;
966
967         el = et->et_root_el;
968         last_eb_blk = ocfs2_et_get_last_eb_blk(et);
969
970         if (last_eb_blk) {
971                 retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
972                                                  &eb_bh);
973                 if (retval < 0) {
974                         mlog_errno(retval);
975                         goto bail;
976                 }
977                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
978                 el = &eb->h_list;
979         }
980
981         BUG_ON(el->l_tree_depth != 0);
982
983         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
984 bail:
985         brelse(eb_bh);
986
987         trace_ocfs2_num_free_extents(retval);
988         return retval;
989 }
990
991 /* expects array to already be allocated
992  *
993  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
994  * l_count for you
995  */
996 static int ocfs2_create_new_meta_bhs(handle_t *handle,
997                                      struct ocfs2_extent_tree *et,
998                                      int wanted,
999                                      struct ocfs2_alloc_context *meta_ac,
1000                                      struct buffer_head *bhs[])
1001 {
1002         int count, status, i;
1003         u16 suballoc_bit_start;
1004         u32 num_got;
1005         u64 suballoc_loc, first_blkno;
1006         struct ocfs2_super *osb =
1007                 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
1008         struct ocfs2_extent_block *eb;
1009
1010         count = 0;
1011         while (count < wanted) {
1012                 status = ocfs2_claim_metadata(handle,
1013                                               meta_ac,
1014                                               wanted - count,
1015                                               &suballoc_loc,
1016                                               &suballoc_bit_start,
1017                                               &num_got,
1018                                               &first_blkno);
1019                 if (status < 0) {
1020                         mlog_errno(status);
1021                         goto bail;
1022                 }
1023
1024                 for(i = count;  i < (num_got + count); i++) {
1025                         bhs[i] = sb_getblk(osb->sb, first_blkno);
1026                         if (bhs[i] == NULL) {
1027                                 status = -EIO;
1028                                 mlog_errno(status);
1029                                 goto bail;
1030                         }
1031                         ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
1032
1033                         status = ocfs2_journal_access_eb(handle, et->et_ci,
1034                                                          bhs[i],
1035                                                          OCFS2_JOURNAL_ACCESS_CREATE);
1036                         if (status < 0) {
1037                                 mlog_errno(status);
1038                                 goto bail;
1039                         }
1040
1041                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
1042                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
1043                         /* Ok, setup the minimal stuff here. */
1044                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
1045                         eb->h_blkno = cpu_to_le64(first_blkno);
1046                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1047                         eb->h_suballoc_slot =
1048                                 cpu_to_le16(meta_ac->ac_alloc_slot);
1049                         eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
1050                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1051                         eb->h_list.l_count =
1052                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
1053
1054                         suballoc_bit_start++;
1055                         first_blkno++;
1056
1057                         /* We'll also be dirtied by the caller, so
1058                          * this isn't absolutely necessary. */
1059                         ocfs2_journal_dirty(handle, bhs[i]);
1060                 }
1061
1062                 count += num_got;
1063         }
1064
1065         status = 0;
1066 bail:
1067         if (status < 0) {
1068                 for(i = 0; i < wanted; i++) {
1069                         brelse(bhs[i]);
1070                         bhs[i] = NULL;
1071                 }
1072                 mlog_errno(status);
1073         }
1074         return status;
1075 }
1076
1077 /*
1078  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
1079  *
1080  * Returns the sum of the rightmost extent rec logical offset and
1081  * cluster count.
1082  *
1083  * ocfs2_add_branch() uses this to determine what logical cluster
1084  * value should be populated into the leftmost new branch records.
1085  *
1086  * ocfs2_shift_tree_depth() uses this to determine the # clusters
1087  * value for the new topmost tree record.
1088  */
1089 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
1090 {
1091         int i;
1092
1093         i = le16_to_cpu(el->l_next_free_rec) - 1;
1094
1095         return le32_to_cpu(el->l_recs[i].e_cpos) +
1096                 ocfs2_rec_clusters(el, &el->l_recs[i]);
1097 }
1098
1099 /*
1100  * Change range of the branches in the right most path according to the leaf
1101  * extent block's rightmost record.
1102  */
1103 static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1104                                          struct ocfs2_extent_tree *et)
1105 {
1106         int status;
1107         struct ocfs2_path *path = NULL;
1108         struct ocfs2_extent_list *el;
1109         struct ocfs2_extent_rec *rec;
1110
1111         path = ocfs2_new_path_from_et(et);
1112         if (!path) {
1113                 status = -ENOMEM;
1114                 return status;
1115         }
1116
1117         status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
1118         if (status < 0) {
1119                 mlog_errno(status);
1120                 goto out;
1121         }
1122
1123         status = ocfs2_extend_trans(handle, path_num_items(path));
1124         if (status < 0) {
1125                 mlog_errno(status);
1126                 goto out;
1127         }
1128
1129         status = ocfs2_journal_access_path(et->et_ci, handle, path);
1130         if (status < 0) {
1131                 mlog_errno(status);
1132                 goto out;
1133         }
1134
1135         el = path_leaf_el(path);
1136         rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1];
1137
1138         ocfs2_adjust_rightmost_records(handle, et, path, rec);
1139
1140 out:
1141         ocfs2_free_path(path);
1142         return status;
1143 }
1144
1145 /*
1146  * Add an entire tree branch to our inode. eb_bh is the extent block
1147  * to start at, if we don't want to start the branch at the root
1148  * structure.
1149  *
1150  * last_eb_bh is required as we have to update it's next_leaf pointer
1151  * for the new last extent block.
1152  *
1153  * the new branch will be 'empty' in the sense that every block will
1154  * contain a single record with cluster count == 0.
1155  */
1156 static int ocfs2_add_branch(handle_t *handle,
1157                             struct ocfs2_extent_tree *et,
1158                             struct buffer_head *eb_bh,
1159                             struct buffer_head **last_eb_bh,
1160                             struct ocfs2_alloc_context *meta_ac)
1161 {
1162         int status, new_blocks, i;
1163         u64 next_blkno, new_last_eb_blk;
1164         struct buffer_head *bh;
1165         struct buffer_head **new_eb_bhs = NULL;
1166         struct ocfs2_extent_block *eb;
1167         struct ocfs2_extent_list  *eb_el;
1168         struct ocfs2_extent_list  *el;
1169         u32 new_cpos, root_end;
1170
1171         BUG_ON(!last_eb_bh || !*last_eb_bh);
1172
1173         if (eb_bh) {
1174                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1175                 el = &eb->h_list;
1176         } else
1177                 el = et->et_root_el;
1178
1179         /* we never add a branch to a leaf. */
1180         BUG_ON(!el->l_tree_depth);
1181
1182         new_blocks = le16_to_cpu(el->l_tree_depth);
1183
1184         eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
1185         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
1186         root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
1187
1188         /*
1189          * If there is a gap before the root end and the real end
1190          * of the righmost leaf block, we need to remove the gap
1191          * between new_cpos and root_end first so that the tree
1192          * is consistent after we add a new branch(it will start
1193          * from new_cpos).
1194          */
1195         if (root_end > new_cpos) {
1196                 trace_ocfs2_adjust_rightmost_branch(
1197                         (unsigned long long)
1198                         ocfs2_metadata_cache_owner(et->et_ci),
1199                         root_end, new_cpos);
1200
1201                 status = ocfs2_adjust_rightmost_branch(handle, et);
1202                 if (status) {
1203                         mlog_errno(status);
1204                         goto bail;
1205                 }
1206         }
1207
1208         /* allocate the number of new eb blocks we need */
1209         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
1210                              GFP_KERNEL);
1211         if (!new_eb_bhs) {
1212                 status = -ENOMEM;
1213                 mlog_errno(status);
1214                 goto bail;
1215         }
1216
1217         status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
1218                                            meta_ac, new_eb_bhs);
1219         if (status < 0) {
1220                 mlog_errno(status);
1221                 goto bail;
1222         }
1223
1224         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
1225          * linked with the rest of the tree.
1226          * conversly, new_eb_bhs[0] is the new bottommost leaf.
1227          *
1228          * when we leave the loop, new_last_eb_blk will point to the
1229          * newest leaf, and next_blkno will point to the topmost extent
1230          * block. */
1231         next_blkno = new_last_eb_blk = 0;
1232         for(i = 0; i < new_blocks; i++) {
1233                 bh = new_eb_bhs[i];
1234                 eb = (struct ocfs2_extent_block *) bh->b_data;
1235                 /* ocfs2_create_new_meta_bhs() should create it right! */
1236                 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1237                 eb_el = &eb->h_list;
1238
1239                 status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
1240                                                  OCFS2_JOURNAL_ACCESS_CREATE);
1241                 if (status < 0) {
1242                         mlog_errno(status);
1243                         goto bail;
1244                 }
1245
1246                 eb->h_next_leaf_blk = 0;
1247                 eb_el->l_tree_depth = cpu_to_le16(i);
1248                 eb_el->l_next_free_rec = cpu_to_le16(1);
1249                 /*
1250                  * This actually counts as an empty extent as
1251                  * c_clusters == 0
1252                  */
1253                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
1254                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
1255                 /*
1256                  * eb_el isn't always an interior node, but even leaf
1257                  * nodes want a zero'd flags and reserved field so
1258                  * this gets the whole 32 bits regardless of use.
1259                  */
1260                 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
1261                 if (!eb_el->l_tree_depth)
1262                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1263
1264                 ocfs2_journal_dirty(handle, bh);
1265                 next_blkno = le64_to_cpu(eb->h_blkno);
1266         }
1267
1268         /* This is a bit hairy. We want to update up to three blocks
1269          * here without leaving any of them in an inconsistent state
1270          * in case of error. We don't have to worry about
1271          * journal_dirty erroring as it won't unless we've aborted the
1272          * handle (in which case we would never be here) so reserving
1273          * the write with journal_access is all we need to do. */
1274         status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
1275                                          OCFS2_JOURNAL_ACCESS_WRITE);
1276         if (status < 0) {
1277                 mlog_errno(status);
1278                 goto bail;
1279         }
1280         status = ocfs2_et_root_journal_access(handle, et,
1281                                               OCFS2_JOURNAL_ACCESS_WRITE);
1282         if (status < 0) {
1283                 mlog_errno(status);
1284                 goto bail;
1285         }
1286         if (eb_bh) {
1287                 status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
1288                                                  OCFS2_JOURNAL_ACCESS_WRITE);
1289                 if (status < 0) {
1290                         mlog_errno(status);
1291                         goto bail;
1292                 }
1293         }
1294
1295         /* Link the new branch into the rest of the tree (el will
1296          * either be on the root_bh, or the extent block passed in. */
1297         i = le16_to_cpu(el->l_next_free_rec);
1298         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
1299         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1300         el->l_recs[i].e_int_clusters = 0;
1301         le16_add_cpu(&el->l_next_free_rec, 1);
1302
1303         /* fe needs a new last extent block pointer, as does the
1304          * next_leaf on the previously last-extent-block. */
1305         ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
1306
1307         eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1308         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1309
1310         ocfs2_journal_dirty(handle, *last_eb_bh);
1311         ocfs2_journal_dirty(handle, et->et_root_bh);
1312         if (eb_bh)
1313                 ocfs2_journal_dirty(handle, eb_bh);
1314
1315         /*
1316          * Some callers want to track the rightmost leaf so pass it
1317          * back here.
1318          */
1319         brelse(*last_eb_bh);
1320         get_bh(new_eb_bhs[0]);
1321         *last_eb_bh = new_eb_bhs[0];
1322
1323         status = 0;
1324 bail:
1325         if (new_eb_bhs) {
1326                 for (i = 0; i < new_blocks; i++)
1327                         brelse(new_eb_bhs[i]);
1328                 kfree(new_eb_bhs);
1329         }
1330
1331         return status;
1332 }
1333
1334 /*
1335  * adds another level to the allocation tree.
1336  * returns back the new extent block so you can add a branch to it
1337  * after this call.
1338  */
1339 static int ocfs2_shift_tree_depth(handle_t *handle,
1340                                   struct ocfs2_extent_tree *et,
1341                                   struct ocfs2_alloc_context *meta_ac,
1342                                   struct buffer_head **ret_new_eb_bh)
1343 {
1344         int status, i;
1345         u32 new_clusters;
1346         struct buffer_head *new_eb_bh = NULL;
1347         struct ocfs2_extent_block *eb;
1348         struct ocfs2_extent_list  *root_el;
1349         struct ocfs2_extent_list  *eb_el;
1350
1351         status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
1352                                            &new_eb_bh);
1353         if (status < 0) {
1354                 mlog_errno(status);
1355                 goto bail;
1356         }
1357
1358         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1359         /* ocfs2_create_new_meta_bhs() should create it right! */
1360         BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1361
1362         eb_el = &eb->h_list;
1363         root_el = et->et_root_el;
1364
1365         status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
1366                                          OCFS2_JOURNAL_ACCESS_CREATE);
1367         if (status < 0) {
1368                 mlog_errno(status);
1369                 goto bail;
1370         }
1371
1372         /* copy the root extent list data into the new extent block */
1373         eb_el->l_tree_depth = root_el->l_tree_depth;
1374         eb_el->l_next_free_rec = root_el->l_next_free_rec;
1375         for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1376                 eb_el->l_recs[i] = root_el->l_recs[i];
1377
1378         ocfs2_journal_dirty(handle, new_eb_bh);
1379
1380         status = ocfs2_et_root_journal_access(handle, et,
1381                                               OCFS2_JOURNAL_ACCESS_WRITE);
1382         if (status < 0) {
1383                 mlog_errno(status);
1384                 goto bail;
1385         }
1386
1387         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1388
1389         /* update root_bh now */
1390         le16_add_cpu(&root_el->l_tree_depth, 1);
1391         root_el->l_recs[0].e_cpos = 0;
1392         root_el->l_recs[0].e_blkno = eb->h_blkno;
1393         root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1394         for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1395                 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1396         root_el->l_next_free_rec = cpu_to_le16(1);
1397
1398         /* If this is our 1st tree depth shift, then last_eb_blk
1399          * becomes the allocated extent block */
1400         if (root_el->l_tree_depth == cpu_to_le16(1))
1401                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1402
1403         ocfs2_journal_dirty(handle, et->et_root_bh);
1404
1405         *ret_new_eb_bh = new_eb_bh;
1406         new_eb_bh = NULL;
1407         status = 0;
1408 bail:
1409         brelse(new_eb_bh);
1410
1411         return status;
1412 }
1413
1414 /*
1415  * Should only be called when there is no space left in any of the
1416  * leaf nodes. What we want to do is find the lowest tree depth
1417  * non-leaf extent block with room for new records. There are three
1418  * valid results of this search:
1419  *
1420  * 1) a lowest extent block is found, then we pass it back in
1421  *    *lowest_eb_bh and return '0'
1422  *
1423  * 2) the search fails to find anything, but the root_el has room. We
1424  *    pass NULL back in *lowest_eb_bh, but still return '0'
1425  *
1426  * 3) the search fails to find anything AND the root_el is full, in
1427  *    which case we return > 0
1428  *
1429  * return status < 0 indicates an error.
1430  */
1431 static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
1432                                     struct buffer_head **target_bh)
1433 {
1434         int status = 0, i;
1435         u64 blkno;
1436         struct ocfs2_extent_block *eb;
1437         struct ocfs2_extent_list  *el;
1438         struct buffer_head *bh = NULL;
1439         struct buffer_head *lowest_bh = NULL;
1440
1441         *target_bh = NULL;
1442
1443         el = et->et_root_el;
1444
1445         while(le16_to_cpu(el->l_tree_depth) > 1) {
1446                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1447                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1448                                     "Owner %llu has empty "
1449                                     "extent list (next_free_rec == 0)",
1450                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
1451                         status = -EIO;
1452                         goto bail;
1453                 }
1454                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1455                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1456                 if (!blkno) {
1457                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1458                                     "Owner %llu has extent "
1459                                     "list where extent # %d has no physical "
1460                                     "block start",
1461                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
1462                         status = -EIO;
1463                         goto bail;
1464                 }
1465
1466                 brelse(bh);
1467                 bh = NULL;
1468
1469                 status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
1470                 if (status < 0) {
1471                         mlog_errno(status);
1472                         goto bail;
1473                 }
1474
1475                 eb = (struct ocfs2_extent_block *) bh->b_data;
1476                 el = &eb->h_list;
1477
1478                 if (le16_to_cpu(el->l_next_free_rec) <
1479                     le16_to_cpu(el->l_count)) {
1480                         brelse(lowest_bh);
1481                         lowest_bh = bh;
1482                         get_bh(lowest_bh);
1483                 }
1484         }
1485
1486         /* If we didn't find one and the fe doesn't have any room,
1487          * then return '1' */
1488         el = et->et_root_el;
1489         if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1490                 status = 1;
1491
1492         *target_bh = lowest_bh;
1493 bail:
1494         brelse(bh);
1495
1496         return status;
1497 }
1498
1499 /*
1500  * Grow a b-tree so that it has more records.
1501  *
1502  * We might shift the tree depth in which case existing paths should
1503  * be considered invalid.
1504  *
1505  * Tree depth after the grow is returned via *final_depth.
1506  *
1507  * *last_eb_bh will be updated by ocfs2_add_branch().
1508  */
1509 static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
1510                            int *final_depth, struct buffer_head **last_eb_bh,
1511                            struct ocfs2_alloc_context *meta_ac)
1512 {
1513         int ret, shift;
1514         struct ocfs2_extent_list *el = et->et_root_el;
1515         int depth = le16_to_cpu(el->l_tree_depth);
1516         struct buffer_head *bh = NULL;
1517
1518         BUG_ON(meta_ac == NULL);
1519
1520         shift = ocfs2_find_branch_target(et, &bh);
1521         if (shift < 0) {
1522                 ret = shift;
1523                 mlog_errno(ret);
1524                 goto out;
1525         }
1526
1527         /* We traveled all the way to the bottom of the allocation tree
1528          * and didn't find room for any more extents - we need to add
1529          * another tree level */
1530         if (shift) {
1531                 BUG_ON(bh);
1532                 trace_ocfs2_grow_tree(
1533                         (unsigned long long)
1534                         ocfs2_metadata_cache_owner(et->et_ci),
1535                         depth);
1536
1537                 /* ocfs2_shift_tree_depth will return us a buffer with
1538                  * the new extent block (so we can pass that to
1539                  * ocfs2_add_branch). */
1540                 ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
1541                 if (ret < 0) {
1542                         mlog_errno(ret);
1543                         goto out;
1544                 }
1545                 depth++;
1546                 if (depth == 1) {
1547                         /*
1548                          * Special case: we have room now if we shifted from
1549                          * tree_depth 0, so no more work needs to be done.
1550                          *
1551                          * We won't be calling add_branch, so pass
1552                          * back *last_eb_bh as the new leaf. At depth
1553                          * zero, it should always be null so there's
1554                          * no reason to brelse.
1555                          */
1556                         BUG_ON(*last_eb_bh);
1557                         get_bh(bh);
1558                         *last_eb_bh = bh;
1559                         goto out;
1560                 }
1561         }
1562
1563         /* call ocfs2_add_branch to add the final part of the tree with
1564          * the new data. */
1565         ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
1566                                meta_ac);
1567         if (ret < 0) {
1568                 mlog_errno(ret);
1569                 goto out;
1570         }
1571
1572 out:
1573         if (final_depth)
1574                 *final_depth = depth;
1575         brelse(bh);
1576         return ret;
1577 }
1578
1579 /*
1580  * This function will discard the rightmost extent record.
1581  */
1582 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1583 {
1584         int next_free = le16_to_cpu(el->l_next_free_rec);
1585         int count = le16_to_cpu(el->l_count);
1586         unsigned int num_bytes;
1587
1588         BUG_ON(!next_free);
1589         /* This will cause us to go off the end of our extent list. */
1590         BUG_ON(next_free >= count);
1591
1592         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1593
1594         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1595 }
1596
1597 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1598                               struct ocfs2_extent_rec *insert_rec)
1599 {
1600         int i, insert_index, next_free, has_empty, num_bytes;
1601         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1602         struct ocfs2_extent_rec *rec;
1603
1604         next_free = le16_to_cpu(el->l_next_free_rec);
1605         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1606
1607         BUG_ON(!next_free);
1608
1609         /* The tree code before us didn't allow enough room in the leaf. */
1610         BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1611
1612         /*
1613          * The easiest way to approach this is to just remove the
1614          * empty extent and temporarily decrement next_free.
1615          */
1616         if (has_empty) {
1617                 /*
1618                  * If next_free was 1 (only an empty extent), this
1619                  * loop won't execute, which is fine. We still want
1620                  * the decrement above to happen.
1621                  */
1622                 for(i = 0; i < (next_free - 1); i++)
1623                         el->l_recs[i] = el->l_recs[i+1];
1624
1625                 next_free--;
1626         }
1627
1628         /*
1629          * Figure out what the new record index should be.
1630          */
1631         for(i = 0; i < next_free; i++) {
1632                 rec = &el->l_recs[i];
1633
1634                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
1635                         break;
1636         }
1637         insert_index = i;
1638
1639         trace_ocfs2_rotate_leaf(insert_cpos, insert_index,
1640                                 has_empty, next_free,
1641                                 le16_to_cpu(el->l_count));
1642
1643         BUG_ON(insert_index < 0);
1644         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1645         BUG_ON(insert_index > next_free);
1646
1647         /*
1648          * No need to memmove if we're just adding to the tail.
1649          */
1650         if (insert_index != next_free) {
1651                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
1652
1653                 num_bytes = next_free - insert_index;
1654                 num_bytes *= sizeof(struct ocfs2_extent_rec);
1655                 memmove(&el->l_recs[insert_index + 1],
1656                         &el->l_recs[insert_index],
1657                         num_bytes);
1658         }
1659
1660         /*
1661          * Either we had an empty extent, and need to re-increment or
1662          * there was no empty extent on a non full rightmost leaf node,
1663          * in which case we still need to increment.
1664          */
1665         next_free++;
1666         el->l_next_free_rec = cpu_to_le16(next_free);
1667         /*
1668          * Make sure none of the math above just messed up our tree.
1669          */
1670         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1671
1672         el->l_recs[insert_index] = *insert_rec;
1673
1674 }
1675
1676 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1677 {
1678         int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1679
1680         BUG_ON(num_recs == 0);
1681
1682         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1683                 num_recs--;
1684                 size = num_recs * sizeof(struct ocfs2_extent_rec);
1685                 memmove(&el->l_recs[0], &el->l_recs[1], size);
1686                 memset(&el->l_recs[num_recs], 0,
1687                        sizeof(struct ocfs2_extent_rec));
1688                 el->l_next_free_rec = cpu_to_le16(num_recs);
1689         }
1690 }
1691
1692 /*
1693  * Create an empty extent record .
1694  *
1695  * l_next_free_rec may be updated.
1696  *
1697  * If an empty extent already exists do nothing.
1698  */
1699 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1700 {
1701         int next_free = le16_to_cpu(el->l_next_free_rec);
1702
1703         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1704
1705         if (next_free == 0)
1706                 goto set_and_inc;
1707
1708         if (ocfs2_is_empty_extent(&el->l_recs[0]))
1709                 return;
1710
1711         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1712                         "Asked to create an empty extent in a full list:\n"
1713                         "count = %u, tree depth = %u",
1714                         le16_to_cpu(el->l_count),
1715                         le16_to_cpu(el->l_tree_depth));
1716
1717         ocfs2_shift_records_right(el);
1718
1719 set_and_inc:
1720         le16_add_cpu(&el->l_next_free_rec, 1);
1721         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1722 }
1723
1724 /*
1725  * For a rotation which involves two leaf nodes, the "root node" is
1726  * the lowest level tree node which contains a path to both leafs. This
1727  * resulting set of information can be used to form a complete "subtree"
1728  *
1729  * This function is passed two full paths from the dinode down to a
1730  * pair of adjacent leaves. It's task is to figure out which path
1731  * index contains the subtree root - this can be the root index itself
1732  * in a worst-case rotation.
1733  *
1734  * The array index of the subtree root is passed back.
1735  */
1736 int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
1737                             struct ocfs2_path *left,
1738                             struct ocfs2_path *right)
1739 {
1740         int i = 0;
1741
1742         /*
1743          * Check that the caller passed in two paths from the same tree.
1744          */
1745         BUG_ON(path_root_bh(left) != path_root_bh(right));
1746
1747         do {
1748                 i++;
1749
1750                 /*
1751                  * The caller didn't pass two adjacent paths.
1752                  */
1753                 mlog_bug_on_msg(i > left->p_tree_depth,
1754                                 "Owner %llu, left depth %u, right depth %u\n"
1755                                 "left leaf blk %llu, right leaf blk %llu\n",
1756                                 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
1757                                 left->p_tree_depth, right->p_tree_depth,
1758                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1759                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1760         } while (left->p_node[i].bh->b_blocknr ==
1761                  right->p_node[i].bh->b_blocknr);
1762
1763         return i - 1;
1764 }
1765
1766 typedef void (path_insert_t)(void *, struct buffer_head *);
1767
1768 /*
1769  * Traverse a btree path in search of cpos, starting at root_el.
1770  *
1771  * This code can be called with a cpos larger than the tree, in which
1772  * case it will return the rightmost path.
1773  */
1774 static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
1775                              struct ocfs2_extent_list *root_el, u32 cpos,
1776                              path_insert_t *func, void *data)
1777 {
1778         int i, ret = 0;
1779         u32 range;
1780         u64 blkno;
1781         struct buffer_head *bh = NULL;
1782         struct ocfs2_extent_block *eb;
1783         struct ocfs2_extent_list *el;
1784         struct ocfs2_extent_rec *rec;
1785
1786         el = root_el;
1787         while (el->l_tree_depth) {
1788                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1789                         ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1790                                     "Owner %llu has empty extent list at "
1791                                     "depth %u\n",
1792                                     (unsigned long long)ocfs2_metadata_cache_owner(ci),
1793                                     le16_to_cpu(el->l_tree_depth));
1794                         ret = -EROFS;
1795                         goto out;
1796
1797                 }
1798
1799                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1800                         rec = &el->l_recs[i];
1801
1802                         /*
1803                          * In the case that cpos is off the allocation
1804                          * tree, this should just wind up returning the
1805                          * rightmost record.
1806                          */
1807                         range = le32_to_cpu(rec->e_cpos) +
1808                                 ocfs2_rec_clusters(el, rec);
1809                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1810                             break;
1811                 }
1812
1813                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1814                 if (blkno == 0) {
1815                         ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1816                                     "Owner %llu has bad blkno in extent list "
1817                                     "at depth %u (index %d)\n",
1818                                     (unsigned long long)ocfs2_metadata_cache_owner(ci),
1819                                     le16_to_cpu(el->l_tree_depth), i);
1820                         ret = -EROFS;
1821                         goto out;
1822                 }
1823
1824                 brelse(bh);
1825                 bh = NULL;
1826                 ret = ocfs2_read_extent_block(ci, blkno, &bh);
1827                 if (ret) {
1828                         mlog_errno(ret);
1829                         goto out;
1830                 }
1831
1832                 eb = (struct ocfs2_extent_block *) bh->b_data;
1833                 el = &eb->h_list;
1834
1835                 if (le16_to_cpu(el->l_next_free_rec) >
1836                     le16_to_cpu(el->l_count)) {
1837                         ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1838                                     "Owner %llu has bad count in extent list "
1839                                     "at block %llu (next free=%u, count=%u)\n",
1840                                     (unsigned long long)ocfs2_metadata_cache_owner(ci),
1841                                     (unsigned long long)bh->b_blocknr,
1842                                     le16_to_cpu(el->l_next_free_rec),
1843                                     le16_to_cpu(el->l_count));
1844                         ret = -EROFS;
1845                         goto out;
1846                 }
1847
1848                 if (func)
1849                         func(data, bh);
1850         }
1851
1852 out:
1853         /*
1854          * Catch any trailing bh that the loop didn't handle.
1855          */
1856         brelse(bh);
1857
1858         return ret;
1859 }
1860
1861 /*
1862  * Given an initialized path (that is, it has a valid root extent
1863  * list), this function will traverse the btree in search of the path
1864  * which would contain cpos.
1865  *
1866  * The path traveled is recorded in the path structure.
1867  *
1868  * Note that this will not do any comparisons on leaf node extent
1869  * records, so it will work fine in the case that we just added a tree
1870  * branch.
1871  */
1872 struct find_path_data {
1873         int index;
1874         struct ocfs2_path *path;
1875 };
1876 static void find_path_ins(void *data, struct buffer_head *bh)
1877 {
1878         struct find_path_data *fp = data;
1879
1880         get_bh(bh);
1881         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1882         fp->index++;
1883 }
1884 int ocfs2_find_path(struct ocfs2_caching_info *ci,
1885                     struct ocfs2_path *path, u32 cpos)
1886 {
1887         struct find_path_data data;
1888
1889         data.index = 1;
1890         data.path = path;
1891         return __ocfs2_find_path(ci, path_root_el(path), cpos,
1892                                  find_path_ins, &data);
1893 }
1894
1895 static void find_leaf_ins(void *data, struct buffer_head *bh)
1896 {
1897         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1898         struct ocfs2_extent_list *el = &eb->h_list;
1899         struct buffer_head **ret = data;
1900
1901         /* We want to retain only the leaf block. */
1902         if (le16_to_cpu(el->l_tree_depth) == 0) {
1903                 get_bh(bh);
1904                 *ret = bh;
1905         }
1906 }
1907 /*
1908  * Find the leaf block in the tree which would contain cpos. No
1909  * checking of the actual leaf is done.
1910  *
1911  * Some paths want to call this instead of allocating a path structure
1912  * and calling ocfs2_find_path().
1913  *
1914  * This function doesn't handle non btree extent lists.
1915  */
1916 int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
1917                     struct ocfs2_extent_list *root_el, u32 cpos,
1918                     struct buffer_head **leaf_bh)
1919 {
1920         int ret;
1921         struct buffer_head *bh = NULL;
1922
1923         ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
1924         if (ret) {
1925                 mlog_errno(ret);
1926                 goto out;
1927         }
1928
1929         *leaf_bh = bh;
1930 out:
1931         return ret;
1932 }
1933
1934 /*
1935  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1936  *
1937  * Basically, we've moved stuff around at the bottom of the tree and
1938  * we need to fix up the extent records above the changes to reflect
1939  * the new changes.
1940  *
1941  * left_rec: the record on the left.
1942  * left_child_el: is the child list pointed to by left_rec
1943  * right_rec: the record to the right of left_rec
1944  * right_child_el: is the child list pointed to by right_rec
1945  *
1946  * By definition, this only works on interior nodes.
1947  */
1948 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1949                                   struct ocfs2_extent_list *left_child_el,
1950                                   struct ocfs2_extent_rec *right_rec,
1951                                   struct ocfs2_extent_list *right_child_el)
1952 {
1953         u32 left_clusters, right_end;
1954
1955         /*
1956          * Interior nodes never have holes. Their cpos is the cpos of
1957          * the leftmost record in their child list. Their cluster
1958          * count covers the full theoretical range of their child list
1959          * - the range between their cpos and the cpos of the record
1960          * immediately to their right.
1961          */
1962         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1963         if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
1964                 BUG_ON(right_child_el->l_tree_depth);
1965                 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1966                 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1967         }
1968         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1969         left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1970
1971         /*
1972          * Calculate the rightmost cluster count boundary before
1973          * moving cpos - we will need to adjust clusters after
1974          * updating e_cpos to keep the same highest cluster count.
1975          */
1976         right_end = le32_to_cpu(right_rec->e_cpos);
1977         right_end += le32_to_cpu(right_rec->e_int_clusters);
1978
1979         right_rec->e_cpos = left_rec->e_cpos;
1980         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1981
1982         right_end -= le32_to_cpu(right_rec->e_cpos);
1983         right_rec->e_int_clusters = cpu_to_le32(right_end);
1984 }
1985
1986 /*
1987  * Adjust the adjacent root node records involved in a
1988  * rotation. left_el_blkno is passed in as a key so that we can easily
1989  * find it's index in the root list.
1990  */
1991 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1992                                       struct ocfs2_extent_list *left_el,
1993                                       struct ocfs2_extent_list *right_el,
1994                                       u64 left_el_blkno)
1995 {
1996         int i;
1997
1998         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1999                le16_to_cpu(left_el->l_tree_depth));
2000
2001         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
2002                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
2003                         break;
2004         }
2005
2006         /*
2007          * The path walking code should have never returned a root and
2008          * two paths which are not adjacent.
2009          */
2010         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
2011
2012         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
2013                                       &root_el->l_recs[i + 1], right_el);
2014 }
2015
2016 /*
2017  * We've changed a leaf block (in right_path) and need to reflect that
2018  * change back up the subtree.
2019  *
2020  * This happens in multiple places:
2021  *   - When we've moved an extent record from the left path leaf to the right
2022  *     path leaf to make room for an empty extent in the left path leaf.
2023  *   - When our insert into the right path leaf is at the leftmost edge
2024  *     and requires an update of the path immediately to it's left. This
2025  *     can occur at the end of some types of rotation and appending inserts.
2026  *   - When we've adjusted the last extent record in the left path leaf and the
2027  *     1st extent record in the right path leaf during cross extent block merge.
2028  */
2029 static void ocfs2_complete_edge_insert(handle_t *handle,
2030                                        struct ocfs2_path *left_path,
2031                                        struct ocfs2_path *right_path,
2032                                        int subtree_index)
2033 {
2034         int i, idx;
2035         struct ocfs2_extent_list *el, *left_el, *right_el;
2036         struct ocfs2_extent_rec *left_rec, *right_rec;
2037         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2038
2039         /*
2040          * Update the counts and position values within all the
2041          * interior nodes to reflect the leaf rotation we just did.
2042          *
2043          * The root node is handled below the loop.
2044          *
2045          * We begin the loop with right_el and left_el pointing to the
2046          * leaf lists and work our way up.
2047          *
2048          * NOTE: within this loop, left_el and right_el always refer
2049          * to the *child* lists.
2050          */
2051         left_el = path_leaf_el(left_path);
2052         right_el = path_leaf_el(right_path);
2053         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
2054                 trace_ocfs2_complete_edge_insert(i);
2055
2056                 /*
2057                  * One nice property of knowing that all of these
2058                  * nodes are below the root is that we only deal with
2059                  * the leftmost right node record and the rightmost
2060                  * left node record.
2061                  */
2062                 el = left_path->p_node[i].el;
2063                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
2064                 left_rec = &el->l_recs[idx];
2065
2066                 el = right_path->p_node[i].el;
2067                 right_rec = &el->l_recs[0];
2068
2069                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
2070                                               right_el);
2071
2072                 ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
2073                 ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2074
2075                 /*
2076                  * Setup our list pointers now so that the current
2077                  * parents become children in the next iteration.
2078                  */
2079                 left_el = left_path->p_node[i].el;
2080                 right_el = right_path->p_node[i].el;
2081         }
2082
2083         /*
2084          * At the root node, adjust the two adjacent records which
2085          * begin our path to the leaves.
2086          */
2087
2088         el = left_path->p_node[subtree_index].el;
2089         left_el = left_path->p_node[subtree_index + 1].el;
2090         right_el = right_path->p_node[subtree_index + 1].el;
2091
2092         ocfs2_adjust_root_records(el, left_el, right_el,
2093                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
2094
2095         root_bh = left_path->p_node[subtree_index].bh;
2096
2097         ocfs2_journal_dirty(handle, root_bh);
2098 }
2099
2100 static int ocfs2_rotate_subtree_right(handle_t *handle,
2101                                       struct ocfs2_extent_tree *et,
2102                                       struct ocfs2_path *left_path,
2103                                       struct ocfs2_path *right_path,
2104                                       int subtree_index)
2105 {
2106         int ret, i;
2107         struct buffer_head *right_leaf_bh;
2108         struct buffer_head *left_leaf_bh = NULL;
2109         struct buffer_head *root_bh;
2110         struct ocfs2_extent_list *right_el, *left_el;
2111         struct ocfs2_extent_rec move_rec;
2112
2113         left_leaf_bh = path_leaf_bh(left_path);
2114         left_el = path_leaf_el(left_path);
2115
2116         if (left_el->l_next_free_rec != left_el->l_count) {
2117                 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
2118                             "Inode %llu has non-full interior leaf node %llu"
2119                             "(next free = %u)",
2120                             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2121                             (unsigned long long)left_leaf_bh->b_blocknr,
2122                             le16_to_cpu(left_el->l_next_free_rec));
2123                 return -EROFS;
2124         }
2125
2126         /*
2127          * This extent block may already have an empty record, so we
2128          * return early if so.
2129          */
2130         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
2131                 return 0;
2132
2133         root_bh = left_path->p_node[subtree_index].bh;
2134         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2135
2136         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2137                                            subtree_index);
2138         if (ret) {
2139                 mlog_errno(ret);
2140                 goto out;
2141         }
2142
2143         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2144                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2145                                                    right_path, i);
2146                 if (ret) {
2147                         mlog_errno(ret);
2148                         goto out;
2149                 }
2150
2151                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2152                                                    left_path, i);
2153                 if (ret) {
2154                         mlog_errno(ret);
2155                         goto out;
2156                 }
2157         }
2158
2159         right_leaf_bh = path_leaf_bh(right_path);
2160         right_el = path_leaf_el(right_path);
2161
2162         /* This is a code error, not a disk corruption. */
2163         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
2164                         "because rightmost leaf block %llu is empty\n",
2165                         (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2166                         (unsigned long long)right_leaf_bh->b_blocknr);
2167
2168         ocfs2_create_empty_extent(right_el);
2169
2170         ocfs2_journal_dirty(handle, right_leaf_bh);
2171
2172         /* Do the copy now. */
2173         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
2174         move_rec = left_el->l_recs[i];
2175         right_el->l_recs[0] = move_rec;
2176
2177         /*
2178          * Clear out the record we just copied and shift everything
2179          * over, leaving an empty extent in the left leaf.
2180          *
2181          * We temporarily subtract from next_free_rec so that the
2182          * shift will lose the tail record (which is now defunct).
2183          */
2184         le16_add_cpu(&left_el->l_next_free_rec, -1);
2185         ocfs2_shift_records_right(left_el);
2186         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2187         le16_add_cpu(&left_el->l_next_free_rec, 1);
2188
2189         ocfs2_journal_dirty(handle, left_leaf_bh);
2190
2191         ocfs2_complete_edge_insert(handle, left_path, right_path,
2192                                    subtree_index);
2193
2194 out:
2195         return ret;
2196 }
2197
2198 /*
2199  * Given a full path, determine what cpos value would return us a path
2200  * containing the leaf immediately to the left of the current one.
2201  *
2202  * Will return zero if the path passed in is already the leftmost path.
2203  */
2204 int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2205                                   struct ocfs2_path *path, u32 *cpos)
2206 {
2207         int i, j, ret = 0;
2208         u64 blkno;
2209         struct ocfs2_extent_list *el;
2210
2211         BUG_ON(path->p_tree_depth == 0);
2212
2213         *cpos = 0;
2214
2215         blkno = path_leaf_bh(path)->b_blocknr;
2216
2217         /* Start at the tree node just above the leaf and work our way up. */
2218         i = path->p_tree_depth - 1;
2219         while (i >= 0) {
2220                 el = path->p_node[i].el;
2221
2222                 /*
2223                  * Find the extent record just before the one in our
2224                  * path.
2225                  */
2226                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2227                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2228                                 if (j == 0) {
2229                                         if (i == 0) {
2230                                                 /*
2231                                                  * We've determined that the
2232                                                  * path specified is already
2233                                                  * the leftmost one - return a
2234                                                  * cpos of zero.
2235                                                  */
2236                                                 goto out;
2237                                         }
2238                                         /*
2239                                          * The leftmost record points to our
2240                                          * leaf - we need to travel up the
2241                                          * tree one level.
2242                                          */
2243                                         goto next_node;
2244                                 }
2245
2246                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
2247                                 *cpos = *cpos + ocfs2_rec_clusters(el,
2248                                                            &el->l_recs[j - 1]);
2249                                 *cpos = *cpos - 1;
2250                                 goto out;
2251                         }
2252                 }
2253
2254                 /*
2255                  * If we got here, we never found a valid node where
2256                  * the tree indicated one should be.
2257                  */
2258                 ocfs2_error(sb,
2259                             "Invalid extent tree at extent block %llu\n",
2260                             (unsigned long long)blkno);
2261                 ret = -EROFS;
2262                 goto out;
2263
2264 next_node:
2265                 blkno = path->p_node[i].bh->b_blocknr;
2266                 i--;
2267         }
2268
2269 out:
2270         return ret;
2271 }
2272
2273 /*
2274  * Extend the transaction by enough credits to complete the rotation,
2275  * and still leave at least the original number of credits allocated
2276  * to this transaction.
2277  */
2278 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2279                                            int op_credits,
2280                                            struct ocfs2_path *path)
2281 {
2282         int ret = 0;
2283         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2284
2285         if (handle->h_buffer_credits < credits)
2286                 ret = ocfs2_extend_trans(handle,
2287                                          credits - handle->h_buffer_credits);
2288
2289         return ret;
2290 }
2291
2292 /*
2293  * Trap the case where we're inserting into the theoretical range past
2294  * the _actual_ left leaf range. Otherwise, we'll rotate a record
2295  * whose cpos is less than ours into the right leaf.
2296  *
2297  * It's only necessary to look at the rightmost record of the left
2298  * leaf because the logic that calls us should ensure that the
2299  * theoretical ranges in the path components above the leaves are
2300  * correct.
2301  */
2302 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2303                                                  u32 insert_cpos)
2304 {
2305         struct ocfs2_extent_list *left_el;
2306         struct ocfs2_extent_rec *rec;
2307         int next_free;
2308
2309         left_el = path_leaf_el(left_path);
2310         next_free = le16_to_cpu(left_el->l_next_free_rec);
2311         rec = &left_el->l_recs[next_free - 1];
2312
2313         if (insert_cpos > le32_to_cpu(rec->e_cpos))
2314                 return 1;
2315         return 0;
2316 }
2317
2318 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2319 {
2320         int next_free = le16_to_cpu(el->l_next_free_rec);
2321         unsigned int range;
2322         struct ocfs2_extent_rec *rec;
2323
2324         if (next_free == 0)
2325                 return 0;
2326
2327         rec = &el->l_recs[0];
2328         if (ocfs2_is_empty_extent(rec)) {
2329                 /* Empty list. */
2330                 if (next_free == 1)
2331                         return 0;
2332                 rec = &el->l_recs[1];
2333         }
2334
2335         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2336         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2337                 return 1;
2338         return 0;
2339 }
2340
2341 /*
2342  * Rotate all the records in a btree right one record, starting at insert_cpos.
2343  *
2344  * The path to the rightmost leaf should be passed in.
2345  *
2346  * The array is assumed to be large enough to hold an entire path (tree depth).
2347  *
2348  * Upon successful return from this function:
2349  *
2350  * - The 'right_path' array will contain a path to the leaf block
2351  *   whose range contains e_cpos.
2352  * - That leaf block will have a single empty extent in list index 0.
2353  * - In the case that the rotation requires a post-insert update,
2354  *   *ret_left_path will contain a valid path which can be passed to
2355  *   ocfs2_insert_path().
2356  */
2357 static int ocfs2_rotate_tree_right(handle_t *handle,
2358                                    struct ocfs2_extent_tree *et,
2359                                    enum ocfs2_split_type split,
2360                                    u32 insert_cpos,
2361                                    struct ocfs2_path *right_path,
2362                                    struct ocfs2_path **ret_left_path)
2363 {
2364         int ret, start, orig_credits = handle->h_buffer_credits;
2365         u32 cpos;
2366         struct ocfs2_path *left_path = NULL;
2367         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2368
2369         *ret_left_path = NULL;
2370
2371         left_path = ocfs2_new_path_from_path(right_path);
2372         if (!left_path) {
2373                 ret = -ENOMEM;
2374                 mlog_errno(ret);
2375                 goto out;
2376         }
2377
2378         ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2379         if (ret) {
2380                 mlog_errno(ret);
2381                 goto out;
2382         }
2383
2384         trace_ocfs2_rotate_tree_right(
2385                 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2386                 insert_cpos, cpos);
2387
2388         /*
2389          * What we want to do here is:
2390          *
2391          * 1) Start with the rightmost path.
2392          *
2393          * 2) Determine a path to the leaf block directly to the left
2394          *    of that leaf.
2395          *
2396          * 3) Determine the 'subtree root' - the lowest level tree node
2397          *    which contains a path to both leaves.
2398          *
2399          * 4) Rotate the subtree.
2400          *
2401          * 5) Find the next subtree by considering the left path to be
2402          *    the new right path.
2403          *
2404          * The check at the top of this while loop also accepts
2405          * insert_cpos == cpos because cpos is only a _theoretical_
2406          * value to get us the left path - insert_cpos might very well
2407          * be filling that hole.
2408          *
2409          * Stop at a cpos of '0' because we either started at the
2410          * leftmost branch (i.e., a tree with one branch and a
2411          * rotation inside of it), or we've gone as far as we can in
2412          * rotating subtrees.
2413          */
2414         while (cpos && insert_cpos <= cpos) {
2415                 trace_ocfs2_rotate_tree_right(
2416                         (unsigned long long)
2417                         ocfs2_metadata_cache_owner(et->et_ci),
2418                         insert_cpos, cpos);
2419
2420                 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
2421                 if (ret) {
2422                         mlog_errno(ret);
2423                         goto out;
2424                 }
2425
2426                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
2427                                 path_leaf_bh(right_path),
2428                                 "Owner %llu: error during insert of %u "
2429                                 "(left path cpos %u) results in two identical "
2430                                 "paths ending at %llu\n",
2431                                 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2432                                 insert_cpos, cpos,
2433                                 (unsigned long long)
2434                                 path_leaf_bh(left_path)->b_blocknr);
2435
2436                 if (split == SPLIT_NONE &&
2437                     ocfs2_rotate_requires_path_adjustment(left_path,
2438                                                           insert_cpos)) {
2439
2440                         /*
2441                          * We've rotated the tree as much as we
2442                          * should. The rest is up to
2443                          * ocfs2_insert_path() to complete, after the
2444                          * record insertion. We indicate this
2445                          * situation by returning the left path.
2446                          *
2447                          * The reason we don't adjust the records here
2448                          * before the record insert is that an error
2449                          * later might break the rule where a parent
2450                          * record e_cpos will reflect the actual
2451                          * e_cpos of the 1st nonempty record of the
2452                          * child list.
2453                          */
2454                         *ret_left_path = left_path;
2455                         goto out_ret_path;
2456                 }
2457
2458                 start = ocfs2_find_subtree_root(et, left_path, right_path);
2459
2460                 trace_ocfs2_rotate_subtree(start,
2461                         (unsigned long long)
2462                         right_path->p_node[start].bh->b_blocknr,
2463                         right_path->p_tree_depth);
2464
2465                 ret = ocfs2_extend_rotate_transaction(handle, start,
2466                                                       orig_credits, right_path);
2467                 if (ret) {
2468                         mlog_errno(ret);
2469                         goto out;
2470                 }
2471
2472                 ret = ocfs2_rotate_subtree_right(handle, et, left_path,
2473                                                  right_path, start);
2474                 if (ret) {
2475                         mlog_errno(ret);
2476                         goto out;
2477                 }
2478
2479                 if (split != SPLIT_NONE &&
2480                     ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2481                                                 insert_cpos)) {
2482                         /*
2483                          * A rotate moves the rightmost left leaf
2484                          * record over to the leftmost right leaf
2485                          * slot. If we're doing an extent split
2486                          * instead of a real insert, then we have to
2487                          * check that the extent to be split wasn't
2488                          * just moved over. If it was, then we can
2489                          * exit here, passing left_path back -
2490                          * ocfs2_split_extent() is smart enough to
2491                          * search both leaves.
2492                          */
2493                         *ret_left_path = left_path;
2494                         goto out_ret_path;
2495                 }
2496
2497                 /*
2498                  * There is no need to re-read the next right path
2499                  * as we know that it'll be our current left
2500                  * path. Optimize by copying values instead.
2501                  */
2502                 ocfs2_mv_path(right_path, left_path);
2503
2504                 ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2505                 if (ret) {
2506                         mlog_errno(ret);
2507                         goto out;
2508                 }
2509         }
2510
2511 out:
2512         ocfs2_free_path(left_path);
2513
2514 out_ret_path:
2515         return ret;
2516 }
2517
2518 static int ocfs2_update_edge_lengths(handle_t *handle,
2519                                      struct ocfs2_extent_tree *et,
2520                                      int subtree_index, struct ocfs2_path *path)
2521 {
2522         int i, idx, ret;
2523         struct ocfs2_extent_rec *rec;
2524         struct ocfs2_extent_list *el;
2525         struct ocfs2_extent_block *eb;
2526         u32 range;
2527
2528         /*
2529          * In normal tree rotation process, we will never touch the
2530          * tree branch above subtree_index and ocfs2_extend_rotate_transaction
2531          * doesn't reserve the credits for them either.
2532          *
2533          * But we do have a special case here which will update the rightmost
2534          * records for all the bh in the path.
2535          * So we have to allocate extra credits and access them.
2536          */
2537         ret = ocfs2_extend_trans(handle, subtree_index);
2538         if (ret) {
2539                 mlog_errno(ret);
2540                 goto out;
2541         }
2542
2543         ret = ocfs2_journal_access_path(et->et_ci, handle, path);
2544         if (ret) {
2545                 mlog_errno(ret);
2546                 goto out;
2547         }
2548
2549         /* Path should always be rightmost. */
2550         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2551         BUG_ON(eb->h_next_leaf_blk != 0ULL);
2552
2553         el = &eb->h_list;
2554         BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2555         idx = le16_to_cpu(el->l_next_free_rec) - 1;
2556         rec = &el->l_recs[idx];
2557         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2558
2559         for (i = 0; i < path->p_tree_depth; i++) {
2560                 el = path->p_node[i].el;
2561                 idx = le16_to_cpu(el->l_next_free_rec) - 1;
2562                 rec = &el->l_recs[idx];
2563
2564                 rec->e_int_clusters = cpu_to_le32(range);
2565                 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2566
2567                 ocfs2_journal_dirty(handle, path->p_node[i].bh);
2568         }
2569 out:
2570         return ret;
2571 }
2572
2573 static void ocfs2_unlink_path(handle_t *handle,
2574                               struct ocfs2_extent_tree *et,
2575                               struct ocfs2_cached_dealloc_ctxt *dealloc,
2576                               struct ocfs2_path *path, int unlink_start)
2577 {
2578         int ret, i;
2579         struct ocfs2_extent_block *eb;
2580         struct ocfs2_extent_list *el;
2581         struct buffer_head *bh;
2582
2583         for(i = unlink_start; i < path_num_items(path); i++) {
2584                 bh = path->p_node[i].bh;
2585
2586                 eb = (struct ocfs2_extent_block *)bh->b_data;
2587                 /*
2588                  * Not all nodes might have had their final count
2589                  * decremented by the caller - handle this here.
2590                  */
2591                 el = &eb->h_list;
2592                 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2593                         mlog(ML_ERROR,
2594                              "Inode %llu, attempted to remove extent block "
2595                              "%llu with %u records\n",
2596                              (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2597                              (unsigned long long)le64_to_cpu(eb->h_blkno),
2598                              le16_to_cpu(el->l_next_free_rec));
2599
2600                         ocfs2_journal_dirty(handle, bh);
2601                         ocfs2_remove_from_cache(et->et_ci, bh);
2602                         continue;
2603                 }
2604
2605                 el->l_next_free_rec = 0;
2606                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2607
2608                 ocfs2_journal_dirty(handle, bh);
2609
2610                 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2611                 if (ret)
2612                         mlog_errno(ret);
2613
2614                 ocfs2_remove_from_cache(et->et_ci, bh);
2615         }
2616 }
2617
2618 static void ocfs2_unlink_subtree(handle_t *handle,
2619                                  struct ocfs2_extent_tree *et,
2620                                  struct ocfs2_path *left_path,
2621                                  struct ocfs2_path *right_path,
2622                                  int subtree_index,
2623                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
2624 {
2625         int i;
2626         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2627         struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2628         struct ocfs2_extent_list *el;
2629         struct ocfs2_extent_block *eb;
2630
2631         el = path_leaf_el(left_path);
2632
2633         eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2634
2635         for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2636                 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2637                         break;
2638
2639         BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2640
2641         memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2642         le16_add_cpu(&root_el->l_next_free_rec, -1);
2643
2644         eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2645         eb->h_next_leaf_blk = 0;
2646
2647         ocfs2_journal_dirty(handle, root_bh);
2648         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2649
2650         ocfs2_unlink_path(handle, et, dealloc, right_path,
2651                           subtree_index + 1);
2652 }
2653
2654 static int ocfs2_rotate_subtree_left(handle_t *handle,
2655                                      struct ocfs2_extent_tree *et,
2656                                      struct ocfs2_path *left_path,
2657                                      struct ocfs2_path *right_path,
2658                                      int subtree_index,
2659                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2660                                      int *deleted)
2661 {
2662         int ret, i, del_right_subtree = 0, right_has_empty = 0;
2663         struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2664         struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2665         struct ocfs2_extent_block *eb;
2666
2667         *deleted = 0;
2668
2669         right_leaf_el = path_leaf_el(right_path);
2670         left_leaf_el = path_leaf_el(left_path);
2671         root_bh = left_path->p_node[subtree_index].bh;
2672         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2673
2674         if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2675                 return 0;
2676
2677         eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2678         if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2679                 /*
2680                  * It's legal for us to proceed if the right leaf is
2681                  * the rightmost one and it has an empty extent. There
2682                  * are two cases to handle - whether the leaf will be
2683                  * empty after removal or not. If the leaf isn't empty
2684                  * then just remove the empty extent up front. The
2685                  * next block will handle empty leaves by flagging
2686                  * them for unlink.
2687                  *
2688                  * Non rightmost leaves will throw -EAGAIN and the
2689                  * caller can manually move the subtree and retry.
2690                  */
2691
2692                 if (eb->h_next_leaf_blk != 0ULL)
2693                         return -EAGAIN;
2694
2695                 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2696                         ret = ocfs2_journal_access_eb(handle, et->et_ci,
2697                                                       path_leaf_bh(right_path),
2698                                                       OCFS2_JOURNAL_ACCESS_WRITE);
2699                         if (ret) {
2700                                 mlog_errno(ret);
2701                                 goto out;
2702                         }
2703
2704                         ocfs2_remove_empty_extent(right_leaf_el);
2705                 } else
2706                         right_has_empty = 1;
2707         }
2708
2709         if (eb->h_next_leaf_blk == 0ULL &&
2710             le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2711                 /*
2712                  * We have to update i_last_eb_blk during the meta
2713                  * data delete.
2714                  */
2715                 ret = ocfs2_et_root_journal_access(handle, et,
2716                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2717                 if (ret) {
2718                         mlog_errno(ret);
2719                         goto out;
2720                 }
2721
2722                 del_right_subtree = 1;
2723         }
2724
2725         /*
2726          * Getting here with an empty extent in the right path implies
2727          * that it's the rightmost path and will be deleted.
2728          */
2729         BUG_ON(right_has_empty && !del_right_subtree);
2730
2731         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2732                                            subtree_index);
2733         if (ret) {
2734                 mlog_errno(ret);
2735                 goto out;
2736         }
2737
2738         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2739                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2740                                                    right_path, i);
2741                 if (ret) {
2742                         mlog_errno(ret);
2743                         goto out;
2744                 }
2745
2746                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2747                                                    left_path, i);
2748                 if (ret) {
2749                         mlog_errno(ret);
2750                         goto out;
2751                 }
2752         }
2753
2754         if (!right_has_empty) {
2755                 /*
2756                  * Only do this if we're moving a real
2757                  * record. Otherwise, the action is delayed until
2758                  * after removal of the right path in which case we
2759                  * can do a simple shift to remove the empty extent.
2760                  */
2761                 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2762                 memset(&right_leaf_el->l_recs[0], 0,
2763                        sizeof(struct ocfs2_extent_rec));
2764         }
2765         if (eb->h_next_leaf_blk == 0ULL) {
2766                 /*
2767                  * Move recs over to get rid of empty extent, decrease
2768                  * next_free. This is allowed to remove the last
2769                  * extent in our leaf (setting l_next_free_rec to
2770                  * zero) - the delete code below won't care.
2771                  */
2772                 ocfs2_remove_empty_extent(right_leaf_el);
2773         }
2774
2775         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2776         ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2777
2778         if (del_right_subtree) {
2779                 ocfs2_unlink_subtree(handle, et, left_path, right_path,
2780                                      subtree_index, dealloc);
2781                 ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
2782                                                 left_path);
2783                 if (ret) {
2784                         mlog_errno(ret);
2785                         goto out;
2786                 }
2787
2788                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2789                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2790
2791                 /*
2792                  * Removal of the extent in the left leaf was skipped
2793                  * above so we could delete the right path
2794                  * 1st.
2795                  */
2796                 if (right_has_empty)
2797                         ocfs2_remove_empty_extent(left_leaf_el);
2798
2799                 ocfs2_journal_dirty(handle, et_root_bh);
2800
2801                 *deleted = 1;
2802         } else
2803                 ocfs2_complete_edge_insert(handle, left_path, right_path,
2804                                            subtree_index);
2805
2806 out:
2807         return ret;
2808 }
2809
2810 /*
2811  * Given a full path, determine what cpos value would return us a path
2812  * containing the leaf immediately to the right of the current one.
2813  *
2814  * Will return zero if the path passed in is already the rightmost path.
2815  *
2816  * This looks similar, but is subtly different to
2817  * ocfs2_find_cpos_for_left_leaf().
2818  */
2819 int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2820                                    struct ocfs2_path *path, u32 *cpos)
2821 {
2822         int i, j, ret = 0;
2823         u64 blkno;
2824         struct ocfs2_extent_list *el;
2825
2826         *cpos = 0;
2827
2828         if (path->p_tree_depth == 0)
2829                 return 0;
2830
2831         blkno = path_leaf_bh(path)->b_blocknr;
2832
2833         /* Start at the tree node just above the leaf and work our way up. */
2834         i = path->p_tree_depth - 1;
2835         while (i >= 0) {
2836                 int next_free;
2837
2838                 el = path->p_node[i].el;
2839
2840                 /*
2841                  * Find the extent record just after the one in our
2842                  * path.
2843                  */
2844                 next_free = le16_to_cpu(el->l_next_free_rec);
2845                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2846                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2847                                 if (j == (next_free - 1)) {
2848                                         if (i == 0) {
2849                                                 /*
2850                                                  * We've determined that the
2851                                                  * path specified is already
2852                                                  * the rightmost one - return a
2853                                                  * cpos of zero.
2854                                                  */
2855                                                 goto out;
2856                                         }
2857                                         /*
2858                                          * The rightmost record points to our
2859                                          * leaf - we need to travel up the
2860                                          * tree one level.
2861                                          */
2862                                         goto next_node;
2863                                 }
2864
2865                                 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2866                                 goto out;
2867                         }
2868                 }
2869
2870                 /*
2871                  * If we got here, we never found a valid node where
2872                  * the tree indicated one should be.
2873                  */
2874                 ocfs2_error(sb,
2875                             "Invalid extent tree at extent block %llu\n",
2876                             (unsigned long long)blkno);
2877                 ret = -EROFS;
2878                 goto out;
2879
2880 next_node:
2881                 blkno = path->p_node[i].bh->b_blocknr;
2882                 i--;
2883         }
2884
2885 out:
2886         return ret;
2887 }
2888
2889 static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
2890                                             struct ocfs2_extent_tree *et,
2891                                             struct ocfs2_path *path)
2892 {
2893         int ret;
2894         struct buffer_head *bh = path_leaf_bh(path);
2895         struct ocfs2_extent_list *el = path_leaf_el(path);
2896
2897         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2898                 return 0;
2899
2900         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
2901                                            path_num_items(path) - 1);
2902         if (ret) {
2903                 mlog_errno(ret);
2904                 goto out;
2905         }
2906
2907         ocfs2_remove_empty_extent(el);
2908         ocfs2_journal_dirty(handle, bh);
2909
2910 out:
2911         return ret;
2912 }
2913
2914 static int __ocfs2_rotate_tree_left(handle_t *handle,
2915                                     struct ocfs2_extent_tree *et,
2916                                     int orig_credits,
2917                                     struct ocfs2_path *path,
2918                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2919                                     struct ocfs2_path **empty_extent_path)
2920 {
2921         int ret, subtree_root, deleted;
2922         u32 right_cpos;
2923         struct ocfs2_path *left_path = NULL;
2924         struct ocfs2_path *right_path = NULL;
2925         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2926
2927         BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2928
2929         *empty_extent_path = NULL;
2930
2931         ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
2932         if (ret) {
2933                 mlog_errno(ret);
2934                 goto out;
2935         }
2936
2937         left_path = ocfs2_new_path_from_path(path);
2938         if (!left_path) {
2939                 ret = -ENOMEM;
2940                 mlog_errno(ret);
2941                 goto out;
2942         }
2943
2944         ocfs2_cp_path(left_path, path);
2945
2946         right_path = ocfs2_new_path_from_path(path);
2947         if (!right_path) {
2948                 ret = -ENOMEM;
2949                 mlog_errno(ret);
2950                 goto out;
2951         }
2952
2953         while (right_cpos) {
2954                 ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
2955                 if (ret) {
2956                         mlog_errno(ret);
2957                         goto out;
2958                 }
2959
2960                 subtree_root = ocfs2_find_subtree_root(et, left_path,
2961                                                        right_path);
2962
2963                 trace_ocfs2_rotate_subtree(subtree_root,
2964                      (unsigned long long)
2965                      right_path->p_node[subtree_root].bh->b_blocknr,
2966                      right_path->p_tree_depth);
2967
2968                 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2969                                                       orig_credits, left_path);
2970                 if (ret) {
2971                         mlog_errno(ret);
2972                         goto out;
2973                 }
2974
2975                 /*
2976                  * Caller might still want to make changes to the
2977                  * tree root, so re-add it to the journal here.
2978                  */
2979                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2980                                                    left_path, 0);
2981                 if (ret) {
2982                         mlog_errno(ret);
2983                         goto out;
2984                 }
2985
2986                 ret = ocfs2_rotate_subtree_left(handle, et, left_path,
2987                                                 right_path, subtree_root,
2988                                                 dealloc, &deleted);
2989                 if (ret == -EAGAIN) {
2990                         /*
2991                          * The rotation has to temporarily stop due to
2992                          * the right subtree having an empty
2993                          * extent. Pass it back to the caller for a
2994                          * fixup.
2995                          */
2996                         *empty_extent_path = right_path;
2997                         right_path = NULL;
2998                         goto out;
2999                 }
3000                 if (ret) {
3001                         mlog_errno(ret);
3002                         goto out;
3003                 }
3004
3005                 /*
3006                  * The subtree rotate might have removed records on
3007                  * the rightmost edge. If so, then rotation is
3008                  * complete.
3009                  */
3010                 if (deleted)
3011                         break;
3012
3013                 ocfs2_mv_path(left_path, right_path);
3014
3015                 ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
3016                                                      &right_cpos);
3017                 if (ret) {
3018                         mlog_errno(ret);
3019                         goto out;
3020                 }
3021         }
3022
3023 out:
3024         ocfs2_free_path(right_path);
3025         ocfs2_free_path(left_path);
3026
3027         return ret;
3028 }
3029
3030 static int ocfs2_remove_rightmost_path(handle_t *handle,
3031                                 struct ocfs2_extent_tree *et,
3032                                 struct ocfs2_path *path,
3033                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
3034 {
3035         int ret, subtree_index;
3036         u32 cpos;
3037         struct ocfs2_path *left_path = NULL;
3038         struct ocfs2_extent_block *eb;
3039         struct ocfs2_extent_list *el;
3040
3041
3042         ret = ocfs2_et_sanity_check(et);
3043         if (ret)
3044                 goto out;
3045         /*
3046          * There's two ways we handle this depending on
3047          * whether path is the only existing one.
3048          */
3049         ret = ocfs2_extend_rotate_transaction(handle, 0,
3050                                               handle->h_buffer_credits,
3051                                               path);
3052         if (ret) {
3053                 mlog_errno(ret);
3054                 goto out;
3055         }
3056
3057         ret = ocfs2_journal_access_path(et->et_ci, handle, path);
3058         if (ret) {
3059                 mlog_errno(ret);
3060                 goto out;
3061         }
3062
3063         ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3064                                             path, &cpos);
3065         if (ret) {
3066                 mlog_errno(ret);
3067                 goto out;
3068         }
3069
3070         if (cpos) {
3071                 /*
3072                  * We have a path to the left of this one - it needs
3073                  * an update too.
3074                  */
3075                 left_path = ocfs2_new_path_from_path(path);
3076                 if (!left_path) {
3077                         ret = -ENOMEM;
3078                         mlog_errno(ret);
3079                         goto out;
3080                 }
3081
3082                 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
3083                 if (ret) {
3084                         mlog_errno(ret);
3085                         goto out;
3086                 }
3087
3088                 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
3089                 if (ret) {
3090                         mlog_errno(ret);
3091                         goto out;
3092                 }
3093
3094                 subtree_index = ocfs2_find_subtree_root(et, left_path, path);
3095
3096                 ocfs2_unlink_subtree(handle, et, left_path, path,
3097                                      subtree_index, dealloc);
3098                 ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
3099                                                 left_path);
3100                 if (ret) {
3101                         mlog_errno(ret);
3102                         goto out;
3103                 }
3104
3105                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
3106                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
3107         } else {
3108                 /*
3109                  * 'path' is also the leftmost path which
3110                  * means it must be the only one. This gets
3111                  * handled differently because we want to
3112                  * revert the root back to having extents
3113                  * in-line.
3114                  */
3115                 ocfs2_unlink_path(handle, et, dealloc, path, 1);
3116
3117                 el = et->et_root_el;
3118                 el->l_tree_depth = 0;
3119                 el->l_next_free_rec = 0;
3120                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3121
3122                 ocfs2_et_set_last_eb_blk(et, 0);
3123         }
3124
3125         ocfs2_journal_dirty(handle, path_root_bh(path));
3126
3127 out:
3128         ocfs2_free_path(left_path);
3129         return ret;
3130 }
3131
3132 /*
3133  * Left rotation of btree records.
3134  *
3135  * In many ways, this is (unsurprisingly) the opposite of right
3136  * rotation. We start at some non-rightmost path containing an empty
3137  * extent in the leaf block. The code works its way to the rightmost
3138  * path by rotating records to the left in every subtree.
3139  *
3140  * This is used by any code which reduces the number of extent records
3141  * in a leaf. After removal, an empty record should be placed in the
3142  * leftmost list position.
3143  *
3144  * This won't handle a length update of the rightmost path records if
3145  * the rightmost tree leaf record is removed so the caller is
3146  * responsible for detecting and correcting that.
3147  */
3148 static int ocfs2_rotate_tree_left(handle_t *handle,
3149                                   struct ocfs2_extent_tree *et,
3150                                   struct ocfs2_path *path,
3151                                   struct ocfs2_cached_dealloc_ctxt *dealloc)
3152 {
3153         int ret, orig_credits = handle->h_buffer_credits;
3154         struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
3155         struct ocfs2_extent_block *eb;
3156         struct ocfs2_extent_list *el;
3157
3158         el = path_leaf_el(path);
3159         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
3160                 return 0;
3161
3162         if (path->p_tree_depth == 0) {
3163 rightmost_no_delete:
3164                 /*
3165                  * Inline extents. This is trivially handled, so do
3166                  * it up front.
3167                  */
3168                 ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
3169                 if (ret)
3170                         mlog_errno(ret);
3171                 goto out;
3172         }
3173
3174         /*
3175          * Handle rightmost branch now. There's several cases:
3176          *  1) simple rotation leaving records in there. That's trivial.
3177          *  2) rotation requiring a branch delete - there's no more
3178          *     records left. Two cases of this:
3179          *     a) There are branches to the left.
3180          *     b) This is also the leftmost (the only) branch.
3181          *
3182          *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
3183          *  2a) we need the left branch so that we can update it with the unlink
3184          *  2b) we need to bring the root back to inline extents.
3185          */
3186
3187         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
3188         el = &eb->h_list;
3189         if (eb->h_next_leaf_blk == 0) {
3190                 /*
3191                  * This gets a bit tricky if we're going to delete the
3192                  * rightmost path. Get the other cases out of the way
3193                  * 1st.
3194                  */
3195                 if (le16_to_cpu(el->l_next_free_rec) > 1)
3196                         goto rightmost_no_delete;
3197
3198                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
3199                         ret = -EIO;
3200                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3201                                     "Owner %llu has empty extent block at %llu",
3202                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
3203                                     (unsigned long long)le64_to_cpu(eb->h_blkno));
3204                         goto out;
3205                 }
3206
3207                 /*
3208                  * XXX: The caller can not trust "path" any more after
3209                  * this as it will have been deleted. What do we do?
3210                  *
3211                  * In theory the rotate-for-merge code will never get
3212                  * here because it'll always ask for a rotate in a
3213                  * nonempty list.
3214                  */
3215
3216                 ret = ocfs2_remove_rightmost_path(handle, et, path,
3217                                                   dealloc);
3218                 if (ret)
3219                         mlog_errno(ret);
3220                 goto out;
3221         }
3222
3223         /*
3224          * Now we can loop, remembering the path we get from -EAGAIN
3225          * and restarting from there.
3226          */
3227 try_rotate:
3228         ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
3229                                        dealloc, &restart_path);
3230         if (ret && ret != -EAGAIN) {
3231                 mlog_errno(ret);
3232                 goto out;
3233         }
3234
3235         while (ret == -EAGAIN) {
3236                 tmp_path = restart_path;
3237                 restart_path = NULL;
3238
3239                 ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
3240                                                tmp_path, dealloc,
3241                                                &restart_path);
3242                 if (ret && ret != -EAGAIN) {
3243                         mlog_errno(ret);
3244                         goto out;
3245                 }
3246
3247                 ocfs2_free_path(tmp_path);
3248                 tmp_path = NULL;
3249
3250                 if (ret == 0)
3251                         goto try_rotate;
3252         }
3253
3254 out:
3255         ocfs2_free_path(tmp_path);
3256         ocfs2_free_path(restart_path);
3257         return ret;
3258 }
3259
3260 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3261                                 int index)
3262 {
3263         struct ocfs2_extent_rec *rec = &el->l_recs[index];
3264         unsigned int size;
3265
3266         if (rec->e_leaf_clusters == 0) {
3267                 /*
3268                  * We consumed all of the merged-from record. An empty
3269                  * extent cannot exist anywhere but the 1st array
3270                  * position, so move things over if the merged-from
3271                  * record doesn't occupy that position.
3272                  *
3273                  * This creates a new empty extent so the caller
3274                  * should be smart enough to have removed any existing
3275                  * ones.
3276                  */
3277                 if (index > 0) {
3278                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3279                         size = index * sizeof(struct ocfs2_extent_rec);
3280                         memmove(&el->l_recs[1], &el->l_recs[0], size);
3281                 }
3282
3283                 /*
3284                  * Always memset - the caller doesn't check whether it
3285                  * created an empty extent, so there could be junk in
3286                  * the other fields.
3287                  */
3288                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3289         }
3290 }
3291
3292 static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
3293                                 struct ocfs2_path *left_path,
3294                                 struct ocfs2_path **ret_right_path)
3295 {
3296         int ret;
3297         u32 right_cpos;
3298         struct ocfs2_path *right_path = NULL;
3299         struct ocfs2_extent_list *left_el;
3300
3301         *ret_right_path = NULL;
3302
3303         /* This function shouldn't be called for non-trees. */
3304         BUG_ON(left_path->p_tree_depth == 0);
3305
3306         left_el = path_leaf_el(left_path);
3307         BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3308
3309         ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3310                                              left_path, &right_cpos);
3311         if (ret) {
3312                 mlog_errno(ret);
3313                 goto out;
3314         }
3315
3316         /* This function shouldn't be called for the rightmost leaf. */
3317         BUG_ON(right_cpos == 0);
3318
3319         right_path = ocfs2_new_path_from_path(left_path);
3320         if (!right_path) {
3321                 ret = -ENOMEM;
3322                 mlog_errno(ret);
3323                 goto out;
3324         }
3325
3326         ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
3327         if (ret) {
3328                 mlog_errno(ret);
3329                 goto out;
3330         }
3331
3332         *ret_right_path = right_path;
3333 out:
3334         if (ret)
3335                 ocfs2_free_path(right_path);
3336         return ret;
3337 }
3338
3339 /*
3340  * Remove split_rec clusters from the record at index and merge them
3341  * onto the beginning of the record "next" to it.
3342  * For index < l_count - 1, the next means the extent rec at index + 1.
3343  * For index == l_count - 1, the "next" means the 1st extent rec of the
3344  * next extent block.
3345  */
3346 static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3347                                  handle_t *handle,
3348                                  struct ocfs2_extent_tree *et,
3349                                  struct ocfs2_extent_rec *split_rec,
3350                                  int index)
3351 {
3352         int ret, next_free, i;
3353         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3354         struct ocfs2_extent_rec *left_rec;
3355         struct ocfs2_extent_rec *right_rec;
3356         struct ocfs2_extent_list *right_el;
3357         struct ocfs2_path *right_path = NULL;
3358         int subtree_index = 0;
3359         struct ocfs2_extent_list *el = path_leaf_el(left_path);
3360         struct buffer_head *bh = path_leaf_bh(left_path);
3361         struct buffer_head *root_bh = NULL;
3362
3363         BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3364         left_rec = &el->l_recs[index];
3365
3366         if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3367             le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3368                 /* we meet with a cross extent block merge. */
3369                 ret = ocfs2_get_right_path(et, left_path, &right_path);
3370                 if (ret) {
3371                         mlog_errno(ret);
3372                         goto out;
3373                 }
3374
3375                 right_el = path_leaf_el(right_path);
3376                 next_free = le16_to_cpu(right_el->l_next_free_rec);
3377                 BUG_ON(next_free <= 0);
3378                 right_rec = &right_el->l_recs[0];
3379                 if (ocfs2_is_empty_extent(right_rec)) {
3380                         BUG_ON(next_free <= 1);
3381                         right_rec = &right_el->l_recs[1];
3382                 }
3383
3384                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3385                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3386                        le32_to_cpu(right_rec->e_cpos));
3387
3388                 subtree_index = ocfs2_find_subtree_root(et, left_path,
3389                                                         right_path);
3390
3391                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3392                                                       handle->h_buffer_credits,
3393                                                       right_path);
3394                 if (ret) {
3395                         mlog_errno(ret);
3396                         goto out;
3397                 }
3398
3399                 root_bh = left_path->p_node[subtree_index].bh;
3400                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3401
3402                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3403                                                    subtree_index);
3404                 if (ret) {
3405                         mlog_errno(ret);
3406                         goto out;
3407                 }
3408
3409                 for (i = subtree_index + 1;
3410                      i < path_num_items(right_path); i++) {
3411                         ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3412                                                            right_path, i);
3413                         if (ret) {
3414                                 mlog_errno(ret);
3415                                 goto out;
3416                         }
3417
3418                         ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3419                                                            left_path, i);
3420                         if (ret) {
3421                                 mlog_errno(ret);
3422                                 goto out;
3423                         }
3424                 }
3425
3426         } else {
3427                 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3428                 right_rec = &el->l_recs[index + 1];
3429         }
3430
3431         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
3432                                            path_num_items(left_path) - 1);
3433         if (ret) {
3434                 mlog_errno(ret);
3435                 goto out;
3436         }
3437
3438         le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3439
3440         le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3441         le64_add_cpu(&right_rec->e_blkno,
3442                      -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3443                                                split_clusters));
3444         le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3445
3446         ocfs2_cleanup_merge(el, index);
3447
3448         ocfs2_journal_dirty(handle, bh);
3449         if (right_path) {
3450                 ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3451                 ocfs2_complete_edge_insert(handle, left_path, right_path,
3452                                            subtree_index);
3453         }
3454 out:
3455         if (right_path)
3456                 ocfs2_free_path(right_path);
3457         return ret;
3458 }
3459
3460 static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
3461                                struct ocfs2_path *right_path,
3462                                struct ocfs2_path **ret_left_path)
3463 {
3464         int ret;
3465         u32 left_cpos;
3466         struct ocfs2_path *left_path = NULL;
3467
3468         *ret_left_path = NULL;
3469
3470         /* This function shouldn't be called for non-trees. */
3471         BUG_ON(right_path->p_tree_depth == 0);
3472
3473         ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3474                                             right_path, &left_cpos);
3475         if (ret) {
3476                 mlog_errno(ret);
3477                 goto out;
3478         }
3479
3480         /* This function shouldn't be called for the leftmost leaf. */
3481         BUG_ON(left_cpos == 0);
3482
3483         left_path = ocfs2_new_path_from_path(right_path);
3484         if (!left_path) {
3485                 ret = -ENOMEM;
3486                 mlog_errno(ret);
3487                 goto out;
3488         }
3489
3490         ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
3491         if (ret) {
3492                 mlog_errno(ret);
3493                 goto out;
3494         }
3495
3496         *ret_left_path = left_path;
3497 out:
3498         if (ret)
3499                 ocfs2_free_path(left_path);
3500         return ret;
3501 }
3502
3503 /*
3504  * Remove split_rec clusters from the record at index and merge them
3505  * onto the tail of the record "before" it.
3506  * For index > 0, the "before" means the extent rec at index - 1.
3507  *
3508  * For index == 0, the "before" means the last record of the previous
3509  * extent block. And there is also a situation that we may need to
3510  * remove the rightmost leaf extent block in the right_path and change
3511  * the right path to indicate the new rightmost path.
3512  */
3513 static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3514                                 handle_t *handle,
3515                                 struct ocfs2_extent_tree *et,
3516                                 struct ocfs2_extent_rec *split_rec,
3517                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
3518                                 int index)
3519 {
3520         int ret, i, subtree_index = 0, has_empty_extent = 0;
3521         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3522         struct ocfs2_extent_rec *left_rec;
3523         struct ocfs2_extent_rec *right_rec;
3524         struct ocfs2_extent_list *el = path_leaf_el(right_path);
3525         struct buffer_head *bh = path_leaf_bh(right_path);
3526         struct buffer_head *root_bh = NULL;
3527         struct ocfs2_path *left_path = NULL;
3528         struct ocfs2_extent_list *left_el;
3529
3530         BUG_ON(index < 0);
3531
3532         right_rec = &el->l_recs[index];
3533         if (index == 0) {
3534                 /* we meet with a cross extent block merge. */
3535                 ret = ocfs2_get_left_path(et, right_path, &left_path);
3536                 if (ret) {
3537                         mlog_errno(ret);
3538                         goto out;
3539                 }
3540
3541                 left_el = path_leaf_el(left_path);
3542                 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3543                        le16_to_cpu(left_el->l_count));
3544
3545                 left_rec = &left_el->l_recs[
3546                                 le16_to_cpu(left_el->l_next_free_rec) - 1];
3547                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3548                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3549                        le32_to_cpu(split_rec->e_cpos));
3550
3551                 subtree_index = ocfs2_find_subtree_root(et, left_path,
3552                                                         right_path);
3553
3554                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3555                                                       handle->h_buffer_credits,
3556                                                       left_path);
3557                 if (ret) {
3558                         mlog_errno(ret);
3559                         goto out;
3560                 }
3561
3562                 root_bh = left_path->p_node[subtree_index].bh;
3563                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3564
3565                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3566                                                    subtree_index);
3567                 if (ret) {
3568                         mlog_errno(ret);
3569                         goto out;
3570                 }
3571
3572                 for (i = subtree_index + 1;
3573                      i < path_num_items(right_path); i++) {
3574                         ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3575                                                            right_path, i);
3576                         if (ret) {
3577                                 mlog_errno(ret);
3578                                 goto out;
3579                         }
3580
3581                         ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3582                                                            left_path, i);
3583                         if (ret) {
3584                                 mlog_errno(ret);
3585                                 goto out;
3586                         }
3587                 }
3588         } else {
3589                 left_rec = &el->l_recs[index - 1];
3590                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3591                         has_empty_extent = 1;
3592         }
3593
3594         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3595                                            path_num_items(right_path) - 1);
3596         if (ret) {
3597                 mlog_errno(ret);
3598                 goto out;
3599         }
3600
3601         if (has_empty_extent && index == 1) {
3602                 /*
3603                  * The easy case - we can just plop the record right in.
3604                  */
3605                 *left_rec = *split_rec;
3606
3607                 has_empty_extent = 0;
3608         } else
3609                 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3610
3611         le32_add_cpu(&right_rec->e_cpos, split_clusters);
3612         le64_add_cpu(&right_rec->e_blkno,
3613                      ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3614                                               split_clusters));
3615         le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3616
3617         ocfs2_cleanup_merge(el, index);
3618
3619         ocfs2_journal_dirty(handle, bh);
3620         if (left_path) {
3621                 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3622
3623                 /*
3624                  * In the situation that the right_rec is empty and the extent
3625                  * block is empty also,  ocfs2_complete_edge_insert can't handle
3626                  * it and we need to delete the right extent block.
3627                  */
3628                 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3629                     le16_to_cpu(el->l_next_free_rec) == 1) {
3630
3631                         ret = ocfs2_remove_rightmost_path(handle, et,
3632                                                           right_path,
3633                                                           dealloc);
3634                         if (ret) {
3635                                 mlog_errno(ret);
3636                                 goto out;
3637                         }
3638
3639                         /* Now the rightmost extent block has been deleted.
3640                          * So we use the new rightmost path.
3641                          */
3642                         ocfs2_mv_path(right_path, left_path);
3643                         left_path = NULL;
3644                 } else
3645                         ocfs2_complete_edge_insert(handle, left_path,
3646                                                    right_path, subtree_index);
3647         }
3648 out:
3649         if (left_path)
3650                 ocfs2_free_path(left_path);
3651         return ret;
3652 }
3653
3654 static int ocfs2_try_to_merge_extent(handle_t *handle,
3655                                      struct ocfs2_extent_tree *et,
3656                                      struct ocfs2_path *path,
3657                                      int split_index,
3658                                      struct ocfs2_extent_rec *split_rec,
3659                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
3660                                      struct ocfs2_merge_ctxt *ctxt)
3661 {
3662         int ret = 0;
3663         struct ocfs2_extent_list *el = path_leaf_el(path);
3664         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3665
3666         BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3667
3668         if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3669                 /*
3670                  * The merge code will need to create an empty
3671                  * extent to take the place of the newly
3672                  * emptied slot. Remove any pre-existing empty
3673                  * extents - having more than one in a leaf is
3674                  * illegal.
3675                  */
3676                 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3677                 if (ret) {
3678                         mlog_errno(ret);
3679                         goto out;
3680                 }
3681                 split_index--;
3682                 rec = &el->l_recs[split_index];
3683         }
3684
3685         if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3686                 /*
3687                  * Left-right contig implies this.
3688                  */
3689                 BUG_ON(!ctxt->c_split_covers_rec);
3690
3691                 /*
3692                  * Since the leftright insert always covers the entire
3693                  * extent, this call will delete the insert record
3694                  * entirely, resulting in an empty extent record added to
3695                  * the extent block.
3696                  *
3697                  * Since the adding of an empty extent shifts
3698                  * everything back to the right, there's no need to
3699                  * update split_index here.
3700                  *
3701                  * When the split_index is zero, we need to merge it to the
3702                  * prevoius extent block. It is more efficient and easier
3703                  * if we do merge_right first and merge_left later.
3704                  */
3705                 ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
3706                                             split_index);
3707                 if (ret) {
3708                         mlog_errno(ret);
3709                         goto out;
3710                 }
3711
3712                 /*
3713                  * We can only get this from logic error above.
3714                  */
3715                 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3716
3717                 /* The merge left us with an empty extent, remove it. */
3718                 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3719                 if (ret) {
3720                         mlog_errno(ret);
3721                         goto out;
3722                 }
3723
3724                 rec = &el->l_recs[split_index];
3725
3726                 /*
3727                  * Note that we don't pass split_rec here on purpose -
3728                  * we've merged it into the rec already.
3729                  */
3730                 ret = ocfs2_merge_rec_left(path, handle, et, rec,
3731                                            dealloc, split_index);
3732
3733                 if (ret) {
3734                         mlog_errno(ret);
3735                         goto out;
3736                 }
3737
3738                 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3739                 /*
3740                  * Error from this last rotate is not critical, so
3741                  * print but don't bubble it up.
3742                  */
3743                 if (ret)
3744                         mlog_errno(ret);
3745                 ret = 0;
3746         } else {
3747                 /*
3748                  * Merge a record to the left or right.
3749                  *
3750                  * 'contig_type' is relative to the existing record,
3751                  * so for example, if we're "right contig", it's to
3752                  * the record on the left (hence the left merge).
3753                  */
3754                 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3755                         ret = ocfs2_merge_rec_left(path, handle, et,
3756                                                    split_rec, dealloc,
3757                                                    split_index);
3758                         if (ret) {
3759                                 mlog_errno(ret);
3760                                 goto out;
3761                         }
3762                 } else {
3763                         ret = ocfs2_merge_rec_right(path, handle,
3764                                                     et, split_rec,
3765                                                     split_index);
3766                         if (ret) {
3767                                 mlog_errno(ret);
3768                                 goto out;
3769                         }
3770                 }
3771
3772                 if (ctxt->c_split_covers_rec) {
3773                         /*
3774                          * The merge may have left an empty extent in
3775                          * our leaf. Try to rotate it away.
3776                          */
3777                         ret = ocfs2_rotate_tree_left(handle, et, path,
3778                                                      dealloc);
3779                         if (ret)
3780                                 mlog_errno(ret);
3781                         ret = 0;
3782                 }
3783         }
3784
3785 out:
3786         return ret;
3787 }
3788
3789 static void ocfs2_subtract_from_rec(struct super_block *sb,
3790                                     enum ocfs2_split_type split,
3791                                     struct ocfs2_extent_rec *rec,
3792                                     struct ocfs2_extent_rec *split_rec)
3793 {
3794         u64 len_blocks;
3795
3796         len_blocks = ocfs2_clusters_to_blocks(sb,
3797                                 le16_to_cpu(split_rec->e_leaf_clusters));
3798
3799         if (split == SPLIT_LEFT) {
3800                 /*
3801                  * Region is on the left edge of the existing
3802                  * record.
3803                  */
3804                 le32_add_cpu(&rec->e_cpos,
3805                              le16_to_cpu(split_rec->e_leaf_clusters));
3806                 le64_add_cpu(&rec->e_blkno, len_blocks);
3807                 le16_add_cpu(&rec->e_leaf_clusters,
3808                              -le16_to_cpu(split_rec->e_leaf_clusters));
3809         } else {
3810                 /*
3811                  * Region is on the right edge of the existing
3812                  * record.
3813                  */
3814                 le16_add_cpu(&rec->e_leaf_clusters,
3815                              -le16_to_cpu(split_rec->e_leaf_clusters));
3816         }
3817 }
3818
3819 /*
3820  * Do the final bits of extent record insertion at the target leaf
3821  * list. If this leaf is part of an allocation tree, it is assumed
3822  * that the tree above has been prepared.
3823  */
3824 static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
3825                                  struct ocfs2_extent_rec *insert_rec,
3826                                  struct ocfs2_extent_list *el,
3827                                  struct ocfs2_insert_type *insert)
3828 {
3829         int i = insert->ins_contig_index;
3830         unsigned int range;
3831         struct ocfs2_extent_rec *rec;
3832
3833         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3834
3835         if (insert->ins_split != SPLIT_NONE) {
3836                 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3837                 BUG_ON(i == -1);
3838                 rec = &el->l_recs[i];
3839                 ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
3840                                         insert->ins_split, rec,
3841                                         insert_rec);
3842                 goto rotate;
3843         }
3844
3845         /*
3846          * Contiguous insert - either left or right.
3847          */
3848         if (insert->ins_contig != CONTIG_NONE) {
3849                 rec = &el->l_recs[i];
3850                 if (insert->ins_contig == CONTIG_LEFT) {
3851                         rec->e_blkno = insert_rec->e_blkno;
3852                         rec->e_cpos = insert_rec->e_cpos;
3853                 }
3854                 le16_add_cpu(&rec->e_leaf_clusters,
3855                              le16_to_cpu(insert_rec->e_leaf_clusters));
3856                 return;
3857         }
3858
3859         /*
3860          * Handle insert into an empty leaf.
3861          */
3862         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3863             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3864              ocfs2_is_empty_extent(&el->l_recs[0]))) {
3865                 el->l_recs[0] = *insert_rec;
3866                 el->l_next_free_rec = cpu_to_le16(1);
3867                 return;
3868         }
3869
3870         /*
3871          * Appending insert.
3872          */
3873         if (insert->ins_appending == APPEND_TAIL) {
3874                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3875                 rec = &el->l_recs[i];
3876                 range = le32_to_cpu(rec->e_cpos)
3877                         + le16_to_cpu(rec->e_leaf_clusters);
3878                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3879
3880                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3881                                 le16_to_cpu(el->l_count),
3882                                 "owner %llu, depth %u, count %u, next free %u, "
3883                                 "rec.cpos %u, rec.clusters %u, "
3884                                 "insert.cpos %u, insert.clusters %u\n",
3885                                 ocfs2_metadata_cache_owner(et->et_ci),
3886                                 le16_to_cpu(el->l_tree_depth),
3887                                 le16_to_cpu(el->l_count),
3888                                 le16_to_cpu(el->l_next_free_rec),
3889                                 le32_to_cpu(el->l_recs[i].e_cpos),
3890                                 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3891                                 le32_to_cpu(insert_rec->e_cpos),
3892                                 le16_to_cpu(insert_rec->e_leaf_clusters));
3893                 i++;
3894                 el->l_recs[i] = *insert_rec;
3895                 le16_add_cpu(&el->l_next_free_rec, 1);
3896                 return;
3897         }
3898
3899 rotate:
3900         /*
3901          * Ok, we have to rotate.
3902          *
3903          * At this point, it is safe to assume that inserting into an
3904          * empty leaf and appending to a leaf have both been handled
3905          * above.
3906          *
3907          * This leaf needs to have space, either by the empty 1st
3908          * extent record, or by virtue of an l_next_rec < l_count.
3909          */
3910         ocfs2_rotate_leaf(el, insert_rec);
3911 }
3912
3913 static void ocfs2_adjust_rightmost_records(handle_t *handle,
3914                                            struct ocfs2_extent_tree *et,
3915                                            struct ocfs2_path *path,
3916                                            struct ocfs2_extent_rec *insert_rec)
3917 {
3918         int ret, i, next_free;
3919         struct buffer_head *bh;
3920         struct ocfs2_extent_list *el;
3921         struct ocfs2_extent_rec *rec;
3922
3923         /*
3924          * Update everything except the leaf block.
3925          */
3926         for (i = 0; i < path->p_tree_depth; i++) {
3927                 bh = path->p_node[i].bh;
3928                 el = path->p_node[i].el;
3929
3930                 next_free = le16_to_cpu(el->l_next_free_rec);
3931                 if (next_free == 0) {
3932                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3933                                     "Owner %llu has a bad extent list",
3934                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
3935                         ret = -EIO;
3936                         return;
3937                 }
3938
3939                 rec = &el->l_recs[next_free - 1];
3940
3941                 rec->e_int_clusters = insert_rec->e_cpos;
3942                 le32_add_cpu(&rec->e_int_clusters,
3943                              le16_to_cpu(insert_rec->e_leaf_clusters));
3944                 le32_add_cpu(&rec->e_int_clusters,
3945                              -le32_to_cpu(rec->e_cpos));
3946
3947                 ocfs2_journal_dirty(handle, bh);
3948         }
3949 }
3950
3951 static int ocfs2_append_rec_to_path(handle_t *handle,
3952                                     struct ocfs2_extent_tree *et,
3953                                     struct ocfs2_extent_rec *insert_rec,
3954                                     struct ocfs2_path *right_path,
3955                                     struct ocfs2_path **ret_left_path)
3956 {
3957         int ret, next_free;
3958         struct ocfs2_extent_list *el;
3959         struct ocfs2_path *left_path = NULL;
3960
3961         *ret_left_path = NULL;
3962
3963         /*
3964          * This shouldn't happen for non-trees. The extent rec cluster
3965          * count manipulation below only works for interior nodes.
3966          */
3967         BUG_ON(right_path->p_tree_depth == 0);
3968
3969         /*
3970          * If our appending insert is at the leftmost edge of a leaf,
3971          * then we might need to update the rightmost records of the
3972          * neighboring path.
3973          */
3974         el = path_leaf_el(right_path);
3975         next_free = le16_to_cpu(el->l_next_free_rec);
3976         if (next_free == 0 ||
3977             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3978                 u32 left_cpos;
3979
3980                 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3981                                                     right_path, &left_cpos);
3982                 if (ret) {
3983                         mlog_errno(ret);
3984                         goto out;
3985                 }
3986
3987                 trace_ocfs2_append_rec_to_path(
3988                         (unsigned long long)
3989                         ocfs2_metadata_cache_owner(et->et_ci),
3990                         le32_to_cpu(insert_rec->e_cpos),
3991                         left_cpos);
3992
3993                 /*
3994                  * No need to worry if the append is already in the
3995                  * leftmost leaf.
3996                  */
3997                 if (left_cpos) {
3998                         left_path = ocfs2_new_path_from_path(right_path);
3999                         if (!left_path) {
4000                                 ret = -ENOMEM;
4001                                 mlog_errno(ret);
4002                                 goto out;
4003                         }
4004
4005                         ret = ocfs2_find_path(et->et_ci, left_path,
4006                                               left_cpos);
4007                         if (ret) {
4008                                 mlog_errno(ret);
4009                                 goto out;
4010                         }
4011
4012                         /*
4013                          * ocfs2_insert_path() will pass the left_path to the
4014                          * journal for us.
4015                          */
4016                 }
4017         }
4018
4019         ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4020         if (ret) {
4021                 mlog_errno(ret);
4022                 goto out;
4023         }
4024
4025         ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
4026
4027         *ret_left_path = left_path;
4028         ret = 0;
4029 out:
4030         if (ret != 0)
4031                 ocfs2_free_path(left_path);
4032
4033         return ret;
4034 }
4035
4036 static void ocfs2_split_record(struct ocfs2_extent_tree *et,
4037                                struct ocfs2_path *left_path,
4038                                struct ocfs2_path *right_path,
4039                                struct ocfs2_extent_rec *split_rec,
4040                                enum ocfs2_split_type split)
4041 {
4042         int index;
4043         u32 cpos = le32_to_cpu(split_rec->e_cpos);
4044         struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
4045         struct ocfs2_extent_rec *rec, *tmprec;
4046
4047         right_el = path_leaf_el(right_path);
4048         if (left_path)
4049                 left_el = path_leaf_el(left_path);
4050
4051         el = right_el;
4052         insert_el = right_el;
4053         index = ocfs2_search_extent_list(el, cpos);
4054         if (index != -1) {
4055                 if (index == 0 && left_path) {
4056                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
4057
4058                         /*
4059                          * This typically means that the record
4060                          * started in the left path but moved to the
4061                          * right as a result of rotation. We either
4062                          * move the existing record to the left, or we
4063                          * do the later insert there.
4064                          *
4065                          * In this case, the left path should always
4066                          * exist as the rotate code will have passed
4067                          * it back for a post-insert update.
4068                          */
4069
4070                         if (split == SPLIT_LEFT) {
4071                                 /*
4072                                  * It's a left split. Since we know
4073                                  * that the rotate code gave us an
4074                                  * empty extent in the left path, we
4075                                  * can just do the insert there.
4076                                  */
4077                                 insert_el = left_el;
4078                         } else {
4079                                 /*
4080                                  * Right split - we have to move the
4081                                  * existing record over to the left
4082                                  * leaf. The insert will be into the
4083                                  * newly created empty extent in the
4084                                  * right leaf.
4085                                  */
4086                                 tmprec = &right_el->l_recs[index];
4087                                 ocfs2_rotate_leaf(left_el, tmprec);
4088                                 el = left_el;
4089
4090                                 memset(tmprec, 0, sizeof(*tmprec));
4091                                 index = ocfs2_search_extent_list(left_el, cpos);
4092                                 BUG_ON(index == -1);
4093                         }
4094                 }
4095         } else {
4096                 BUG_ON(!left_path);
4097                 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
4098                 /*
4099                  * Left path is easy - we can just allow the insert to
4100                  * happen.
4101                  */
4102                 el = left_el;
4103                 insert_el = left_el;
4104                 index = ocfs2_search_extent_list(el, cpos);
4105                 BUG_ON(index == -1);
4106         }
4107
4108         rec = &el->l_recs[index];
4109         ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4110                                 split, rec, split_rec);
4111         ocfs2_rotate_leaf(insert_el, split_rec);
4112 }
4113
4114 /*
4115  * This function only does inserts on an allocation b-tree. For tree
4116  * depth = 0, ocfs2_insert_at_leaf() is called directly.
4117  *
4118  * right_path is the path we want to do the actual insert
4119  * in. left_path should only be passed in if we need to update that
4120  * portion of the tree after an edge insert.
4121  */
4122 static int ocfs2_insert_path(handle_t *handle,
4123                              struct ocfs2_extent_tree *et,
4124                              struct ocfs2_path *left_path,
4125                              struct ocfs2_path *right_path,
4126                              struct ocfs2_extent_rec *insert_rec,
4127                              struct ocfs2_insert_type *insert)
4128 {
4129         int ret, subtree_index;
4130         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
4131
4132         if (left_path) {
4133                 /*
4134                  * There's a chance that left_path got passed back to
4135                  * us without being accounted for in the
4136                  * journal. Extend our transaction here to be sure we
4137                  * can change those blocks.
4138                  */
4139                 ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
4140                 if (ret < 0) {
4141                         mlog_errno(ret);
4142                         goto out;
4143                 }
4144
4145                 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
4146                 if (ret < 0) {
4147                         mlog_errno(ret);
4148                         goto out;
4149                 }
4150         }
4151
4152         /*
4153          * Pass both paths to the journal. The majority of inserts
4154          * will be touching all components anyway.
4155          */
4156         ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4157         if (ret < 0) {
4158                 mlog_errno(ret);
4159                 goto out;
4160         }
4161
4162         if (insert->ins_split != SPLIT_NONE) {
4163                 /*
4164                  * We could call ocfs2_insert_at_leaf() for some types
4165                  * of splits, but it's easier to just let one separate
4166                  * function sort it all out.
4167                  */
4168                 ocfs2_split_record(et, left_path, right_path,
4169                                    insert_rec, insert->ins_split);
4170
4171                 /*
4172                  * Split might have modified either leaf and we don't
4173                  * have a guarantee that the later edge insert will
4174                  * dirty this for us.
4175                  */
4176                 if (left_path)
4177                         ocfs2_journal_dirty(handle,
4178                                             path_leaf_bh(left_path));
4179         } else
4180                 ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
4181                                      insert);
4182
4183         ocfs2_journal_dirty(handle, leaf_bh);
4184
4185         if (left_path) {
4186                 /*
4187                  * The rotate code has indicated that we need to fix
4188                  * up portions of the tree after the insert.
4189                  *
4190                  * XXX: Should we extend the transaction here?
4191                  */
4192                 subtree_index = ocfs2_find_subtree_root(et, left_path,
4193                                                         right_path);
4194                 ocfs2_complete_edge_insert(handle, left_path, right_path,
4195                                            subtree_index);
4196         }
4197
4198         ret = 0;
4199 out:
4200         return ret;
4201 }
4202
4203 static int ocfs2_do_insert_extent(handle_t *handle,
4204                                   struct ocfs2_extent_tree *et,
4205                                   struct ocfs2_extent_rec *insert_rec,
4206                                   struct ocfs2_insert_type *type)
4207 {
4208         int ret, rotate = 0;
4209         u32 cpos;
4210         struct ocfs2_path *right_path = NULL;
4211         struct ocfs2_path *left_path = NULL;
4212         struct ocfs2_extent_list *el;
4213
4214         el = et->et_root_el;
4215
4216         ret = ocfs2_et_root_journal_access(handle, et,
4217                                            OCFS2_JOURNAL_ACCESS_WRITE);
4218         if (ret) {
4219                 mlog_errno(ret);
4220                 goto out;
4221         }
4222
4223         if (le16_to_cpu(el->l_tree_depth) == 0) {
4224                 ocfs2_insert_at_leaf(et, insert_rec, el, type);
4225                 goto out_update_clusters;
4226         }
4227
4228         right_path = ocfs2_new_path_from_et(et);
4229         if (!right_path) {
4230                 ret = -ENOMEM;
4231                 mlog_errno(ret);
4232                 goto out;
4233         }
4234
4235         /*
4236          * Determine the path to start with. Rotations need the
4237          * rightmost path, everything else can go directly to the
4238          * target leaf.
4239          */
4240         cpos = le32_to_cpu(insert_rec->e_cpos);
4241         if (type->ins_appending == APPEND_NONE &&
4242             type->ins_contig == CONTIG_NONE) {
4243                 rotate = 1;
4244                 cpos = UINT_MAX;
4245         }
4246
4247         ret = ocfs2_find_path(et->et_ci, right_path, cpos);
4248         if (ret) {
4249                 mlog_errno(ret);
4250                 goto out;
4251         }
4252
4253         /*
4254          * Rotations and appends need special treatment - they modify
4255          * parts of the tree's above them.
4256          *
4257          * Both might pass back a path immediate to the left of the
4258          * one being inserted to. This will be cause
4259          * ocfs2_insert_path() to modify the rightmost records of
4260          * left_path to account for an edge insert.
4261          *
4262          * XXX: When modifying this code, keep in mind that an insert
4263          * can wind up skipping both of these two special cases...
4264          */
4265         if (rotate) {
4266                 ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
4267                                               le32_to_cpu(insert_rec->e_cpos),
4268                                               right_path, &left_path);
4269                 if (ret) {
4270                         mlog_errno(ret);
4271                         goto out;
4272                 }
4273
4274                 /*
4275                  * ocfs2_rotate_tree_right() might have extended the
4276                  * transaction without re-journaling our tree root.
4277                  */
4278                 ret = ocfs2_et_root_journal_access(handle, et,
4279                                                    OCFS2_JOURNAL_ACCESS_WRITE);
4280                 if (ret) {
4281                         mlog_errno(ret);
4282                         goto out;
4283                 }
4284         } else if (type->ins_appending == APPEND_TAIL
4285                    && type->ins_contig != CONTIG_LEFT) {
4286                 ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
4287                                                right_path, &left_path);
4288                 if (ret) {
4289                         mlog_errno(ret);
4290                         goto out;
4291                 }
4292         }
4293
4294         ret = ocfs2_insert_path(handle, et, left_path, right_path,
4295                                 insert_rec, type);
4296         if (ret) {
4297                 mlog_errno(ret);
4298                 goto out;
4299         }
4300
4301 out_update_clusters:
4302         if (type->ins_split == SPLIT_NONE)
4303                 ocfs2_et_update_clusters(et,
4304                                          le16_to_cpu(insert_rec->e_leaf_clusters));
4305
4306         ocfs2_journal_dirty(handle, et->et_root_bh);
4307
4308 out:
4309         ocfs2_free_path(left_path);
4310         ocfs2_free_path(right_path);
4311
4312         return ret;
4313 }
4314
4315 static enum ocfs2_contig_type
4316 ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4317                                struct ocfs2_path *path,
4318                                struct ocfs2_extent_list *el, int index,
4319                                struct ocfs2_extent_rec *split_rec)
4320 {
4321         int status;
4322         enum ocfs2_contig_type ret = CONTIG_NONE;
4323         u32 left_cpos, right_cpos;
4324         struct ocfs2_extent_rec *rec = NULL;
4325         struct ocfs2_extent_list *new_el;
4326         struct ocfs2_path *left_path = NULL, *right_path = NULL;
4327         struct buffer_head *bh;
4328         struct ocfs2_extent_block *eb;
4329         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
4330
4331         if (index > 0) {
4332                 rec = &el->l_recs[index - 1];
4333         } else if (path->p_tree_depth > 0) {
4334                 status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
4335                 if (status)
4336                         goto out;
4337
4338                 if (left_cpos != 0) {
4339                         left_path = ocfs2_new_path_from_path(path);
4340                         if (!left_path)
4341                                 goto out;
4342
4343                         status = ocfs2_find_path(et->et_ci, left_path,
4344                                                  left_cpos);
4345                         if (status)
4346                                 goto out;
4347
4348                         new_el = path_leaf_el(left_path);
4349
4350                         if (le16_to_cpu(new_el->l_next_free_rec) !=
4351                             le16_to_cpu(new_el->l_count)) {
4352                                 bh = path_leaf_bh(left_path);
4353                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4354                                 ocfs2_error(sb,
4355                                             "Extent block #%llu has an "
4356                                             "invalid l_next_free_rec of "
4357                                             "%d.  It should have "
4358                                             "matched the l_count of %d",
4359                                             (unsigned long long)le64_to_cpu(eb->h_blkno),
4360                                             le16_to_cpu(new_el->l_next_free_rec),
4361                                             le16_to_cpu(new_el->l_count));
4362                                 status = -EINVAL;
4363                                 goto out;
4364                         }
4365                         rec = &new_el->l_recs[
4366                                 le16_to_cpu(new_el->l_next_free_rec) - 1];
4367                 }
4368         }
4369
4370         /*
4371          * We're careful to check for an empty extent record here -
4372          * the merge code will know what to do if it sees one.
4373          */
4374         if (rec) {
4375                 if (index == 1 && ocfs2_is_empty_extent(rec)) {
4376                         if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4377                                 ret = CONTIG_RIGHT;
4378                 } else {
4379                         ret = ocfs2_et_extent_contig(et, rec, split_rec);
4380                 }
4381         }
4382
4383         rec = NULL;
4384         if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4385                 rec = &el->l_recs[index + 1];
4386         else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4387                  path->p_tree_depth > 0) {
4388                 status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
4389                 if (status)
4390                         goto out;
4391
4392                 if (right_cpos == 0)
4393                         goto out;
4394
4395                 right_path = ocfs2_new_path_from_path(path);
4396                 if (!right_path)
4397                         goto out;
4398
4399                 status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
4400                 if (status)
4401                         goto out;
4402
4403                 new_el = path_leaf_el(right_path);
4404                 rec = &new_el->l_recs[0];
4405                 if (ocfs2_is_empty_extent(rec)) {
4406                         if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4407                                 bh = path_leaf_bh(right_path);
4408                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4409                                 ocfs2_error(sb,
4410                                             "Extent block #%llu has an "
4411                                             "invalid l_next_free_rec of %d",
4412                                             (unsigned long long)le64_to_cpu(eb->h_blkno),
4413                                             le16_to_cpu(new_el->l_next_free_rec));
4414                                 status = -EINVAL;
4415                                 goto out;
4416                         }
4417                         rec = &new_el->l_recs[1];
4418                 }
4419         }
4420
4421         if (rec) {
4422                 enum ocfs2_contig_type contig_type;
4423
4424                 contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
4425
4426                 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4427                         ret = CONTIG_LEFTRIGHT;
4428                 else if (ret == CONTIG_NONE)
4429                         ret = contig_type;
4430         }
4431
4432 out:
4433         if (left_path)
4434                 ocfs2_free_path(left_path);
4435         if (right_path)
4436                 ocfs2_free_path(right_path);
4437
4438         return ret;
4439 }
4440
4441 static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
4442                                      struct ocfs2_insert_type *insert,
4443                                      struct ocfs2_extent_list *el,
4444                                      struct ocfs2_extent_rec *insert_rec)
4445 {
4446         int i;
4447         enum ocfs2_contig_type contig_type = CONTIG_NONE;
4448
4449         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4450
4451         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4452                 contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
4453                                                      insert_rec);
4454                 if (contig_type != CONTIG_NONE) {
4455                         insert->ins_contig_index = i;
4456                         break;
4457                 }
4458         }
4459         insert->ins_contig = contig_type;
4460
4461         if (insert->ins_contig != CONTIG_NONE) {
4462                 struct ocfs2_extent_rec *rec =
4463                                 &el->l_recs[insert->ins_contig_index];
4464                 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4465                                    le16_to_cpu(insert_rec->e_leaf_clusters);
4466
4467                 /*
4468                  * Caller might want us to limit the size of extents, don't
4469                  * calculate contiguousness if we might exceed that limit.
4470                  */
4471                 if (et->et_max_leaf_clusters &&
4472                     (len > et->et_max_leaf_clusters))
4473                         insert->ins_contig = CONTIG_NONE;
4474         }
4475 }
4476
4477 /*
4478  * This should only be called against the righmost leaf extent list.
4479  *
4480  * ocfs2_figure_appending_type() will figure out whether we'll have to
4481  * insert at the tail of the rightmost leaf.
4482  *
4483  * This should also work against the root extent list for tree's with 0
4484  * depth. If we consider the root extent list to be the rightmost leaf node
4485  * then the logic here makes sense.
4486  */
4487 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4488                                         struct ocfs2_extent_list *el,
4489                                         struct ocfs2_extent_rec *insert_rec)
4490 {
4491         int i;
4492         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4493         struct ocfs2_extent_rec *rec;
4494
4495         insert->ins_appending = APPEND_NONE;
4496
4497         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4498
4499         if (!el->l_next_free_rec)
4500                 goto set_tail_append;
4501
4502         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4503                 /* Were all records empty? */
4504                 if (le16_to_cpu(el->l_next_free_rec) == 1)
4505                         goto set_tail_append;
4506         }
4507
4508         i = le16_to_cpu(el->l_next_free_rec) - 1;
4509         rec = &el->l_recs[i];
4510
4511         if (cpos >=
4512             (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4513                 goto set_tail_append;
4514
4515         return;
4516
4517 set_tail_append:
4518         insert->ins_appending = APPEND_TAIL;
4519 }
4520
4521 /*
4522  * Helper function called at the begining of an insert.
4523  *
4524  * This computes a few things that are commonly used in the process of
4525  * inserting into the btree:
4526  *   - Whether the new extent is contiguous with an existing one.
4527  *   - The current tree depth.
4528  *   - Whether the insert is an appending one.
4529  *   - The total # of free records in the tree.
4530  *
4531  * All of the information is stored on the ocfs2_insert_type
4532  * structure.
4533  */
4534 static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
4535                                     struct buffer_head **last_eb_bh,
4536                                     struct ocfs2_extent_rec *insert_rec,
4537                                     int *free_records,
4538                                     struct ocfs2_insert_type *insert)
4539 {
4540         int ret;
4541         struct ocfs2_extent_block *eb;
4542         struct ocfs2_extent_list *el;
4543         struct ocfs2_path *path = NULL;
4544         struct buffer_head *bh = NULL;
4545
4546         insert->ins_split = SPLIT_NONE;
4547
4548         el = et->et_root_el;
4549         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4550
4551         if (el->l_tree_depth) {
4552                 /*
4553                  * If we have tree depth, we read in the
4554                  * rightmost extent block ahead of time as
4555                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
4556                  * may want it later.
4557                  */
4558                 ret = ocfs2_read_extent_block(et->et_ci,
4559                                               ocfs2_et_get_last_eb_blk(et),
4560                                               &bh);
4561                 if (ret) {
4562                         mlog_errno(ret);
4563                         goto out;
4564                 }
4565                 eb = (struct ocfs2_extent_block *) bh->b_data;
4566                 el = &eb->h_list;
4567         }
4568
4569         /*
4570          * Unless we have a contiguous insert, we'll need to know if
4571          * there is room left in our allocation tree for another
4572          * extent record.
4573          *
4574          * XXX: This test is simplistic, we can search for empty
4575          * extent records too.
4576          */
4577         *free_records = le16_to_cpu(el->l_count) -
4578                 le16_to_cpu(el->l_next_free_rec);
4579
4580         if (!insert->ins_tree_depth) {
4581                 ocfs2_figure_contig_type(et, insert, el, insert_rec);
4582                 ocfs2_figure_appending_type(insert, el, insert_rec);
4583                 return 0;
4584         }
4585
4586         path = ocfs2_new_path_from_et(et);
4587         if (!path) {
4588                 ret = -ENOMEM;
4589                 mlog_errno(ret);
4590                 goto out;
4591         }
4592
4593         /*
4594          * In the case that we're inserting past what the tree
4595          * currently accounts for, ocfs2_find_path() will return for
4596          * us the rightmost tree path. This is accounted for below in
4597          * the appending code.
4598          */
4599         ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
4600         if (ret) {
4601                 mlog_errno(ret);
4602                 goto out;
4603         }
4604
4605         el = path_leaf_el(path);
4606
4607         /*
4608          * Now that we have the path, there's two things we want to determine:
4609          * 1) Contiguousness (also set contig_index if this is so)
4610          *
4611          * 2) Are we doing an append? We can trivially break this up
4612          *     into two types of appends: simple record append, or a
4613          *     rotate inside the tail leaf.
4614          */
4615         ocfs2_figure_contig_type(et, insert, el, insert_rec);
4616
4617         /*
4618          * The insert code isn't quite ready to deal with all cases of
4619          * left contiguousness. Specifically, if it's an insert into
4620          * the 1st record in a leaf, it will require the adjustment of
4621          * cluster count on the last record of the path directly to it's
4622          * left. For now, just catch that case and fool the layers
4623          * above us. This works just fine for tree_depth == 0, which
4624          * is why we allow that above.
4625          */
4626         if (insert->ins_contig == CONTIG_LEFT &&
4627             insert->ins_contig_index == 0)
4628                 insert->ins_contig = CONTIG_NONE;
4629
4630         /*
4631          * Ok, so we can simply compare against last_eb to figure out
4632          * whether the path doesn't exist. This will only happen in
4633          * the case that we're doing a tail append, so maybe we can
4634          * take advantage of that information somehow.
4635          */
4636         if (ocfs2_et_get_last_eb_blk(et) ==
4637             path_leaf_bh(path)->b_blocknr) {
4638                 /*
4639                  * Ok, ocfs2_find_path() returned us the rightmost
4640                  * tree path. This might be an appending insert. There are
4641                  * two cases:
4642                  *    1) We're doing a true append at the tail:
4643                  *      -This might even be off the end of the leaf
4644                  *    2) We're "appending" by rotating in the tail
4645                  */
4646                 ocfs2_figure_appending_type(insert, el, insert_rec);
4647         }
4648
4649 out:
4650         ocfs2_free_path(path);
4651
4652         if (ret == 0)
4653                 *last_eb_bh = bh;
4654         else
4655                 brelse(bh);
4656         return ret;
4657 }
4658
4659 /*
4660  * Insert an extent into a btree.
4661  *
4662  * The caller needs to update the owning btree's cluster count.
4663  */
4664 int ocfs2_insert_extent(handle_t *handle,
4665                         struct ocfs2_extent_tree *et,
4666                         u32 cpos,
4667                         u64 start_blk,
4668                         u32 new_clusters,
4669                         u8 flags,
4670                         struct ocfs2_alloc_context *meta_ac)
4671 {
4672         int status;
4673         int uninitialized_var(free_records);
4674         struct buffer_head *last_eb_bh = NULL;
4675         struct ocfs2_insert_type insert = {0, };
4676         struct ocfs2_extent_rec rec;
4677
4678         trace_ocfs2_insert_extent_start(
4679                 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
4680                 cpos, new_clusters);
4681
4682         memset(&rec, 0, sizeof(rec));
4683         rec.e_cpos = cpu_to_le32(cpos);
4684         rec.e_blkno = cpu_to_le64(start_blk);
4685         rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4686         rec.e_flags = flags;
4687         status = ocfs2_et_insert_check(et, &rec);
4688         if (status) {
4689                 mlog_errno(status);
4690                 goto bail;
4691         }
4692
4693         status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
4694                                           &free_records, &insert);
4695         if (status < 0) {
4696                 mlog_errno(status);
4697                 goto bail;
4698         }
4699
4700         trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig,
4701                                   insert.ins_contig_index, free_records,
4702                                   insert.ins_tree_depth);
4703
4704         if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4705                 status = ocfs2_grow_tree(handle, et,
4706                                          &insert.ins_tree_depth, &last_eb_bh,
4707                                          meta_ac);
4708                 if (status) {
4709                         mlog_errno(status);
4710                         goto bail;
4711                 }
4712         }
4713
4714         /* Finally, we can add clusters. This might rotate the tree for us. */
4715         status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
4716         if (status < 0)
4717                 mlog_errno(status);
4718         else
4719                 ocfs2_et_extent_map_insert(et, &rec);
4720
4721 bail:
4722         brelse(last_eb_bh);
4723
4724         return status;
4725 }
4726
4727 /*
4728  * Allcate and add clusters into the extent b-tree.
4729  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4730  * The extent b-tree's root is specified by et, and
4731  * it is not limited to the file storage. Any extent tree can use this
4732  * function if it implements the proper ocfs2_extent_tree.
4733  */
4734 int ocfs2_add_clusters_in_btree(handle_t *handle,
4735                                 struct ocfs2_extent_tree *et,
4736                                 u32 *logical_offset,
4737                                 u32 clusters_to_add,
4738                                 int mark_unwritten,
4739                                 struct ocfs2_alloc_context *data_ac,
4740                                 struct ocfs2_alloc_context *meta_ac,
4741                                 enum ocfs2_alloc_restarted *reason_ret)
4742 {
4743         int status = 0, err = 0;
4744         int free_extents;
4745         enum ocfs2_alloc_restarted reason = RESTART_NONE;
4746         u32 bit_off, num_bits;
4747         u64 block;
4748         u8 flags = 0;
4749         struct ocfs2_super *osb =
4750                 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
4751
4752         BUG_ON(!clusters_to_add);
4753
4754         if (mark_unwritten)
4755                 flags = OCFS2_EXT_UNWRITTEN;
4756
4757         free_extents = ocfs2_num_free_extents(osb, et);
4758         if (free_extents < 0) {
4759                 status = free_extents;
4760                 mlog_errno(status);
4761                 goto leave;
4762         }
4763
4764         /* there are two cases which could cause us to EAGAIN in the
4765          * we-need-more-metadata case:
4766          * 1) we haven't reserved *any*
4767          * 2) we are so fragmented, we've needed to add metadata too
4768          *    many times. */
4769         if (!free_extents && !meta_ac) {
4770                 err = -1;
4771                 status = -EAGAIN;
4772                 reason = RESTART_META;
4773                 goto leave;
4774         } else if ((!free_extents)
4775                    && (ocfs2_alloc_context_bits_left(meta_ac)
4776                        < ocfs2_extend_meta_needed(et->et_root_el))) {
4777                 err = -2;
4778                 status = -EAGAIN;
4779                 reason = RESTART_META;
4780                 goto leave;
4781         }
4782
4783         status = __ocfs2_claim_clusters(handle, data_ac, 1,
4784                                         clusters_to_add, &bit_off, &num_bits);
4785         if (status < 0) {
4786                 if (status != -ENOSPC)
4787                         mlog_errno(status);
4788                 goto leave;
4789         }
4790
4791         BUG_ON(num_bits > clusters_to_add);
4792
4793         /* reserve our write early -- insert_extent may update the tree root */
4794         status = ocfs2_et_root_journal_access(handle, et,
4795                                               OCFS2_JOURNAL_ACCESS_WRITE);
4796         if (status < 0) {
4797                 mlog_errno(status);
4798                 goto leave;
4799         }
4800
4801         block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4802         trace_ocfs2_add_clusters_in_btree(
4803              (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
4804              bit_off, num_bits);
4805         status = ocfs2_insert_extent(handle, et, *logical_offset, block,
4806                                      num_bits, flags, meta_ac);
4807         if (status < 0) {
4808                 mlog_errno(status);
4809                 goto leave;
4810         }
4811
4812         ocfs2_journal_dirty(handle, et->et_root_bh);
4813
4814         clusters_to_add -= num_bits;
4815         *logical_offset += num_bits;
4816
4817         if (clusters_to_add) {
4818                 err = clusters_to_add;
4819                 status = -EAGAIN;
4820                 reason = RESTART_TRANS;
4821         }
4822
4823 leave:
4824         if (reason_ret)
4825                 *reason_ret = reason;
4826         trace_ocfs2_add_clusters_in_btree_ret(status, reason, err);
4827         return status;
4828 }
4829
4830 static void ocfs2_make_right_split_rec(struct super_block *sb,
4831                                        struct ocfs2_extent_rec *split_rec,
4832                                        u32 cpos,
4833                                        struct ocfs2_extent_rec *rec)
4834 {
4835         u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4836         u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4837
4838         memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4839
4840         split_rec->e_cpos = cpu_to_le32(cpos);
4841         split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4842
4843         split_rec->e_blkno = rec->e_blkno;
4844         le64_add_cpu(&split_rec->e_blkno,
4845                      ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4846
4847         split_rec->e_flags = rec->e_flags;
4848 }
4849
4850 static int ocfs2_split_and_insert(handle_t *handle,
4851                                   struct ocfs2_extent_tree *et,
4852                                   struct ocfs2_path *path,
4853                                   struct buffer_head **last_eb_bh,
4854                                   int split_index,
4855                                   struct ocfs2_extent_rec *orig_split_rec,
4856                                   struct ocfs2_alloc_context *meta_ac)
4857 {
4858         int ret = 0, depth;
4859         unsigned int insert_range, rec_range, do_leftright = 0;
4860         struct ocfs2_extent_rec tmprec;
4861         struct ocfs2_extent_list *rightmost_el;
4862         struct ocfs2_extent_rec rec;
4863         struct ocfs2_extent_rec split_rec = *orig_split_rec;
4864         struct ocfs2_insert_type insert;
4865         struct ocfs2_extent_block *eb;
4866
4867 leftright:
4868         /*
4869          * Store a copy of the record on the stack - it might move
4870          * around as the tree is manipulated below.
4871          */
4872         rec = path_leaf_el(path)->l_recs[split_index];
4873
4874         rightmost_el = et->et_root_el;
4875
4876         depth = le16_to_cpu(rightmost_el->l_tree_depth);
4877         if (depth) {
4878                 BUG_ON(!(*last_eb_bh));
4879                 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4880                 rightmost_el = &eb->h_list;
4881         }
4882
4883         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4884             le16_to_cpu(rightmost_el->l_count)) {
4885                 ret = ocfs2_grow_tree(handle, et,
4886                                       &depth, last_eb_bh, meta_ac);
4887                 if (ret) {
4888                         mlog_errno(ret);
4889                         goto out;
4890                 }
4891         }
4892
4893         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4894         insert.ins_appending = APPEND_NONE;
4895         insert.ins_contig = CONTIG_NONE;
4896         insert.ins_tree_depth = depth;
4897
4898         insert_range = le32_to_cpu(split_rec.e_cpos) +
4899                 le16_to_cpu(split_rec.e_leaf_clusters);
4900         rec_range = le32_to_cpu(rec.e_cpos) +
4901                 le16_to_cpu(rec.e_leaf_clusters);
4902
4903         if (split_rec.e_cpos == rec.e_cpos) {
4904                 insert.ins_split = SPLIT_LEFT;
4905         } else if (insert_range == rec_range) {
4906                 insert.ins_split = SPLIT_RIGHT;
4907         } else {
4908                 /*
4909                  * Left/right split. We fake this as a right split
4910                  * first and then make a second pass as a left split.
4911                  */
4912                 insert.ins_split = SPLIT_RIGHT;
4913
4914                 ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4915                                            &tmprec, insert_range, &rec);
4916
4917                 split_rec = tmprec;
4918
4919                 BUG_ON(do_leftright);
4920                 do_leftright = 1;
4921         }
4922
4923         ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
4924         if (ret) {
4925                 mlog_errno(ret);
4926                 goto out;
4927         }
4928
4929         if (do_leftright == 1) {
4930                 u32 cpos;
4931                 struct ocfs2_extent_list *el;
4932
4933                 do_leftright++;
4934                 split_rec = *orig_split_rec;
4935
4936                 ocfs2_reinit_path(path, 1);
4937
4938                 cpos = le32_to_cpu(split_rec.e_cpos);
4939                 ret = ocfs2_find_path(et->et_ci, path, cpos);
4940                 if (ret) {
4941                         mlog_errno(ret);
4942                         goto out;
4943                 }
4944
4945                 el = path_leaf_el(path);
4946                 split_index = ocfs2_search_extent_list(el, cpos);
4947                 goto leftright;
4948         }
4949 out:
4950
4951         return ret;
4952 }
4953
4954 static int ocfs2_replace_extent_rec(handle_t *handle,
4955                                     struct ocfs2_extent_tree *et,
4956                                     struct ocfs2_path *path,
4957                                     struct ocfs2_extent_list *el,
4958                                     int split_index,
4959                                     struct ocfs2_extent_rec *split_rec)
4960 {
4961         int ret;
4962
4963         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
4964                                            path_num_items(path) - 1);
4965         if (ret) {
4966                 mlog_errno(ret);
4967                 goto out;
4968         }
4969
4970         el->l_recs[split_index] = *split_rec;
4971
4972         ocfs2_journal_dirty(handle, path_leaf_bh(path));
4973 out:
4974         return ret;
4975 }
4976
4977 /*
4978  * Split part or all of the extent record at split_index in the leaf
4979  * pointed to by path. Merge with the contiguous extent record if needed.
4980  *
4981  * Care is taken to handle contiguousness so as to not grow the tree.
4982  *
4983  * meta_ac is not strictly necessary - we only truly need it if growth
4984  * of the tree is required. All other cases will degrade into a less
4985  * optimal tree layout.
4986  *
4987  * last_eb_bh should be the rightmost leaf block for any extent
4988  * btree. Since a split may grow the tree or a merge might shrink it,
4989  * the caller cannot trust the contents of that buffer after this call.
4990  *
4991  * This code is optimized for readability - several passes might be
4992  * made over certain portions of the tree. All of those blocks will
4993  * have been brought into cache (and pinned via the journal), so the
4994  * extra overhead is not expressed in terms of disk reads.
4995  */
4996 int ocfs2_split_extent(handle_t *handle,
4997                        struct ocfs2_extent_tree *et,
4998                        struct ocfs2_path *path,
4999                        int split_index,
5000                        struct ocfs2_extent_rec *split_rec,
5001                        struct ocfs2_alloc_context *meta_ac,
5002                        struct ocfs2_cached_dealloc_ctxt *dealloc)
5003 {
5004         int ret = 0;
5005         struct ocfs2_extent_list *el = path_leaf_el(path);
5006         struct buffer_head *last_eb_bh = NULL;
5007         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
5008         struct ocfs2_merge_ctxt ctxt;
5009         struct ocfs2_extent_list *rightmost_el;
5010
5011         if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
5012             ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
5013              (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
5014                 ret = -EIO;
5015                 mlog_errno(ret);
5016                 goto out;
5017         }
5018
5019         ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
5020                                                             split_index,
5021                                                             split_rec);
5022
5023         /*
5024          * The core merge / split code wants to know how much room is
5025          * left in this allocation tree, so we pass the
5026          * rightmost extent list.
5027          */
5028         if (path->p_tree_depth) {
5029                 struct ocfs2_extent_block *eb;
5030
5031                 ret = ocfs2_read_extent_block(et->et_ci,
5032                                               ocfs2_et_get_last_eb_blk(et),
5033                                               &last_eb_bh);
5034                 if (ret) {
5035                         mlog_errno(ret);
5036                         goto out;
5037                 }
5038
5039                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5040                 rightmost_el = &eb->h_list;
5041         } else
5042                 rightmost_el = path_root_el(path);
5043
5044         if (rec->e_cpos == split_rec->e_cpos &&
5045             rec->e_leaf_clusters == split_rec->e_leaf_clusters)
5046                 ctxt.c_split_covers_rec = 1;
5047         else
5048                 ctxt.c_split_covers_rec = 0;
5049
5050         ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
5051
5052         trace_ocfs2_split_extent(split_index, ctxt.c_contig_type,
5053                                  ctxt.c_has_empty_extent,
5054                                  ctxt.c_split_covers_rec);
5055
5056         if (ctxt.c_contig_type == CONTIG_NONE) {
5057                 if (ctxt.c_split_covers_rec)
5058                         ret = ocfs2_replace_extent_rec(handle, et, path, el,
5059                                                        split_index, split_rec);
5060                 else
5061                         ret = ocfs2_split_and_insert(handle, et, path,
5062                                                      &last_eb_bh, split_index,
5063                                                      split_rec, meta_ac);
5064                 if (ret)
5065                         mlog_errno(ret);
5066         } else {
5067                 ret = ocfs2_try_to_merge_extent(handle, et, path,
5068                                                 split_index, split_rec,
5069                                                 dealloc, &ctxt);
5070                 if (ret)
5071                         mlog_errno(ret);
5072         }
5073
5074 out:
5075         brelse(last_eb_bh);
5076         return ret;
5077 }
5078
5079 /*
5080  * Change the flags of the already-existing extent at cpos for len clusters.
5081  *
5082  * new_flags: the flags we want to set.
5083  * clear_flags: the flags we want to clear.
5084  * phys: the new physical offset we want this new extent starts from.
5085  *
5086  * If the existing extent is larger than the request, initiate a
5087  * split. An attempt will be made at merging with adjacent extents.
5088  *
5089  * The caller is responsible for passing down meta_ac if we'll need it.
5090  */
5091 int ocfs2_change_extent_flag(handle_t *handle,
5092                              struct ocfs2_extent_tree *et,
5093                              u32 cpos, u32 len, u32 phys,
5094                              struct ocfs2_alloc_context *meta_ac,
5095                              struct ocfs2_cached_dealloc_ctxt *dealloc,
5096                              int new_flags, int clear_flags)
5097 {
5098         int ret, index;
5099         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5100         u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
5101         struct ocfs2_extent_rec split_rec;
5102         struct ocfs2_path *left_path = NULL;
5103         struct ocfs2_extent_list *el;
5104         struct ocfs2_extent_rec *rec;
5105
5106         left_path = ocfs2_new_path_from_et(et);
5107         if (!left_path) {
5108                 ret = -ENOMEM;
5109                 mlog_errno(ret);
5110                 goto out;
5111         }
5112
5113         ret = ocfs2_find_path(et->et_ci, left_path, cpos);
5114         if (ret) {
5115                 mlog_errno(ret);
5116                 goto out;
5117         }
5118         el = path_leaf_el(left_path);
5119
5120         index = ocfs2_search_extent_list(el, cpos);
5121         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5122                 ocfs2_error(sb,
5123                             "Owner %llu has an extent at cpos %u which can no "
5124                             "longer be found.\n",
5125                              (unsigned long long)
5126                              ocfs2_metadata_cache_owner(et->et_ci), cpos);
5127                 ret = -EROFS;
5128                 goto out;
5129         }
5130
5131         ret = -EIO;
5132         rec = &el->l_recs[index];
5133         if (new_flags && (rec->e_flags & new_flags)) {
5134                 mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
5135                      "extent that already had them",
5136                      (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5137                      new_flags);
5138                 goto out;
5139         }
5140
5141         if (clear_flags && !(rec->e_flags & clear_flags)) {
5142                 mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
5143                      "extent that didn't have them",
5144                      (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5145                      clear_flags);
5146                 goto out;
5147         }
5148
5149         memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
5150         split_rec.e_cpos = cpu_to_le32(cpos);
5151         split_rec.e_leaf_clusters = cpu_to_le16(len);
5152         split_rec.e_blkno = cpu_to_le64(start_blkno);
5153         split_rec.e_flags = rec->e_flags;
5154         if (new_flags)
5155                 split_rec.e_flags |= new_flags;
5156         if (clear_flags)
5157                 split_rec.e_flags &= ~clear_flags;
5158
5159         ret = ocfs2_split_extent(handle, et, left_path,
5160                                  index, &split_rec, meta_ac,
5161                                  dealloc);
5162         if (ret)
5163                 mlog_errno(ret);
5164
5165 out:
5166         ocfs2_free_path(left_path);
5167         return ret;
5168
5169 }
5170
5171 /*
5172  * Mark the already-existing extent at cpos as written for len clusters.
5173  * This removes the unwritten extent flag.
5174  *
5175  * If the existing extent is larger than the request, initiate a
5176  * split. An attempt will be made at merging with adjacent extents.
5177  *
5178  * The caller is responsible for passing down meta_ac if we'll need it.
5179  */
5180 int ocfs2_mark_extent_written(struct inode *inode,
5181                               struct ocfs2_extent_tree *et,
5182                               handle_t *handle, u32 cpos, u32 len, u32 phys,
5183                               struct ocfs2_alloc_context *meta_ac,
5184                               struct ocfs2_cached_dealloc_ctxt *dealloc)
5185 {
5186         int ret;
5187
5188         trace_ocfs2_mark_extent_written(
5189                 (unsigned long long)OCFS2_I(inode)->ip_blkno,
5190                 cpos, len, phys);
5191
5192         if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5193                 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
5194                             "that are being written to, but the feature bit "
5195                             "is not set in the super block.",
5196                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
5197                 ret = -EROFS;
5198                 goto out;
5199         }
5200
5201         /*
5202          * XXX: This should be fixed up so that we just re-insert the
5203          * next extent records.
5204          */
5205         ocfs2_et_extent_map_truncate(et, 0);
5206
5207         ret = ocfs2_change_extent_flag(handle, et, cpos,
5208                                        len, phys, meta_ac, dealloc,
5209                                        0, OCFS2_EXT_UNWRITTEN);
5210         if (ret)
5211                 mlog_errno(ret);
5212
5213 out:
5214         return ret;
5215 }
5216
5217 static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5218                             struct ocfs2_path *path,
5219                             int index, u32 new_range,
5220                             struct ocfs2_alloc_context *meta_ac)
5221 {
5222         int ret, depth, credits;
5223         struct buffer_head *last_eb_bh = NULL;
5224         struct ocfs2_extent_block *eb;
5225         struct ocfs2_extent_list *rightmost_el, *el;
5226         struct ocfs2_extent_rec split_rec;
5227         struct ocfs2_extent_rec *rec;
5228         struct ocfs2_insert_type insert;
5229
5230         /*
5231          * Setup the record to split before we grow the tree.
5232          */
5233         el = path_leaf_el(path);
5234         rec = &el->l_recs[index];
5235         ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
5236                                    &split_rec, new_range, rec);
5237
5238         depth = path->p_tree_depth;
5239         if (depth > 0) {
5240                 ret = ocfs2_read_extent_block(et->et_ci,
5241                                               ocfs2_et_get_last_eb_blk(et),
5242                                               &last_eb_bh);
5243                 if (ret < 0) {
5244                         mlog_errno(ret);
5245                         goto out;
5246                 }
5247
5248                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5249                 rightmost_el = &eb->h_list;
5250         } else
5251                 rightmost_el = path_leaf_el(path);
5252
5253         credits = path->p_tree_depth +
5254                   ocfs2_extend_meta_needed(et->et_root_el);
5255         ret = ocfs2_extend_trans(handle, credits);
5256         if (ret) {
5257                 mlog_errno(ret);
5258                 goto out;
5259         }
5260
5261         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5262             le16_to_cpu(rightmost_el->l_count)) {
5263                 ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
5264                                       meta_ac);
5265                 if (ret) {
5266                         mlog_errno(ret);
5267                         goto out;
5268                 }
5269         }
5270
5271         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
5272         insert.ins_appending = APPEND_NONE;
5273         insert.ins_contig = CONTIG_NONE;
5274         insert.ins_split = SPLIT_RIGHT;
5275         insert.ins_tree_depth = depth;
5276
5277         ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
5278         if (ret)
5279                 mlog_errno(ret);
5280
5281 out:
5282         brelse(last_eb_bh);
5283         return ret;
5284 }
5285
5286 static int ocfs2_truncate_rec(handle_t *handle,
5287                               struct ocfs2_extent_tree *et,
5288                               struct ocfs2_path *path, int index,
5289                               struct ocfs2_cached_dealloc_ctxt *dealloc,
5290                               u32 cpos, u32 len)
5291 {
5292         int ret;
5293         u32 left_cpos, rec_range, trunc_range;
5294         int wants_rotate = 0, is_rightmost_tree_rec = 0;
5295         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5296         struct ocfs2_path *left_path = NULL;
5297         struct ocfs2_extent_list *el = path_leaf_el(path);
5298         struct ocfs2_extent_rec *rec;
5299         struct ocfs2_extent_block *eb;
5300
5301         if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5302                 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5303                 if (ret) {
5304                         mlog_errno(ret);
5305                         goto out;
5306                 }
5307
5308                 index--;
5309         }
5310
5311         if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5312             path->p_tree_depth) {
5313                 /*
5314                  * Check whether this is the rightmost tree record. If
5315                  * we remove all of this record or part of its right
5316                  * edge then an update of the record lengths above it
5317                  * will be required.
5318                  */
5319                 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5320                 if (eb->h_next_leaf_blk == 0)
5321                         is_rightmost_tree_rec = 1;
5322         }
5323
5324         rec = &el->l_recs[index];
5325         if (index == 0 && path->p_tree_depth &&
5326             le32_to_cpu(rec->e_cpos) == cpos) {
5327                 /*
5328                  * Changing the leftmost offset (via partial or whole
5329                  * record truncate) of an interior (or rightmost) path
5330                  * means we have to update the subtree that is formed
5331                  * by this leaf and the one to it's left.
5332                  *
5333                  * There are two cases we can skip:
5334                  *   1) Path is the leftmost one in our btree.
5335                  *   2) The leaf is rightmost and will be empty after
5336                  *      we remove the extent record - the rotate code
5337                  *      knows how to update the newly formed edge.
5338                  */
5339
5340                 ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
5341                 if (ret) {
5342                         mlog_errno(ret);
5343                         goto out;
5344                 }
5345
5346                 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5347                         left_path = ocfs2_new_path_from_path(path);
5348                         if (!left_path) {
5349                                 ret = -ENOMEM;
5350                                 mlog_errno(ret);
5351                                 goto out;
5352                         }
5353
5354                         ret = ocfs2_find_path(et->et_ci, left_path,
5355                                               left_cpos);
5356                         if (ret) {
5357                                 mlog_errno(ret);
5358                                 goto out;
5359                         }
5360                 }
5361         }
5362
5363         ret = ocfs2_extend_rotate_transaction(handle, 0,
5364                                               handle->h_buffer_credits,
5365                                               path);
5366         if (ret) {
5367                 mlog_errno(ret);
5368                 goto out;
5369         }
5370
5371         ret = ocfs2_journal_access_path(et->et_ci, handle, path);
5372         if (ret) {
5373                 mlog_errno(ret);
5374                 goto out;
5375         }
5376
5377         ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
5378         if (ret) {
5379                 mlog_errno(ret);
5380                 goto out;
5381         }
5382
5383         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5384         trunc_range = cpos + len;
5385
5386         if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5387                 int next_free;
5388
5389                 memset(rec, 0, sizeof(*rec));
5390                 ocfs2_cleanup_merge(el, index);
5391                 wants_rotate = 1;
5392
5393                 next_free = le16_to_cpu(el->l_next_free_rec);
5394                 if (is_rightmost_tree_rec && next_free > 1) {
5395                         /*
5396                          * We skip the edge update if this path will
5397                          * be deleted by the rotate code.
5398                          */
5399                         rec = &el->l_recs[next_free - 1];
5400                         ocfs2_adjust_rightmost_records(handle, et, path,
5401                                                        rec);
5402                 }
5403         } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5404                 /* Remove leftmost portion of the record. */
5405                 le32_add_cpu(&rec->e_cpos, len);
5406                 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5407                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5408         } else if (rec_range == trunc_range) {
5409                 /* Remove rightmost portion of the record */
5410                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5411                 if (is_rightmost_tree_rec)
5412                         ocfs2_adjust_rightmost_records(handle, et, path, rec);
5413         } else {
5414                 /* Caller should have trapped this. */
5415                 mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
5416                      "(%u, %u)\n",
5417                      (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5418                      le32_to_cpu(rec->e_cpos),
5419                      le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5420                 BUG();
5421         }
5422
5423         if (left_path) {
5424                 int subtree_index;
5425
5426                 subtree_index = ocfs2_find_subtree_root(et, left_path, path);
5427                 ocfs2_complete_edge_insert(handle, left_path, path,
5428                                            subtree_index);
5429         }
5430
5431         ocfs2_journal_dirty(handle, path_leaf_bh(path));
5432
5433         ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5434         if (ret) {
5435                 mlog_errno(ret);
5436                 goto out;
5437         }
5438
5439 out:
5440         ocfs2_free_path(left_path);
5441         return ret;
5442 }
5443
5444 int ocfs2_remove_extent(handle_t *handle,
5445                         struct ocfs2_extent_tree *et,
5446                         u32 cpos, u32 len,
5447                         struct ocfs2_alloc_context *meta_ac,
5448                         struct ocfs2_cached_dealloc_ctxt *dealloc)
5449 {
5450         int ret, index;
5451         u32 rec_range, trunc_range;
5452         struct ocfs2_extent_rec *rec;
5453         struct ocfs2_extent_list *el;
5454         struct ocfs2_path *path = NULL;
5455
5456         /*
5457          * XXX: Why are we truncating to 0 instead of wherever this
5458          * affects us?
5459          */
5460         ocfs2_et_extent_map_truncate(et, 0);
5461
5462         path = ocfs2_new_path_from_et(et);
5463         if (!path) {
5464                 ret = -ENOMEM;
5465                 mlog_errno(ret);
5466                 goto out;
5467         }
5468
5469         ret = ocfs2_find_path(et->et_ci, path, cpos);
5470         if (ret) {
5471                 mlog_errno(ret);
5472                 goto out;
5473         }
5474
5475         el = path_leaf_el(path);
5476         index = ocfs2_search_extent_list(el, cpos);
5477         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5478                 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5479                             "Owner %llu has an extent at cpos %u which can no "
5480                             "longer be found.\n",
5481                             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5482                             cpos);
5483                 ret = -EROFS;
5484                 goto out;
5485         }
5486
5487         /*
5488          * We have 3 cases of extent removal:
5489          *   1) Range covers the entire extent rec
5490          *   2) Range begins or ends on one edge of the extent rec
5491          *   3) Range is in the middle of the extent rec (no shared edges)
5492          *
5493          * For case 1 we remove the extent rec and left rotate to
5494          * fill the hole.
5495          *
5496          * For case 2 we just shrink the existing extent rec, with a
5497          * tree update if the shrinking edge is also the edge of an
5498          * extent block.
5499          *
5500          * For case 3 we do a right split to turn the extent rec into
5501          * something case 2 can handle.
5502          */
5503         rec = &el->l_recs[index];
5504         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5505         trunc_range = cpos + len;
5506
5507         BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5508
5509         trace_ocfs2_remove_extent(
5510                 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5511                 cpos, len, index, le32_to_cpu(rec->e_cpos),
5512                 ocfs2_rec_clusters(el, rec));
5513
5514         if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5515                 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5516                                          cpos, len);
5517                 if (ret) {
5518                         mlog_errno(ret);
5519                         goto out;
5520                 }
5521         } else {
5522                 ret = ocfs2_split_tree(handle, et, path, index,
5523                                        trunc_range, meta_ac);
5524                 if (ret) {
5525                         mlog_errno(ret);
5526                         goto out;
5527                 }
5528
5529                 /*
5530                  * The split could have manipulated the tree enough to
5531                  * move the record location, so we have to look for it again.
5532                  */
5533                 ocfs2_reinit_path(path, 1);
5534
5535                 ret = ocfs2_find_path(et->et_ci, path, cpos);
5536                 if (ret) {
5537                         mlog_errno(ret);
5538                         goto out;
5539                 }
5540
5541                 el = path_leaf_el(path);
5542                 index = ocfs2_search_extent_list(el, cpos);
5543                 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5544                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5545                                     "Owner %llu: split at cpos %u lost record.",
5546                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5547                                     cpos);
5548                         ret = -EROFS;
5549                         goto out;
5550                 }
5551
5552                 /*
5553                  * Double check our values here. If anything is fishy,
5554                  * it's easier to catch it at the top level.
5555                  */
5556                 rec = &el->l_recs[index];
5557                 rec_range = le32_to_cpu(rec->e_cpos) +
5558                         ocfs2_rec_clusters(el, rec);
5559                 if (rec_range != trunc_range) {
5560                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5561                                     "Owner %llu: error after split at cpos %u"
5562                                     "trunc len %u, existing record is (%u,%u)",
5563                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5564                                     cpos, len, le32_to_cpu(rec->e_cpos),
5565                                     ocfs2_rec_clusters(el, rec));
5566                         ret = -EROFS;
5567                         goto out;
5568                 }
5569
5570                 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5571                                          cpos, len);
5572                 if (ret) {
5573                         mlog_errno(ret);
5574                         goto out;
5575                 }
5576         }
5577
5578 out:
5579         ocfs2_free_path(path);
5580         return ret;
5581 }
5582
5583 /*
5584  * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
5585  * same as ocfs2_lock_alloctors(), except for it accepts a blocks
5586  * number to reserve some extra blocks, and it only handles meta
5587  * data allocations.
5588  *
5589  * Currently, only ocfs2_remove_btree_range() uses it for truncating
5590  * and punching holes.
5591  */
5592 static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
5593                                               struct ocfs2_extent_tree *et,
5594                                               u32 extents_to_split,
5595                                               struct ocfs2_alloc_context **ac,
5596                                               int extra_blocks)
5597 {
5598         int ret = 0, num_free_extents;
5599         unsigned int max_recs_needed = 2 * extents_to_split;
5600         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5601
5602         *ac = NULL;
5603
5604         num_free_extents = ocfs2_num_free_extents(osb, et);
5605         if (num_free_extents < 0) {
5606                 ret = num_free_extents;
5607                 mlog_errno(ret);
5608                 goto out;
5609         }
5610
5611         if (!num_free_extents ||
5612             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
5613                 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
5614
5615         if (extra_blocks) {
5616                 ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
5617                 if (ret < 0) {
5618                         if (ret != -ENOSPC)
5619                                 mlog_errno(ret);
5620                         goto out;
5621                 }
5622         }
5623
5624 out:
5625         if (ret) {
5626                 if (*ac) {
5627                         ocfs2_free_alloc_context(*ac);
5628                         *ac = NULL;
5629                 }
5630         }
5631
5632         return ret;
5633 }
5634
5635 int ocfs2_remove_btree_range(struct inode *inode,
5636                              struct ocfs2_extent_tree *et,
5637                              u32 cpos, u32 phys_cpos, u32 len, int flags,
5638                              struct ocfs2_cached_dealloc_ctxt *dealloc,
5639                              u64 refcount_loc)
5640 {
5641         int ret, credits = 0, extra_blocks = 0;
5642         u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5643         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5644         struct inode *tl_inode = osb->osb_tl_inode;
5645         handle_t *handle;
5646         struct ocfs2_alloc_context *meta_ac = NULL;
5647         struct ocfs2_refcount_tree *ref_tree = NULL;
5648
5649         if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
5650                 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
5651                          OCFS2_HAS_REFCOUNT_FL));
5652
5653                 ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
5654                                                &ref_tree, NULL);
5655                 if (ret) {
5656                         mlog_errno(ret);
5657                         goto out;
5658                 }
5659
5660                 ret = ocfs2_prepare_refcount_change_for_del(inode,
5661                                                             refcount_loc,
5662                                                             phys_blkno,
5663                                                             len,
5664                                                             &credits,
5665                                                             &extra_blocks);
5666                 if (ret < 0) {
5667                         mlog_errno(ret);
5668                         goto out;
5669                 }
5670         }
5671
5672         ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
5673                                                  extra_blocks);
5674         if (ret) {
5675                 mlog_errno(ret);
5676                 return ret;
5677         }
5678
5679         mutex_lock(&tl_inode->i_mutex);
5680
5681         if (ocfs2_truncate_log_needs_flush(osb)) {
5682                 ret = __ocfs2_flush_truncate_log(osb);
5683                 if (ret < 0) {
5684                         mlog_errno(ret);
5685                         goto out;
5686                 }
5687         }
5688
5689         handle = ocfs2_start_trans(osb,
5690                         ocfs2_remove_extent_credits(osb->sb) + credits);
5691         if (IS_ERR(handle)) {
5692                 ret = PTR_ERR(handle);
5693                 mlog_errno(ret);
5694                 goto out;
5695         }
5696
5697         ret = ocfs2_et_root_journal_access(handle, et,
5698                                            OCFS2_JOURNAL_ACCESS_WRITE);
5699         if (ret) {
5700                 mlog_errno(ret);
5701                 goto out;
5702         }
5703
5704         dquot_free_space_nodirty(inode,
5705                                   ocfs2_clusters_to_bytes(inode->i_sb, len));
5706
5707         ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
5708         if (ret) {
5709                 mlog_errno(ret);
5710                 goto out_commit;
5711         }
5712
5713         ocfs2_et_update_clusters(et, -len);
5714
5715         ocfs2_journal_dirty(handle, et->et_root_bh);
5716
5717         if (phys_blkno) {
5718                 if (flags & OCFS2_EXT_REFCOUNTED)
5719                         ret = ocfs2_decrease_refcount(inode, handle,
5720                                         ocfs2_blocks_to_clusters(osb->sb,
5721                                                                  phys_blkno),
5722                                         len, meta_ac,
5723                                         dealloc, 1);
5724                 else
5725                         ret = ocfs2_truncate_log_append(osb, handle,
5726                                                         phys_blkno, len);
5727                 if (ret)
5728                         mlog_errno(ret);
5729
5730         }
5731
5732 out_commit:
5733         ocfs2_commit_trans(osb, handle);
5734 out:
5735         mutex_unlock(&tl_inode->i_mutex);
5736
5737         if (meta_ac)
5738                 ocfs2_free_alloc_context(meta_ac);
5739
5740         if (ref_tree)
5741                 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
5742
5743         return ret;
5744 }
5745
5746 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5747 {
5748         struct buffer_head *tl_bh = osb->osb_tl_bh;
5749         struct ocfs2_dinode *di;
5750         struct ocfs2_truncate_log *tl;
5751
5752         di = (struct ocfs2_dinode *) tl_bh->b_data;
5753         tl = &di->id2.i_dealloc;
5754
5755         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5756                         "slot %d, invalid truncate log parameters: used = "
5757                         "%u, count = %u\n", osb->slot_num,
5758                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5759         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5760 }
5761
5762 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5763                                            unsigned int new_start)
5764 {
5765         unsigned int tail_index;
5766         unsigned int current_tail;
5767
5768         /* No records, nothing to coalesce */
5769         if (!le16_to_cpu(tl->tl_used))
5770                 return 0;
5771
5772         tail_index = le16_to_cpu(tl->tl_used) - 1;
5773         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5774         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5775
5776         return current_tail == new_start;
5777 }
5778
5779 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5780                               handle_t *handle,
5781                               u64 start_blk,
5782                               unsigned int num_clusters)
5783 {
5784         int status, index;
5785         unsigned int start_cluster, tl_count;
5786         struct inode *tl_inode = osb->osb_tl_inode;
5787         struct buffer_head *tl_bh = osb->osb_tl_bh;
5788         struct ocfs2_dinode *di;
5789         struct ocfs2_truncate_log *tl;
5790
5791         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5792
5793         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5794
5795         di = (struct ocfs2_dinode *) tl_bh->b_data;
5796
5797         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5798          * by the underlying call to ocfs2_read_inode_block(), so any
5799          * corruption is a code bug */
5800         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5801
5802         tl = &di->id2.i_dealloc;
5803         tl_count = le16_to_cpu(tl->tl_count);
5804         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5805                         tl_count == 0,
5806                         "Truncate record count on #%llu invalid "
5807                         "wanted %u, actual %u\n",
5808                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5809                         ocfs2_truncate_recs_per_inode(osb->sb),
5810                         le16_to_cpu(tl->tl_count));
5811
5812         /* Caller should have known to flush before calling us. */
5813         index = le16_to_cpu(tl->tl_used);
5814         if (index >= tl_count) {
5815                 status = -ENOSPC;
5816                 mlog_errno(status);
5817                 goto bail;
5818         }
5819
5820         status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5821                                          OCFS2_JOURNAL_ACCESS_WRITE);
5822         if (status < 0) {
5823                 mlog_errno(status);
5824                 goto bail;
5825         }
5826
5827         trace_ocfs2_truncate_log_append(
5828                 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index,
5829                 start_cluster, num_clusters);
5830         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5831                 /*
5832                  * Move index back to the record we are coalescing with.
5833                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5834                  */
5835                 index--;
5836
5837                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5838                 trace_ocfs2_truncate_log_append(
5839                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5840                         index, le32_to_cpu(tl->tl_recs[index].t_start),
5841                         num_clusters);
5842         } else {
5843                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5844                 tl->tl_used = cpu_to_le16(index + 1);
5845         }
5846         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5847
5848         ocfs2_journal_dirty(handle, tl_bh);
5849
5850         osb->truncated_clusters += num_clusters;
5851 bail:
5852         return status;
5853 }
5854
5855 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5856                                          handle_t *handle,
5857                                          struct inode *data_alloc_inode,
5858                                          struct buffer_head *data_alloc_bh)
5859 {
5860         int status = 0;
5861         int i;
5862         unsigned int num_clusters;
5863         u64 start_blk;
5864         struct ocfs2_truncate_rec rec;
5865         struct ocfs2_dinode *di;
5866         struct ocfs2_truncate_log *tl;
5867         struct inode *tl_inode = osb->osb_tl_inode;
5868         struct buffer_head *tl_bh = osb->osb_tl_bh;
5869
5870         di = (struct ocfs2_dinode *) tl_bh->b_data;
5871         tl = &di->id2.i_dealloc;
5872         i = le16_to_cpu(tl->tl_used) - 1;
5873         while (i >= 0) {
5874                 /* Caller has given us at least enough credits to
5875                  * update the truncate log dinode */
5876                 status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5877                                                  OCFS2_JOURNAL_ACCESS_WRITE);
5878                 if (status < 0) {
5879                         mlog_errno(status);
5880                         goto bail;
5881                 }
5882
5883                 tl->tl_used = cpu_to_le16(i);
5884
5885                 ocfs2_journal_dirty(handle, tl_bh);
5886
5887                 /* TODO: Perhaps we can calculate the bulk of the
5888                  * credits up front rather than extending like
5889                  * this. */
5890                 status = ocfs2_extend_trans(handle,
5891                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5892                 if (status < 0) {
5893                         mlog_errno(status);
5894                         goto bail;
5895                 }
5896
5897                 rec = tl->tl_recs[i];
5898                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5899                                                     le32_to_cpu(rec.t_start));
5900                 num_clusters = le32_to_cpu(rec.t_clusters);
5901
5902                 /* if start_blk is not set, we ignore the record as
5903                  * invalid. */
5904                 if (start_blk) {
5905                         trace_ocfs2_replay_truncate_records(
5906                                 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5907                                 i, le32_to_cpu(rec.t_start), num_clusters);
5908
5909                         status = ocfs2_free_clusters(handle, data_alloc_inode,
5910                                                      data_alloc_bh, start_blk,
5911                                                      num_clusters);
5912                         if (status < 0) {
5913                                 mlog_errno(status);
5914                                 goto bail;
5915                         }
5916                 }
5917                 i--;
5918         }
5919
5920         osb->truncated_clusters = 0;
5921
5922 bail:
5923         return status;
5924 }
5925
5926 /* Expects you to already be holding tl_inode->i_mutex */
5927 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5928 {
5929         int status;
5930         unsigned int num_to_flush;
5931         handle_t *handle;
5932         struct inode *tl_inode = osb->osb_tl_inode;
5933         struct inode *data_alloc_inode = NULL;
5934         struct buffer_head *tl_bh = osb->osb_tl_bh;
5935         struct buffer_head *data_alloc_bh = NULL;
5936         struct ocfs2_dinode *di;
5937         struct ocfs2_truncate_log *tl;
5938
5939         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5940
5941         di = (struct ocfs2_dinode *) tl_bh->b_data;
5942
5943         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5944          * by the underlying call to ocfs2_read_inode_block(), so any
5945          * corruption is a code bug */
5946         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5947
5948         tl = &di->id2.i_dealloc;
5949         num_to_flush = le16_to_cpu(tl->tl_used);
5950         trace_ocfs2_flush_truncate_log(
5951                 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5952                 num_to_flush);
5953         if (!num_to_flush) {
5954                 status = 0;
5955                 goto out;
5956         }
5957
5958         data_alloc_inode = ocfs2_get_system_file_inode(osb,
5959                                                        GLOBAL_BITMAP_SYSTEM_INODE,
5960                                                        OCFS2_INVALID_SLOT);
5961         if (!data_alloc_inode) {
5962                 status = -EINVAL;
5963                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
5964                 goto out;
5965         }
5966
5967         mutex_lock(&data_alloc_inode->i_mutex);
5968
5969         status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5970         if (status < 0) {
5971                 mlog_errno(status);
5972                 goto out_mutex;
5973         }
5974
5975         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5976         if (IS_ERR(handle)) {
5977                 status = PTR_ERR(handle);
5978                 mlog_errno(status);
5979                 goto out_unlock;
5980         }
5981
5982         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5983                                                data_alloc_bh);
5984         if (status < 0)
5985                 mlog_errno(status);
5986
5987         ocfs2_commit_trans(osb, handle);
5988
5989 out_unlock:
5990         brelse(data_alloc_bh);
5991         ocfs2_inode_unlock(data_alloc_inode, 1);
5992
5993 out_mutex:
5994         mutex_unlock(&data_alloc_inode->i_mutex);
5995         iput(data_alloc_inode);
5996
5997 out:
5998         return status;
5999 }
6000
6001 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
6002 {
6003         int status;
6004         struct inode *tl_inode = osb->osb_tl_inode;
6005
6006         mutex_lock(&tl_inode->i_mutex);
6007         status = __ocfs2_flush_truncate_log(osb);
6008         mutex_unlock(&tl_inode->i_mutex);
6009
6010         return status;
6011 }
6012
6013 static void ocfs2_truncate_log_worker(struct work_struct *work)
6014 {
6015         int status;
6016         struct ocfs2_super *osb =
6017                 container_of(work, struct ocfs2_super,
6018                              osb_truncate_log_wq.work);
6019
6020         status = ocfs2_flush_truncate_log(osb);
6021         if (status < 0)
6022                 mlog_errno(status);
6023         else
6024                 ocfs2_init_steal_slots(osb);
6025 }
6026
6027 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
6028 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
6029                                        int cancel)
6030 {
6031         if (osb->osb_tl_inode) {
6032                 /* We want to push off log flushes while truncates are
6033                  * still running. */
6034                 if (cancel)
6035                         cancel_delayed_work(&osb->osb_truncate_log_wq);
6036
6037                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
6038                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
6039         }
6040 }
6041
6042 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
6043                                        int slot_num,
6044                                        struct inode **tl_inode,
6045                                        struct buffer_head **tl_bh)
6046 {
6047         int status;
6048         struct inode *inode = NULL;
6049         struct buffer_head *bh = NULL;
6050
6051         inode = ocfs2_get_system_file_inode(osb,
6052                                            TRUNCATE_LOG_SYSTEM_INODE,
6053                                            slot_num);
6054         if (!inode) {
6055                 status = -EINVAL;
6056                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
6057                 goto bail;
6058         }
6059
6060         status = ocfs2_read_inode_block(inode, &bh);
6061         if (status < 0) {
6062                 iput(inode);
6063                 mlog_errno(status);
6064                 goto bail;
6065         }
6066
6067         *tl_inode = inode;
6068         *tl_bh    = bh;
6069 bail:
6070         return status;
6071 }
6072
6073 /* called during the 1st stage of node recovery. we stamp a clean
6074  * truncate log and pass back a copy for processing later. if the
6075  * truncate log does not require processing, a *tl_copy is set to
6076  * NULL. */
6077 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
6078                                       int slot_num,
6079                                       struct ocfs2_dinode **tl_copy)
6080 {
6081         int status;
6082         struct inode *tl_inode = NULL;
6083         struct buffer_head *tl_bh = NULL;
6084         struct ocfs2_dinode *di;
6085         struct ocfs2_truncate_log *tl;
6086
6087         *tl_copy = NULL;
6088
6089         trace_ocfs2_begin_truncate_log_recovery(slot_num);
6090
6091         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
6092         if (status < 0) {
6093                 mlog_errno(status);
6094                 goto bail;
6095         }
6096
6097         di = (struct ocfs2_dinode *) tl_bh->b_data;
6098
6099         /* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
6100          * validated by the underlying call to ocfs2_read_inode_block(),
6101          * so any corruption is a code bug */
6102         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
6103
6104         tl = &di->id2.i_dealloc;
6105         if (le16_to_cpu(tl->tl_used)) {
6106                 trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used));
6107
6108                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
6109                 if (!(*tl_copy)) {
6110                         status = -ENOMEM;
6111                         mlog_errno(status);
6112                         goto bail;
6113                 }
6114
6115                 /* Assuming the write-out below goes well, this copy
6116                  * will be passed back to recovery for processing. */
6117                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
6118
6119                 /* All we need to do to clear the truncate log is set
6120                  * tl_used. */
6121                 tl->tl_used = 0;
6122
6123                 ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
6124                 status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode));
6125                 if (status < 0) {
6126                         mlog_errno(status);
6127                         goto bail;
6128                 }
6129         }
6130
6131 bail:
6132         if (tl_inode)
6133                 iput(tl_inode);
6134         brelse(tl_bh);
6135
6136         if (status < 0 && (*tl_copy)) {
6137                 kfree(*tl_copy);
6138                 *tl_copy = NULL;
6139                 mlog_errno(status);
6140         }
6141
6142         return status;
6143 }
6144
6145 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
6146                                          struct ocfs2_dinode *tl_copy)
6147 {
6148         int status = 0;
6149         int i;
6150         unsigned int clusters, num_recs, start_cluster;
6151         u64 start_blk;
6152         handle_t *handle;
6153         struct inode *tl_inode = osb->osb_tl_inode;
6154         struct ocfs2_truncate_log *tl;
6155
6156         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
6157                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
6158                 return -EINVAL;
6159         }
6160
6161         tl = &tl_copy->id2.i_dealloc;
6162         num_recs = le16_to_cpu(tl->tl_used);
6163         trace_ocfs2_complete_truncate_log_recovery(
6164                 (unsigned long long)le64_to_cpu(tl_copy->i_blkno),
6165                 num_recs);
6166
6167         mutex_lock(&tl_inode->i_mutex);
6168         for(i = 0; i < num_recs; i++) {
6169                 if (ocfs2_truncate_log_needs_flush(osb)) {
6170                         status = __ocfs2_flush_truncate_log(osb);
6171                         if (status < 0) {
6172                                 mlog_errno(status);
6173                                 goto bail_up;
6174                         }
6175                 }
6176
6177                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6178                 if (IS_ERR(handle)) {
6179                         status = PTR_ERR(handle);
6180                         mlog_errno(status);
6181                         goto bail_up;
6182                 }
6183
6184                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
6185                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
6186                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
6187
6188                 status = ocfs2_truncate_log_append(osb, handle,
6189                                                    start_blk, clusters);
6190                 ocfs2_commit_trans(osb, handle);
6191                 if (status < 0) {
6192                         mlog_errno(status);
6193                         goto bail_up;
6194                 }
6195         }
6196
6197 bail_up:
6198         mutex_unlock(&tl_inode->i_mutex);
6199
6200         return status;
6201 }
6202
6203 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
6204 {
6205         int status;
6206         struct inode *tl_inode = osb->osb_tl_inode;
6207
6208         if (tl_inode) {
6209                 cancel_delayed_work(&osb->osb_truncate_log_wq);
6210                 flush_workqueue(ocfs2_wq);
6211
6212                 status = ocfs2_flush_truncate_log(osb);
6213                 if (status < 0)
6214                         mlog_errno(status);
6215
6216                 brelse(osb->osb_tl_bh);
6217                 iput(osb->osb_tl_inode);
6218         }
6219 }
6220
6221 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
6222 {
6223         int status;
6224         struct inode *tl_inode = NULL;
6225         struct buffer_head *tl_bh = NULL;
6226
6227         status = ocfs2_get_truncate_log_info(osb,
6228                                              osb->slot_num,
6229                                              &tl_inode,
6230                                              &tl_bh);
6231         if (status < 0)
6232                 mlog_errno(status);
6233
6234         /* ocfs2_truncate_log_shutdown keys on the existence of
6235          * osb->osb_tl_inode so we don't set any of the osb variables
6236          * until we're sure all is well. */
6237         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
6238                           ocfs2_truncate_log_worker);
6239         osb->osb_tl_bh    = tl_bh;
6240         osb->osb_tl_inode = tl_inode;
6241
6242         return status;
6243 }
6244
6245 /*
6246  * Delayed de-allocation of suballocator blocks.
6247  *
6248  * Some sets of block de-allocations might involve multiple suballocator inodes.
6249  *
6250  * The locking for this can get extremely complicated, especially when
6251  * the suballocator inodes to delete from aren't known until deep
6252  * within an unrelated codepath.
6253  *
6254  * ocfs2_extent_block structures are a good example of this - an inode
6255  * btree could have been grown by any number of nodes each allocating
6256  * out of their own suballoc inode.
6257  *
6258  * These structures allow the delay of block de-allocation until a
6259  * later time, when locking of multiple cluster inodes won't cause
6260  * deadlock.
6261  */
6262
6263 /*
6264  * Describe a single bit freed from a suballocator.  For the block
6265  * suballocators, it represents one block.  For the global cluster
6266  * allocator, it represents some clusters and free_bit indicates
6267  * clusters number.
6268  */
6269 struct ocfs2_cached_block_free {
6270         struct ocfs2_cached_block_free          *free_next;
6271         u64                                     free_bg;
6272         u64                                     free_blk;
6273         unsigned int                            free_bit;
6274 };
6275
6276 struct ocfs2_per_slot_free_list {
6277         struct ocfs2_per_slot_free_list         *f_next_suballocator;
6278         int                                     f_inode_type;
6279         int                                     f_slot;
6280         struct ocfs2_cached_block_free          *f_first;
6281 };
6282
6283 static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
6284                                     int sysfile_type,
6285                                     int slot,
6286                                     struct ocfs2_cached_block_free *head)
6287 {
6288         int ret;
6289         u64 bg_blkno;
6290         handle_t *handle;
6291         struct inode *inode;
6292         struct buffer_head *di_bh = NULL;
6293         struct ocfs2_cached_block_free *tmp;
6294
6295         inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
6296         if (!inode) {
6297                 ret = -EINVAL;
6298                 mlog_errno(ret);
6299                 goto out;
6300         }
6301
6302         mutex_lock(&inode->i_mutex);
6303
6304         ret = ocfs2_inode_lock(inode, &di_bh, 1);
6305         if (ret) {
6306                 mlog_errno(ret);
6307                 goto out_mutex;
6308         }
6309
6310         handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
6311         if (IS_ERR(handle)) {
6312                 ret = PTR_ERR(handle);
6313                 mlog_errno(ret);
6314                 goto out_unlock;
6315         }
6316
6317         while (head) {
6318                 if (head->free_bg)
6319                         bg_blkno = head->free_bg;
6320                 else
6321                         bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
6322                                                               head->free_bit);
6323                 trace_ocfs2_free_cached_blocks(
6324                      (unsigned long long)head->free_blk, head->free_bit);
6325
6326                 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
6327                                                head->free_bit, bg_blkno, 1);
6328                 if (ret) {
6329                         mlog_errno(ret);
6330                         goto out_journal;
6331                 }
6332
6333                 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
6334                 if (ret) {
6335                         mlog_errno(ret);
6336                         goto out_journal;
6337                 }
6338
6339                 tmp = head;
6340                 head = head->free_next;
6341                 kfree(tmp);
6342         }
6343
6344 out_journal:
6345         ocfs2_commit_trans(osb, handle);
6346
6347 out_unlock:
6348         ocfs2_inode_unlock(inode, 1);
6349         brelse(di_bh);
6350 out_mutex:
6351         mutex_unlock(&inode->i_mutex);
6352         iput(inode);
6353 out:
6354         while(head) {
6355                 /* Premature exit may have left some dangling items. */
6356                 tmp = head;
6357                 head = head->free_next;
6358                 kfree(tmp);
6359         }
6360
6361         return ret;
6362 }
6363
6364 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6365                                 u64 blkno, unsigned int bit)
6366 {
6367         int ret = 0;
6368         struct ocfs2_cached_block_free *item;
6369
6370         item = kzalloc(sizeof(*item), GFP_NOFS);
6371         if (item == NULL) {
6372                 ret = -ENOMEM;
6373                 mlog_errno(ret);
6374                 return ret;
6375         }
6376
6377         trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit);
6378
6379         item->free_blk = blkno;
6380         item->free_bit = bit;
6381         item->free_next = ctxt->c_global_allocator;
6382
6383         ctxt->c_global_allocator = item;
6384         return ret;
6385 }
6386
6387 static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
6388                                       struct ocfs2_cached_block_free *head)
6389 {
6390         struct ocfs2_cached_block_free *tmp;
6391         struct inode *tl_inode = osb->osb_tl_inode;
6392         handle_t *handle;
6393         int ret = 0;
6394
6395         mutex_lock(&tl_inode->i_mutex);
6396
6397         while (head) {
6398                 if (ocfs2_truncate_log_needs_flush(osb)) {
6399                         ret = __ocfs2_flush_truncate_log(osb);
6400                         if (ret < 0) {
6401                                 mlog_errno(ret);
6402                                 break;
6403                         }
6404                 }
6405
6406                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6407                 if (IS_ERR(handle)) {
6408                         ret = PTR_ERR(handle);
6409                         mlog_errno(ret);
6410                         break;
6411                 }
6412
6413                 ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6414                                                 head->free_bit);
6415
6416                 ocfs2_commit_trans(osb, handle);
6417                 tmp = head;
6418                 head = head->free_next;
6419                 kfree(tmp);
6420
6421                 if (ret < 0) {
6422                         mlog_errno(ret);
6423                         break;
6424                 }
6425         }
6426
6427         mutex_unlock(&tl_inode->i_mutex);
6428
6429         while (head) {
6430                 /* Premature exit may have left some dangling items. */
6431                 tmp = head;
6432                 head = head->free_next;
6433                 kfree(tmp);
6434         }
6435
6436         return ret;
6437 }
6438
6439 int ocfs2_run_deallocs(struct ocfs2_super *osb,
6440                        struct ocfs2_cached_dealloc_ctxt *ctxt)
6441 {
6442         int ret = 0, ret2;
6443         struct ocfs2_per_slot_free_list *fl;
6444
6445         if (!ctxt)
6446                 return 0;
6447
6448         while (ctxt->c_first_suballocator) {
6449                 fl = ctxt->c_first_suballocator;
6450
6451                 if (fl->f_first) {
6452                         trace_ocfs2_run_deallocs(fl->f_inode_type,
6453                                                  fl->f_slot);
6454                         ret2 = ocfs2_free_cached_blocks(osb,
6455                                                         fl->f_inode_type,
6456                                                         fl->f_slot,
6457                                                         fl->f_first);
6458                         if (ret2)
6459                                 mlog_errno(ret2);
6460                         if (!ret)
6461                                 ret = ret2;
6462                 }
6463
6464                 ctxt->c_first_suballocator = fl->f_next_suballocator;
6465                 kfree(fl);
6466         }
6467
6468         if (ctxt->c_global_allocator) {
6469                 ret2 = ocfs2_free_cached_clusters(osb,
6470                                                   ctxt->c_global_allocator);
6471                 if (ret2)
6472                         mlog_errno(ret2);
6473                 if (!ret)
6474                         ret = ret2;
6475
6476                 ctxt->c_global_allocator = NULL;
6477         }
6478
6479         return ret;
6480 }
6481
6482 static struct ocfs2_per_slot_free_list *
6483 ocfs2_find_per_slot_free_list(int type,
6484                               int slot,
6485                               struct ocfs2_cached_dealloc_ctxt *ctxt)
6486 {
6487         struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6488
6489         while (fl) {
6490                 if (fl->f_inode_type == type && fl->f_slot == slot)
6491                         return fl;
6492
6493                 fl = fl->f_next_suballocator;
6494         }
6495
6496         fl = kmalloc(sizeof(*fl), GFP_NOFS);
6497         if (fl) {
6498                 fl->f_inode_type = type;
6499                 fl->f_slot = slot;
6500                 fl->f_first = NULL;
6501                 fl->f_next_suballocator = ctxt->c_first_suballocator;
6502
6503                 ctxt->c_first_suballocator = fl;
6504         }
6505         return fl;
6506 }
6507
6508 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6509                               int type, int slot, u64 suballoc,
6510                               u64 blkno, unsigned int bit)
6511 {
6512         int ret;
6513         struct ocfs2_per_slot_free_list *fl;
6514         struct ocfs2_cached_block_free *item;
6515
6516         fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
6517         if (fl == NULL) {
6518                 ret = -ENOMEM;
6519                 mlog_errno(ret);
6520                 goto out;
6521         }
6522
6523         item = kzalloc(sizeof(*item), GFP_NOFS);
6524         if (item == NULL) {
6525                 ret = -ENOMEM;
6526                 mlog_errno(ret);
6527                 goto out;
6528         }
6529
6530         trace_ocfs2_cache_block_dealloc(type, slot,
6531                                         (unsigned long long)suballoc,
6532                                         (unsigned long long)blkno, bit);
6533
6534         item->free_bg = suballoc;
6535         item->free_blk = blkno;
6536         item->free_bit = bit;
6537         item->free_next = fl->f_first;
6538
6539         fl->f_first = item;
6540
6541         ret = 0;
6542 out:
6543         return ret;
6544 }
6545
6546 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6547                                          struct ocfs2_extent_block *eb)
6548 {
6549         return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6550                                          le16_to_cpu(eb->h_suballoc_slot),
6551                                          le64_to_cpu(eb->h_suballoc_loc),
6552                                          le64_to_cpu(eb->h_blkno),
6553                                          le16_to_cpu(eb->h_suballoc_bit));
6554 }
6555
6556 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6557 {
6558         set_buffer_uptodate(bh);
6559         mark_buffer_dirty(bh);
6560         return 0;
6561 }
6562
6563 void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6564                               unsigned int from, unsigned int to,
6565                               struct page *page, int zero, u64 *phys)
6566 {
6567         int ret, partial = 0;
6568
6569         ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6570         if (ret)
6571                 mlog_errno(ret);
6572
6573         if (zero)
6574                 zero_user_segment(page, from, to);
6575
6576         /*
6577          * Need to set the buffers we zero'd into uptodate
6578          * here if they aren't - ocfs2_map_page_blocks()
6579          * might've skipped some
6580          */
6581         ret = walk_page_buffers(handle, page_buffers(page),
6582                                 from, to, &partial,
6583                                 ocfs2_zero_func);
6584         if (ret < 0)
6585                 mlog_errno(ret);
6586         else if (ocfs2_should_order_data(inode)) {
6587                 ret = ocfs2_jbd2_file_inode(handle, inode);
6588                 if (ret < 0)
6589                         mlog_errno(ret);
6590         }
6591
6592         if (!partial)
6593                 SetPageUptodate(page);
6594
6595         flush_dcache_page(page);
6596 }
6597
6598 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6599                                      loff_t end, struct page **pages,
6600                                      int numpages, u64 phys, handle_t *handle)
6601 {
6602         int i;
6603         struct page *page;
6604         unsigned int from, to = PAGE_CACHE_SIZE;
6605         struct super_block *sb = inode->i_sb;
6606
6607         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6608
6609         if (numpages == 0)
6610                 goto out;
6611
6612         to = PAGE_CACHE_SIZE;
6613         for(i = 0; i < numpages; i++) {
6614                 page = pages[i];
6615
6616                 from = start & (PAGE_CACHE_SIZE - 1);
6617                 if ((end >> PAGE_CACHE_SHIFT) == page->index)
6618                         to = end & (PAGE_CACHE_SIZE - 1);
6619
6620                 BUG_ON(from > PAGE_CACHE_SIZE);
6621                 BUG_ON(to > PAGE_CACHE_SIZE);
6622
6623                 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6624                                          &phys);
6625
6626                 start = (page->index + 1) << PAGE_CACHE_SHIFT;
6627         }
6628 out:
6629         if (pages)
6630                 ocfs2_unlock_and_free_pages(pages, numpages);
6631 }
6632
6633 int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
6634                      struct page **pages, int *num)
6635 {
6636         int numpages, ret = 0;
6637         struct address_space *mapping = inode->i_mapping;
6638         unsigned long index;
6639         loff_t last_page_bytes;
6640
6641         BUG_ON(start > end);
6642
6643         numpages = 0;
6644         last_page_bytes = PAGE_ALIGN(end);
6645         index = start >> PAGE_CACHE_SHIFT;
6646         do {
6647                 pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
6648                 if (!pages[numpages]) {
6649                         ret = -ENOMEM;
6650                         mlog_errno(ret);
6651                         goto out;
6652                 }
6653
6654                 numpages++;
6655                 index++;
6656         } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6657
6658 out:
6659         if (ret != 0) {
6660                 if (pages)
6661                         ocfs2_unlock_and_free_pages(pages, numpages);
6662                 numpages = 0;
6663         }
6664
6665         *num = numpages;
6666
6667         return ret;
6668 }
6669
6670 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6671                                 struct page **pages, int *num)
6672 {
6673         struct super_block *sb = inode->i_sb;
6674
6675         BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6676                (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6677
6678         return ocfs2_grab_pages(inode, start, end, pages, num);
6679 }
6680
6681 /*
6682  * Zero the area past i_size but still within an allocated
6683  * cluster. This avoids exposing nonzero data on subsequent file
6684  * extends.
6685  *
6686  * We need to call this before i_size is updated on the inode because
6687  * otherwise block_write_full_page() will skip writeout of pages past
6688  * i_size. The new_i_size parameter is passed for this reason.
6689  */
6690 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6691                                   u64 range_start, u64 range_end)
6692 {
6693         int ret = 0, numpages;
6694         struct page **pages = NULL;
6695         u64 phys;
6696         unsigned int ext_flags;
6697         struct super_block *sb = inode->i_sb;
6698
6699         /*
6700          * File systems which don't support sparse files zero on every
6701          * extend.
6702          */
6703         if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6704                 return 0;
6705
6706         pages = kcalloc(ocfs2_pages_per_cluster(sb),
6707                         sizeof(struct page *), GFP_NOFS);
6708         if (pages == NULL) {
6709                 ret = -ENOMEM;
6710                 mlog_errno(ret);
6711                 goto out;
6712         }
6713
6714         if (range_start == range_end)
6715                 goto out;
6716
6717         ret = ocfs2_extent_map_get_blocks(inode,
6718                                           range_start >> sb->s_blocksize_bits,
6719                                           &phys, NULL, &ext_flags);
6720         if (ret) {
6721                 mlog_errno(ret);
6722                 goto out;
6723         }
6724
6725         /*
6726          * Tail is a hole, or is marked unwritten. In either case, we
6727          * can count on read and write to return/push zero's.
6728          */
6729         if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6730                 goto out;
6731
6732         ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6733                                    &numpages);
6734         if (ret) {
6735                 mlog_errno(ret);
6736                 goto out;
6737         }
6738
6739         ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6740                                  numpages, phys, handle);
6741
6742         /*
6743          * Initiate writeout of the pages we zero'd here. We don't
6744          * wait on them - the truncate_inode_pages() call later will
6745          * do that for us.
6746          */
6747         ret = filemap_fdatawrite_range(inode->i_mapping, range_start,
6748                                        range_end - 1);
6749         if (ret)
6750                 mlog_errno(ret);
6751
6752 out:
6753         if (pages)
6754                 kfree(pages);
6755
6756         return ret;
6757 }
6758
6759 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
6760                                              struct ocfs2_dinode *di)
6761 {
6762         unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6763         unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
6764
6765         if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
6766                 memset(&di->id2, 0, blocksize -
6767                                     offsetof(struct ocfs2_dinode, id2) -
6768                                     xattrsize);
6769         else
6770                 memset(&di->id2, 0, blocksize -
6771                                     offsetof(struct ocfs2_dinode, id2));
6772 }
6773
6774 void ocfs2_dinode_new_extent_list(struct inode *inode,
6775                                   struct ocfs2_dinode *di)
6776 {
6777         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6778         di->id2.i_list.l_tree_depth = 0;
6779         di->id2.i_list.l_next_free_rec = 0;
6780         di->id2.i_list.l_count = cpu_to_le16(
6781                 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
6782 }
6783
6784 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6785 {
6786         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6787         struct ocfs2_inline_data *idata = &di->id2.i_data;
6788
6789         spin_lock(&oi->ip_lock);
6790         oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6791         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6792         spin_unlock(&oi->ip_lock);
6793
6794         /*
6795          * We clear the entire i_data structure here so that all
6796          * fields can be properly initialized.
6797          */
6798         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6799
6800         idata->id_count = cpu_to_le16(
6801                         ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
6802 }
6803
6804 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6805                                          struct buffer_head *di_bh)
6806 {
6807         int ret, i, has_data, num_pages = 0;
6808         handle_t *handle;
6809         u64 uninitialized_var(block);
6810         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6811         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6812         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6813         struct ocfs2_alloc_context *data_ac = NULL;
6814         struct page **pages = NULL;
6815         loff_t end = osb->s_clustersize;
6816         struct ocfs2_extent_tree et;
6817         int did_quota = 0;
6818
6819         has_data = i_size_read(inode) ? 1 : 0;
6820
6821         if (has_data) {
6822                 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6823                                 sizeof(struct page *), GFP_NOFS);
6824                 if (pages == NULL) {
6825                         ret = -ENOMEM;
6826                         mlog_errno(ret);
6827                         goto out;
6828                 }
6829
6830                 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6831                 if (ret) {
6832                         mlog_errno(ret);
6833                         goto out;
6834                 }
6835         }
6836
6837         handle = ocfs2_start_trans(osb,
6838                                    ocfs2_inline_to_extents_credits(osb->sb));
6839         if (IS_ERR(handle)) {
6840                 ret = PTR_ERR(handle);
6841                 mlog_errno(ret);
6842                 goto out_unlock;
6843         }
6844
6845         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
6846                                       OCFS2_JOURNAL_ACCESS_WRITE);
6847         if (ret) {
6848                 mlog_errno(ret);
6849                 goto out_commit;
6850         }
6851
6852         if (has_data) {
6853                 u32 bit_off, num;
6854                 unsigned int page_end;
6855                 u64 phys;
6856
6857                 ret = dquot_alloc_space_nodirty(inode,
6858                                        ocfs2_clusters_to_bytes(osb->sb, 1));
6859                 if (ret)
6860                         goto out_commit;
6861                 did_quota = 1;
6862
6863                 data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
6864
6865                 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
6866                                            &num);
6867                 if (ret) {
6868                         mlog_errno(ret);
6869                         goto out_commit;
6870                 }
6871
6872                 /*
6873                  * Save two copies, one for insert, and one that can
6874                  * be changed by ocfs2_map_and_dirty_page() below.
6875                  */
6876                 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6877
6878                 /*
6879                  * Non sparse file systems zero on extend, so no need
6880                  * to do that now.
6881                  */
6882                 if (!ocfs2_sparse_alloc(osb) &&
6883                     PAGE_CACHE_SIZE < osb->s_clustersize)
6884                         end = PAGE_CACHE_SIZE;
6885
6886                 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6887                 if (ret) {
6888                         mlog_errno(ret);
6889                         goto out_commit;
6890                 }
6891
6892                 /*
6893                  * This should populate the 1st page for us and mark
6894                  * it up to date.
6895                  */
6896                 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6897                 if (ret) {
6898                         mlog_errno(ret);
6899                         goto out_commit;
6900                 }
6901
6902                 page_end = PAGE_CACHE_SIZE;
6903                 if (PAGE_CACHE_SIZE > osb->s_clustersize)
6904                         page_end = osb->s_clustersize;
6905
6906                 for (i = 0; i < num_pages; i++)
6907                         ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6908                                                  pages[i], i > 0, &phys);
6909         }
6910
6911         spin_lock(&oi->ip_lock);
6912         oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6913         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6914         spin_unlock(&oi->ip_lock);
6915
6916         ocfs2_dinode_new_extent_list(inode, di);
6917
6918         ocfs2_journal_dirty(handle, di_bh);
6919
6920         if (has_data) {
6921                 /*
6922                  * An error at this point should be extremely rare. If
6923                  * this proves to be false, we could always re-build
6924                  * the in-inode data from our pages.
6925                  */
6926                 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
6927                 ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
6928                 if (ret) {
6929                         mlog_errno(ret);
6930                         goto out_commit;
6931                 }
6932
6933                 inode->i_blocks = ocfs2_inode_sector_count(inode);
6934         }
6935
6936 out_commit:
6937         if (ret < 0 && did_quota)
6938                 dquot_free_space_nodirty(inode,
6939                                           ocfs2_clusters_to_bytes(osb->sb, 1));
6940
6941         ocfs2_commit_trans(osb, handle);
6942
6943 out_unlock:
6944         if (data_ac)
6945                 ocfs2_free_alloc_context(data_ac);
6946
6947 out:
6948         if (pages) {
6949                 ocfs2_unlock_and_free_pages(pages, num_pages);
6950                 kfree(pages);
6951         }
6952
6953         return ret;
6954 }
6955
6956 /*
6957  * It is expected, that by the time you call this function,
6958  * inode->i_size and fe->i_size have been adjusted.
6959  *
6960  * WARNING: This will kfree the truncate context
6961  */
6962 int ocfs2_commit_truncate(struct ocfs2_super *osb,
6963                           struct inode *inode,
6964                           struct buffer_head *di_bh)
6965 {
6966         int status = 0, i, flags = 0;
6967         u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
6968         u64 blkno = 0;
6969         struct ocfs2_extent_list *el;
6970         struct ocfs2_extent_rec *rec;
6971         struct ocfs2_path *path = NULL;
6972         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6973         struct ocfs2_extent_list *root_el = &(di->id2.i_list);
6974         u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
6975         struct ocfs2_extent_tree et;
6976         struct ocfs2_cached_dealloc_ctxt dealloc;
6977
6978         ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
6979         ocfs2_init_dealloc_ctxt(&dealloc);
6980
6981         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6982                                                      i_size_read(inode));
6983
6984         path = ocfs2_new_path(di_bh, &di->id2.i_list,
6985                               ocfs2_journal_access_di);
6986         if (!path) {
6987                 status = -ENOMEM;
6988                 mlog_errno(status);
6989                 goto bail;
6990         }
6991
6992         ocfs2_extent_map_trunc(inode, new_highest_cpos);
6993
6994 start:
6995         /*
6996          * Check that we still have allocation to delete.
6997          */
6998         if (OCFS2_I(inode)->ip_clusters == 0) {
6999                 status = 0;
7000                 goto bail;
7001         }
7002
7003         /*
7004          * Truncate always works against the rightmost tree branch.
7005          */
7006         status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX);
7007         if (status) {
7008                 mlog_errno(status);
7009                 goto bail;
7010         }
7011
7012         trace_ocfs2_commit_truncate(
7013                 (unsigned long long)OCFS2_I(inode)->ip_blkno,
7014                 new_highest_cpos,
7015                 OCFS2_I(inode)->ip_clusters,
7016                 path->p_tree_depth);
7017
7018         /*
7019          * By now, el will point to the extent list on the bottom most
7020          * portion of this tree. Only the tail record is considered in
7021          * each pass.
7022          *
7023          * We handle the following cases, in order:
7024          * - empty extent: delete the remaining branch
7025          * - remove the entire record
7026          * - remove a partial record
7027          * - no record needs to be removed (truncate has completed)
7028          */
7029         el = path_leaf_el(path);
7030         if (le16_to_cpu(el->l_next_free_rec) == 0) {
7031                 ocfs2_error(inode->i_sb,
7032                             "Inode %llu has empty extent block at %llu\n",
7033                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7034                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
7035                 status = -EROFS;
7036                 goto bail;
7037         }
7038
7039         i = le16_to_cpu(el->l_next_free_rec) - 1;
7040         rec = &el->l_recs[i];
7041         flags = rec->e_flags;
7042         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
7043
7044         if (i == 0 && ocfs2_is_empty_extent(rec)) {
7045                 /*
7046                  * Lower levels depend on this never happening, but it's best
7047                  * to check it up here before changing the tree.
7048                 */
7049                 if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
7050                         ocfs2_error(inode->i_sb, "Inode %lu has an empty "
7051                                     "extent record, depth %u\n", inode->i_ino,
7052                                     le16_to_cpu(root_el->l_tree_depth));
7053                         status = -EROFS;
7054                         goto bail;
7055                 }
7056                 trunc_cpos = le32_to_cpu(rec->e_cpos);
7057                 trunc_len = 0;
7058                 blkno = 0;
7059         } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
7060                 /*
7061                  * Truncate entire record.
7062                  */
7063                 trunc_cpos = le32_to_cpu(rec->e_cpos);
7064                 trunc_len = ocfs2_rec_clusters(el, rec);
7065                 blkno = le64_to_cpu(rec->e_blkno);
7066         } else if (range > new_highest_cpos) {
7067                 /*
7068                  * Partial truncate. it also should be
7069                  * the last truncate we're doing.
7070                  */
7071                 trunc_cpos = new_highest_cpos;
7072                 trunc_len = range - new_highest_cpos;
7073                 coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
7074                 blkno = le64_to_cpu(rec->e_blkno) +
7075                                 ocfs2_clusters_to_blocks(inode->i_sb, coff);
7076         } else {
7077                 /*
7078                  * Truncate completed, leave happily.
7079                  */
7080                 status = 0;
7081                 goto bail;
7082         }
7083
7084         phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
7085
7086         status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
7087                                           phys_cpos, trunc_len, flags, &dealloc,
7088                                           refcount_loc);
7089         if (status < 0) {
7090                 mlog_errno(status);
7091                 goto bail;
7092         }
7093
7094         ocfs2_reinit_path(path, 1);
7095
7096         /*
7097          * The check above will catch the case where we've truncated
7098          * away all allocation.
7099          */
7100         goto start;
7101
7102 bail:
7103
7104         ocfs2_schedule_truncate_log_flush(osb, 1);
7105
7106         ocfs2_run_deallocs(osb, &dealloc);
7107
7108         ocfs2_free_path(path);
7109
7110         return status;
7111 }
7112
7113 /*
7114  * 'start' is inclusive, 'end' is not.
7115  */
7116 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7117                           unsigned int start, unsigned int end, int trunc)
7118 {
7119         int ret;
7120         unsigned int numbytes;
7121         handle_t *handle;
7122         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7123         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7124         struct ocfs2_inline_data *idata = &di->id2.i_data;
7125
7126         if (end > i_size_read(inode))
7127                 end = i_size_read(inode);
7128
7129         BUG_ON(start >= end);
7130
7131         if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7132             !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7133             !ocfs2_supports_inline_data(osb)) {
7134                 ocfs2_error(inode->i_sb,
7135                             "Inline data flags for inode %llu don't agree! "
7136                             "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7137                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7138                             le16_to_cpu(di->i_dyn_features),
7139                             OCFS2_I(inode)->ip_dyn_features,
7140                             osb->s_feature_incompat);
7141                 ret = -EROFS;
7142                 goto out;
7143         }
7144
7145         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7146         if (IS_ERR(handle)) {
7147                 ret = PTR_ERR(handle);
7148                 mlog_errno(ret);
7149                 goto out;
7150         }
7151
7152         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7153                                       OCFS2_JOURNAL_ACCESS_WRITE);
7154         if (ret) {
7155                 mlog_errno(ret);
7156                 goto out_commit;
7157         }
7158
7159         numbytes = end - start;
7160         memset(idata->id_data + start, 0, numbytes);
7161
7162         /*
7163          * No need to worry about the data page here - it's been
7164          * truncated already and inline data doesn't need it for
7165          * pushing zero's to disk, so we'll let readpage pick it up
7166          * later.
7167          */
7168         if (trunc) {
7169                 i_size_write(inode, start);
7170                 di->i_size = cpu_to_le64(start);
7171         }
7172
7173         inode->i_blocks = ocfs2_inode_sector_count(inode);
7174         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7175
7176         di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7177         di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7178
7179         ocfs2_journal_dirty(handle, di_bh);
7180
7181 out_commit:
7182         ocfs2_commit_trans(osb, handle);
7183
7184 out:
7185         return ret;
7186 }