Merge tag 'char-misc-5.2-rc1-part2' of git://git.kernel.org/pub/scm/linux/kernel...
[sfrench/cifs-2.6.git] / fs / ocfs2 / suballoc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31
32 #include <cluster/masklog.h>
33
34 #include "ocfs2.h"
35
36 #include "alloc.h"
37 #include "blockcheck.h"
38 #include "dlmglue.h"
39 #include "inode.h"
40 #include "journal.h"
41 #include "localalloc.h"
42 #include "suballoc.h"
43 #include "super.h"
44 #include "sysfile.h"
45 #include "uptodate.h"
46 #include "ocfs2_trace.h"
47
48 #include "buffer_head_io.h"
49
50 #define NOT_ALLOC_NEW_GROUP             0
51 #define ALLOC_NEW_GROUP                 0x1
52 #define ALLOC_GROUPS_FROM_GLOBAL        0x2
53
54 #define OCFS2_MAX_TO_STEAL              1024
55
56 struct ocfs2_suballoc_result {
57         u64             sr_bg_blkno;    /* The bg we allocated from.  Set
58                                            to 0 when a block group is
59                                            contiguous. */
60         u64             sr_bg_stable_blkno; /*
61                                              * Doesn't change, always
62                                              * set to target block
63                                              * group descriptor
64                                              * block.
65                                              */
66         u64             sr_blkno;       /* The first allocated block */
67         unsigned int    sr_bit_offset;  /* The bit in the bg */
68         unsigned int    sr_bits;        /* How many bits we claimed */
69 };
70
71 static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
72 {
73         if (res->sr_blkno == 0)
74                 return 0;
75
76         if (res->sr_bg_blkno)
77                 return res->sr_bg_blkno;
78
79         return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
80 }
81
82 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
83 static int ocfs2_block_group_fill(handle_t *handle,
84                                   struct inode *alloc_inode,
85                                   struct buffer_head *bg_bh,
86                                   u64 group_blkno,
87                                   unsigned int group_clusters,
88                                   u16 my_chain,
89                                   struct ocfs2_chain_list *cl);
90 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
91                                    struct inode *alloc_inode,
92                                    struct buffer_head *bh,
93                                    u64 max_block,
94                                    u64 *last_alloc_group,
95                                    int flags);
96
97 static int ocfs2_cluster_group_search(struct inode *inode,
98                                       struct buffer_head *group_bh,
99                                       u32 bits_wanted, u32 min_bits,
100                                       u64 max_block,
101                                       struct ocfs2_suballoc_result *res);
102 static int ocfs2_block_group_search(struct inode *inode,
103                                     struct buffer_head *group_bh,
104                                     u32 bits_wanted, u32 min_bits,
105                                     u64 max_block,
106                                     struct ocfs2_suballoc_result *res);
107 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
108                                      handle_t *handle,
109                                      u32 bits_wanted,
110                                      u32 min_bits,
111                                      struct ocfs2_suballoc_result *res);
112 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
113                                          int nr);
114 static int ocfs2_relink_block_group(handle_t *handle,
115                                     struct inode *alloc_inode,
116                                     struct buffer_head *fe_bh,
117                                     struct buffer_head *bg_bh,
118                                     struct buffer_head *prev_bg_bh,
119                                     u16 chain);
120 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
121                                                      u32 wanted);
122 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
123                                                    u64 bg_blkno,
124                                                    u16 bg_bit_off);
125 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
126                                                 u64 data_blkno,
127                                                 u64 *bg_blkno,
128                                                 u16 *bg_bit_off);
129 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
130                                              u32 bits_wanted, u64 max_block,
131                                              int flags,
132                                              struct ocfs2_alloc_context **ac);
133
134 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
135 {
136         struct inode *inode = ac->ac_inode;
137
138         if (inode) {
139                 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
140                         ocfs2_inode_unlock(inode, 1);
141
142                 inode_unlock(inode);
143
144                 iput(inode);
145                 ac->ac_inode = NULL;
146         }
147         brelse(ac->ac_bh);
148         ac->ac_bh = NULL;
149         ac->ac_resv = NULL;
150         kfree(ac->ac_find_loc_priv);
151         ac->ac_find_loc_priv = NULL;
152 }
153
154 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
155 {
156         ocfs2_free_ac_resource(ac);
157         kfree(ac);
158 }
159
160 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
161 {
162         return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
163 }
164
165 #define do_error(fmt, ...)                                              \
166 do {                                                                    \
167         if (resize)                                                     \
168                 mlog(ML_ERROR, fmt, ##__VA_ARGS__);                     \
169         else                                                            \
170                 return ocfs2_error(sb, fmt, ##__VA_ARGS__);             \
171 } while (0)
172
173 static int ocfs2_validate_gd_self(struct super_block *sb,
174                                   struct buffer_head *bh,
175                                   int resize)
176 {
177         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
178
179         if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
180                 do_error("Group descriptor #%llu has bad signature %.*s\n",
181                          (unsigned long long)bh->b_blocknr, 7,
182                          gd->bg_signature);
183         }
184
185         if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
186                 do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
187                          (unsigned long long)bh->b_blocknr,
188                          (unsigned long long)le64_to_cpu(gd->bg_blkno));
189         }
190
191         if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
192                 do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
193                          (unsigned long long)bh->b_blocknr,
194                          le32_to_cpu(gd->bg_generation));
195         }
196
197         if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
198                 do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
199                          (unsigned long long)bh->b_blocknr,
200                          le16_to_cpu(gd->bg_bits),
201                          le16_to_cpu(gd->bg_free_bits_count));
202         }
203
204         if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
205                 do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
206                          (unsigned long long)bh->b_blocknr,
207                          le16_to_cpu(gd->bg_bits),
208                          8 * le16_to_cpu(gd->bg_size));
209         }
210
211         return 0;
212 }
213
214 static int ocfs2_validate_gd_parent(struct super_block *sb,
215                                     struct ocfs2_dinode *di,
216                                     struct buffer_head *bh,
217                                     int resize)
218 {
219         unsigned int max_bits;
220         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
221
222         if (di->i_blkno != gd->bg_parent_dinode) {
223                 do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
224                          (unsigned long long)bh->b_blocknr,
225                          (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
226                          (unsigned long long)le64_to_cpu(di->i_blkno));
227         }
228
229         max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
230         if (le16_to_cpu(gd->bg_bits) > max_bits) {
231                 do_error("Group descriptor #%llu has bit count of %u\n",
232                          (unsigned long long)bh->b_blocknr,
233                          le16_to_cpu(gd->bg_bits));
234         }
235
236         /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
237         if ((le16_to_cpu(gd->bg_chain) >
238              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
239             ((le16_to_cpu(gd->bg_chain) ==
240              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
241                 do_error("Group descriptor #%llu has bad chain %u\n",
242                          (unsigned long long)bh->b_blocknr,
243                          le16_to_cpu(gd->bg_chain));
244         }
245
246         return 0;
247 }
248
249 #undef do_error
250
251 /*
252  * This version only prints errors.  It does not fail the filesystem, and
253  * exists only for resize.
254  */
255 int ocfs2_check_group_descriptor(struct super_block *sb,
256                                  struct ocfs2_dinode *di,
257                                  struct buffer_head *bh)
258 {
259         int rc;
260         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
261
262         BUG_ON(!buffer_uptodate(bh));
263
264         /*
265          * If the ecc fails, we return the error but otherwise
266          * leave the filesystem running.  We know any error is
267          * local to this block.
268          */
269         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
270         if (rc) {
271                 mlog(ML_ERROR,
272                      "Checksum failed for group descriptor %llu\n",
273                      (unsigned long long)bh->b_blocknr);
274         } else
275                 rc = ocfs2_validate_gd_self(sb, bh, 1);
276         if (!rc)
277                 rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
278
279         return rc;
280 }
281
282 static int ocfs2_validate_group_descriptor(struct super_block *sb,
283                                            struct buffer_head *bh)
284 {
285         int rc;
286         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
287
288         trace_ocfs2_validate_group_descriptor(
289                                         (unsigned long long)bh->b_blocknr);
290
291         BUG_ON(!buffer_uptodate(bh));
292
293         /*
294          * If the ecc fails, we return the error but otherwise
295          * leave the filesystem running.  We know any error is
296          * local to this block.
297          */
298         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
299         if (rc)
300                 return rc;
301
302         /*
303          * Errors after here are fatal.
304          */
305
306         return ocfs2_validate_gd_self(sb, bh, 0);
307 }
308
309 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
310                                 u64 gd_blkno, struct buffer_head **bh)
311 {
312         int rc;
313         struct buffer_head *tmp = *bh;
314
315         rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
316                               ocfs2_validate_group_descriptor);
317         if (rc)
318                 goto out;
319
320         rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
321         if (rc) {
322                 brelse(tmp);
323                 goto out;
324         }
325
326         /* If ocfs2_read_block() got us a new bh, pass it up. */
327         if (!*bh)
328                 *bh = tmp;
329
330 out:
331         return rc;
332 }
333
334 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
335                                           struct ocfs2_group_desc *bg,
336                                           struct ocfs2_chain_list *cl,
337                                           u64 p_blkno, unsigned int clusters)
338 {
339         struct ocfs2_extent_list *el = &bg->bg_list;
340         struct ocfs2_extent_rec *rec;
341
342         BUG_ON(!ocfs2_supports_discontig_bg(osb));
343         if (!el->l_next_free_rec)
344                 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
345         rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
346         rec->e_blkno = cpu_to_le64(p_blkno);
347         rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
348                                   le16_to_cpu(cl->cl_bpc));
349         rec->e_leaf_clusters = cpu_to_le16(clusters);
350         le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
351         le16_add_cpu(&bg->bg_free_bits_count,
352                      clusters * le16_to_cpu(cl->cl_bpc));
353         le16_add_cpu(&el->l_next_free_rec, 1);
354 }
355
356 static int ocfs2_block_group_fill(handle_t *handle,
357                                   struct inode *alloc_inode,
358                                   struct buffer_head *bg_bh,
359                                   u64 group_blkno,
360                                   unsigned int group_clusters,
361                                   u16 my_chain,
362                                   struct ocfs2_chain_list *cl)
363 {
364         int status = 0;
365         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
366         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
367         struct super_block * sb = alloc_inode->i_sb;
368
369         if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
370                 status = ocfs2_error(alloc_inode->i_sb,
371                                      "group block (%llu) != b_blocknr (%llu)\n",
372                                      (unsigned long long)group_blkno,
373                                      (unsigned long long) bg_bh->b_blocknr);
374                 goto bail;
375         }
376
377         status = ocfs2_journal_access_gd(handle,
378                                          INODE_CACHE(alloc_inode),
379                                          bg_bh,
380                                          OCFS2_JOURNAL_ACCESS_CREATE);
381         if (status < 0) {
382                 mlog_errno(status);
383                 goto bail;
384         }
385
386         memset(bg, 0, sb->s_blocksize);
387         strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
388         bg->bg_generation = cpu_to_le32(osb->fs_generation);
389         bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
390                                                 osb->s_feature_incompat));
391         bg->bg_chain = cpu_to_le16(my_chain);
392         bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
393         bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
394         bg->bg_blkno = cpu_to_le64(group_blkno);
395         if (group_clusters == le16_to_cpu(cl->cl_cpg))
396                 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
397         else
398                 ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
399                                               group_clusters);
400
401         /* set the 1st bit in the bitmap to account for the descriptor block */
402         ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
403         bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
404
405         ocfs2_journal_dirty(handle, bg_bh);
406
407         /* There is no need to zero out or otherwise initialize the
408          * other blocks in a group - All valid FS metadata in a block
409          * group stores the superblock fs_generation value at
410          * allocation time. */
411
412 bail:
413         if (status)
414                 mlog_errno(status);
415         return status;
416 }
417
418 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
419 {
420         u16 curr, best;
421
422         best = curr = 0;
423         while (curr < le16_to_cpu(cl->cl_count)) {
424                 if (le32_to_cpu(cl->cl_recs[best].c_total) >
425                     le32_to_cpu(cl->cl_recs[curr].c_total))
426                         best = curr;
427                 curr++;
428         }
429         return best;
430 }
431
432 static struct buffer_head *
433 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
434                                struct inode *alloc_inode,
435                                struct ocfs2_alloc_context *ac,
436                                struct ocfs2_chain_list *cl)
437 {
438         int status;
439         u32 bit_off, num_bits;
440         u64 bg_blkno;
441         struct buffer_head *bg_bh;
442         unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
443
444         status = ocfs2_claim_clusters(handle, ac,
445                                       le16_to_cpu(cl->cl_cpg), &bit_off,
446                                       &num_bits);
447         if (status < 0) {
448                 if (status != -ENOSPC)
449                         mlog_errno(status);
450                 goto bail;
451         }
452
453         /* setup the group */
454         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
455         trace_ocfs2_block_group_alloc_contig(
456              (unsigned long long)bg_blkno, alloc_rec);
457
458         bg_bh = sb_getblk(osb->sb, bg_blkno);
459         if (!bg_bh) {
460                 status = -ENOMEM;
461                 mlog_errno(status);
462                 goto bail;
463         }
464         ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
465
466         status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
467                                         bg_blkno, num_bits, alloc_rec, cl);
468         if (status < 0) {
469                 brelse(bg_bh);
470                 mlog_errno(status);
471         }
472
473 bail:
474         return status ? ERR_PTR(status) : bg_bh;
475 }
476
477 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
478                                         handle_t *handle,
479                                         struct ocfs2_alloc_context *ac,
480                                         unsigned int min_bits,
481                                         u32 *bit_off, u32 *num_bits)
482 {
483         int status = 0;
484
485         while (min_bits) {
486                 status = ocfs2_claim_clusters(handle, ac, min_bits,
487                                               bit_off, num_bits);
488                 if (status != -ENOSPC)
489                         break;
490
491                 min_bits >>= 1;
492         }
493
494         return status;
495 }
496
497 static int ocfs2_block_group_grow_discontig(handle_t *handle,
498                                             struct inode *alloc_inode,
499                                             struct buffer_head *bg_bh,
500                                             struct ocfs2_alloc_context *ac,
501                                             struct ocfs2_chain_list *cl,
502                                             unsigned int min_bits)
503 {
504         int status;
505         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
506         struct ocfs2_group_desc *bg =
507                 (struct ocfs2_group_desc *)bg_bh->b_data;
508         unsigned int needed = le16_to_cpu(cl->cl_cpg) -
509                          le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
510         u32 p_cpos, clusters;
511         u64 p_blkno;
512         struct ocfs2_extent_list *el = &bg->bg_list;
513
514         status = ocfs2_journal_access_gd(handle,
515                                          INODE_CACHE(alloc_inode),
516                                          bg_bh,
517                                          OCFS2_JOURNAL_ACCESS_CREATE);
518         if (status < 0) {
519                 mlog_errno(status);
520                 goto bail;
521         }
522
523         while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
524                                 le16_to_cpu(el->l_count))) {
525                 if (min_bits > needed)
526                         min_bits = needed;
527                 status = ocfs2_block_group_claim_bits(osb, handle, ac,
528                                                       min_bits, &p_cpos,
529                                                       &clusters);
530                 if (status < 0) {
531                         if (status != -ENOSPC)
532                                 mlog_errno(status);
533                         goto bail;
534                 }
535                 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
536                 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
537                                               clusters);
538
539                 min_bits = clusters;
540                 needed = le16_to_cpu(cl->cl_cpg) -
541                          le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
542         }
543
544         if (needed > 0) {
545                 /*
546                  * We have used up all the extent rec but can't fill up
547                  * the cpg. So bail out.
548                  */
549                 status = -ENOSPC;
550                 goto bail;
551         }
552
553         ocfs2_journal_dirty(handle, bg_bh);
554
555 bail:
556         return status;
557 }
558
559 static void ocfs2_bg_alloc_cleanup(handle_t *handle,
560                                    struct ocfs2_alloc_context *cluster_ac,
561                                    struct inode *alloc_inode,
562                                    struct buffer_head *bg_bh)
563 {
564         int i, ret;
565         struct ocfs2_group_desc *bg;
566         struct ocfs2_extent_list *el;
567         struct ocfs2_extent_rec *rec;
568
569         if (!bg_bh)
570                 return;
571
572         bg = (struct ocfs2_group_desc *)bg_bh->b_data;
573         el = &bg->bg_list;
574         for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
575                 rec = &el->l_recs[i];
576                 ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
577                                           cluster_ac->ac_bh,
578                                           le64_to_cpu(rec->e_blkno),
579                                           le16_to_cpu(rec->e_leaf_clusters));
580                 if (ret)
581                         mlog_errno(ret);
582                 /* Try all the clusters to free */
583         }
584
585         ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
586         brelse(bg_bh);
587 }
588
589 static struct buffer_head *
590 ocfs2_block_group_alloc_discontig(handle_t *handle,
591                                   struct inode *alloc_inode,
592                                   struct ocfs2_alloc_context *ac,
593                                   struct ocfs2_chain_list *cl)
594 {
595         int status;
596         u32 bit_off, num_bits;
597         u64 bg_blkno;
598         unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
599         struct buffer_head *bg_bh = NULL;
600         unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
601         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
602
603         if (!ocfs2_supports_discontig_bg(osb)) {
604                 status = -ENOSPC;
605                 goto bail;
606         }
607
608         status = ocfs2_extend_trans(handle,
609                                     ocfs2_calc_bg_discontig_credits(osb->sb));
610         if (status) {
611                 mlog_errno(status);
612                 goto bail;
613         }
614
615         /*
616          * We're going to be grabbing from multiple cluster groups.
617          * We don't have enough credits to relink them all, and the
618          * cluster groups will be staying in cache for the duration of
619          * this operation.
620          */
621         ac->ac_disable_chain_relink = 1;
622
623         /* Claim the first region */
624         status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
625                                               &bit_off, &num_bits);
626         if (status < 0) {
627                 if (status != -ENOSPC)
628                         mlog_errno(status);
629                 goto bail;
630         }
631         min_bits = num_bits;
632
633         /* setup the group */
634         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
635         trace_ocfs2_block_group_alloc_discontig(
636                                 (unsigned long long)bg_blkno, alloc_rec);
637
638         bg_bh = sb_getblk(osb->sb, bg_blkno);
639         if (!bg_bh) {
640                 status = -ENOMEM;
641                 mlog_errno(status);
642                 goto bail;
643         }
644         ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
645
646         status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
647                                         bg_blkno, num_bits, alloc_rec, cl);
648         if (status < 0) {
649                 mlog_errno(status);
650                 goto bail;
651         }
652
653         status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
654                                                   bg_bh, ac, cl, min_bits);
655         if (status)
656                 mlog_errno(status);
657
658 bail:
659         if (status)
660                 ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
661         return status ? ERR_PTR(status) : bg_bh;
662 }
663
664 /*
665  * We expect the block group allocator to already be locked.
666  */
667 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
668                                    struct inode *alloc_inode,
669                                    struct buffer_head *bh,
670                                    u64 max_block,
671                                    u64 *last_alloc_group,
672                                    int flags)
673 {
674         int status, credits;
675         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
676         struct ocfs2_chain_list *cl;
677         struct ocfs2_alloc_context *ac = NULL;
678         handle_t *handle = NULL;
679         u16 alloc_rec;
680         struct buffer_head *bg_bh = NULL;
681         struct ocfs2_group_desc *bg;
682
683         BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
684
685         cl = &fe->id2.i_chain;
686         status = ocfs2_reserve_clusters_with_limit(osb,
687                                                    le16_to_cpu(cl->cl_cpg),
688                                                    max_block, flags, &ac);
689         if (status < 0) {
690                 if (status != -ENOSPC)
691                         mlog_errno(status);
692                 goto bail;
693         }
694
695         credits = ocfs2_calc_group_alloc_credits(osb->sb,
696                                                  le16_to_cpu(cl->cl_cpg));
697         handle = ocfs2_start_trans(osb, credits);
698         if (IS_ERR(handle)) {
699                 status = PTR_ERR(handle);
700                 handle = NULL;
701                 mlog_errno(status);
702                 goto bail;
703         }
704
705         if (last_alloc_group && *last_alloc_group != 0) {
706                 trace_ocfs2_block_group_alloc(
707                                 (unsigned long long)*last_alloc_group);
708                 ac->ac_last_group = *last_alloc_group;
709         }
710
711         bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
712                                                ac, cl);
713         if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
714                 bg_bh = ocfs2_block_group_alloc_discontig(handle,
715                                                           alloc_inode,
716                                                           ac, cl);
717         if (IS_ERR(bg_bh)) {
718                 status = PTR_ERR(bg_bh);
719                 bg_bh = NULL;
720                 if (status != -ENOSPC)
721                         mlog_errno(status);
722                 goto bail;
723         }
724         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
725
726         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
727                                          bh, OCFS2_JOURNAL_ACCESS_WRITE);
728         if (status < 0) {
729                 mlog_errno(status);
730                 goto bail;
731         }
732
733         alloc_rec = le16_to_cpu(bg->bg_chain);
734         le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
735                      le16_to_cpu(bg->bg_free_bits_count));
736         le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
737                      le16_to_cpu(bg->bg_bits));
738         cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
739         if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
740                 le16_add_cpu(&cl->cl_next_free_rec, 1);
741
742         le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
743                                         le16_to_cpu(bg->bg_free_bits_count));
744         le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
745         le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
746
747         ocfs2_journal_dirty(handle, bh);
748
749         spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
750         OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
751         fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
752                                              le32_to_cpu(fe->i_clusters)));
753         spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
754         i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
755         alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
756         ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
757
758         status = 0;
759
760         /* save the new last alloc group so that the caller can cache it. */
761         if (last_alloc_group)
762                 *last_alloc_group = ac->ac_last_group;
763
764 bail:
765         if (handle)
766                 ocfs2_commit_trans(osb, handle);
767
768         if (ac)
769                 ocfs2_free_alloc_context(ac);
770
771         brelse(bg_bh);
772
773         if (status)
774                 mlog_errno(status);
775         return status;
776 }
777
778 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
779                                        struct ocfs2_alloc_context *ac,
780                                        int type,
781                                        u32 slot,
782                                        u64 *last_alloc_group,
783                                        int flags)
784 {
785         int status;
786         u32 bits_wanted = ac->ac_bits_wanted;
787         struct inode *alloc_inode;
788         struct buffer_head *bh = NULL;
789         struct ocfs2_dinode *fe;
790         u32 free_bits;
791
792         alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
793         if (!alloc_inode) {
794                 mlog_errno(-EINVAL);
795                 return -EINVAL;
796         }
797
798         inode_lock(alloc_inode);
799
800         status = ocfs2_inode_lock(alloc_inode, &bh, 1);
801         if (status < 0) {
802                 inode_unlock(alloc_inode);
803                 iput(alloc_inode);
804
805                 mlog_errno(status);
806                 return status;
807         }
808
809         ac->ac_inode = alloc_inode;
810         ac->ac_alloc_slot = slot;
811
812         fe = (struct ocfs2_dinode *) bh->b_data;
813
814         /* The bh was validated by the inode read inside
815          * ocfs2_inode_lock().  Any corruption is a code bug. */
816         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
817
818         if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
819                 status = ocfs2_error(alloc_inode->i_sb,
820                                      "Invalid chain allocator %llu\n",
821                                      (unsigned long long)le64_to_cpu(fe->i_blkno));
822                 goto bail;
823         }
824
825         free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
826                 le32_to_cpu(fe->id1.bitmap1.i_used);
827
828         if (bits_wanted > free_bits) {
829                 /* cluster bitmap never grows */
830                 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
831                         trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
832                                                                 free_bits);
833                         status = -ENOSPC;
834                         goto bail;
835                 }
836
837                 if (!(flags & ALLOC_NEW_GROUP)) {
838                         trace_ocfs2_reserve_suballoc_bits_no_new_group(
839                                                 slot, bits_wanted, free_bits);
840                         status = -ENOSPC;
841                         goto bail;
842                 }
843
844                 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
845                                                  ac->ac_max_block,
846                                                  last_alloc_group, flags);
847                 if (status < 0) {
848                         if (status != -ENOSPC)
849                                 mlog_errno(status);
850                         goto bail;
851                 }
852                 atomic_inc(&osb->alloc_stats.bg_extends);
853
854                 /* You should never ask for this much metadata */
855                 BUG_ON(bits_wanted >
856                        (le32_to_cpu(fe->id1.bitmap1.i_total)
857                         - le32_to_cpu(fe->id1.bitmap1.i_used)));
858         }
859
860         get_bh(bh);
861         ac->ac_bh = bh;
862 bail:
863         brelse(bh);
864
865         if (status)
866                 mlog_errno(status);
867         return status;
868 }
869
870 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
871 {
872         spin_lock(&osb->osb_lock);
873         osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
874         spin_unlock(&osb->osb_lock);
875         atomic_set(&osb->s_num_inodes_stolen, 0);
876 }
877
878 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
879 {
880         spin_lock(&osb->osb_lock);
881         osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
882         spin_unlock(&osb->osb_lock);
883         atomic_set(&osb->s_num_meta_stolen, 0);
884 }
885
886 void ocfs2_init_steal_slots(struct ocfs2_super *osb)
887 {
888         ocfs2_init_inode_steal_slot(osb);
889         ocfs2_init_meta_steal_slot(osb);
890 }
891
892 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
893 {
894         spin_lock(&osb->osb_lock);
895         if (type == INODE_ALLOC_SYSTEM_INODE)
896                 osb->s_inode_steal_slot = slot;
897         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
898                 osb->s_meta_steal_slot = slot;
899         spin_unlock(&osb->osb_lock);
900 }
901
902 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
903 {
904         int slot = OCFS2_INVALID_SLOT;
905
906         spin_lock(&osb->osb_lock);
907         if (type == INODE_ALLOC_SYSTEM_INODE)
908                 slot = osb->s_inode_steal_slot;
909         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
910                 slot = osb->s_meta_steal_slot;
911         spin_unlock(&osb->osb_lock);
912
913         return slot;
914 }
915
916 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
917 {
918         return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
919 }
920
921 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
922 {
923         return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
924 }
925
926 static int ocfs2_steal_resource(struct ocfs2_super *osb,
927                                 struct ocfs2_alloc_context *ac,
928                                 int type)
929 {
930         int i, status = -ENOSPC;
931         int slot = __ocfs2_get_steal_slot(osb, type);
932
933         /* Start to steal resource from the first slot after ours. */
934         if (slot == OCFS2_INVALID_SLOT)
935                 slot = osb->slot_num + 1;
936
937         for (i = 0; i < osb->max_slots; i++, slot++) {
938                 if (slot == osb->max_slots)
939                         slot = 0;
940
941                 if (slot == osb->slot_num)
942                         continue;
943
944                 status = ocfs2_reserve_suballoc_bits(osb, ac,
945                                                      type,
946                                                      (u32)slot, NULL,
947                                                      NOT_ALLOC_NEW_GROUP);
948                 if (status >= 0) {
949                         __ocfs2_set_steal_slot(osb, slot, type);
950                         break;
951                 }
952
953                 ocfs2_free_ac_resource(ac);
954         }
955
956         return status;
957 }
958
959 static int ocfs2_steal_inode(struct ocfs2_super *osb,
960                              struct ocfs2_alloc_context *ac)
961 {
962         return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
963 }
964
965 static int ocfs2_steal_meta(struct ocfs2_super *osb,
966                             struct ocfs2_alloc_context *ac)
967 {
968         return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
969 }
970
971 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
972                                       int blocks,
973                                       struct ocfs2_alloc_context **ac)
974 {
975         int status;
976         int slot = ocfs2_get_meta_steal_slot(osb);
977
978         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
979         if (!(*ac)) {
980                 status = -ENOMEM;
981                 mlog_errno(status);
982                 goto bail;
983         }
984
985         (*ac)->ac_bits_wanted = blocks;
986         (*ac)->ac_which = OCFS2_AC_USE_META;
987         (*ac)->ac_group_search = ocfs2_block_group_search;
988
989         if (slot != OCFS2_INVALID_SLOT &&
990                 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
991                 goto extent_steal;
992
993         atomic_set(&osb->s_num_meta_stolen, 0);
994         status = ocfs2_reserve_suballoc_bits(osb, (*ac),
995                                              EXTENT_ALLOC_SYSTEM_INODE,
996                                              (u32)osb->slot_num, NULL,
997                                              ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
998
999
1000         if (status >= 0) {
1001                 status = 0;
1002                 if (slot != OCFS2_INVALID_SLOT)
1003                         ocfs2_init_meta_steal_slot(osb);
1004                 goto bail;
1005         } else if (status < 0 && status != -ENOSPC) {
1006                 mlog_errno(status);
1007                 goto bail;
1008         }
1009
1010         ocfs2_free_ac_resource(*ac);
1011
1012 extent_steal:
1013         status = ocfs2_steal_meta(osb, *ac);
1014         atomic_inc(&osb->s_num_meta_stolen);
1015         if (status < 0) {
1016                 if (status != -ENOSPC)
1017                         mlog_errno(status);
1018                 goto bail;
1019         }
1020
1021         status = 0;
1022 bail:
1023         if ((status < 0) && *ac) {
1024                 ocfs2_free_alloc_context(*ac);
1025                 *ac = NULL;
1026         }
1027
1028         if (status)
1029                 mlog_errno(status);
1030         return status;
1031 }
1032
1033 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
1034                                struct ocfs2_extent_list *root_el,
1035                                struct ocfs2_alloc_context **ac)
1036 {
1037         return ocfs2_reserve_new_metadata_blocks(osb,
1038                                         ocfs2_extend_meta_needed(root_el),
1039                                         ac);
1040 }
1041
1042 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
1043                             struct ocfs2_alloc_context **ac)
1044 {
1045         int status;
1046         int slot = ocfs2_get_inode_steal_slot(osb);
1047         u64 alloc_group;
1048
1049         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1050         if (!(*ac)) {
1051                 status = -ENOMEM;
1052                 mlog_errno(status);
1053                 goto bail;
1054         }
1055
1056         (*ac)->ac_bits_wanted = 1;
1057         (*ac)->ac_which = OCFS2_AC_USE_INODE;
1058
1059         (*ac)->ac_group_search = ocfs2_block_group_search;
1060
1061         /*
1062          * stat(2) can't handle i_ino > 32bits, so we tell the
1063          * lower levels not to allocate us a block group past that
1064          * limit.  The 'inode64' mount option avoids this behavior.
1065          */
1066         if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
1067                 (*ac)->ac_max_block = (u32)~0U;
1068
1069         /*
1070          * slot is set when we successfully steal inode from other nodes.
1071          * It is reset in 3 places:
1072          * 1. when we flush the truncate log
1073          * 2. when we complete local alloc recovery.
1074          * 3. when we successfully allocate from our own slot.
1075          * After it is set, we will go on stealing inodes until we find the
1076          * need to check our slots to see whether there is some space for us.
1077          */
1078         if (slot != OCFS2_INVALID_SLOT &&
1079             atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
1080                 goto inode_steal;
1081
1082         atomic_set(&osb->s_num_inodes_stolen, 0);
1083         alloc_group = osb->osb_inode_alloc_group;
1084         status = ocfs2_reserve_suballoc_bits(osb, *ac,
1085                                              INODE_ALLOC_SYSTEM_INODE,
1086                                              (u32)osb->slot_num,
1087                                              &alloc_group,
1088                                              ALLOC_NEW_GROUP |
1089                                              ALLOC_GROUPS_FROM_GLOBAL);
1090         if (status >= 0) {
1091                 status = 0;
1092
1093                 spin_lock(&osb->osb_lock);
1094                 osb->osb_inode_alloc_group = alloc_group;
1095                 spin_unlock(&osb->osb_lock);
1096                 trace_ocfs2_reserve_new_inode_new_group(
1097                         (unsigned long long)alloc_group);
1098
1099                 /*
1100                  * Some inodes must be freed by us, so try to allocate
1101                  * from our own next time.
1102                  */
1103                 if (slot != OCFS2_INVALID_SLOT)
1104                         ocfs2_init_inode_steal_slot(osb);
1105                 goto bail;
1106         } else if (status < 0 && status != -ENOSPC) {
1107                 mlog_errno(status);
1108                 goto bail;
1109         }
1110
1111         ocfs2_free_ac_resource(*ac);
1112
1113 inode_steal:
1114         status = ocfs2_steal_inode(osb, *ac);
1115         atomic_inc(&osb->s_num_inodes_stolen);
1116         if (status < 0) {
1117                 if (status != -ENOSPC)
1118                         mlog_errno(status);
1119                 goto bail;
1120         }
1121
1122         status = 0;
1123 bail:
1124         if ((status < 0) && *ac) {
1125                 ocfs2_free_alloc_context(*ac);
1126                 *ac = NULL;
1127         }
1128
1129         if (status)
1130                 mlog_errno(status);
1131         return status;
1132 }
1133
1134 /* local alloc code has to do the same thing, so rather than do this
1135  * twice.. */
1136 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
1137                                       struct ocfs2_alloc_context *ac)
1138 {
1139         int status;
1140
1141         ac->ac_which = OCFS2_AC_USE_MAIN;
1142         ac->ac_group_search = ocfs2_cluster_group_search;
1143
1144         status = ocfs2_reserve_suballoc_bits(osb, ac,
1145                                              GLOBAL_BITMAP_SYSTEM_INODE,
1146                                              OCFS2_INVALID_SLOT, NULL,
1147                                              ALLOC_NEW_GROUP);
1148         if (status < 0 && status != -ENOSPC)
1149                 mlog_errno(status);
1150
1151         return status;
1152 }
1153
1154 /* Callers don't need to care which bitmap (local alloc or main) to
1155  * use so we figure it out for them, but unfortunately this clutters
1156  * things a bit. */
1157 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
1158                                              u32 bits_wanted, u64 max_block,
1159                                              int flags,
1160                                              struct ocfs2_alloc_context **ac)
1161 {
1162         int status, ret = 0;
1163         int retried = 0;
1164
1165         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1166         if (!(*ac)) {
1167                 status = -ENOMEM;
1168                 mlog_errno(status);
1169                 goto bail;
1170         }
1171
1172         (*ac)->ac_bits_wanted = bits_wanted;
1173         (*ac)->ac_max_block = max_block;
1174
1175         status = -ENOSPC;
1176         if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
1177             ocfs2_alloc_should_use_local(osb, bits_wanted)) {
1178                 status = ocfs2_reserve_local_alloc_bits(osb,
1179                                                         bits_wanted,
1180                                                         *ac);
1181                 if ((status < 0) && (status != -ENOSPC)) {
1182                         mlog_errno(status);
1183                         goto bail;
1184                 }
1185         }
1186
1187         if (status == -ENOSPC) {
1188 retry:
1189                 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1190                 /* Retry if there is sufficient space cached in truncate log */
1191                 if (status == -ENOSPC && !retried) {
1192                         retried = 1;
1193                         ocfs2_inode_unlock((*ac)->ac_inode, 1);
1194                         inode_unlock((*ac)->ac_inode);
1195
1196                         ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted);
1197                         if (ret == 1) {
1198                                 iput((*ac)->ac_inode);
1199                                 (*ac)->ac_inode = NULL;
1200                                 goto retry;
1201                         }
1202
1203                         if (ret < 0)
1204                                 mlog_errno(ret);
1205
1206                         inode_lock((*ac)->ac_inode);
1207                         ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
1208                         if (ret < 0) {
1209                                 mlog_errno(ret);
1210                                 inode_unlock((*ac)->ac_inode);
1211                                 iput((*ac)->ac_inode);
1212                                 (*ac)->ac_inode = NULL;
1213                                 goto bail;
1214                         }
1215                 }
1216                 if (status < 0) {
1217                         if (status != -ENOSPC)
1218                                 mlog_errno(status);
1219                         goto bail;
1220                 }
1221         }
1222
1223         status = 0;
1224 bail:
1225         if ((status < 0) && *ac) {
1226                 ocfs2_free_alloc_context(*ac);
1227                 *ac = NULL;
1228         }
1229
1230         if (status)
1231                 mlog_errno(status);
1232         return status;
1233 }
1234
1235 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
1236                            u32 bits_wanted,
1237                            struct ocfs2_alloc_context **ac)
1238 {
1239         return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
1240                                                  ALLOC_NEW_GROUP, ac);
1241 }
1242
1243 /*
1244  * More or less lifted from ext3. I'll leave their description below:
1245  *
1246  * "For ext3 allocations, we must not reuse any blocks which are
1247  * allocated in the bitmap buffer's "last committed data" copy.  This
1248  * prevents deletes from freeing up the page for reuse until we have
1249  * committed the delete transaction.
1250  *
1251  * If we didn't do this, then deleting something and reallocating it as
1252  * data would allow the old block to be overwritten before the
1253  * transaction committed (because we force data to disk before commit).
1254  * This would lead to corruption if we crashed between overwriting the
1255  * data and committing the delete.
1256  *
1257  * @@@ We may want to make this allocation behaviour conditional on
1258  * data-writes at some point, and disable it for metadata allocations or
1259  * sync-data inodes."
1260  *
1261  * Note: OCFS2 already does this differently for metadata vs data
1262  * allocations, as those bitmaps are separate and undo access is never
1263  * called on a metadata group descriptor.
1264  */
1265 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1266                                          int nr)
1267 {
1268         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1269         int ret;
1270
1271         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1272                 return 0;
1273
1274         if (!buffer_jbd(bg_bh))
1275                 return 1;
1276
1277         jbd_lock_bh_state(bg_bh);
1278         bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
1279         if (bg)
1280                 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1281         else
1282                 ret = 1;
1283         jbd_unlock_bh_state(bg_bh);
1284
1285         return ret;
1286 }
1287
1288 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1289                                              struct buffer_head *bg_bh,
1290                                              unsigned int bits_wanted,
1291                                              unsigned int total_bits,
1292                                              struct ocfs2_suballoc_result *res)
1293 {
1294         void *bitmap;
1295         u16 best_offset, best_size;
1296         int offset, start, found, status = 0;
1297         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1298
1299         /* Callers got this descriptor from
1300          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1301         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1302
1303         found = start = best_offset = best_size = 0;
1304         bitmap = bg->bg_bitmap;
1305
1306         while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
1307                 if (offset == total_bits)
1308                         break;
1309
1310                 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1311                         /* We found a zero, but we can't use it as it
1312                          * hasn't been put to disk yet! */
1313                         found = 0;
1314                         start = offset + 1;
1315                 } else if (offset == start) {
1316                         /* we found a zero */
1317                         found++;
1318                         /* move start to the next bit to test */
1319                         start++;
1320                 } else {
1321                         /* got a zero after some ones */
1322                         found = 1;
1323                         start = offset + 1;
1324                 }
1325                 if (found > best_size) {
1326                         best_size = found;
1327                         best_offset = start - found;
1328                 }
1329                 /* we got everything we needed */
1330                 if (found == bits_wanted) {
1331                         /* mlog(0, "Found it all!\n"); */
1332                         break;
1333                 }
1334         }
1335
1336         if (best_size) {
1337                 res->sr_bit_offset = best_offset;
1338                 res->sr_bits = best_size;
1339         } else {
1340                 status = -ENOSPC;
1341                 /* No error log here -- see the comment above
1342                  * ocfs2_test_bg_bit_allocatable */
1343         }
1344
1345         return status;
1346 }
1347
1348 int ocfs2_block_group_set_bits(handle_t *handle,
1349                                              struct inode *alloc_inode,
1350                                              struct ocfs2_group_desc *bg,
1351                                              struct buffer_head *group_bh,
1352                                              unsigned int bit_off,
1353                                              unsigned int num_bits)
1354 {
1355         int status;
1356         void *bitmap = bg->bg_bitmap;
1357         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1358
1359         /* All callers get the descriptor via
1360          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1361         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1362         BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1363
1364         trace_ocfs2_block_group_set_bits(bit_off, num_bits);
1365
1366         if (ocfs2_is_cluster_bitmap(alloc_inode))
1367                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1368
1369         status = ocfs2_journal_access_gd(handle,
1370                                          INODE_CACHE(alloc_inode),
1371                                          group_bh,
1372                                          journal_type);
1373         if (status < 0) {
1374                 mlog_errno(status);
1375                 goto bail;
1376         }
1377
1378         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1379         if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1380                 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
1381                                    (unsigned long long)le64_to_cpu(bg->bg_blkno),
1382                                    le16_to_cpu(bg->bg_bits),
1383                                    le16_to_cpu(bg->bg_free_bits_count),
1384                                    num_bits);
1385         }
1386         while(num_bits--)
1387                 ocfs2_set_bit(bit_off++, bitmap);
1388
1389         ocfs2_journal_dirty(handle, group_bh);
1390
1391 bail:
1392         return status;
1393 }
1394
1395 /* find the one with the most empty bits */
1396 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1397 {
1398         u16 curr, best;
1399
1400         BUG_ON(!cl->cl_next_free_rec);
1401
1402         best = curr = 0;
1403         while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1404                 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1405                     le32_to_cpu(cl->cl_recs[best].c_free))
1406                         best = curr;
1407                 curr++;
1408         }
1409
1410         BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1411         return best;
1412 }
1413
1414 static int ocfs2_relink_block_group(handle_t *handle,
1415                                     struct inode *alloc_inode,
1416                                     struct buffer_head *fe_bh,
1417                                     struct buffer_head *bg_bh,
1418                                     struct buffer_head *prev_bg_bh,
1419                                     u16 chain)
1420 {
1421         int status;
1422         /* there is a really tiny chance the journal calls could fail,
1423          * but we wouldn't want inconsistent blocks in *any* case. */
1424         u64 bg_ptr, prev_bg_ptr;
1425         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1426         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1427         struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1428
1429         /* The caller got these descriptors from
1430          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1431         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1432         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1433
1434         trace_ocfs2_relink_block_group(
1435                 (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1436                 (unsigned long long)le64_to_cpu(bg->bg_blkno),
1437                 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1438
1439         bg_ptr = le64_to_cpu(bg->bg_next_group);
1440         prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1441
1442         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1443                                          prev_bg_bh,
1444                                          OCFS2_JOURNAL_ACCESS_WRITE);
1445         if (status < 0)
1446                 goto out;
1447
1448         prev_bg->bg_next_group = bg->bg_next_group;
1449         ocfs2_journal_dirty(handle, prev_bg_bh);
1450
1451         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1452                                          bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1453         if (status < 0)
1454                 goto out_rollback_prev_bg;
1455
1456         bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1457         ocfs2_journal_dirty(handle, bg_bh);
1458
1459         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1460                                          fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1461         if (status < 0)
1462                 goto out_rollback_bg;
1463
1464         fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1465         ocfs2_journal_dirty(handle, fe_bh);
1466
1467 out:
1468         if (status < 0)
1469                 mlog_errno(status);
1470         return status;
1471
1472 out_rollback_bg:
1473         bg->bg_next_group = cpu_to_le64(bg_ptr);
1474 out_rollback_prev_bg:
1475         prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1476         goto out;
1477 }
1478
1479 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1480                                                      u32 wanted)
1481 {
1482         return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1483 }
1484
1485 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1486  * value on error. */
1487 static int ocfs2_cluster_group_search(struct inode *inode,
1488                                       struct buffer_head *group_bh,
1489                                       u32 bits_wanted, u32 min_bits,
1490                                       u64 max_block,
1491                                       struct ocfs2_suballoc_result *res)
1492 {
1493         int search = -ENOSPC;
1494         int ret;
1495         u64 blkoff;
1496         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1497         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1498         unsigned int max_bits, gd_cluster_off;
1499
1500         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1501
1502         if (gd->bg_free_bits_count) {
1503                 max_bits = le16_to_cpu(gd->bg_bits);
1504
1505                 /* Tail groups in cluster bitmaps which aren't cpg
1506                  * aligned are prone to partial extension by a failed
1507                  * fs resize. If the file system resize never got to
1508                  * update the dinode cluster count, then we don't want
1509                  * to trust any clusters past it, regardless of what
1510                  * the group descriptor says. */
1511                 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1512                                                           le64_to_cpu(gd->bg_blkno));
1513                 if ((gd_cluster_off + max_bits) >
1514                     OCFS2_I(inode)->ip_clusters) {
1515                         max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1516                         trace_ocfs2_cluster_group_search_wrong_max_bits(
1517                                 (unsigned long long)le64_to_cpu(gd->bg_blkno),
1518                                 le16_to_cpu(gd->bg_bits),
1519                                 OCFS2_I(inode)->ip_clusters, max_bits);
1520                 }
1521
1522                 ret = ocfs2_block_group_find_clear_bits(osb,
1523                                                         group_bh, bits_wanted,
1524                                                         max_bits, res);
1525                 if (ret)
1526                         return ret;
1527
1528                 if (max_block) {
1529                         blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1530                                                           gd_cluster_off +
1531                                                           res->sr_bit_offset +
1532                                                           res->sr_bits);
1533                         trace_ocfs2_cluster_group_search_max_block(
1534                                 (unsigned long long)blkoff,
1535                                 (unsigned long long)max_block);
1536                         if (blkoff > max_block)
1537                                 return -ENOSPC;
1538                 }
1539
1540                 /* ocfs2_block_group_find_clear_bits() might
1541                  * return success, but we still want to return
1542                  * -ENOSPC unless it found the minimum number
1543                  * of bits. */
1544                 if (min_bits <= res->sr_bits)
1545                         search = 0; /* success */
1546                 else if (res->sr_bits) {
1547                         /*
1548                          * Don't show bits which we'll be returning
1549                          * for allocation to the local alloc bitmap.
1550                          */
1551                         ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
1552                 }
1553         }
1554
1555         return search;
1556 }
1557
1558 static int ocfs2_block_group_search(struct inode *inode,
1559                                     struct buffer_head *group_bh,
1560                                     u32 bits_wanted, u32 min_bits,
1561                                     u64 max_block,
1562                                     struct ocfs2_suballoc_result *res)
1563 {
1564         int ret = -ENOSPC;
1565         u64 blkoff;
1566         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1567
1568         BUG_ON(min_bits != 1);
1569         BUG_ON(ocfs2_is_cluster_bitmap(inode));
1570
1571         if (bg->bg_free_bits_count) {
1572                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1573                                                         group_bh, bits_wanted,
1574                                                         le16_to_cpu(bg->bg_bits),
1575                                                         res);
1576                 if (!ret && max_block) {
1577                         blkoff = le64_to_cpu(bg->bg_blkno) +
1578                                 res->sr_bit_offset + res->sr_bits;
1579                         trace_ocfs2_block_group_search_max_block(
1580                                 (unsigned long long)blkoff,
1581                                 (unsigned long long)max_block);
1582                         if (blkoff > max_block)
1583                                 ret = -ENOSPC;
1584                 }
1585         }
1586
1587         return ret;
1588 }
1589
1590 int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1591                                        handle_t *handle,
1592                                        struct buffer_head *di_bh,
1593                                        u32 num_bits,
1594                                        u16 chain)
1595 {
1596         int ret;
1597         u32 tmp_used;
1598         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1599         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1600
1601         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1602                                       OCFS2_JOURNAL_ACCESS_WRITE);
1603         if (ret < 0) {
1604                 mlog_errno(ret);
1605                 goto out;
1606         }
1607
1608         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1609         di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1610         le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1611         ocfs2_journal_dirty(handle, di_bh);
1612
1613 out:
1614         return ret;
1615 }
1616
1617 void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
1618                                        struct buffer_head *di_bh,
1619                                        u32 num_bits,
1620                                        u16 chain)
1621 {
1622         u32 tmp_used;
1623         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1624         struct ocfs2_chain_list *cl;
1625
1626         cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
1627         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1628         di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
1629         le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
1630 }
1631
1632 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1633                                          struct ocfs2_extent_rec *rec,
1634                                          struct ocfs2_chain_list *cl)
1635 {
1636         unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1637         unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1638         unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc;
1639
1640         if (res->sr_bit_offset < bitoff)
1641                 return 0;
1642         if (res->sr_bit_offset >= (bitoff + bitcount))
1643                 return 0;
1644         res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1645                 (res->sr_bit_offset - bitoff);
1646         if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1647                 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1648         return 1;
1649 }
1650
1651 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1652                                           struct ocfs2_group_desc *bg,
1653                                           struct ocfs2_suballoc_result *res)
1654 {
1655         int i;
1656         u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
1657         struct ocfs2_extent_rec *rec;
1658         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1659         struct ocfs2_chain_list *cl = &di->id2.i_chain;
1660
1661         if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1662                 res->sr_blkno = 0;
1663                 return;
1664         }
1665
1666         res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1667         res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
1668         if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1669             !bg->bg_list.l_next_free_rec)
1670                 return;
1671
1672         for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1673                 rec = &bg->bg_list.l_recs[i];
1674                 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1675                         res->sr_bg_blkno = bg_blkno;  /* Restore */
1676                         break;
1677                 }
1678         }
1679 }
1680
1681 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1682                                   handle_t *handle,
1683                                   u32 bits_wanted,
1684                                   u32 min_bits,
1685                                   struct ocfs2_suballoc_result *res,
1686                                   u16 *bits_left)
1687 {
1688         int ret;
1689         struct buffer_head *group_bh = NULL;
1690         struct ocfs2_group_desc *gd;
1691         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1692         struct inode *alloc_inode = ac->ac_inode;
1693
1694         ret = ocfs2_read_group_descriptor(alloc_inode, di,
1695                                           res->sr_bg_blkno, &group_bh);
1696         if (ret < 0) {
1697                 mlog_errno(ret);
1698                 return ret;
1699         }
1700
1701         gd = (struct ocfs2_group_desc *) group_bh->b_data;
1702         ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1703                                   ac->ac_max_block, res);
1704         if (ret < 0) {
1705                 if (ret != -ENOSPC)
1706                         mlog_errno(ret);
1707                 goto out;
1708         }
1709
1710         if (!ret)
1711                 ocfs2_bg_discontig_fix_result(ac, gd, res);
1712
1713         /*
1714          * sr_bg_blkno might have been changed by
1715          * ocfs2_bg_discontig_fix_result
1716          */
1717         res->sr_bg_stable_blkno = group_bh->b_blocknr;
1718
1719         if (ac->ac_find_loc_only)
1720                 goto out_loc_only;
1721
1722         ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1723                                                res->sr_bits,
1724                                                le16_to_cpu(gd->bg_chain));
1725         if (ret < 0) {
1726                 mlog_errno(ret);
1727                 goto out;
1728         }
1729
1730         ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1731                                          res->sr_bit_offset, res->sr_bits);
1732         if (ret < 0) {
1733                 ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
1734                                                res->sr_bits,
1735                                                le16_to_cpu(gd->bg_chain));
1736                 mlog_errno(ret);
1737         }
1738
1739 out_loc_only:
1740         *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1741
1742 out:
1743         brelse(group_bh);
1744
1745         return ret;
1746 }
1747
1748 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1749                               handle_t *handle,
1750                               u32 bits_wanted,
1751                               u32 min_bits,
1752                               struct ocfs2_suballoc_result *res,
1753                               u16 *bits_left)
1754 {
1755         int status;
1756         u16 chain;
1757         u64 next_group;
1758         struct inode *alloc_inode = ac->ac_inode;
1759         struct buffer_head *group_bh = NULL;
1760         struct buffer_head *prev_group_bh = NULL;
1761         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1762         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1763         struct ocfs2_group_desc *bg;
1764
1765         chain = ac->ac_chain;
1766         trace_ocfs2_search_chain_begin(
1767                 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
1768                 bits_wanted, chain);
1769
1770         status = ocfs2_read_group_descriptor(alloc_inode, fe,
1771                                              le64_to_cpu(cl->cl_recs[chain].c_blkno),
1772                                              &group_bh);
1773         if (status < 0) {
1774                 mlog_errno(status);
1775                 goto bail;
1776         }
1777         bg = (struct ocfs2_group_desc *) group_bh->b_data;
1778
1779         status = -ENOSPC;
1780         /* for now, the chain search is a bit simplistic. We just use
1781          * the 1st group with any empty bits. */
1782         while ((status = ac->ac_group_search(alloc_inode, group_bh,
1783                                              bits_wanted, min_bits,
1784                                              ac->ac_max_block,
1785                                              res)) == -ENOSPC) {
1786                 if (!bg->bg_next_group)
1787                         break;
1788
1789                 brelse(prev_group_bh);
1790                 prev_group_bh = NULL;
1791
1792                 next_group = le64_to_cpu(bg->bg_next_group);
1793                 prev_group_bh = group_bh;
1794                 group_bh = NULL;
1795                 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1796                                                      next_group, &group_bh);
1797                 if (status < 0) {
1798                         mlog_errno(status);
1799                         goto bail;
1800                 }
1801                 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1802         }
1803         if (status < 0) {
1804                 if (status != -ENOSPC)
1805                         mlog_errno(status);
1806                 goto bail;
1807         }
1808
1809         trace_ocfs2_search_chain_succ(
1810                 (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
1811
1812         res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1813
1814         BUG_ON(res->sr_bits == 0);
1815         if (!status)
1816                 ocfs2_bg_discontig_fix_result(ac, bg, res);
1817
1818         /*
1819          * sr_bg_blkno might have been changed by
1820          * ocfs2_bg_discontig_fix_result
1821          */
1822         res->sr_bg_stable_blkno = group_bh->b_blocknr;
1823
1824         /*
1825          * Keep track of previous block descriptor read. When
1826          * we find a target, if we have read more than X
1827          * number of descriptors, and the target is reasonably
1828          * empty, relink him to top of his chain.
1829          *
1830          * We've read 0 extra blocks and only send one more to
1831          * the transaction, yet the next guy to search has a
1832          * much easier time.
1833          *
1834          * Do this *after* figuring out how many bits we're taking out
1835          * of our target group.
1836          */
1837         if (!ac->ac_disable_chain_relink &&
1838             (prev_group_bh) &&
1839             (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1840                 status = ocfs2_relink_block_group(handle, alloc_inode,
1841                                                   ac->ac_bh, group_bh,
1842                                                   prev_group_bh, chain);
1843                 if (status < 0) {
1844                         mlog_errno(status);
1845                         goto bail;
1846                 }
1847         }
1848
1849         if (ac->ac_find_loc_only)
1850                 goto out_loc_only;
1851
1852         status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
1853                                                   ac->ac_bh, res->sr_bits,
1854                                                   chain);
1855         if (status) {
1856                 mlog_errno(status);
1857                 goto bail;
1858         }
1859
1860         status = ocfs2_block_group_set_bits(handle,
1861                                             alloc_inode,
1862                                             bg,
1863                                             group_bh,
1864                                             res->sr_bit_offset,
1865                                             res->sr_bits);
1866         if (status < 0) {
1867                 ocfs2_rollback_alloc_dinode_counts(alloc_inode,
1868                                         ac->ac_bh, res->sr_bits, chain);
1869                 mlog_errno(status);
1870                 goto bail;
1871         }
1872
1873         trace_ocfs2_search_chain_end(
1874                         (unsigned long long)le64_to_cpu(fe->i_blkno),
1875                         res->sr_bits);
1876
1877 out_loc_only:
1878         *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1879 bail:
1880         brelse(group_bh);
1881         brelse(prev_group_bh);
1882
1883         if (status)
1884                 mlog_errno(status);
1885         return status;
1886 }
1887
1888 /* will give out up to bits_wanted contiguous bits. */
1889 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1890                                      handle_t *handle,
1891                                      u32 bits_wanted,
1892                                      u32 min_bits,
1893                                      struct ocfs2_suballoc_result *res)
1894 {
1895         int status;
1896         u16 victim, i;
1897         u16 bits_left = 0;
1898         u64 hint = ac->ac_last_group;
1899         struct ocfs2_chain_list *cl;
1900         struct ocfs2_dinode *fe;
1901
1902         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1903         BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1904         BUG_ON(!ac->ac_bh);
1905
1906         fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1907
1908         /* The bh was validated by the inode read during
1909          * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1910         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1911
1912         if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1913             le32_to_cpu(fe->id1.bitmap1.i_total)) {
1914                 status = ocfs2_error(ac->ac_inode->i_sb,
1915                                      "Chain allocator dinode %llu has %u used bits but only %u total\n",
1916                                      (unsigned long long)le64_to_cpu(fe->i_blkno),
1917                                      le32_to_cpu(fe->id1.bitmap1.i_used),
1918                                      le32_to_cpu(fe->id1.bitmap1.i_total));
1919                 goto bail;
1920         }
1921
1922         res->sr_bg_blkno = hint;
1923         if (res->sr_bg_blkno) {
1924                 /* Attempt to short-circuit the usual search mechanism
1925                  * by jumping straight to the most recently used
1926                  * allocation group. This helps us maintain some
1927                  * contiguousness across allocations. */
1928                 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1929                                                 min_bits, res, &bits_left);
1930                 if (!status)
1931                         goto set_hint;
1932                 if (status < 0 && status != -ENOSPC) {
1933                         mlog_errno(status);
1934                         goto bail;
1935                 }
1936         }
1937
1938         cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1939
1940         victim = ocfs2_find_victim_chain(cl);
1941         ac->ac_chain = victim;
1942
1943         status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1944                                     res, &bits_left);
1945         if (!status) {
1946                 if (ocfs2_is_cluster_bitmap(ac->ac_inode))
1947                         hint = res->sr_bg_blkno;
1948                 else
1949                         hint = ocfs2_group_from_res(res);
1950                 goto set_hint;
1951         }
1952         if (status < 0 && status != -ENOSPC) {
1953                 mlog_errno(status);
1954                 goto bail;
1955         }
1956
1957         trace_ocfs2_claim_suballoc_bits(victim);
1958
1959         /* If we didn't pick a good victim, then just default to
1960          * searching each chain in order. Don't allow chain relinking
1961          * because we only calculate enough journal credits for one
1962          * relink per alloc. */
1963         ac->ac_disable_chain_relink = 1;
1964         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1965                 if (i == victim)
1966                         continue;
1967                 if (!cl->cl_recs[i].c_free)
1968                         continue;
1969
1970                 ac->ac_chain = i;
1971                 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1972                                             res, &bits_left);
1973                 if (!status) {
1974                         hint = ocfs2_group_from_res(res);
1975                         break;
1976                 }
1977                 if (status < 0 && status != -ENOSPC) {
1978                         mlog_errno(status);
1979                         goto bail;
1980                 }
1981         }
1982
1983 set_hint:
1984         if (status != -ENOSPC) {
1985                 /* If the next search of this group is not likely to
1986                  * yield a suitable extent, then we reset the last
1987                  * group hint so as to not waste a disk read */
1988                 if (bits_left < min_bits)
1989                         ac->ac_last_group = 0;
1990                 else
1991                         ac->ac_last_group = hint;
1992         }
1993
1994 bail:
1995         if (status)
1996                 mlog_errno(status);
1997         return status;
1998 }
1999
2000 int ocfs2_claim_metadata(handle_t *handle,
2001                          struct ocfs2_alloc_context *ac,
2002                          u32 bits_wanted,
2003                          u64 *suballoc_loc,
2004                          u16 *suballoc_bit_start,
2005                          unsigned int *num_bits,
2006                          u64 *blkno_start)
2007 {
2008         int status;
2009         struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2010
2011         BUG_ON(!ac);
2012         BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
2013         BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
2014
2015         status = ocfs2_claim_suballoc_bits(ac,
2016                                            handle,
2017                                            bits_wanted,
2018                                            1,
2019                                            &res);
2020         if (status < 0) {
2021                 mlog_errno(status);
2022                 goto bail;
2023         }
2024         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2025
2026         *suballoc_loc = res.sr_bg_blkno;
2027         *suballoc_bit_start = res.sr_bit_offset;
2028         *blkno_start = res.sr_blkno;
2029         ac->ac_bits_given += res.sr_bits;
2030         *num_bits = res.sr_bits;
2031         status = 0;
2032 bail:
2033         if (status)
2034                 mlog_errno(status);
2035         return status;
2036 }
2037
2038 static void ocfs2_init_inode_ac_group(struct inode *dir,
2039                                       struct buffer_head *parent_di_bh,
2040                                       struct ocfs2_alloc_context *ac)
2041 {
2042         struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
2043         /*
2044          * Try to allocate inodes from some specific group.
2045          *
2046          * If the parent dir has recorded the last group used in allocation,
2047          * cool, use it. Otherwise if we try to allocate new inode from the
2048          * same slot the parent dir belongs to, use the same chunk.
2049          *
2050          * We are very careful here to avoid the mistake of setting
2051          * ac_last_group to a group descriptor from a different (unlocked) slot.
2052          */
2053         if (OCFS2_I(dir)->ip_last_used_group &&
2054             OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
2055                 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
2056         else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
2057                 if (di->i_suballoc_loc)
2058                         ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
2059                 else
2060                         ac->ac_last_group = ocfs2_which_suballoc_group(
2061                                         le64_to_cpu(di->i_blkno),
2062                                         le16_to_cpu(di->i_suballoc_bit));
2063         }
2064 }
2065
2066 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
2067                                              struct ocfs2_alloc_context *ac)
2068 {
2069         OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
2070         OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
2071 }
2072
2073 int ocfs2_find_new_inode_loc(struct inode *dir,
2074                              struct buffer_head *parent_fe_bh,
2075                              struct ocfs2_alloc_context *ac,
2076                              u64 *fe_blkno)
2077 {
2078         int ret;
2079         handle_t *handle = NULL;
2080         struct ocfs2_suballoc_result *res;
2081
2082         BUG_ON(!ac);
2083         BUG_ON(ac->ac_bits_given != 0);
2084         BUG_ON(ac->ac_bits_wanted != 1);
2085         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2086
2087         res = kzalloc(sizeof(*res), GFP_NOFS);
2088         if (res == NULL) {
2089                 ret = -ENOMEM;
2090                 mlog_errno(ret);
2091                 goto out;
2092         }
2093
2094         ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2095
2096         /*
2097          * The handle started here is for chain relink. Alternatively,
2098          * we could just disable relink for these calls.
2099          */
2100         handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
2101         if (IS_ERR(handle)) {
2102                 ret = PTR_ERR(handle);
2103                 handle = NULL;
2104                 mlog_errno(ret);
2105                 goto out;
2106         }
2107
2108         /*
2109          * This will instruct ocfs2_claim_suballoc_bits and
2110          * ocfs2_search_one_group to search but save actual allocation
2111          * for later.
2112          */
2113         ac->ac_find_loc_only = 1;
2114
2115         ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
2116         if (ret < 0) {
2117                 mlog_errno(ret);
2118                 goto out;
2119         }
2120
2121         ac->ac_find_loc_priv = res;
2122         *fe_blkno = res->sr_blkno;
2123         ocfs2_update_inode_fsync_trans(handle, dir, 0);
2124 out:
2125         if (handle)
2126                 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
2127
2128         if (ret)
2129                 kfree(res);
2130
2131         return ret;
2132 }
2133
2134 int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2135                                  struct inode *dir,
2136                                  struct ocfs2_alloc_context *ac,
2137                                  u64 *suballoc_loc,
2138                                  u16 *suballoc_bit,
2139                                  u64 di_blkno)
2140 {
2141         int ret;
2142         u16 chain;
2143         struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
2144         struct buffer_head *bg_bh = NULL;
2145         struct ocfs2_group_desc *bg;
2146         struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2147
2148         /*
2149          * Since di_blkno is being passed back in, we check for any
2150          * inconsistencies which may have happened between
2151          * calls. These are code bugs as di_blkno is not expected to
2152          * change once returned from ocfs2_find_new_inode_loc()
2153          */
2154         BUG_ON(res->sr_blkno != di_blkno);
2155
2156         ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
2157                                           res->sr_bg_stable_blkno, &bg_bh);
2158         if (ret) {
2159                 mlog_errno(ret);
2160                 goto out;
2161         }
2162
2163         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
2164         chain = le16_to_cpu(bg->bg_chain);
2165
2166         ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
2167                                                ac->ac_bh, res->sr_bits,
2168                                                chain);
2169         if (ret) {
2170                 mlog_errno(ret);
2171                 goto out;
2172         }
2173
2174         ret = ocfs2_block_group_set_bits(handle,
2175                                          ac->ac_inode,
2176                                          bg,
2177                                          bg_bh,
2178                                          res->sr_bit_offset,
2179                                          res->sr_bits);
2180         if (ret < 0) {
2181                 ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
2182                                                ac->ac_bh, res->sr_bits, chain);
2183                 mlog_errno(ret);
2184                 goto out;
2185         }
2186
2187         trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
2188                                            res->sr_bits);
2189
2190         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2191
2192         BUG_ON(res->sr_bits != 1);
2193
2194         *suballoc_loc = res->sr_bg_blkno;
2195         *suballoc_bit = res->sr_bit_offset;
2196         ac->ac_bits_given++;
2197         ocfs2_save_inode_ac_group(dir, ac);
2198
2199 out:
2200         brelse(bg_bh);
2201
2202         return ret;
2203 }
2204
2205 int ocfs2_claim_new_inode(handle_t *handle,
2206                           struct inode *dir,
2207                           struct buffer_head *parent_fe_bh,
2208                           struct ocfs2_alloc_context *ac,
2209                           u64 *suballoc_loc,
2210                           u16 *suballoc_bit,
2211                           u64 *fe_blkno)
2212 {
2213         int status;
2214         struct ocfs2_suballoc_result res;
2215
2216         BUG_ON(!ac);
2217         BUG_ON(ac->ac_bits_given != 0);
2218         BUG_ON(ac->ac_bits_wanted != 1);
2219         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2220
2221         ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2222
2223         status = ocfs2_claim_suballoc_bits(ac,
2224                                            handle,
2225                                            1,
2226                                            1,
2227                                            &res);
2228         if (status < 0) {
2229                 mlog_errno(status);
2230                 goto bail;
2231         }
2232         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2233
2234         BUG_ON(res.sr_bits != 1);
2235
2236         *suballoc_loc = res.sr_bg_blkno;
2237         *suballoc_bit = res.sr_bit_offset;
2238         *fe_blkno = res.sr_blkno;
2239         ac->ac_bits_given++;
2240         ocfs2_save_inode_ac_group(dir, ac);
2241         status = 0;
2242 bail:
2243         if (status)
2244                 mlog_errno(status);
2245         return status;
2246 }
2247
2248 /* translate a group desc. blkno and it's bitmap offset into
2249  * disk cluster offset. */
2250 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
2251                                                    u64 bg_blkno,
2252                                                    u16 bg_bit_off)
2253 {
2254         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2255         u32 cluster = 0;
2256
2257         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2258
2259         if (bg_blkno != osb->first_cluster_group_blkno)
2260                 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
2261         cluster += (u32) bg_bit_off;
2262         return cluster;
2263 }
2264
2265 /* given a cluster offset, calculate which block group it belongs to
2266  * and return that block offset. */
2267 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
2268 {
2269         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2270         u32 group_no;
2271
2272         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2273
2274         group_no = cluster / osb->bitmap_cpg;
2275         if (!group_no)
2276                 return osb->first_cluster_group_blkno;
2277         return ocfs2_clusters_to_blocks(inode->i_sb,
2278                                         group_no * osb->bitmap_cpg);
2279 }
2280
2281 /* given the block number of a cluster start, calculate which cluster
2282  * group and descriptor bitmap offset that corresponds to. */
2283 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
2284                                                 u64 data_blkno,
2285                                                 u64 *bg_blkno,
2286                                                 u16 *bg_bit_off)
2287 {
2288         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2289         u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
2290
2291         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2292
2293         *bg_blkno = ocfs2_which_cluster_group(inode,
2294                                               data_cluster);
2295
2296         if (*bg_blkno == osb->first_cluster_group_blkno)
2297                 *bg_bit_off = (u16) data_cluster;
2298         else
2299                 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
2300                                                              data_blkno - *bg_blkno);
2301 }
2302
2303 /*
2304  * min_bits - minimum contiguous chunk from this total allocation we
2305  * can handle. set to what we asked for originally for a full
2306  * contig. allocation, set to '1' to indicate we can deal with extents
2307  * of any size.
2308  */
2309 int __ocfs2_claim_clusters(handle_t *handle,
2310                            struct ocfs2_alloc_context *ac,
2311                            u32 min_clusters,
2312                            u32 max_clusters,
2313                            u32 *cluster_start,
2314                            u32 *num_clusters)
2315 {
2316         int status;
2317         unsigned int bits_wanted = max_clusters;
2318         struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2319         struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
2320
2321         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2322
2323         BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
2324                && ac->ac_which != OCFS2_AC_USE_MAIN);
2325
2326         if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2327                 WARN_ON(min_clusters > 1);
2328
2329                 status = ocfs2_claim_local_alloc_bits(osb,
2330                                                       handle,
2331                                                       ac,
2332                                                       bits_wanted,
2333                                                       cluster_start,
2334                                                       num_clusters);
2335                 if (!status)
2336                         atomic_inc(&osb->alloc_stats.local_data);
2337         } else {
2338                 if (min_clusters > (osb->bitmap_cpg - 1)) {
2339                         /* The only paths asking for contiguousness
2340                          * should know about this already. */
2341                         mlog(ML_ERROR, "minimum allocation requested %u exceeds "
2342                              "group bitmap size %u!\n", min_clusters,
2343                              osb->bitmap_cpg);
2344                         status = -ENOSPC;
2345                         goto bail;
2346                 }
2347                 /* clamp the current request down to a realistic size. */
2348                 if (bits_wanted > (osb->bitmap_cpg - 1))
2349                         bits_wanted = osb->bitmap_cpg - 1;
2350
2351                 status = ocfs2_claim_suballoc_bits(ac,
2352                                                    handle,
2353                                                    bits_wanted,
2354                                                    min_clusters,
2355                                                    &res);
2356                 if (!status) {
2357                         BUG_ON(res.sr_blkno); /* cluster alloc can't set */
2358                         *cluster_start =
2359                                 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
2360                                                                  res.sr_bg_blkno,
2361                                                                  res.sr_bit_offset);
2362                         atomic_inc(&osb->alloc_stats.bitmap_data);
2363                         *num_clusters = res.sr_bits;
2364                 }
2365         }
2366         if (status < 0) {
2367                 if (status != -ENOSPC)
2368                         mlog_errno(status);
2369                 goto bail;
2370         }
2371
2372         ac->ac_bits_given += *num_clusters;
2373
2374 bail:
2375         if (status)
2376                 mlog_errno(status);
2377         return status;
2378 }
2379
2380 int ocfs2_claim_clusters(handle_t *handle,
2381                          struct ocfs2_alloc_context *ac,
2382                          u32 min_clusters,
2383                          u32 *cluster_start,
2384                          u32 *num_clusters)
2385 {
2386         unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
2387
2388         return __ocfs2_claim_clusters(handle, ac, min_clusters,
2389                                       bits_wanted, cluster_start, num_clusters);
2390 }
2391
2392 static int ocfs2_block_group_clear_bits(handle_t *handle,
2393                                         struct inode *alloc_inode,
2394                                         struct ocfs2_group_desc *bg,
2395                                         struct buffer_head *group_bh,
2396                                         unsigned int bit_off,
2397                                         unsigned int num_bits,
2398                                         void (*undo_fn)(unsigned int bit,
2399                                                         unsigned long *bmap))
2400 {
2401         int status;
2402         unsigned int tmp;
2403         struct ocfs2_group_desc *undo_bg = NULL;
2404
2405         /* The caller got this descriptor from
2406          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
2407         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
2408
2409         trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
2410
2411         BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2412         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2413                                          group_bh,
2414                                          undo_fn ?
2415                                          OCFS2_JOURNAL_ACCESS_UNDO :
2416                                          OCFS2_JOURNAL_ACCESS_WRITE);
2417         if (status < 0) {
2418                 mlog_errno(status);
2419                 goto bail;
2420         }
2421
2422         if (undo_fn) {
2423                 jbd_lock_bh_state(group_bh);
2424                 undo_bg = (struct ocfs2_group_desc *)
2425                                         bh2jh(group_bh)->b_committed_data;
2426                 BUG_ON(!undo_bg);
2427         }
2428
2429         tmp = num_bits;
2430         while(tmp--) {
2431                 ocfs2_clear_bit((bit_off + tmp),
2432                                 (unsigned long *) bg->bg_bitmap);
2433                 if (undo_fn)
2434                         undo_fn(bit_off + tmp,
2435                                 (unsigned long *) undo_bg->bg_bitmap);
2436         }
2437         le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2438         if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2439                 if (undo_fn)
2440                         jbd_unlock_bh_state(group_bh);
2441                 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
2442                                    (unsigned long long)le64_to_cpu(bg->bg_blkno),
2443                                    le16_to_cpu(bg->bg_bits),
2444                                    le16_to_cpu(bg->bg_free_bits_count),
2445                                    num_bits);
2446         }
2447
2448         if (undo_fn)
2449                 jbd_unlock_bh_state(group_bh);
2450
2451         ocfs2_journal_dirty(handle, group_bh);
2452 bail:
2453         return status;
2454 }
2455
2456 /*
2457  * expects the suballoc inode to already be locked.
2458  */
2459 static int _ocfs2_free_suballoc_bits(handle_t *handle,
2460                                      struct inode *alloc_inode,
2461                                      struct buffer_head *alloc_bh,
2462                                      unsigned int start_bit,
2463                                      u64 bg_blkno,
2464                                      unsigned int count,
2465                                      void (*undo_fn)(unsigned int bit,
2466                                                      unsigned long *bitmap))
2467 {
2468         int status = 0;
2469         u32 tmp_used;
2470         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2471         struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2472         struct buffer_head *group_bh = NULL;
2473         struct ocfs2_group_desc *group;
2474
2475         /* The alloc_bh comes from ocfs2_free_dinode() or
2476          * ocfs2_free_clusters().  The callers have all locked the
2477          * allocator and gotten alloc_bh from the lock call.  This
2478          * validates the dinode buffer.  Any corruption that has happened
2479          * is a code bug. */
2480         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2481         BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2482
2483         trace_ocfs2_free_suballoc_bits(
2484                 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
2485                 (unsigned long long)bg_blkno,
2486                 start_bit, count);
2487
2488         status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2489                                              &group_bh);
2490         if (status < 0) {
2491                 mlog_errno(status);
2492                 goto bail;
2493         }
2494         group = (struct ocfs2_group_desc *) group_bh->b_data;
2495
2496         BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2497
2498         status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2499                                               group, group_bh,
2500                                               start_bit, count, undo_fn);
2501         if (status < 0) {
2502                 mlog_errno(status);
2503                 goto bail;
2504         }
2505
2506         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2507                                          alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2508         if (status < 0) {
2509                 mlog_errno(status);
2510                 ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
2511                                 start_bit, count);
2512                 goto bail;
2513         }
2514
2515         le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2516                      count);
2517         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2518         fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2519         ocfs2_journal_dirty(handle, alloc_bh);
2520
2521 bail:
2522         brelse(group_bh);
2523
2524         if (status)
2525                 mlog_errno(status);
2526         return status;
2527 }
2528
2529 int ocfs2_free_suballoc_bits(handle_t *handle,
2530                              struct inode *alloc_inode,
2531                              struct buffer_head *alloc_bh,
2532                              unsigned int start_bit,
2533                              u64 bg_blkno,
2534                              unsigned int count)
2535 {
2536         return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2537                                          start_bit, bg_blkno, count, NULL);
2538 }
2539
2540 int ocfs2_free_dinode(handle_t *handle,
2541                       struct inode *inode_alloc_inode,
2542                       struct buffer_head *inode_alloc_bh,
2543                       struct ocfs2_dinode *di)
2544 {
2545         u64 blk = le64_to_cpu(di->i_blkno);
2546         u16 bit = le16_to_cpu(di->i_suballoc_bit);
2547         u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2548
2549         if (di->i_suballoc_loc)
2550                 bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2551         return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2552                                         inode_alloc_bh, bit, bg_blkno, 1);
2553 }
2554
2555 static int _ocfs2_free_clusters(handle_t *handle,
2556                                 struct inode *bitmap_inode,
2557                                 struct buffer_head *bitmap_bh,
2558                                 u64 start_blk,
2559                                 unsigned int num_clusters,
2560                                 void (*undo_fn)(unsigned int bit,
2561                                                 unsigned long *bitmap))
2562 {
2563         int status;
2564         u16 bg_start_bit;
2565         u64 bg_blkno;
2566
2567         /* You can't ever have a contiguous set of clusters
2568          * bigger than a block group bitmap so we never have to worry
2569          * about looping on them.
2570          * This is expensive. We can safely remove once this stuff has
2571          * gotten tested really well. */
2572         BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb,
2573                                 ocfs2_blocks_to_clusters(bitmap_inode->i_sb,
2574                                                          start_blk)));
2575
2576
2577         ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2578                                      &bg_start_bit);
2579
2580         trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
2581                         (unsigned long long)start_blk,
2582                         bg_start_bit, num_clusters);
2583
2584         status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2585                                            bg_start_bit, bg_blkno,
2586                                            num_clusters, undo_fn);
2587         if (status < 0) {
2588                 mlog_errno(status);
2589                 goto out;
2590         }
2591
2592         ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2593                                          num_clusters);
2594
2595 out:
2596         if (status)
2597                 mlog_errno(status);
2598         return status;
2599 }
2600
2601 int ocfs2_free_clusters(handle_t *handle,
2602                         struct inode *bitmap_inode,
2603                         struct buffer_head *bitmap_bh,
2604                         u64 start_blk,
2605                         unsigned int num_clusters)
2606 {
2607         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2608                                     start_blk, num_clusters,
2609                                     _ocfs2_set_bit);
2610 }
2611
2612 /*
2613  * Give never-used clusters back to the global bitmap.  We don't need
2614  * to protect these bits in the undo buffer.
2615  */
2616 int ocfs2_release_clusters(handle_t *handle,
2617                            struct inode *bitmap_inode,
2618                            struct buffer_head *bitmap_bh,
2619                            u64 start_blk,
2620                            unsigned int num_clusters)
2621 {
2622         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2623                                     start_blk, num_clusters,
2624                                     _ocfs2_clear_bit);
2625 }
2626
2627 /*
2628  * For a given allocation, determine which allocators will need to be
2629  * accessed, and lock them, reserving the appropriate number of bits.
2630  *
2631  * Sparse file systems call this from ocfs2_write_begin_nolock()
2632  * and ocfs2_allocate_unwritten_extents().
2633  *
2634  * File systems which don't support holes call this from
2635  * ocfs2_extend_allocation().
2636  */
2637 int ocfs2_lock_allocators(struct inode *inode,
2638                           struct ocfs2_extent_tree *et,
2639                           u32 clusters_to_add, u32 extents_to_split,
2640                           struct ocfs2_alloc_context **data_ac,
2641                           struct ocfs2_alloc_context **meta_ac)
2642 {
2643         int ret = 0, num_free_extents;
2644         unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2645         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2646
2647         *meta_ac = NULL;
2648         if (data_ac)
2649                 *data_ac = NULL;
2650
2651         BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2652
2653         num_free_extents = ocfs2_num_free_extents(et);
2654         if (num_free_extents < 0) {
2655                 ret = num_free_extents;
2656                 mlog_errno(ret);
2657                 goto out;
2658         }
2659
2660         /*
2661          * Sparse allocation file systems need to be more conservative
2662          * with reserving room for expansion - the actual allocation
2663          * happens while we've got a journal handle open so re-taking
2664          * a cluster lock (because we ran out of room for another
2665          * extent) will violate ordering rules.
2666          *
2667          * Most of the time we'll only be seeing this 1 cluster at a time
2668          * anyway.
2669          *
2670          * Always lock for any unwritten extents - we might want to
2671          * add blocks during a split.
2672          */
2673         if (!num_free_extents ||
2674             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2675                 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2676                 if (ret < 0) {
2677                         if (ret != -ENOSPC)
2678                                 mlog_errno(ret);
2679                         goto out;
2680                 }
2681         }
2682
2683         if (clusters_to_add == 0)
2684                 goto out;
2685
2686         ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2687         if (ret < 0) {
2688                 if (ret != -ENOSPC)
2689                         mlog_errno(ret);
2690                 goto out;
2691         }
2692
2693 out:
2694         if (ret) {
2695                 if (*meta_ac) {
2696                         ocfs2_free_alloc_context(*meta_ac);
2697                         *meta_ac = NULL;
2698                 }
2699
2700                 /*
2701                  * We cannot have an error and a non null *data_ac.
2702                  */
2703         }
2704
2705         return ret;
2706 }
2707
2708 /*
2709  * Read the inode specified by blkno to get suballoc_slot and
2710  * suballoc_bit.
2711  */
2712 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2713                                        u16 *suballoc_slot, u64 *group_blkno,
2714                                        u16 *suballoc_bit)
2715 {
2716         int status;
2717         struct buffer_head *inode_bh = NULL;
2718         struct ocfs2_dinode *inode_fe;
2719
2720         trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
2721
2722         /* dirty read disk */
2723         status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2724         if (status < 0) {
2725                 mlog(ML_ERROR, "read block %llu failed %d\n",
2726                      (unsigned long long)blkno, status);
2727                 goto bail;
2728         }
2729
2730         inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2731         if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2732                 mlog(ML_ERROR, "invalid inode %llu requested\n",
2733                      (unsigned long long)blkno);
2734                 status = -EINVAL;
2735                 goto bail;
2736         }
2737
2738         if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2739             (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2740                 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2741                      (unsigned long long)blkno,
2742                      (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2743                 status = -EINVAL;
2744                 goto bail;
2745         }
2746
2747         if (suballoc_slot)
2748                 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2749         if (suballoc_bit)
2750                 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2751         if (group_blkno)
2752                 *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
2753
2754 bail:
2755         brelse(inode_bh);
2756
2757         if (status)
2758                 mlog_errno(status);
2759         return status;
2760 }
2761
2762 /*
2763  * test whether bit is SET in allocator bitmap or not.  on success, 0
2764  * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
2765  * is returned and *res is meaningless.  Call this after you have
2766  * cluster locked against suballoc, or you may get a result based on
2767  * non-up2date contents
2768  */
2769 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2770                                    struct inode *suballoc,
2771                                    struct buffer_head *alloc_bh,
2772                                    u64 group_blkno, u64 blkno,
2773                                    u16 bit, int *res)
2774 {
2775         struct ocfs2_dinode *alloc_di;
2776         struct ocfs2_group_desc *group;
2777         struct buffer_head *group_bh = NULL;
2778         u64 bg_blkno;
2779         int status;
2780
2781         trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
2782                                       (unsigned int)bit);
2783
2784         alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
2785         if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
2786                 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2787                      (unsigned int)bit,
2788                      ocfs2_bits_per_group(&alloc_di->id2.i_chain));
2789                 status = -EINVAL;
2790                 goto bail;
2791         }
2792
2793         bg_blkno = group_blkno ? group_blkno :
2794                    ocfs2_which_suballoc_group(blkno, bit);
2795         status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2796                                              &group_bh);
2797         if (status < 0) {
2798                 mlog(ML_ERROR, "read group %llu failed %d\n",
2799                      (unsigned long long)bg_blkno, status);
2800                 goto bail;
2801         }
2802
2803         group = (struct ocfs2_group_desc *) group_bh->b_data;
2804         *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2805
2806 bail:
2807         brelse(group_bh);
2808
2809         if (status)
2810                 mlog_errno(status);
2811         return status;
2812 }
2813
2814 /*
2815  * Test if the bit representing this inode (blkno) is set in the
2816  * suballocator.
2817  *
2818  * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2819  *
2820  * In the event of failure, a negative value is returned and *res is
2821  * meaningless.
2822  *
2823  * Callers must make sure to hold nfs_sync_lock to prevent
2824  * ocfs2_delete_inode() on another node from accessing the same
2825  * suballocator concurrently.
2826  */
2827 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2828 {
2829         int status;
2830         u64 group_blkno = 0;
2831         u16 suballoc_bit = 0, suballoc_slot = 0;
2832         struct inode *inode_alloc_inode;
2833         struct buffer_head *alloc_bh = NULL;
2834
2835         trace_ocfs2_test_inode_bit((unsigned long long)blkno);
2836
2837         status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2838                                              &group_blkno, &suballoc_bit);
2839         if (status < 0) {
2840                 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2841                 goto bail;
2842         }
2843
2844         inode_alloc_inode =
2845                 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2846                                             suballoc_slot);
2847         if (!inode_alloc_inode) {
2848                 /* the error code could be inaccurate, but we are not able to
2849                  * get the correct one. */
2850                 status = -EINVAL;
2851                 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2852                      (u32)suballoc_slot);
2853                 goto bail;
2854         }
2855
2856         inode_lock(inode_alloc_inode);
2857         status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2858         if (status < 0) {
2859                 inode_unlock(inode_alloc_inode);
2860                 iput(inode_alloc_inode);
2861                 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2862                      (u32)suballoc_slot, status);
2863                 goto bail;
2864         }
2865
2866         status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2867                                          group_blkno, blkno, suballoc_bit, res);
2868         if (status < 0)
2869                 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2870
2871         ocfs2_inode_unlock(inode_alloc_inode, 0);
2872         inode_unlock(inode_alloc_inode);
2873
2874         iput(inode_alloc_inode);
2875         brelse(alloc_bh);
2876 bail:
2877         if (status)
2878                 mlog_errno(status);
2879         return status;
2880 }