Merge branch 'drm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/airlied...
[sfrench/cifs-2.6.git] / fs / ocfs2 / suballoc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "blockcheck.h"
39 #include "dlmglue.h"
40 #include "inode.h"
41 #include "journal.h"
42 #include "localalloc.h"
43 #include "suballoc.h"
44 #include "super.h"
45 #include "sysfile.h"
46 #include "uptodate.h"
47
48 #include "buffer_head_io.h"
49
50 #define NOT_ALLOC_NEW_GROUP             0
51 #define ALLOC_NEW_GROUP                 0x1
52 #define ALLOC_GROUPS_FROM_GLOBAL        0x2
53
54 #define OCFS2_MAX_TO_STEAL              1024
55
56 struct ocfs2_suballoc_result {
57         u64             sr_bg_blkno;    /* The bg we allocated from.  Set
58                                            to 0 when a block group is
59                                            contiguous. */
60         u64             sr_blkno;       /* The first allocated block */
61         unsigned int    sr_bit_offset;  /* The bit in the bg */
62         unsigned int    sr_bits;        /* How many bits we claimed */
63 };
64
65 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
66 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
67 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
68 static int ocfs2_block_group_fill(handle_t *handle,
69                                   struct inode *alloc_inode,
70                                   struct buffer_head *bg_bh,
71                                   u64 group_blkno,
72                                   unsigned int group_clusters,
73                                   u16 my_chain,
74                                   struct ocfs2_chain_list *cl);
75 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
76                                    struct inode *alloc_inode,
77                                    struct buffer_head *bh,
78                                    u64 max_block,
79                                    u64 *last_alloc_group,
80                                    int flags);
81
82 static int ocfs2_cluster_group_search(struct inode *inode,
83                                       struct buffer_head *group_bh,
84                                       u32 bits_wanted, u32 min_bits,
85                                       u64 max_block,
86                                       struct ocfs2_suballoc_result *res);
87 static int ocfs2_block_group_search(struct inode *inode,
88                                     struct buffer_head *group_bh,
89                                     u32 bits_wanted, u32 min_bits,
90                                     u64 max_block,
91                                     struct ocfs2_suballoc_result *res);
92 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
93                                      handle_t *handle,
94                                      u32 bits_wanted,
95                                      u32 min_bits,
96                                      struct ocfs2_suballoc_result *res);
97 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
98                                          int nr);
99 static inline int ocfs2_block_group_set_bits(handle_t *handle,
100                                              struct inode *alloc_inode,
101                                              struct ocfs2_group_desc *bg,
102                                              struct buffer_head *group_bh,
103                                              unsigned int bit_off,
104                                              unsigned int num_bits);
105 static int ocfs2_relink_block_group(handle_t *handle,
106                                     struct inode *alloc_inode,
107                                     struct buffer_head *fe_bh,
108                                     struct buffer_head *bg_bh,
109                                     struct buffer_head *prev_bg_bh,
110                                     u16 chain);
111 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
112                                                      u32 wanted);
113 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
114                                                    u64 bg_blkno,
115                                                    u16 bg_bit_off);
116 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
117                                                 u64 data_blkno,
118                                                 u64 *bg_blkno,
119                                                 u16 *bg_bit_off);
120 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
121                                              u32 bits_wanted, u64 max_block,
122                                              int flags,
123                                              struct ocfs2_alloc_context **ac);
124
125 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
126 {
127         struct inode *inode = ac->ac_inode;
128
129         if (inode) {
130                 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
131                         ocfs2_inode_unlock(inode, 1);
132
133                 mutex_unlock(&inode->i_mutex);
134
135                 iput(inode);
136                 ac->ac_inode = NULL;
137         }
138         brelse(ac->ac_bh);
139         ac->ac_bh = NULL;
140         ac->ac_resv = NULL;
141 }
142
143 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
144 {
145         ocfs2_free_ac_resource(ac);
146         kfree(ac);
147 }
148
149 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
150 {
151         return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
152 }
153
154 #define do_error(fmt, ...)                                              \
155         do{                                                             \
156                 if (resize)                                     \
157                         mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
158                 else                                                    \
159                         ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
160         } while (0)
161
162 static int ocfs2_validate_gd_self(struct super_block *sb,
163                                   struct buffer_head *bh,
164                                   int resize)
165 {
166         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
167
168         if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
169                 do_error("Group descriptor #%llu has bad signature %.*s",
170                          (unsigned long long)bh->b_blocknr, 7,
171                          gd->bg_signature);
172                 return -EINVAL;
173         }
174
175         if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
176                 do_error("Group descriptor #%llu has an invalid bg_blkno "
177                          "of %llu",
178                          (unsigned long long)bh->b_blocknr,
179                          (unsigned long long)le64_to_cpu(gd->bg_blkno));
180                 return -EINVAL;
181         }
182
183         if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
184                 do_error("Group descriptor #%llu has an invalid "
185                          "fs_generation of #%u",
186                          (unsigned long long)bh->b_blocknr,
187                          le32_to_cpu(gd->bg_generation));
188                 return -EINVAL;
189         }
190
191         if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
192                 do_error("Group descriptor #%llu has bit count %u but "
193                          "claims that %u are free",
194                          (unsigned long long)bh->b_blocknr,
195                          le16_to_cpu(gd->bg_bits),
196                          le16_to_cpu(gd->bg_free_bits_count));
197                 return -EINVAL;
198         }
199
200         if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
201                 do_error("Group descriptor #%llu has bit count %u but "
202                          "max bitmap bits of %u",
203                          (unsigned long long)bh->b_blocknr,
204                          le16_to_cpu(gd->bg_bits),
205                          8 * le16_to_cpu(gd->bg_size));
206                 return -EINVAL;
207         }
208
209         return 0;
210 }
211
212 static int ocfs2_validate_gd_parent(struct super_block *sb,
213                                     struct ocfs2_dinode *di,
214                                     struct buffer_head *bh,
215                                     int resize)
216 {
217         unsigned int max_bits;
218         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
219
220         if (di->i_blkno != gd->bg_parent_dinode) {
221                 do_error("Group descriptor #%llu has bad parent "
222                          "pointer (%llu, expected %llu)",
223                          (unsigned long long)bh->b_blocknr,
224                          (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
225                          (unsigned long long)le64_to_cpu(di->i_blkno));
226                 return -EINVAL;
227         }
228
229         max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
230         if (le16_to_cpu(gd->bg_bits) > max_bits) {
231                 do_error("Group descriptor #%llu has bit count of %u",
232                          (unsigned long long)bh->b_blocknr,
233                          le16_to_cpu(gd->bg_bits));
234                 return -EINVAL;
235         }
236
237         /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
238         if ((le16_to_cpu(gd->bg_chain) >
239              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
240             ((le16_to_cpu(gd->bg_chain) ==
241              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
242                 do_error("Group descriptor #%llu has bad chain %u",
243                          (unsigned long long)bh->b_blocknr,
244                          le16_to_cpu(gd->bg_chain));
245                 return -EINVAL;
246         }
247
248         return 0;
249 }
250
251 #undef do_error
252
253 /*
254  * This version only prints errors.  It does not fail the filesystem, and
255  * exists only for resize.
256  */
257 int ocfs2_check_group_descriptor(struct super_block *sb,
258                                  struct ocfs2_dinode *di,
259                                  struct buffer_head *bh)
260 {
261         int rc;
262         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
263
264         BUG_ON(!buffer_uptodate(bh));
265
266         /*
267          * If the ecc fails, we return the error but otherwise
268          * leave the filesystem running.  We know any error is
269          * local to this block.
270          */
271         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
272         if (rc) {
273                 mlog(ML_ERROR,
274                      "Checksum failed for group descriptor %llu\n",
275                      (unsigned long long)bh->b_blocknr);
276         } else
277                 rc = ocfs2_validate_gd_self(sb, bh, 1);
278         if (!rc)
279                 rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
280
281         return rc;
282 }
283
284 static int ocfs2_validate_group_descriptor(struct super_block *sb,
285                                            struct buffer_head *bh)
286 {
287         int rc;
288         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
289
290         mlog(0, "Validating group descriptor %llu\n",
291              (unsigned long long)bh->b_blocknr);
292
293         BUG_ON(!buffer_uptodate(bh));
294
295         /*
296          * If the ecc fails, we return the error but otherwise
297          * leave the filesystem running.  We know any error is
298          * local to this block.
299          */
300         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
301         if (rc)
302                 return rc;
303
304         /*
305          * Errors after here are fatal.
306          */
307
308         return ocfs2_validate_gd_self(sb, bh, 0);
309 }
310
311 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
312                                 u64 gd_blkno, struct buffer_head **bh)
313 {
314         int rc;
315         struct buffer_head *tmp = *bh;
316
317         rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
318                               ocfs2_validate_group_descriptor);
319         if (rc)
320                 goto out;
321
322         rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
323         if (rc) {
324                 brelse(tmp);
325                 goto out;
326         }
327
328         /* If ocfs2_read_block() got us a new bh, pass it up. */
329         if (!*bh)
330                 *bh = tmp;
331
332 out:
333         return rc;
334 }
335
336 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
337                                           struct ocfs2_group_desc *bg,
338                                           struct ocfs2_chain_list *cl,
339                                           u64 p_blkno, u32 clusters)
340 {
341         struct ocfs2_extent_list *el = &bg->bg_list;
342         struct ocfs2_extent_rec *rec;
343
344         BUG_ON(!ocfs2_supports_discontig_bg(osb));
345         if (!el->l_next_free_rec)
346                 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
347         rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
348         rec->e_blkno = cpu_to_le64(p_blkno);
349         rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
350                                   le16_to_cpu(cl->cl_bpc));
351         rec->e_leaf_clusters = cpu_to_le32(clusters);
352         le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
353         le16_add_cpu(&bg->bg_free_bits_count,
354                      clusters * le16_to_cpu(cl->cl_bpc));
355         le16_add_cpu(&el->l_next_free_rec, 1);
356 }
357
358 static int ocfs2_block_group_fill(handle_t *handle,
359                                   struct inode *alloc_inode,
360                                   struct buffer_head *bg_bh,
361                                   u64 group_blkno,
362                                   unsigned int group_clusters,
363                                   u16 my_chain,
364                                   struct ocfs2_chain_list *cl)
365 {
366         int status = 0;
367         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
368         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
369         struct super_block * sb = alloc_inode->i_sb;
370
371         mlog_entry_void();
372
373         if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
374                 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
375                             "b_blocknr (%llu)",
376                             (unsigned long long)group_blkno,
377                             (unsigned long long) bg_bh->b_blocknr);
378                 status = -EIO;
379                 goto bail;
380         }
381
382         status = ocfs2_journal_access_gd(handle,
383                                          INODE_CACHE(alloc_inode),
384                                          bg_bh,
385                                          OCFS2_JOURNAL_ACCESS_CREATE);
386         if (status < 0) {
387                 mlog_errno(status);
388                 goto bail;
389         }
390
391         memset(bg, 0, sb->s_blocksize);
392         strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
393         bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
394         bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
395                                                 osb->s_feature_incompat));
396         bg->bg_chain = cpu_to_le16(my_chain);
397         bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
398         bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
399         bg->bg_blkno = cpu_to_le64(group_blkno);
400         if (group_clusters == le16_to_cpu(cl->cl_cpg))
401                 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
402         else
403                 ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
404                                               group_clusters);
405
406         /* set the 1st bit in the bitmap to account for the descriptor block */
407         ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
408         bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
409
410         ocfs2_journal_dirty(handle, bg_bh);
411
412         /* There is no need to zero out or otherwise initialize the
413          * other blocks in a group - All valid FS metadata in a block
414          * group stores the superblock fs_generation value at
415          * allocation time. */
416
417 bail:
418         mlog_exit(status);
419         return status;
420 }
421
422 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
423 {
424         u16 curr, best;
425
426         best = curr = 0;
427         while (curr < le16_to_cpu(cl->cl_count)) {
428                 if (le32_to_cpu(cl->cl_recs[best].c_total) >
429                     le32_to_cpu(cl->cl_recs[curr].c_total))
430                         best = curr;
431                 curr++;
432         }
433         return best;
434 }
435
436 static struct buffer_head *
437 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
438                                struct inode *alloc_inode,
439                                struct ocfs2_alloc_context *ac,
440                                struct ocfs2_chain_list *cl)
441 {
442         int status;
443         u32 bit_off, num_bits;
444         u64 bg_blkno;
445         struct buffer_head *bg_bh;
446         unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
447
448         status = ocfs2_claim_clusters(handle, ac,
449                                       le16_to_cpu(cl->cl_cpg), &bit_off,
450                                       &num_bits);
451         if (status < 0) {
452                 if (status != -ENOSPC)
453                         mlog_errno(status);
454                 goto bail;
455         }
456
457         /* setup the group */
458         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
459         mlog(0, "new descriptor, record %u, at block %llu\n",
460              alloc_rec, (unsigned long long)bg_blkno);
461
462         bg_bh = sb_getblk(osb->sb, bg_blkno);
463         if (!bg_bh) {
464                 status = -EIO;
465                 mlog_errno(status);
466                 goto bail;
467         }
468         ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
469
470         status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
471                                         bg_blkno, num_bits, alloc_rec, cl);
472         if (status < 0) {
473                 brelse(bg_bh);
474                 mlog_errno(status);
475         }
476
477 bail:
478         return status ? ERR_PTR(status) : bg_bh;
479 }
480
481 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
482                                         handle_t *handle,
483                                         struct ocfs2_alloc_context *ac,
484                                         unsigned int min_bits,
485                                         u32 *bit_off, u32 *num_bits)
486 {
487         int status = 0;
488
489         while (min_bits) {
490                 status = ocfs2_claim_clusters(handle, ac, min_bits,
491                                               bit_off, num_bits);
492                 if (status != -ENOSPC)
493                         break;
494
495                 min_bits >>= 1;
496         }
497
498         return status;
499 }
500
501 static int ocfs2_block_group_grow_discontig(handle_t *handle,
502                                             struct inode *alloc_inode,
503                                             struct buffer_head *bg_bh,
504                                             struct ocfs2_alloc_context *ac,
505                                             struct ocfs2_chain_list *cl,
506                                             unsigned int min_bits)
507 {
508         int status;
509         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
510         struct ocfs2_group_desc *bg =
511                 (struct ocfs2_group_desc *)bg_bh->b_data;
512         unsigned int needed = le16_to_cpu(cl->cl_cpg) -
513                          le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
514         u32 p_cpos, clusters;
515         u64 p_blkno;
516         struct ocfs2_extent_list *el = &bg->bg_list;
517
518         status = ocfs2_journal_access_gd(handle,
519                                          INODE_CACHE(alloc_inode),
520                                          bg_bh,
521                                          OCFS2_JOURNAL_ACCESS_CREATE);
522         if (status < 0) {
523                 mlog_errno(status);
524                 goto bail;
525         }
526
527         while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
528                                 le16_to_cpu(el->l_count))) {
529                 if (min_bits > needed)
530                         min_bits = needed;
531                 status = ocfs2_block_group_claim_bits(osb, handle, ac,
532                                                       min_bits, &p_cpos,
533                                                       &clusters);
534                 if (status < 0) {
535                         if (status != -ENOSPC)
536                                 mlog_errno(status);
537                         goto bail;
538                 }
539                 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
540                 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
541                                               clusters);
542
543                 min_bits = clusters;
544                 needed = le16_to_cpu(cl->cl_cpg) -
545                          le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
546         }
547
548         if (needed > 0) {
549                 /*
550                  * We have used up all the extent rec but can't fill up
551                  * the cpg. So bail out.
552                  */
553                 status = -ENOSPC;
554                 goto bail;
555         }
556
557         ocfs2_journal_dirty(handle, bg_bh);
558
559 bail:
560         return status;
561 }
562
563 static void ocfs2_bg_alloc_cleanup(handle_t *handle,
564                                    struct ocfs2_alloc_context *cluster_ac,
565                                    struct inode *alloc_inode,
566                                    struct buffer_head *bg_bh)
567 {
568         int i, ret;
569         struct ocfs2_group_desc *bg;
570         struct ocfs2_extent_list *el;
571         struct ocfs2_extent_rec *rec;
572
573         if (!bg_bh)
574                 return;
575
576         bg = (struct ocfs2_group_desc *)bg_bh->b_data;
577         el = &bg->bg_list;
578         for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
579                 rec = &el->l_recs[i];
580                 ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
581                                           cluster_ac->ac_bh,
582                                           le64_to_cpu(rec->e_blkno),
583                                           le32_to_cpu(rec->e_leaf_clusters));
584                 if (ret)
585                         mlog_errno(ret);
586                 /* Try all the clusters to free */
587         }
588
589         ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
590         brelse(bg_bh);
591 }
592
593 static struct buffer_head *
594 ocfs2_block_group_alloc_discontig(handle_t *handle,
595                                   struct inode *alloc_inode,
596                                   struct ocfs2_alloc_context *ac,
597                                   struct ocfs2_chain_list *cl)
598 {
599         int status;
600         u32 bit_off, num_bits;
601         u64 bg_blkno;
602         unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
603         struct buffer_head *bg_bh = NULL;
604         unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
605         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
606
607         if (!ocfs2_supports_discontig_bg(osb)) {
608                 status = -ENOSPC;
609                 goto bail;
610         }
611
612         status = ocfs2_extend_trans(handle,
613                                     ocfs2_calc_bg_discontig_credits(osb->sb));
614         if (status) {
615                 mlog_errno(status);
616                 goto bail;
617         }
618
619         /*
620          * We're going to be grabbing from multiple cluster groups.
621          * We don't have enough credits to relink them all, and the
622          * cluster groups will be staying in cache for the duration of
623          * this operation.
624          */
625         ac->ac_allow_chain_relink = 0;
626
627         /* Claim the first region */
628         status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
629                                               &bit_off, &num_bits);
630         if (status < 0) {
631                 if (status != -ENOSPC)
632                         mlog_errno(status);
633                 goto bail;
634         }
635         min_bits = num_bits;
636
637         /* setup the group */
638         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
639         mlog(0, "new descriptor, record %u, at block %llu\n",
640              alloc_rec, (unsigned long long)bg_blkno);
641
642         bg_bh = sb_getblk(osb->sb, bg_blkno);
643         if (!bg_bh) {
644                 status = -EIO;
645                 mlog_errno(status);
646                 goto bail;
647         }
648         ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
649
650         status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
651                                         bg_blkno, num_bits, alloc_rec, cl);
652         if (status < 0) {
653                 mlog_errno(status);
654                 goto bail;
655         }
656
657         status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
658                                                   bg_bh, ac, cl, min_bits);
659         if (status)
660                 mlog_errno(status);
661
662 bail:
663         if (status)
664                 ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
665         return status ? ERR_PTR(status) : bg_bh;
666 }
667
668 /*
669  * We expect the block group allocator to already be locked.
670  */
671 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
672                                    struct inode *alloc_inode,
673                                    struct buffer_head *bh,
674                                    u64 max_block,
675                                    u64 *last_alloc_group,
676                                    int flags)
677 {
678         int status, credits;
679         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
680         struct ocfs2_chain_list *cl;
681         struct ocfs2_alloc_context *ac = NULL;
682         handle_t *handle = NULL;
683         u16 alloc_rec;
684         struct buffer_head *bg_bh = NULL;
685         struct ocfs2_group_desc *bg;
686
687         BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
688
689         mlog_entry_void();
690
691         cl = &fe->id2.i_chain;
692         status = ocfs2_reserve_clusters_with_limit(osb,
693                                                    le16_to_cpu(cl->cl_cpg),
694                                                    max_block, flags, &ac);
695         if (status < 0) {
696                 if (status != -ENOSPC)
697                         mlog_errno(status);
698                 goto bail;
699         }
700
701         credits = ocfs2_calc_group_alloc_credits(osb->sb,
702                                                  le16_to_cpu(cl->cl_cpg));
703         handle = ocfs2_start_trans(osb, credits);
704         if (IS_ERR(handle)) {
705                 status = PTR_ERR(handle);
706                 handle = NULL;
707                 mlog_errno(status);
708                 goto bail;
709         }
710
711         if (last_alloc_group && *last_alloc_group != 0) {
712                 mlog(0, "use old allocation group %llu for block group alloc\n",
713                      (unsigned long long)*last_alloc_group);
714                 ac->ac_last_group = *last_alloc_group;
715         }
716
717         bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
718                                                ac, cl);
719         if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
720                 bg_bh = ocfs2_block_group_alloc_discontig(handle,
721                                                           alloc_inode,
722                                                           ac, cl);
723         if (IS_ERR(bg_bh)) {
724                 status = PTR_ERR(bg_bh);
725                 bg_bh = NULL;
726                 if (status != -ENOSPC)
727                         mlog_errno(status);
728                 goto bail;
729         }
730         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
731
732         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
733                                          bh, OCFS2_JOURNAL_ACCESS_WRITE);
734         if (status < 0) {
735                 mlog_errno(status);
736                 goto bail;
737         }
738
739         alloc_rec = le16_to_cpu(bg->bg_chain);
740         le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
741                      le16_to_cpu(bg->bg_free_bits_count));
742         le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
743                      le16_to_cpu(bg->bg_bits));
744         cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
745         if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
746                 le16_add_cpu(&cl->cl_next_free_rec, 1);
747
748         le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
749                                         le16_to_cpu(bg->bg_free_bits_count));
750         le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
751         le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
752
753         ocfs2_journal_dirty(handle, bh);
754
755         spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
756         OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
757         fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
758                                              le32_to_cpu(fe->i_clusters)));
759         spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
760         i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
761         alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
762
763         status = 0;
764
765         /* save the new last alloc group so that the caller can cache it. */
766         if (last_alloc_group)
767                 *last_alloc_group = ac->ac_last_group;
768
769 bail:
770         if (handle)
771                 ocfs2_commit_trans(osb, handle);
772
773         if (ac)
774                 ocfs2_free_alloc_context(ac);
775
776         brelse(bg_bh);
777
778         mlog_exit(status);
779         return status;
780 }
781
782 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
783                                        struct ocfs2_alloc_context *ac,
784                                        int type,
785                                        u32 slot,
786                                        u64 *last_alloc_group,
787                                        int flags)
788 {
789         int status;
790         u32 bits_wanted = ac->ac_bits_wanted;
791         struct inode *alloc_inode;
792         struct buffer_head *bh = NULL;
793         struct ocfs2_dinode *fe;
794         u32 free_bits;
795
796         mlog_entry_void();
797
798         alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
799         if (!alloc_inode) {
800                 mlog_errno(-EINVAL);
801                 return -EINVAL;
802         }
803
804         mutex_lock(&alloc_inode->i_mutex);
805
806         status = ocfs2_inode_lock(alloc_inode, &bh, 1);
807         if (status < 0) {
808                 mutex_unlock(&alloc_inode->i_mutex);
809                 iput(alloc_inode);
810
811                 mlog_errno(status);
812                 return status;
813         }
814
815         ac->ac_inode = alloc_inode;
816         ac->ac_alloc_slot = slot;
817
818         fe = (struct ocfs2_dinode *) bh->b_data;
819
820         /* The bh was validated by the inode read inside
821          * ocfs2_inode_lock().  Any corruption is a code bug. */
822         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
823
824         if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
825                 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
826                             (unsigned long long)le64_to_cpu(fe->i_blkno));
827                 status = -EIO;
828                 goto bail;
829         }
830
831         free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
832                 le32_to_cpu(fe->id1.bitmap1.i_used);
833
834         if (bits_wanted > free_bits) {
835                 /* cluster bitmap never grows */
836                 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
837                         mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
838                              bits_wanted, free_bits);
839                         status = -ENOSPC;
840                         goto bail;
841                 }
842
843                 if (!(flags & ALLOC_NEW_GROUP)) {
844                         mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
845                              "and we don't alloc a new group for it.\n",
846                              slot, bits_wanted, free_bits);
847                         status = -ENOSPC;
848                         goto bail;
849                 }
850
851                 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
852                                                  ac->ac_max_block,
853                                                  last_alloc_group, flags);
854                 if (status < 0) {
855                         if (status != -ENOSPC)
856                                 mlog_errno(status);
857                         goto bail;
858                 }
859                 atomic_inc(&osb->alloc_stats.bg_extends);
860
861                 /* You should never ask for this much metadata */
862                 BUG_ON(bits_wanted >
863                        (le32_to_cpu(fe->id1.bitmap1.i_total)
864                         - le32_to_cpu(fe->id1.bitmap1.i_used)));
865         }
866
867         get_bh(bh);
868         ac->ac_bh = bh;
869 bail:
870         brelse(bh);
871
872         mlog_exit(status);
873         return status;
874 }
875
876 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
877 {
878         spin_lock(&osb->osb_lock);
879         osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
880         spin_unlock(&osb->osb_lock);
881         atomic_set(&osb->s_num_inodes_stolen, 0);
882 }
883
884 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
885 {
886         spin_lock(&osb->osb_lock);
887         osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
888         spin_unlock(&osb->osb_lock);
889         atomic_set(&osb->s_num_meta_stolen, 0);
890 }
891
892 void ocfs2_init_steal_slots(struct ocfs2_super *osb)
893 {
894         ocfs2_init_inode_steal_slot(osb);
895         ocfs2_init_meta_steal_slot(osb);
896 }
897
898 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
899 {
900         spin_lock(&osb->osb_lock);
901         if (type == INODE_ALLOC_SYSTEM_INODE)
902                 osb->s_inode_steal_slot = slot;
903         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
904                 osb->s_meta_steal_slot = slot;
905         spin_unlock(&osb->osb_lock);
906 }
907
908 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
909 {
910         int slot = OCFS2_INVALID_SLOT;
911
912         spin_lock(&osb->osb_lock);
913         if (type == INODE_ALLOC_SYSTEM_INODE)
914                 slot = osb->s_inode_steal_slot;
915         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
916                 slot = osb->s_meta_steal_slot;
917         spin_unlock(&osb->osb_lock);
918
919         return slot;
920 }
921
922 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
923 {
924         return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
925 }
926
927 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
928 {
929         return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
930 }
931
932 static int ocfs2_steal_resource(struct ocfs2_super *osb,
933                                 struct ocfs2_alloc_context *ac,
934                                 int type)
935 {
936         int i, status = -ENOSPC;
937         int slot = __ocfs2_get_steal_slot(osb, type);
938
939         /* Start to steal resource from the first slot after ours. */
940         if (slot == OCFS2_INVALID_SLOT)
941                 slot = osb->slot_num + 1;
942
943         for (i = 0; i < osb->max_slots; i++, slot++) {
944                 if (slot == osb->max_slots)
945                         slot = 0;
946
947                 if (slot == osb->slot_num)
948                         continue;
949
950                 status = ocfs2_reserve_suballoc_bits(osb, ac,
951                                                      type,
952                                                      (u32)slot, NULL,
953                                                      NOT_ALLOC_NEW_GROUP);
954                 if (status >= 0) {
955                         __ocfs2_set_steal_slot(osb, slot, type);
956                         break;
957                 }
958
959                 ocfs2_free_ac_resource(ac);
960         }
961
962         return status;
963 }
964
965 static int ocfs2_steal_inode(struct ocfs2_super *osb,
966                              struct ocfs2_alloc_context *ac)
967 {
968         return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
969 }
970
971 static int ocfs2_steal_meta(struct ocfs2_super *osb,
972                             struct ocfs2_alloc_context *ac)
973 {
974         return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
975 }
976
977 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
978                                       int blocks,
979                                       struct ocfs2_alloc_context **ac)
980 {
981         int status;
982         int slot = ocfs2_get_meta_steal_slot(osb);
983
984         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
985         if (!(*ac)) {
986                 status = -ENOMEM;
987                 mlog_errno(status);
988                 goto bail;
989         }
990
991         (*ac)->ac_bits_wanted = blocks;
992         (*ac)->ac_which = OCFS2_AC_USE_META;
993         (*ac)->ac_group_search = ocfs2_block_group_search;
994
995         if (slot != OCFS2_INVALID_SLOT &&
996                 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
997                 goto extent_steal;
998
999         atomic_set(&osb->s_num_meta_stolen, 0);
1000         status = ocfs2_reserve_suballoc_bits(osb, (*ac),
1001                                              EXTENT_ALLOC_SYSTEM_INODE,
1002                                              (u32)osb->slot_num, NULL,
1003                                              ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
1004
1005
1006         if (status >= 0) {
1007                 status = 0;
1008                 if (slot != OCFS2_INVALID_SLOT)
1009                         ocfs2_init_meta_steal_slot(osb);
1010                 goto bail;
1011         } else if (status < 0 && status != -ENOSPC) {
1012                 mlog_errno(status);
1013                 goto bail;
1014         }
1015
1016         ocfs2_free_ac_resource(*ac);
1017
1018 extent_steal:
1019         status = ocfs2_steal_meta(osb, *ac);
1020         atomic_inc(&osb->s_num_meta_stolen);
1021         if (status < 0) {
1022                 if (status != -ENOSPC)
1023                         mlog_errno(status);
1024                 goto bail;
1025         }
1026
1027         status = 0;
1028 bail:
1029         if ((status < 0) && *ac) {
1030                 ocfs2_free_alloc_context(*ac);
1031                 *ac = NULL;
1032         }
1033
1034         mlog_exit(status);
1035         return status;
1036 }
1037
1038 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
1039                                struct ocfs2_extent_list *root_el,
1040                                struct ocfs2_alloc_context **ac)
1041 {
1042         return ocfs2_reserve_new_metadata_blocks(osb,
1043                                         ocfs2_extend_meta_needed(root_el),
1044                                         ac);
1045 }
1046
1047 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
1048                             struct ocfs2_alloc_context **ac)
1049 {
1050         int status;
1051         int slot = ocfs2_get_inode_steal_slot(osb);
1052         u64 alloc_group;
1053
1054         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1055         if (!(*ac)) {
1056                 status = -ENOMEM;
1057                 mlog_errno(status);
1058                 goto bail;
1059         }
1060
1061         (*ac)->ac_bits_wanted = 1;
1062         (*ac)->ac_which = OCFS2_AC_USE_INODE;
1063
1064         (*ac)->ac_group_search = ocfs2_block_group_search;
1065
1066         /*
1067          * stat(2) can't handle i_ino > 32bits, so we tell the
1068          * lower levels not to allocate us a block group past that
1069          * limit.  The 'inode64' mount option avoids this behavior.
1070          */
1071         if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
1072                 (*ac)->ac_max_block = (u32)~0U;
1073
1074         /*
1075          * slot is set when we successfully steal inode from other nodes.
1076          * It is reset in 3 places:
1077          * 1. when we flush the truncate log
1078          * 2. when we complete local alloc recovery.
1079          * 3. when we successfully allocate from our own slot.
1080          * After it is set, we will go on stealing inodes until we find the
1081          * need to check our slots to see whether there is some space for us.
1082          */
1083         if (slot != OCFS2_INVALID_SLOT &&
1084             atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
1085                 goto inode_steal;
1086
1087         atomic_set(&osb->s_num_inodes_stolen, 0);
1088         alloc_group = osb->osb_inode_alloc_group;
1089         status = ocfs2_reserve_suballoc_bits(osb, *ac,
1090                                              INODE_ALLOC_SYSTEM_INODE,
1091                                              (u32)osb->slot_num,
1092                                              &alloc_group,
1093                                              ALLOC_NEW_GROUP |
1094                                              ALLOC_GROUPS_FROM_GLOBAL);
1095         if (status >= 0) {
1096                 status = 0;
1097
1098                 spin_lock(&osb->osb_lock);
1099                 osb->osb_inode_alloc_group = alloc_group;
1100                 spin_unlock(&osb->osb_lock);
1101                 mlog(0, "after reservation, new allocation group is "
1102                      "%llu\n", (unsigned long long)alloc_group);
1103
1104                 /*
1105                  * Some inodes must be freed by us, so try to allocate
1106                  * from our own next time.
1107                  */
1108                 if (slot != OCFS2_INVALID_SLOT)
1109                         ocfs2_init_inode_steal_slot(osb);
1110                 goto bail;
1111         } else if (status < 0 && status != -ENOSPC) {
1112                 mlog_errno(status);
1113                 goto bail;
1114         }
1115
1116         ocfs2_free_ac_resource(*ac);
1117
1118 inode_steal:
1119         status = ocfs2_steal_inode(osb, *ac);
1120         atomic_inc(&osb->s_num_inodes_stolen);
1121         if (status < 0) {
1122                 if (status != -ENOSPC)
1123                         mlog_errno(status);
1124                 goto bail;
1125         }
1126
1127         status = 0;
1128 bail:
1129         if ((status < 0) && *ac) {
1130                 ocfs2_free_alloc_context(*ac);
1131                 *ac = NULL;
1132         }
1133
1134         mlog_exit(status);
1135         return status;
1136 }
1137
1138 /* local alloc code has to do the same thing, so rather than do this
1139  * twice.. */
1140 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
1141                                       struct ocfs2_alloc_context *ac)
1142 {
1143         int status;
1144
1145         ac->ac_which = OCFS2_AC_USE_MAIN;
1146         ac->ac_group_search = ocfs2_cluster_group_search;
1147
1148         status = ocfs2_reserve_suballoc_bits(osb, ac,
1149                                              GLOBAL_BITMAP_SYSTEM_INODE,
1150                                              OCFS2_INVALID_SLOT, NULL,
1151                                              ALLOC_NEW_GROUP);
1152         if (status < 0 && status != -ENOSPC) {
1153                 mlog_errno(status);
1154                 goto bail;
1155         }
1156
1157 bail:
1158         return status;
1159 }
1160
1161 /* Callers don't need to care which bitmap (local alloc or main) to
1162  * use so we figure it out for them, but unfortunately this clutters
1163  * things a bit. */
1164 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
1165                                              u32 bits_wanted, u64 max_block,
1166                                              int flags,
1167                                              struct ocfs2_alloc_context **ac)
1168 {
1169         int status;
1170
1171         mlog_entry_void();
1172
1173         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1174         if (!(*ac)) {
1175                 status = -ENOMEM;
1176                 mlog_errno(status);
1177                 goto bail;
1178         }
1179
1180         (*ac)->ac_bits_wanted = bits_wanted;
1181         (*ac)->ac_max_block = max_block;
1182
1183         status = -ENOSPC;
1184         if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
1185             ocfs2_alloc_should_use_local(osb, bits_wanted)) {
1186                 status = ocfs2_reserve_local_alloc_bits(osb,
1187                                                         bits_wanted,
1188                                                         *ac);
1189                 if ((status < 0) && (status != -ENOSPC)) {
1190                         mlog_errno(status);
1191                         goto bail;
1192                 }
1193         }
1194
1195         if (status == -ENOSPC) {
1196                 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1197                 if (status < 0) {
1198                         if (status != -ENOSPC)
1199                                 mlog_errno(status);
1200                         goto bail;
1201                 }
1202         }
1203
1204         status = 0;
1205 bail:
1206         if ((status < 0) && *ac) {
1207                 ocfs2_free_alloc_context(*ac);
1208                 *ac = NULL;
1209         }
1210
1211         mlog_exit(status);
1212         return status;
1213 }
1214
1215 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
1216                            u32 bits_wanted,
1217                            struct ocfs2_alloc_context **ac)
1218 {
1219         return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
1220                                                  ALLOC_NEW_GROUP, ac);
1221 }
1222
1223 /*
1224  * More or less lifted from ext3. I'll leave their description below:
1225  *
1226  * "For ext3 allocations, we must not reuse any blocks which are
1227  * allocated in the bitmap buffer's "last committed data" copy.  This
1228  * prevents deletes from freeing up the page for reuse until we have
1229  * committed the delete transaction.
1230  *
1231  * If we didn't do this, then deleting something and reallocating it as
1232  * data would allow the old block to be overwritten before the
1233  * transaction committed (because we force data to disk before commit).
1234  * This would lead to corruption if we crashed between overwriting the
1235  * data and committing the delete.
1236  *
1237  * @@@ We may want to make this allocation behaviour conditional on
1238  * data-writes at some point, and disable it for metadata allocations or
1239  * sync-data inodes."
1240  *
1241  * Note: OCFS2 already does this differently for metadata vs data
1242  * allocations, as those bitmaps are separate and undo access is never
1243  * called on a metadata group descriptor.
1244  */
1245 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1246                                          int nr)
1247 {
1248         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1249         int ret;
1250
1251         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1252                 return 0;
1253
1254         if (!buffer_jbd(bg_bh))
1255                 return 1;
1256
1257         jbd_lock_bh_state(bg_bh);
1258         bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
1259         if (bg)
1260                 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1261         else
1262                 ret = 1;
1263         jbd_unlock_bh_state(bg_bh);
1264
1265         return ret;
1266 }
1267
1268 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1269                                              struct buffer_head *bg_bh,
1270                                              unsigned int bits_wanted,
1271                                              unsigned int total_bits,
1272                                              struct ocfs2_suballoc_result *res)
1273 {
1274         void *bitmap;
1275         u16 best_offset, best_size;
1276         int offset, start, found, status = 0;
1277         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1278
1279         /* Callers got this descriptor from
1280          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1281         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1282
1283         found = start = best_offset = best_size = 0;
1284         bitmap = bg->bg_bitmap;
1285
1286         while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
1287                 if (offset == total_bits)
1288                         break;
1289
1290                 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1291                         /* We found a zero, but we can't use it as it
1292                          * hasn't been put to disk yet! */
1293                         found = 0;
1294                         start = offset + 1;
1295                 } else if (offset == start) {
1296                         /* we found a zero */
1297                         found++;
1298                         /* move start to the next bit to test */
1299                         start++;
1300                 } else {
1301                         /* got a zero after some ones */
1302                         found = 1;
1303                         start = offset + 1;
1304                 }
1305                 if (found > best_size) {
1306                         best_size = found;
1307                         best_offset = start - found;
1308                 }
1309                 /* we got everything we needed */
1310                 if (found == bits_wanted) {
1311                         /* mlog(0, "Found it all!\n"); */
1312                         break;
1313                 }
1314         }
1315
1316         if (best_size) {
1317                 res->sr_bit_offset = best_offset;
1318                 res->sr_bits = best_size;
1319         } else {
1320                 status = -ENOSPC;
1321                 /* No error log here -- see the comment above
1322                  * ocfs2_test_bg_bit_allocatable */
1323         }
1324
1325         return status;
1326 }
1327
1328 static inline int ocfs2_block_group_set_bits(handle_t *handle,
1329                                              struct inode *alloc_inode,
1330                                              struct ocfs2_group_desc *bg,
1331                                              struct buffer_head *group_bh,
1332                                              unsigned int bit_off,
1333                                              unsigned int num_bits)
1334 {
1335         int status;
1336         void *bitmap = bg->bg_bitmap;
1337         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1338
1339         mlog_entry_void();
1340
1341         /* All callers get the descriptor via
1342          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1343         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1344         BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1345
1346         mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
1347              num_bits);
1348
1349         if (ocfs2_is_cluster_bitmap(alloc_inode))
1350                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1351
1352         status = ocfs2_journal_access_gd(handle,
1353                                          INODE_CACHE(alloc_inode),
1354                                          group_bh,
1355                                          journal_type);
1356         if (status < 0) {
1357                 mlog_errno(status);
1358                 goto bail;
1359         }
1360
1361         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1362         while(num_bits--)
1363                 ocfs2_set_bit(bit_off++, bitmap);
1364
1365         ocfs2_journal_dirty(handle, group_bh);
1366
1367 bail:
1368         mlog_exit(status);
1369         return status;
1370 }
1371
1372 /* find the one with the most empty bits */
1373 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1374 {
1375         u16 curr, best;
1376
1377         BUG_ON(!cl->cl_next_free_rec);
1378
1379         best = curr = 0;
1380         while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1381                 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1382                     le32_to_cpu(cl->cl_recs[best].c_free))
1383                         best = curr;
1384                 curr++;
1385         }
1386
1387         BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1388         return best;
1389 }
1390
1391 static int ocfs2_relink_block_group(handle_t *handle,
1392                                     struct inode *alloc_inode,
1393                                     struct buffer_head *fe_bh,
1394                                     struct buffer_head *bg_bh,
1395                                     struct buffer_head *prev_bg_bh,
1396                                     u16 chain)
1397 {
1398         int status;
1399         /* there is a really tiny chance the journal calls could fail,
1400          * but we wouldn't want inconsistent blocks in *any* case. */
1401         u64 fe_ptr, bg_ptr, prev_bg_ptr;
1402         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1403         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1404         struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1405
1406         /* The caller got these descriptors from
1407          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1408         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1409         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1410
1411         mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
1412              (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1413              (unsigned long long)le64_to_cpu(bg->bg_blkno),
1414              (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1415
1416         fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
1417         bg_ptr = le64_to_cpu(bg->bg_next_group);
1418         prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1419
1420         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1421                                          prev_bg_bh,
1422                                          OCFS2_JOURNAL_ACCESS_WRITE);
1423         if (status < 0) {
1424                 mlog_errno(status);
1425                 goto out_rollback;
1426         }
1427
1428         prev_bg->bg_next_group = bg->bg_next_group;
1429         ocfs2_journal_dirty(handle, prev_bg_bh);
1430
1431         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1432                                          bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1433         if (status < 0) {
1434                 mlog_errno(status);
1435                 goto out_rollback;
1436         }
1437
1438         bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1439         ocfs2_journal_dirty(handle, bg_bh);
1440
1441         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1442                                          fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1443         if (status < 0) {
1444                 mlog_errno(status);
1445                 goto out_rollback;
1446         }
1447
1448         fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1449         ocfs2_journal_dirty(handle, fe_bh);
1450
1451 out_rollback:
1452         if (status < 0) {
1453                 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
1454                 bg->bg_next_group = cpu_to_le64(bg_ptr);
1455                 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1456         }
1457
1458         mlog_exit(status);
1459         return status;
1460 }
1461
1462 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1463                                                      u32 wanted)
1464 {
1465         return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1466 }
1467
1468 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1469  * value on error. */
1470 static int ocfs2_cluster_group_search(struct inode *inode,
1471                                       struct buffer_head *group_bh,
1472                                       u32 bits_wanted, u32 min_bits,
1473                                       u64 max_block,
1474                                       struct ocfs2_suballoc_result *res)
1475 {
1476         int search = -ENOSPC;
1477         int ret;
1478         u64 blkoff;
1479         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1480         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1481         unsigned int max_bits, gd_cluster_off;
1482
1483         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1484
1485         if (gd->bg_free_bits_count) {
1486                 max_bits = le16_to_cpu(gd->bg_bits);
1487
1488                 /* Tail groups in cluster bitmaps which aren't cpg
1489                  * aligned are prone to partial extention by a failed
1490                  * fs resize. If the file system resize never got to
1491                  * update the dinode cluster count, then we don't want
1492                  * to trust any clusters past it, regardless of what
1493                  * the group descriptor says. */
1494                 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1495                                                           le64_to_cpu(gd->bg_blkno));
1496                 if ((gd_cluster_off + max_bits) >
1497                     OCFS2_I(inode)->ip_clusters) {
1498                         max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1499                         mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
1500                              (unsigned long long)le64_to_cpu(gd->bg_blkno),
1501                              le16_to_cpu(gd->bg_bits),
1502                              OCFS2_I(inode)->ip_clusters, max_bits);
1503                 }
1504
1505                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1506                                                         group_bh, bits_wanted,
1507                                                         max_bits, res);
1508                 if (ret)
1509                         return ret;
1510
1511                 if (max_block) {
1512                         blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1513                                                           gd_cluster_off +
1514                                                           res->sr_bit_offset +
1515                                                           res->sr_bits);
1516                         mlog(0, "Checking %llu against %llu\n",
1517                              (unsigned long long)blkoff,
1518                              (unsigned long long)max_block);
1519                         if (blkoff > max_block)
1520                                 return -ENOSPC;
1521                 }
1522
1523                 /* ocfs2_block_group_find_clear_bits() might
1524                  * return success, but we still want to return
1525                  * -ENOSPC unless it found the minimum number
1526                  * of bits. */
1527                 if (min_bits <= res->sr_bits)
1528                         search = 0; /* success */
1529                 else if (res->sr_bits) {
1530                         /*
1531                          * Don't show bits which we'll be returning
1532                          * for allocation to the local alloc bitmap.
1533                          */
1534                         ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
1535                 }
1536         }
1537
1538         return search;
1539 }
1540
1541 static int ocfs2_block_group_search(struct inode *inode,
1542                                     struct buffer_head *group_bh,
1543                                     u32 bits_wanted, u32 min_bits,
1544                                     u64 max_block,
1545                                     struct ocfs2_suballoc_result *res)
1546 {
1547         int ret = -ENOSPC;
1548         u64 blkoff;
1549         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1550
1551         BUG_ON(min_bits != 1);
1552         BUG_ON(ocfs2_is_cluster_bitmap(inode));
1553
1554         if (bg->bg_free_bits_count) {
1555                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1556                                                         group_bh, bits_wanted,
1557                                                         le16_to_cpu(bg->bg_bits),
1558                                                         res);
1559                 if (!ret && max_block) {
1560                         blkoff = le64_to_cpu(bg->bg_blkno) +
1561                                 res->sr_bit_offset + res->sr_bits;
1562                         mlog(0, "Checking %llu against %llu\n",
1563                              (unsigned long long)blkoff,
1564                              (unsigned long long)max_block);
1565                         if (blkoff > max_block)
1566                                 ret = -ENOSPC;
1567                 }
1568         }
1569
1570         return ret;
1571 }
1572
1573 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1574                                        handle_t *handle,
1575                                        struct buffer_head *di_bh,
1576                                        u32 num_bits,
1577                                        u16 chain)
1578 {
1579         int ret;
1580         u32 tmp_used;
1581         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1582         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1583
1584         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1585                                       OCFS2_JOURNAL_ACCESS_WRITE);
1586         if (ret < 0) {
1587                 mlog_errno(ret);
1588                 goto out;
1589         }
1590
1591         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1592         di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1593         le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1594         ocfs2_journal_dirty(handle, di_bh);
1595
1596 out:
1597         return ret;
1598 }
1599
1600 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1601                                          struct ocfs2_extent_rec *rec,
1602                                          struct ocfs2_chain_list *cl)
1603 {
1604         unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1605         unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1606         unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc;
1607
1608         if (res->sr_bit_offset < bitoff)
1609                 return 0;
1610         if (res->sr_bit_offset >= (bitoff + bitcount))
1611                 return 0;
1612         res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1613                 (res->sr_bit_offset - bitoff);
1614         if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1615                 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1616         return 1;
1617 }
1618
1619 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1620                                           struct ocfs2_group_desc *bg,
1621                                           struct ocfs2_suballoc_result *res)
1622 {
1623         int i;
1624         u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
1625         struct ocfs2_extent_rec *rec;
1626         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1627         struct ocfs2_chain_list *cl = &di->id2.i_chain;
1628
1629         if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1630                 res->sr_blkno = 0;
1631                 return;
1632         }
1633
1634         res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1635         res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
1636         if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1637             !bg->bg_list.l_next_free_rec)
1638                 return;
1639
1640         for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1641                 rec = &bg->bg_list.l_recs[i];
1642                 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1643                         res->sr_bg_blkno = bg_blkno;  /* Restore */
1644                         break;
1645                 }
1646         }
1647 }
1648
1649 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1650                                   handle_t *handle,
1651                                   u32 bits_wanted,
1652                                   u32 min_bits,
1653                                   struct ocfs2_suballoc_result *res,
1654                                   u16 *bits_left)
1655 {
1656         int ret;
1657         struct buffer_head *group_bh = NULL;
1658         struct ocfs2_group_desc *gd;
1659         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1660         struct inode *alloc_inode = ac->ac_inode;
1661
1662         ret = ocfs2_read_group_descriptor(alloc_inode, di,
1663                                           res->sr_bg_blkno, &group_bh);
1664         if (ret < 0) {
1665                 mlog_errno(ret);
1666                 return ret;
1667         }
1668
1669         gd = (struct ocfs2_group_desc *) group_bh->b_data;
1670         ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1671                                   ac->ac_max_block, res);
1672         if (ret < 0) {
1673                 if (ret != -ENOSPC)
1674                         mlog_errno(ret);
1675                 goto out;
1676         }
1677
1678         if (!ret)
1679                 ocfs2_bg_discontig_fix_result(ac, gd, res);
1680
1681         ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1682                                                res->sr_bits,
1683                                                le16_to_cpu(gd->bg_chain));
1684         if (ret < 0) {
1685                 mlog_errno(ret);
1686                 goto out;
1687         }
1688
1689         ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1690                                          res->sr_bit_offset, res->sr_bits);
1691         if (ret < 0)
1692                 mlog_errno(ret);
1693
1694         *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1695
1696 out:
1697         brelse(group_bh);
1698
1699         return ret;
1700 }
1701
1702 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1703                               handle_t *handle,
1704                               u32 bits_wanted,
1705                               u32 min_bits,
1706                               struct ocfs2_suballoc_result *res,
1707                               u16 *bits_left)
1708 {
1709         int status;
1710         u16 chain;
1711         u32 tmp_used;
1712         u64 next_group;
1713         struct inode *alloc_inode = ac->ac_inode;
1714         struct buffer_head *group_bh = NULL;
1715         struct buffer_head *prev_group_bh = NULL;
1716         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1717         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1718         struct ocfs2_group_desc *bg;
1719
1720         chain = ac->ac_chain;
1721         mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1722              bits_wanted, chain,
1723              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1724
1725         status = ocfs2_read_group_descriptor(alloc_inode, fe,
1726                                              le64_to_cpu(cl->cl_recs[chain].c_blkno),
1727                                              &group_bh);
1728         if (status < 0) {
1729                 mlog_errno(status);
1730                 goto bail;
1731         }
1732         bg = (struct ocfs2_group_desc *) group_bh->b_data;
1733
1734         status = -ENOSPC;
1735         /* for now, the chain search is a bit simplistic. We just use
1736          * the 1st group with any empty bits. */
1737         while ((status = ac->ac_group_search(alloc_inode, group_bh,
1738                                              bits_wanted, min_bits,
1739                                              ac->ac_max_block,
1740                                              res)) == -ENOSPC) {
1741                 if (!bg->bg_next_group)
1742                         break;
1743
1744                 brelse(prev_group_bh);
1745                 prev_group_bh = NULL;
1746
1747                 next_group = le64_to_cpu(bg->bg_next_group);
1748                 prev_group_bh = group_bh;
1749                 group_bh = NULL;
1750                 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1751                                                      next_group, &group_bh);
1752                 if (status < 0) {
1753                         mlog_errno(status);
1754                         goto bail;
1755                 }
1756                 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1757         }
1758         if (status < 0) {
1759                 if (status != -ENOSPC)
1760                         mlog_errno(status);
1761                 goto bail;
1762         }
1763
1764         mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1765              res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1766
1767         res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1768
1769         BUG_ON(res->sr_bits == 0);
1770         if (!status)
1771                 ocfs2_bg_discontig_fix_result(ac, bg, res);
1772
1773
1774         /*
1775          * Keep track of previous block descriptor read. When
1776          * we find a target, if we have read more than X
1777          * number of descriptors, and the target is reasonably
1778          * empty, relink him to top of his chain.
1779          *
1780          * We've read 0 extra blocks and only send one more to
1781          * the transaction, yet the next guy to search has a
1782          * much easier time.
1783          *
1784          * Do this *after* figuring out how many bits we're taking out
1785          * of our target group.
1786          */
1787         if (ac->ac_allow_chain_relink &&
1788             (prev_group_bh) &&
1789             (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1790                 status = ocfs2_relink_block_group(handle, alloc_inode,
1791                                                   ac->ac_bh, group_bh,
1792                                                   prev_group_bh, chain);
1793                 if (status < 0) {
1794                         mlog_errno(status);
1795                         goto bail;
1796                 }
1797         }
1798
1799         /* Ok, claim our bits now: set the info on dinode, chainlist
1800          * and then the group */
1801         status = ocfs2_journal_access_di(handle,
1802                                          INODE_CACHE(alloc_inode),
1803                                          ac->ac_bh,
1804                                          OCFS2_JOURNAL_ACCESS_WRITE);
1805         if (status < 0) {
1806                 mlog_errno(status);
1807                 goto bail;
1808         }
1809
1810         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1811         fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
1812         le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
1813         ocfs2_journal_dirty(handle, ac->ac_bh);
1814
1815         status = ocfs2_block_group_set_bits(handle,
1816                                             alloc_inode,
1817                                             bg,
1818                                             group_bh,
1819                                             res->sr_bit_offset,
1820                                             res->sr_bits);
1821         if (status < 0) {
1822                 mlog_errno(status);
1823                 goto bail;
1824         }
1825
1826         mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
1827              (unsigned long long)le64_to_cpu(fe->i_blkno));
1828
1829         *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1830 bail:
1831         brelse(group_bh);
1832         brelse(prev_group_bh);
1833
1834         mlog_exit(status);
1835         return status;
1836 }
1837
1838 /* will give out up to bits_wanted contiguous bits. */
1839 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1840                                      handle_t *handle,
1841                                      u32 bits_wanted,
1842                                      u32 min_bits,
1843                                      struct ocfs2_suballoc_result *res)
1844 {
1845         int status;
1846         u16 victim, i;
1847         u16 bits_left = 0;
1848         struct ocfs2_chain_list *cl;
1849         struct ocfs2_dinode *fe;
1850
1851         mlog_entry_void();
1852
1853         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1854         BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1855         BUG_ON(!ac->ac_bh);
1856
1857         fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1858
1859         /* The bh was validated by the inode read during
1860          * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1861         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1862
1863         if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1864             le32_to_cpu(fe->id1.bitmap1.i_total)) {
1865                 ocfs2_error(ac->ac_inode->i_sb,
1866                             "Chain allocator dinode %llu has %u used "
1867                             "bits but only %u total.",
1868                             (unsigned long long)le64_to_cpu(fe->i_blkno),
1869                             le32_to_cpu(fe->id1.bitmap1.i_used),
1870                             le32_to_cpu(fe->id1.bitmap1.i_total));
1871                 status = -EIO;
1872                 goto bail;
1873         }
1874
1875         res->sr_bg_blkno = ac->ac_last_group;
1876         if (res->sr_bg_blkno) {
1877                 /* Attempt to short-circuit the usual search mechanism
1878                  * by jumping straight to the most recently used
1879                  * allocation group. This helps us mantain some
1880                  * contiguousness across allocations. */
1881                 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1882                                                 min_bits, res, &bits_left);
1883                 if (!status)
1884                         goto set_hint;
1885                 if (status < 0 && status != -ENOSPC) {
1886                         mlog_errno(status);
1887                         goto bail;
1888                 }
1889         }
1890
1891         cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1892
1893         victim = ocfs2_find_victim_chain(cl);
1894         ac->ac_chain = victim;
1895         ac->ac_allow_chain_relink = 1;
1896
1897         status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1898                                     res, &bits_left);
1899         if (!status)
1900                 goto set_hint;
1901         if (status < 0 && status != -ENOSPC) {
1902                 mlog_errno(status);
1903                 goto bail;
1904         }
1905
1906         mlog(0, "Search of victim chain %u came up with nothing, "
1907              "trying all chains now.\n", victim);
1908
1909         /* If we didn't pick a good victim, then just default to
1910          * searching each chain in order. Don't allow chain relinking
1911          * because we only calculate enough journal credits for one
1912          * relink per alloc. */
1913         ac->ac_allow_chain_relink = 0;
1914         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1915                 if (i == victim)
1916                         continue;
1917                 if (!cl->cl_recs[i].c_free)
1918                         continue;
1919
1920                 ac->ac_chain = i;
1921                 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1922                                             res, &bits_left);
1923                 if (!status)
1924                         break;
1925                 if (status < 0 && status != -ENOSPC) {
1926                         mlog_errno(status);
1927                         goto bail;
1928                 }
1929         }
1930
1931 set_hint:
1932         if (status != -ENOSPC) {
1933                 /* If the next search of this group is not likely to
1934                  * yield a suitable extent, then we reset the last
1935                  * group hint so as to not waste a disk read */
1936                 if (bits_left < min_bits)
1937                         ac->ac_last_group = 0;
1938                 else
1939                         ac->ac_last_group = res->sr_bg_blkno;
1940         }
1941
1942 bail:
1943         mlog_exit(status);
1944         return status;
1945 }
1946
1947 int ocfs2_claim_metadata(handle_t *handle,
1948                          struct ocfs2_alloc_context *ac,
1949                          u32 bits_wanted,
1950                          u64 *suballoc_loc,
1951                          u16 *suballoc_bit_start,
1952                          unsigned int *num_bits,
1953                          u64 *blkno_start)
1954 {
1955         int status;
1956         struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
1957
1958         BUG_ON(!ac);
1959         BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1960         BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1961
1962         status = ocfs2_claim_suballoc_bits(ac,
1963                                            handle,
1964                                            bits_wanted,
1965                                            1,
1966                                            &res);
1967         if (status < 0) {
1968                 mlog_errno(status);
1969                 goto bail;
1970         }
1971         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
1972
1973         *suballoc_loc = res.sr_bg_blkno;
1974         *suballoc_bit_start = res.sr_bit_offset;
1975         *blkno_start = res.sr_blkno;
1976         ac->ac_bits_given += res.sr_bits;
1977         *num_bits = res.sr_bits;
1978         status = 0;
1979 bail:
1980         mlog_exit(status);
1981         return status;
1982 }
1983
1984 static void ocfs2_init_inode_ac_group(struct inode *dir,
1985                                       struct buffer_head *parent_di_bh,
1986                                       struct ocfs2_alloc_context *ac)
1987 {
1988         struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
1989         /*
1990          * Try to allocate inodes from some specific group.
1991          *
1992          * If the parent dir has recorded the last group used in allocation,
1993          * cool, use it. Otherwise if we try to allocate new inode from the
1994          * same slot the parent dir belongs to, use the same chunk.
1995          *
1996          * We are very careful here to avoid the mistake of setting
1997          * ac_last_group to a group descriptor from a different (unlocked) slot.
1998          */
1999         if (OCFS2_I(dir)->ip_last_used_group &&
2000             OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
2001                 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
2002         else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
2003                 if (di->i_suballoc_loc)
2004                         ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
2005                 else
2006                         ac->ac_last_group = ocfs2_which_suballoc_group(
2007                                         le64_to_cpu(di->i_blkno),
2008                                         le16_to_cpu(di->i_suballoc_bit));
2009         }
2010 }
2011
2012 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
2013                                              struct ocfs2_alloc_context *ac)
2014 {
2015         OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
2016         OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
2017 }
2018
2019 int ocfs2_claim_new_inode(handle_t *handle,
2020                           struct inode *dir,
2021                           struct buffer_head *parent_fe_bh,
2022                           struct ocfs2_alloc_context *ac,
2023                           u64 *suballoc_loc,
2024                           u16 *suballoc_bit,
2025                           u64 *fe_blkno)
2026 {
2027         int status;
2028         struct ocfs2_suballoc_result res;
2029
2030         mlog_entry_void();
2031
2032         BUG_ON(!ac);
2033         BUG_ON(ac->ac_bits_given != 0);
2034         BUG_ON(ac->ac_bits_wanted != 1);
2035         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2036
2037         ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2038
2039         status = ocfs2_claim_suballoc_bits(ac,
2040                                            handle,
2041                                            1,
2042                                            1,
2043                                            &res);
2044         if (status < 0) {
2045                 mlog_errno(status);
2046                 goto bail;
2047         }
2048         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2049
2050         BUG_ON(res.sr_bits != 1);
2051
2052         *suballoc_loc = res.sr_bg_blkno;
2053         *suballoc_bit = res.sr_bit_offset;
2054         *fe_blkno = res.sr_blkno;
2055         ac->ac_bits_given++;
2056         ocfs2_save_inode_ac_group(dir, ac);
2057         status = 0;
2058 bail:
2059         mlog_exit(status);
2060         return status;
2061 }
2062
2063 /* translate a group desc. blkno and it's bitmap offset into
2064  * disk cluster offset. */
2065 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
2066                                                    u64 bg_blkno,
2067                                                    u16 bg_bit_off)
2068 {
2069         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2070         u32 cluster = 0;
2071
2072         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2073
2074         if (bg_blkno != osb->first_cluster_group_blkno)
2075                 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
2076         cluster += (u32) bg_bit_off;
2077         return cluster;
2078 }
2079
2080 /* given a cluster offset, calculate which block group it belongs to
2081  * and return that block offset. */
2082 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
2083 {
2084         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2085         u32 group_no;
2086
2087         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2088
2089         group_no = cluster / osb->bitmap_cpg;
2090         if (!group_no)
2091                 return osb->first_cluster_group_blkno;
2092         return ocfs2_clusters_to_blocks(inode->i_sb,
2093                                         group_no * osb->bitmap_cpg);
2094 }
2095
2096 /* given the block number of a cluster start, calculate which cluster
2097  * group and descriptor bitmap offset that corresponds to. */
2098 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
2099                                                 u64 data_blkno,
2100                                                 u64 *bg_blkno,
2101                                                 u16 *bg_bit_off)
2102 {
2103         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2104         u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
2105
2106         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2107
2108         *bg_blkno = ocfs2_which_cluster_group(inode,
2109                                               data_cluster);
2110
2111         if (*bg_blkno == osb->first_cluster_group_blkno)
2112                 *bg_bit_off = (u16) data_cluster;
2113         else
2114                 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
2115                                                              data_blkno - *bg_blkno);
2116 }
2117
2118 /*
2119  * min_bits - minimum contiguous chunk from this total allocation we
2120  * can handle. set to what we asked for originally for a full
2121  * contig. allocation, set to '1' to indicate we can deal with extents
2122  * of any size.
2123  */
2124 int __ocfs2_claim_clusters(handle_t *handle,
2125                            struct ocfs2_alloc_context *ac,
2126                            u32 min_clusters,
2127                            u32 max_clusters,
2128                            u32 *cluster_start,
2129                            u32 *num_clusters)
2130 {
2131         int status;
2132         unsigned int bits_wanted = max_clusters;
2133         struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2134         struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
2135
2136         mlog_entry_void();
2137
2138         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2139
2140         BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
2141                && ac->ac_which != OCFS2_AC_USE_MAIN);
2142
2143         if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2144                 WARN_ON(min_clusters > 1);
2145
2146                 status = ocfs2_claim_local_alloc_bits(osb,
2147                                                       handle,
2148                                                       ac,
2149                                                       bits_wanted,
2150                                                       cluster_start,
2151                                                       num_clusters);
2152                 if (!status)
2153                         atomic_inc(&osb->alloc_stats.local_data);
2154         } else {
2155                 if (min_clusters > (osb->bitmap_cpg - 1)) {
2156                         /* The only paths asking for contiguousness
2157                          * should know about this already. */
2158                         mlog(ML_ERROR, "minimum allocation requested %u exceeds "
2159                              "group bitmap size %u!\n", min_clusters,
2160                              osb->bitmap_cpg);
2161                         status = -ENOSPC;
2162                         goto bail;
2163                 }
2164                 /* clamp the current request down to a realistic size. */
2165                 if (bits_wanted > (osb->bitmap_cpg - 1))
2166                         bits_wanted = osb->bitmap_cpg - 1;
2167
2168                 status = ocfs2_claim_suballoc_bits(ac,
2169                                                    handle,
2170                                                    bits_wanted,
2171                                                    min_clusters,
2172                                                    &res);
2173                 if (!status) {
2174                         BUG_ON(res.sr_blkno); /* cluster alloc can't set */
2175                         *cluster_start =
2176                                 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
2177                                                                  res.sr_bg_blkno,
2178                                                                  res.sr_bit_offset);
2179                         atomic_inc(&osb->alloc_stats.bitmap_data);
2180                         *num_clusters = res.sr_bits;
2181                 }
2182         }
2183         if (status < 0) {
2184                 if (status != -ENOSPC)
2185                         mlog_errno(status);
2186                 goto bail;
2187         }
2188
2189         ac->ac_bits_given += *num_clusters;
2190
2191 bail:
2192         mlog_exit(status);
2193         return status;
2194 }
2195
2196 int ocfs2_claim_clusters(handle_t *handle,
2197                          struct ocfs2_alloc_context *ac,
2198                          u32 min_clusters,
2199                          u32 *cluster_start,
2200                          u32 *num_clusters)
2201 {
2202         unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
2203
2204         return __ocfs2_claim_clusters(handle, ac, min_clusters,
2205                                       bits_wanted, cluster_start, num_clusters);
2206 }
2207
2208 static int ocfs2_block_group_clear_bits(handle_t *handle,
2209                                         struct inode *alloc_inode,
2210                                         struct ocfs2_group_desc *bg,
2211                                         struct buffer_head *group_bh,
2212                                         unsigned int bit_off,
2213                                         unsigned int num_bits,
2214                                         void (*undo_fn)(unsigned int bit,
2215                                                         unsigned long *bmap))
2216 {
2217         int status;
2218         unsigned int tmp;
2219         struct ocfs2_group_desc *undo_bg = NULL;
2220
2221         mlog_entry_void();
2222
2223         /* The caller got this descriptor from
2224          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
2225         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
2226
2227         mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
2228
2229         BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2230         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2231                                          group_bh,
2232                                          undo_fn ?
2233                                          OCFS2_JOURNAL_ACCESS_UNDO :
2234                                          OCFS2_JOURNAL_ACCESS_WRITE);
2235         if (status < 0) {
2236                 mlog_errno(status);
2237                 goto bail;
2238         }
2239
2240         if (undo_fn) {
2241                 jbd_lock_bh_state(group_bh);
2242                 undo_bg = (struct ocfs2_group_desc *)
2243                                         bh2jh(group_bh)->b_committed_data;
2244                 BUG_ON(!undo_bg);
2245         }
2246
2247         tmp = num_bits;
2248         while(tmp--) {
2249                 ocfs2_clear_bit((bit_off + tmp),
2250                                 (unsigned long *) bg->bg_bitmap);
2251                 if (undo_fn)
2252                         undo_fn(bit_off + tmp,
2253                                 (unsigned long *) undo_bg->bg_bitmap);
2254         }
2255         le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2256
2257         if (undo_fn)
2258                 jbd_unlock_bh_state(group_bh);
2259
2260         ocfs2_journal_dirty(handle, group_bh);
2261 bail:
2262         return status;
2263 }
2264
2265 /*
2266  * expects the suballoc inode to already be locked.
2267  */
2268 static int _ocfs2_free_suballoc_bits(handle_t *handle,
2269                                      struct inode *alloc_inode,
2270                                      struct buffer_head *alloc_bh,
2271                                      unsigned int start_bit,
2272                                      u64 bg_blkno,
2273                                      unsigned int count,
2274                                      void (*undo_fn)(unsigned int bit,
2275                                                      unsigned long *bitmap))
2276 {
2277         int status = 0;
2278         u32 tmp_used;
2279         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2280         struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2281         struct buffer_head *group_bh = NULL;
2282         struct ocfs2_group_desc *group;
2283
2284         mlog_entry_void();
2285
2286         /* The alloc_bh comes from ocfs2_free_dinode() or
2287          * ocfs2_free_clusters().  The callers have all locked the
2288          * allocator and gotten alloc_bh from the lock call.  This
2289          * validates the dinode buffer.  Any corruption that has happended
2290          * is a code bug. */
2291         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2292         BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2293
2294         mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
2295              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
2296              (unsigned long long)bg_blkno, start_bit);
2297
2298         status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2299                                              &group_bh);
2300         if (status < 0) {
2301                 mlog_errno(status);
2302                 goto bail;
2303         }
2304         group = (struct ocfs2_group_desc *) group_bh->b_data;
2305
2306         BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2307
2308         status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2309                                               group, group_bh,
2310                                               start_bit, count, undo_fn);
2311         if (status < 0) {
2312                 mlog_errno(status);
2313                 goto bail;
2314         }
2315
2316         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2317                                          alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2318         if (status < 0) {
2319                 mlog_errno(status);
2320                 goto bail;
2321         }
2322
2323         le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2324                      count);
2325         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2326         fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2327         ocfs2_journal_dirty(handle, alloc_bh);
2328
2329 bail:
2330         brelse(group_bh);
2331
2332         mlog_exit(status);
2333         return status;
2334 }
2335
2336 int ocfs2_free_suballoc_bits(handle_t *handle,
2337                              struct inode *alloc_inode,
2338                              struct buffer_head *alloc_bh,
2339                              unsigned int start_bit,
2340                              u64 bg_blkno,
2341                              unsigned int count)
2342 {
2343         return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2344                                          start_bit, bg_blkno, count, NULL);
2345 }
2346
2347 int ocfs2_free_dinode(handle_t *handle,
2348                       struct inode *inode_alloc_inode,
2349                       struct buffer_head *inode_alloc_bh,
2350                       struct ocfs2_dinode *di)
2351 {
2352         u64 blk = le64_to_cpu(di->i_blkno);
2353         u16 bit = le16_to_cpu(di->i_suballoc_bit);
2354         u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2355
2356         if (di->i_suballoc_loc)
2357                 bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2358         return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2359                                         inode_alloc_bh, bit, bg_blkno, 1);
2360 }
2361
2362 static int _ocfs2_free_clusters(handle_t *handle,
2363                                 struct inode *bitmap_inode,
2364                                 struct buffer_head *bitmap_bh,
2365                                 u64 start_blk,
2366                                 unsigned int num_clusters,
2367                                 void (*undo_fn)(unsigned int bit,
2368                                                 unsigned long *bitmap))
2369 {
2370         int status;
2371         u16 bg_start_bit;
2372         u64 bg_blkno;
2373         struct ocfs2_dinode *fe;
2374
2375         /* You can't ever have a contiguous set of clusters
2376          * bigger than a block group bitmap so we never have to worry
2377          * about looping on them. */
2378
2379         mlog_entry_void();
2380
2381         /* This is expensive. We can safely remove once this stuff has
2382          * gotten tested really well. */
2383         BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
2384
2385         fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
2386
2387         ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2388                                      &bg_start_bit);
2389
2390         mlog(0, "want to free %u clusters starting at block %llu\n",
2391              num_clusters, (unsigned long long)start_blk);
2392         mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2393              (unsigned long long)bg_blkno, bg_start_bit);
2394
2395         status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2396                                            bg_start_bit, bg_blkno,
2397                                            num_clusters, undo_fn);
2398         if (status < 0) {
2399                 mlog_errno(status);
2400                 goto out;
2401         }
2402
2403         ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2404                                          num_clusters);
2405
2406 out:
2407         mlog_exit(status);
2408         return status;
2409 }
2410
2411 int ocfs2_free_clusters(handle_t *handle,
2412                         struct inode *bitmap_inode,
2413                         struct buffer_head *bitmap_bh,
2414                         u64 start_blk,
2415                         unsigned int num_clusters)
2416 {
2417         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2418                                     start_blk, num_clusters,
2419                                     _ocfs2_set_bit);
2420 }
2421
2422 /*
2423  * Give never-used clusters back to the global bitmap.  We don't need
2424  * to protect these bits in the undo buffer.
2425  */
2426 int ocfs2_release_clusters(handle_t *handle,
2427                            struct inode *bitmap_inode,
2428                            struct buffer_head *bitmap_bh,
2429                            u64 start_blk,
2430                            unsigned int num_clusters)
2431 {
2432         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2433                                     start_blk, num_clusters,
2434                                     _ocfs2_clear_bit);
2435 }
2436
2437 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2438 {
2439         printk("Block Group:\n");
2440         printk("bg_signature:       %s\n", bg->bg_signature);
2441         printk("bg_size:            %u\n", bg->bg_size);
2442         printk("bg_bits:            %u\n", bg->bg_bits);
2443         printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
2444         printk("bg_chain:           %u\n", bg->bg_chain);
2445         printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
2446         printk("bg_next_group:      %llu\n",
2447                (unsigned long long)bg->bg_next_group);
2448         printk("bg_parent_dinode:   %llu\n",
2449                (unsigned long long)bg->bg_parent_dinode);
2450         printk("bg_blkno:           %llu\n",
2451                (unsigned long long)bg->bg_blkno);
2452 }
2453
2454 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
2455 {
2456         int i;
2457
2458         printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
2459         printk("i_signature:                  %s\n", fe->i_signature);
2460         printk("i_size:                       %llu\n",
2461                (unsigned long long)fe->i_size);
2462         printk("i_clusters:                   %u\n", fe->i_clusters);
2463         printk("i_generation:                 %u\n",
2464                le32_to_cpu(fe->i_generation));
2465         printk("id1.bitmap1.i_used:           %u\n",
2466                le32_to_cpu(fe->id1.bitmap1.i_used));
2467         printk("id1.bitmap1.i_total:          %u\n",
2468                le32_to_cpu(fe->id1.bitmap1.i_total));
2469         printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
2470         printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
2471         printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
2472         printk("id2.i_chain.cl_next_free_rec: %u\n",
2473                fe->id2.i_chain.cl_next_free_rec);
2474         for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
2475                 printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
2476                        fe->id2.i_chain.cl_recs[i].c_free);
2477                 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
2478                        fe->id2.i_chain.cl_recs[i].c_total);
2479                 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
2480                        (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
2481         }
2482 }
2483
2484 /*
2485  * For a given allocation, determine which allocators will need to be
2486  * accessed, and lock them, reserving the appropriate number of bits.
2487  *
2488  * Sparse file systems call this from ocfs2_write_begin_nolock()
2489  * and ocfs2_allocate_unwritten_extents().
2490  *
2491  * File systems which don't support holes call this from
2492  * ocfs2_extend_allocation().
2493  */
2494 int ocfs2_lock_allocators(struct inode *inode,
2495                           struct ocfs2_extent_tree *et,
2496                           u32 clusters_to_add, u32 extents_to_split,
2497                           struct ocfs2_alloc_context **data_ac,
2498                           struct ocfs2_alloc_context **meta_ac)
2499 {
2500         int ret = 0, num_free_extents;
2501         unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2502         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2503
2504         *meta_ac = NULL;
2505         if (data_ac)
2506                 *data_ac = NULL;
2507
2508         BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2509
2510         num_free_extents = ocfs2_num_free_extents(osb, et);
2511         if (num_free_extents < 0) {
2512                 ret = num_free_extents;
2513                 mlog_errno(ret);
2514                 goto out;
2515         }
2516
2517         /*
2518          * Sparse allocation file systems need to be more conservative
2519          * with reserving room for expansion - the actual allocation
2520          * happens while we've got a journal handle open so re-taking
2521          * a cluster lock (because we ran out of room for another
2522          * extent) will violate ordering rules.
2523          *
2524          * Most of the time we'll only be seeing this 1 cluster at a time
2525          * anyway.
2526          *
2527          * Always lock for any unwritten extents - we might want to
2528          * add blocks during a split.
2529          */
2530         if (!num_free_extents ||
2531             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2532                 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2533                 if (ret < 0) {
2534                         if (ret != -ENOSPC)
2535                                 mlog_errno(ret);
2536                         goto out;
2537                 }
2538         }
2539
2540         if (clusters_to_add == 0)
2541                 goto out;
2542
2543         ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2544         if (ret < 0) {
2545                 if (ret != -ENOSPC)
2546                         mlog_errno(ret);
2547                 goto out;
2548         }
2549
2550 out:
2551         if (ret) {
2552                 if (*meta_ac) {
2553                         ocfs2_free_alloc_context(*meta_ac);
2554                         *meta_ac = NULL;
2555                 }
2556
2557                 /*
2558                  * We cannot have an error and a non null *data_ac.
2559                  */
2560         }
2561
2562         return ret;
2563 }
2564
2565 /*
2566  * Read the inode specified by blkno to get suballoc_slot and
2567  * suballoc_bit.
2568  */
2569 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2570                                        u16 *suballoc_slot, u16 *suballoc_bit)
2571 {
2572         int status;
2573         struct buffer_head *inode_bh = NULL;
2574         struct ocfs2_dinode *inode_fe;
2575
2576         mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
2577
2578         /* dirty read disk */
2579         status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2580         if (status < 0) {
2581                 mlog(ML_ERROR, "read block %llu failed %d\n",
2582                      (unsigned long long)blkno, status);
2583                 goto bail;
2584         }
2585
2586         inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2587         if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2588                 mlog(ML_ERROR, "invalid inode %llu requested\n",
2589                      (unsigned long long)blkno);
2590                 status = -EINVAL;
2591                 goto bail;
2592         }
2593
2594         if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2595             (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2596                 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2597                      (unsigned long long)blkno,
2598                      (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2599                 status = -EINVAL;
2600                 goto bail;
2601         }
2602
2603         if (suballoc_slot)
2604                 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2605         if (suballoc_bit)
2606                 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2607
2608 bail:
2609         brelse(inode_bh);
2610
2611         mlog_exit(status);
2612         return status;
2613 }
2614
2615 /*
2616  * test whether bit is SET in allocator bitmap or not.  on success, 0
2617  * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
2618  * is returned and *res is meaningless.  Call this after you have
2619  * cluster locked against suballoc, or you may get a result based on
2620  * non-up2date contents
2621  */
2622 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2623                                    struct inode *suballoc,
2624                                    struct buffer_head *alloc_bh, u64 blkno,
2625                                    u16 bit, int *res)
2626 {
2627         struct ocfs2_dinode *alloc_di;
2628         struct ocfs2_group_desc *group;
2629         struct buffer_head *group_bh = NULL;
2630         u64 bg_blkno;
2631         int status;
2632
2633         mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
2634                    (unsigned int)bit);
2635
2636         alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
2637         if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
2638                 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2639                      (unsigned int)bit,
2640                      ocfs2_bits_per_group(&alloc_di->id2.i_chain));
2641                 status = -EINVAL;
2642                 goto bail;
2643         }
2644
2645         if (alloc_di->i_suballoc_loc)
2646                 bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc);
2647         else
2648                 bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2649         status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2650                                              &group_bh);
2651         if (status < 0) {
2652                 mlog(ML_ERROR, "read group %llu failed %d\n",
2653                      (unsigned long long)bg_blkno, status);
2654                 goto bail;
2655         }
2656
2657         group = (struct ocfs2_group_desc *) group_bh->b_data;
2658         *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2659
2660 bail:
2661         brelse(group_bh);
2662
2663         mlog_exit(status);
2664         return status;
2665 }
2666
2667 /*
2668  * Test if the bit representing this inode (blkno) is set in the
2669  * suballocator.
2670  *
2671  * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2672  *
2673  * In the event of failure, a negative value is returned and *res is
2674  * meaningless.
2675  *
2676  * Callers must make sure to hold nfs_sync_lock to prevent
2677  * ocfs2_delete_inode() on another node from accessing the same
2678  * suballocator concurrently.
2679  */
2680 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2681 {
2682         int status;
2683         u16 suballoc_bit = 0, suballoc_slot = 0;
2684         struct inode *inode_alloc_inode;
2685         struct buffer_head *alloc_bh = NULL;
2686
2687         mlog_entry("blkno: %llu", (unsigned long long)blkno);
2688
2689         status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2690                                              &suballoc_bit);
2691         if (status < 0) {
2692                 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2693                 goto bail;
2694         }
2695
2696         inode_alloc_inode =
2697                 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2698                                             suballoc_slot);
2699         if (!inode_alloc_inode) {
2700                 /* the error code could be inaccurate, but we are not able to
2701                  * get the correct one. */
2702                 status = -EINVAL;
2703                 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2704                      (u32)suballoc_slot);
2705                 goto bail;
2706         }
2707
2708         mutex_lock(&inode_alloc_inode->i_mutex);
2709         status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2710         if (status < 0) {
2711                 mutex_unlock(&inode_alloc_inode->i_mutex);
2712                 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2713                      (u32)suballoc_slot, status);
2714                 goto bail;
2715         }
2716
2717         status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2718                                          blkno, suballoc_bit, res);
2719         if (status < 0)
2720                 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2721
2722         ocfs2_inode_unlock(inode_alloc_inode, 0);
2723         mutex_unlock(&inode_alloc_inode->i_mutex);
2724
2725         iput(inode_alloc_inode);
2726         brelse(alloc_bh);
2727 bail:
2728         mlog_exit(status);
2729         return status;
2730 }