btrfs: get fs_info from block group in lookup_free_space_inode
[sfrench/cifs-2.6.git] / fs / btrfs / free-space-cache.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2008 Red Hat.  All rights reserved.
4  */
5
6 #include <linux/pagemap.h>
7 #include <linux/sched.h>
8 #include <linux/sched/signal.h>
9 #include <linux/slab.h>
10 #include <linux/math64.h>
11 #include <linux/ratelimit.h>
12 #include <linux/error-injection.h>
13 #include <linux/sched/mm.h>
14 #include "ctree.h"
15 #include "free-space-cache.h"
16 #include "transaction.h"
17 #include "disk-io.h"
18 #include "extent_io.h"
19 #include "inode-map.h"
20 #include "volumes.h"
21
22 #define BITS_PER_BITMAP         (PAGE_SIZE * 8UL)
23 #define MAX_CACHE_BYTES_PER_GIG SZ_32K
24
25 struct btrfs_trim_range {
26         u64 start;
27         u64 bytes;
28         struct list_head list;
29 };
30
31 static int link_free_space(struct btrfs_free_space_ctl *ctl,
32                            struct btrfs_free_space *info);
33 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
34                               struct btrfs_free_space *info);
35 static int btrfs_wait_cache_io_root(struct btrfs_root *root,
36                              struct btrfs_trans_handle *trans,
37                              struct btrfs_io_ctl *io_ctl,
38                              struct btrfs_path *path);
39
40 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
41                                                struct btrfs_path *path,
42                                                u64 offset)
43 {
44         struct btrfs_fs_info *fs_info = root->fs_info;
45         struct btrfs_key key;
46         struct btrfs_key location;
47         struct btrfs_disk_key disk_key;
48         struct btrfs_free_space_header *header;
49         struct extent_buffer *leaf;
50         struct inode *inode = NULL;
51         unsigned nofs_flag;
52         int ret;
53
54         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
55         key.offset = offset;
56         key.type = 0;
57
58         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
59         if (ret < 0)
60                 return ERR_PTR(ret);
61         if (ret > 0) {
62                 btrfs_release_path(path);
63                 return ERR_PTR(-ENOENT);
64         }
65
66         leaf = path->nodes[0];
67         header = btrfs_item_ptr(leaf, path->slots[0],
68                                 struct btrfs_free_space_header);
69         btrfs_free_space_key(leaf, header, &disk_key);
70         btrfs_disk_key_to_cpu(&location, &disk_key);
71         btrfs_release_path(path);
72
73         /*
74          * We are often under a trans handle at this point, so we need to make
75          * sure NOFS is set to keep us from deadlocking.
76          */
77         nofs_flag = memalloc_nofs_save();
78         inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path);
79         btrfs_release_path(path);
80         memalloc_nofs_restore(nofs_flag);
81         if (IS_ERR(inode))
82                 return inode;
83
84         mapping_set_gfp_mask(inode->i_mapping,
85                         mapping_gfp_constraint(inode->i_mapping,
86                         ~(__GFP_FS | __GFP_HIGHMEM)));
87
88         return inode;
89 }
90
91 struct inode *lookup_free_space_inode(
92                 struct btrfs_block_group_cache *block_group,
93                 struct btrfs_path *path)
94 {
95         struct btrfs_fs_info *fs_info = block_group->fs_info;
96         struct inode *inode = NULL;
97         u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
98
99         spin_lock(&block_group->lock);
100         if (block_group->inode)
101                 inode = igrab(block_group->inode);
102         spin_unlock(&block_group->lock);
103         if (inode)
104                 return inode;
105
106         inode = __lookup_free_space_inode(fs_info->tree_root, path,
107                                           block_group->key.objectid);
108         if (IS_ERR(inode))
109                 return inode;
110
111         spin_lock(&block_group->lock);
112         if (!((BTRFS_I(inode)->flags & flags) == flags)) {
113                 btrfs_info(fs_info, "Old style space inode found, converting.");
114                 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
115                         BTRFS_INODE_NODATACOW;
116                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
117         }
118
119         if (!block_group->iref) {
120                 block_group->inode = igrab(inode);
121                 block_group->iref = 1;
122         }
123         spin_unlock(&block_group->lock);
124
125         return inode;
126 }
127
128 static int __create_free_space_inode(struct btrfs_root *root,
129                                      struct btrfs_trans_handle *trans,
130                                      struct btrfs_path *path,
131                                      u64 ino, u64 offset)
132 {
133         struct btrfs_key key;
134         struct btrfs_disk_key disk_key;
135         struct btrfs_free_space_header *header;
136         struct btrfs_inode_item *inode_item;
137         struct extent_buffer *leaf;
138         u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
139         int ret;
140
141         ret = btrfs_insert_empty_inode(trans, root, path, ino);
142         if (ret)
143                 return ret;
144
145         /* We inline crc's for the free disk space cache */
146         if (ino != BTRFS_FREE_INO_OBJECTID)
147                 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
148
149         leaf = path->nodes[0];
150         inode_item = btrfs_item_ptr(leaf, path->slots[0],
151                                     struct btrfs_inode_item);
152         btrfs_item_key(leaf, &disk_key, path->slots[0]);
153         memzero_extent_buffer(leaf, (unsigned long)inode_item,
154                              sizeof(*inode_item));
155         btrfs_set_inode_generation(leaf, inode_item, trans->transid);
156         btrfs_set_inode_size(leaf, inode_item, 0);
157         btrfs_set_inode_nbytes(leaf, inode_item, 0);
158         btrfs_set_inode_uid(leaf, inode_item, 0);
159         btrfs_set_inode_gid(leaf, inode_item, 0);
160         btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
161         btrfs_set_inode_flags(leaf, inode_item, flags);
162         btrfs_set_inode_nlink(leaf, inode_item, 1);
163         btrfs_set_inode_transid(leaf, inode_item, trans->transid);
164         btrfs_set_inode_block_group(leaf, inode_item, offset);
165         btrfs_mark_buffer_dirty(leaf);
166         btrfs_release_path(path);
167
168         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
169         key.offset = offset;
170         key.type = 0;
171         ret = btrfs_insert_empty_item(trans, root, path, &key,
172                                       sizeof(struct btrfs_free_space_header));
173         if (ret < 0) {
174                 btrfs_release_path(path);
175                 return ret;
176         }
177
178         leaf = path->nodes[0];
179         header = btrfs_item_ptr(leaf, path->slots[0],
180                                 struct btrfs_free_space_header);
181         memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
182         btrfs_set_free_space_key(leaf, header, &disk_key);
183         btrfs_mark_buffer_dirty(leaf);
184         btrfs_release_path(path);
185
186         return 0;
187 }
188
189 int create_free_space_inode(struct btrfs_trans_handle *trans,
190                             struct btrfs_block_group_cache *block_group,
191                             struct btrfs_path *path)
192 {
193         int ret;
194         u64 ino;
195
196         ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino);
197         if (ret < 0)
198                 return ret;
199
200         return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
201                                          ino, block_group->key.objectid);
202 }
203
204 int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
205                                        struct btrfs_block_rsv *rsv)
206 {
207         u64 needed_bytes;
208         int ret;
209
210         /* 1 for slack space, 1 for updating the inode */
211         needed_bytes = btrfs_calc_trunc_metadata_size(fs_info, 1) +
212                 btrfs_calc_trans_metadata_size(fs_info, 1);
213
214         spin_lock(&rsv->lock);
215         if (rsv->reserved < needed_bytes)
216                 ret = -ENOSPC;
217         else
218                 ret = 0;
219         spin_unlock(&rsv->lock);
220         return ret;
221 }
222
223 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
224                                     struct btrfs_block_group_cache *block_group,
225                                     struct inode *inode)
226 {
227         struct btrfs_root *root = BTRFS_I(inode)->root;
228         int ret = 0;
229         bool locked = false;
230
231         if (block_group) {
232                 struct btrfs_path *path = btrfs_alloc_path();
233
234                 if (!path) {
235                         ret = -ENOMEM;
236                         goto fail;
237                 }
238                 locked = true;
239                 mutex_lock(&trans->transaction->cache_write_mutex);
240                 if (!list_empty(&block_group->io_list)) {
241                         list_del_init(&block_group->io_list);
242
243                         btrfs_wait_cache_io(trans, block_group, path);
244                         btrfs_put_block_group(block_group);
245                 }
246
247                 /*
248                  * now that we've truncated the cache away, its no longer
249                  * setup or written
250                  */
251                 spin_lock(&block_group->lock);
252                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
253                 spin_unlock(&block_group->lock);
254                 btrfs_free_path(path);
255         }
256
257         btrfs_i_size_write(BTRFS_I(inode), 0);
258         truncate_pagecache(inode, 0);
259
260         /*
261          * We skip the throttling logic for free space cache inodes, so we don't
262          * need to check for -EAGAIN.
263          */
264         ret = btrfs_truncate_inode_items(trans, root, inode,
265                                          0, BTRFS_EXTENT_DATA_KEY);
266         if (ret)
267                 goto fail;
268
269         ret = btrfs_update_inode(trans, root, inode);
270
271 fail:
272         if (locked)
273                 mutex_unlock(&trans->transaction->cache_write_mutex);
274         if (ret)
275                 btrfs_abort_transaction(trans, ret);
276
277         return ret;
278 }
279
280 static void readahead_cache(struct inode *inode)
281 {
282         struct file_ra_state *ra;
283         unsigned long last_index;
284
285         ra = kzalloc(sizeof(*ra), GFP_NOFS);
286         if (!ra)
287                 return;
288
289         file_ra_state_init(ra, inode->i_mapping);
290         last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
291
292         page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
293
294         kfree(ra);
295 }
296
297 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
298                        int write)
299 {
300         int num_pages;
301         int check_crcs = 0;
302
303         num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
304
305         if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID)
306                 check_crcs = 1;
307
308         /* Make sure we can fit our crcs and generation into the first page */
309         if (write && check_crcs &&
310             (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
311                 return -ENOSPC;
312
313         memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
314
315         io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
316         if (!io_ctl->pages)
317                 return -ENOMEM;
318
319         io_ctl->num_pages = num_pages;
320         io_ctl->fs_info = btrfs_sb(inode->i_sb);
321         io_ctl->check_crcs = check_crcs;
322         io_ctl->inode = inode;
323
324         return 0;
325 }
326 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
327
328 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
329 {
330         kfree(io_ctl->pages);
331         io_ctl->pages = NULL;
332 }
333
334 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
335 {
336         if (io_ctl->cur) {
337                 io_ctl->cur = NULL;
338                 io_ctl->orig = NULL;
339         }
340 }
341
342 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
343 {
344         ASSERT(io_ctl->index < io_ctl->num_pages);
345         io_ctl->page = io_ctl->pages[io_ctl->index++];
346         io_ctl->cur = page_address(io_ctl->page);
347         io_ctl->orig = io_ctl->cur;
348         io_ctl->size = PAGE_SIZE;
349         if (clear)
350                 clear_page(io_ctl->cur);
351 }
352
353 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
354 {
355         int i;
356
357         io_ctl_unmap_page(io_ctl);
358
359         for (i = 0; i < io_ctl->num_pages; i++) {
360                 if (io_ctl->pages[i]) {
361                         ClearPageChecked(io_ctl->pages[i]);
362                         unlock_page(io_ctl->pages[i]);
363                         put_page(io_ctl->pages[i]);
364                 }
365         }
366 }
367
368 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
369                                 int uptodate)
370 {
371         struct page *page;
372         gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
373         int i;
374
375         for (i = 0; i < io_ctl->num_pages; i++) {
376                 page = find_or_create_page(inode->i_mapping, i, mask);
377                 if (!page) {
378                         io_ctl_drop_pages(io_ctl);
379                         return -ENOMEM;
380                 }
381                 io_ctl->pages[i] = page;
382                 if (uptodate && !PageUptodate(page)) {
383                         btrfs_readpage(NULL, page);
384                         lock_page(page);
385                         if (!PageUptodate(page)) {
386                                 btrfs_err(BTRFS_I(inode)->root->fs_info,
387                                            "error reading free space cache");
388                                 io_ctl_drop_pages(io_ctl);
389                                 return -EIO;
390                         }
391                 }
392         }
393
394         for (i = 0; i < io_ctl->num_pages; i++) {
395                 clear_page_dirty_for_io(io_ctl->pages[i]);
396                 set_page_extent_mapped(io_ctl->pages[i]);
397         }
398
399         return 0;
400 }
401
402 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
403 {
404         __le64 *val;
405
406         io_ctl_map_page(io_ctl, 1);
407
408         /*
409          * Skip the csum areas.  If we don't check crcs then we just have a
410          * 64bit chunk at the front of the first page.
411          */
412         if (io_ctl->check_crcs) {
413                 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
414                 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
415         } else {
416                 io_ctl->cur += sizeof(u64);
417                 io_ctl->size -= sizeof(u64) * 2;
418         }
419
420         val = io_ctl->cur;
421         *val = cpu_to_le64(generation);
422         io_ctl->cur += sizeof(u64);
423 }
424
425 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
426 {
427         __le64 *gen;
428
429         /*
430          * Skip the crc area.  If we don't check crcs then we just have a 64bit
431          * chunk at the front of the first page.
432          */
433         if (io_ctl->check_crcs) {
434                 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
435                 io_ctl->size -= sizeof(u64) +
436                         (sizeof(u32) * io_ctl->num_pages);
437         } else {
438                 io_ctl->cur += sizeof(u64);
439                 io_ctl->size -= sizeof(u64) * 2;
440         }
441
442         gen = io_ctl->cur;
443         if (le64_to_cpu(*gen) != generation) {
444                 btrfs_err_rl(io_ctl->fs_info,
445                         "space cache generation (%llu) does not match inode (%llu)",
446                                 *gen, generation);
447                 io_ctl_unmap_page(io_ctl);
448                 return -EIO;
449         }
450         io_ctl->cur += sizeof(u64);
451         return 0;
452 }
453
454 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
455 {
456         u32 *tmp;
457         u32 crc = ~(u32)0;
458         unsigned offset = 0;
459
460         if (!io_ctl->check_crcs) {
461                 io_ctl_unmap_page(io_ctl);
462                 return;
463         }
464
465         if (index == 0)
466                 offset = sizeof(u32) * io_ctl->num_pages;
467
468         crc = btrfs_csum_data(io_ctl->orig + offset, crc,
469                               PAGE_SIZE - offset);
470         btrfs_csum_final(crc, (u8 *)&crc);
471         io_ctl_unmap_page(io_ctl);
472         tmp = page_address(io_ctl->pages[0]);
473         tmp += index;
474         *tmp = crc;
475 }
476
477 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
478 {
479         u32 *tmp, val;
480         u32 crc = ~(u32)0;
481         unsigned offset = 0;
482
483         if (!io_ctl->check_crcs) {
484                 io_ctl_map_page(io_ctl, 0);
485                 return 0;
486         }
487
488         if (index == 0)
489                 offset = sizeof(u32) * io_ctl->num_pages;
490
491         tmp = page_address(io_ctl->pages[0]);
492         tmp += index;
493         val = *tmp;
494
495         io_ctl_map_page(io_ctl, 0);
496         crc = btrfs_csum_data(io_ctl->orig + offset, crc,
497                               PAGE_SIZE - offset);
498         btrfs_csum_final(crc, (u8 *)&crc);
499         if (val != crc) {
500                 btrfs_err_rl(io_ctl->fs_info,
501                         "csum mismatch on free space cache");
502                 io_ctl_unmap_page(io_ctl);
503                 return -EIO;
504         }
505
506         return 0;
507 }
508
509 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
510                             void *bitmap)
511 {
512         struct btrfs_free_space_entry *entry;
513
514         if (!io_ctl->cur)
515                 return -ENOSPC;
516
517         entry = io_ctl->cur;
518         entry->offset = cpu_to_le64(offset);
519         entry->bytes = cpu_to_le64(bytes);
520         entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
521                 BTRFS_FREE_SPACE_EXTENT;
522         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
523         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
524
525         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
526                 return 0;
527
528         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
529
530         /* No more pages to map */
531         if (io_ctl->index >= io_ctl->num_pages)
532                 return 0;
533
534         /* map the next page */
535         io_ctl_map_page(io_ctl, 1);
536         return 0;
537 }
538
539 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
540 {
541         if (!io_ctl->cur)
542                 return -ENOSPC;
543
544         /*
545          * If we aren't at the start of the current page, unmap this one and
546          * map the next one if there is any left.
547          */
548         if (io_ctl->cur != io_ctl->orig) {
549                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
550                 if (io_ctl->index >= io_ctl->num_pages)
551                         return -ENOSPC;
552                 io_ctl_map_page(io_ctl, 0);
553         }
554
555         copy_page(io_ctl->cur, bitmap);
556         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
557         if (io_ctl->index < io_ctl->num_pages)
558                 io_ctl_map_page(io_ctl, 0);
559         return 0;
560 }
561
562 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
563 {
564         /*
565          * If we're not on the boundary we know we've modified the page and we
566          * need to crc the page.
567          */
568         if (io_ctl->cur != io_ctl->orig)
569                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
570         else
571                 io_ctl_unmap_page(io_ctl);
572
573         while (io_ctl->index < io_ctl->num_pages) {
574                 io_ctl_map_page(io_ctl, 1);
575                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
576         }
577 }
578
579 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
580                             struct btrfs_free_space *entry, u8 *type)
581 {
582         struct btrfs_free_space_entry *e;
583         int ret;
584
585         if (!io_ctl->cur) {
586                 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
587                 if (ret)
588                         return ret;
589         }
590
591         e = io_ctl->cur;
592         entry->offset = le64_to_cpu(e->offset);
593         entry->bytes = le64_to_cpu(e->bytes);
594         *type = e->type;
595         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
596         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
597
598         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
599                 return 0;
600
601         io_ctl_unmap_page(io_ctl);
602
603         return 0;
604 }
605
606 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
607                               struct btrfs_free_space *entry)
608 {
609         int ret;
610
611         ret = io_ctl_check_crc(io_ctl, io_ctl->index);
612         if (ret)
613                 return ret;
614
615         copy_page(entry->bitmap, io_ctl->cur);
616         io_ctl_unmap_page(io_ctl);
617
618         return 0;
619 }
620
621 /*
622  * Since we attach pinned extents after the fact we can have contiguous sections
623  * of free space that are split up in entries.  This poses a problem with the
624  * tree logging stuff since it could have allocated across what appears to be 2
625  * entries since we would have merged the entries when adding the pinned extents
626  * back to the free space cache.  So run through the space cache that we just
627  * loaded and merge contiguous entries.  This will make the log replay stuff not
628  * blow up and it will make for nicer allocator behavior.
629  */
630 static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
631 {
632         struct btrfs_free_space *e, *prev = NULL;
633         struct rb_node *n;
634
635 again:
636         spin_lock(&ctl->tree_lock);
637         for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
638                 e = rb_entry(n, struct btrfs_free_space, offset_index);
639                 if (!prev)
640                         goto next;
641                 if (e->bitmap || prev->bitmap)
642                         goto next;
643                 if (prev->offset + prev->bytes == e->offset) {
644                         unlink_free_space(ctl, prev);
645                         unlink_free_space(ctl, e);
646                         prev->bytes += e->bytes;
647                         kmem_cache_free(btrfs_free_space_cachep, e);
648                         link_free_space(ctl, prev);
649                         prev = NULL;
650                         spin_unlock(&ctl->tree_lock);
651                         goto again;
652                 }
653 next:
654                 prev = e;
655         }
656         spin_unlock(&ctl->tree_lock);
657 }
658
659 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
660                                    struct btrfs_free_space_ctl *ctl,
661                                    struct btrfs_path *path, u64 offset)
662 {
663         struct btrfs_fs_info *fs_info = root->fs_info;
664         struct btrfs_free_space_header *header;
665         struct extent_buffer *leaf;
666         struct btrfs_io_ctl io_ctl;
667         struct btrfs_key key;
668         struct btrfs_free_space *e, *n;
669         LIST_HEAD(bitmaps);
670         u64 num_entries;
671         u64 num_bitmaps;
672         u64 generation;
673         u8 type;
674         int ret = 0;
675
676         /* Nothing in the space cache, goodbye */
677         if (!i_size_read(inode))
678                 return 0;
679
680         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
681         key.offset = offset;
682         key.type = 0;
683
684         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
685         if (ret < 0)
686                 return 0;
687         else if (ret > 0) {
688                 btrfs_release_path(path);
689                 return 0;
690         }
691
692         ret = -1;
693
694         leaf = path->nodes[0];
695         header = btrfs_item_ptr(leaf, path->slots[0],
696                                 struct btrfs_free_space_header);
697         num_entries = btrfs_free_space_entries(leaf, header);
698         num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
699         generation = btrfs_free_space_generation(leaf, header);
700         btrfs_release_path(path);
701
702         if (!BTRFS_I(inode)->generation) {
703                 btrfs_info(fs_info,
704                            "the free space cache file (%llu) is invalid, skip it",
705                            offset);
706                 return 0;
707         }
708
709         if (BTRFS_I(inode)->generation != generation) {
710                 btrfs_err(fs_info,
711                           "free space inode generation (%llu) did not match free space cache generation (%llu)",
712                           BTRFS_I(inode)->generation, generation);
713                 return 0;
714         }
715
716         if (!num_entries)
717                 return 0;
718
719         ret = io_ctl_init(&io_ctl, inode, 0);
720         if (ret)
721                 return ret;
722
723         readahead_cache(inode);
724
725         ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
726         if (ret)
727                 goto out;
728
729         ret = io_ctl_check_crc(&io_ctl, 0);
730         if (ret)
731                 goto free_cache;
732
733         ret = io_ctl_check_generation(&io_ctl, generation);
734         if (ret)
735                 goto free_cache;
736
737         while (num_entries) {
738                 e = kmem_cache_zalloc(btrfs_free_space_cachep,
739                                       GFP_NOFS);
740                 if (!e)
741                         goto free_cache;
742
743                 ret = io_ctl_read_entry(&io_ctl, e, &type);
744                 if (ret) {
745                         kmem_cache_free(btrfs_free_space_cachep, e);
746                         goto free_cache;
747                 }
748
749                 if (!e->bytes) {
750                         kmem_cache_free(btrfs_free_space_cachep, e);
751                         goto free_cache;
752                 }
753
754                 if (type == BTRFS_FREE_SPACE_EXTENT) {
755                         spin_lock(&ctl->tree_lock);
756                         ret = link_free_space(ctl, e);
757                         spin_unlock(&ctl->tree_lock);
758                         if (ret) {
759                                 btrfs_err(fs_info,
760                                         "Duplicate entries in free space cache, dumping");
761                                 kmem_cache_free(btrfs_free_space_cachep, e);
762                                 goto free_cache;
763                         }
764                 } else {
765                         ASSERT(num_bitmaps);
766                         num_bitmaps--;
767                         e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
768                         if (!e->bitmap) {
769                                 kmem_cache_free(
770                                         btrfs_free_space_cachep, e);
771                                 goto free_cache;
772                         }
773                         spin_lock(&ctl->tree_lock);
774                         ret = link_free_space(ctl, e);
775                         ctl->total_bitmaps++;
776                         ctl->op->recalc_thresholds(ctl);
777                         spin_unlock(&ctl->tree_lock);
778                         if (ret) {
779                                 btrfs_err(fs_info,
780                                         "Duplicate entries in free space cache, dumping");
781                                 kmem_cache_free(btrfs_free_space_cachep, e);
782                                 goto free_cache;
783                         }
784                         list_add_tail(&e->list, &bitmaps);
785                 }
786
787                 num_entries--;
788         }
789
790         io_ctl_unmap_page(&io_ctl);
791
792         /*
793          * We add the bitmaps at the end of the entries in order that
794          * the bitmap entries are added to the cache.
795          */
796         list_for_each_entry_safe(e, n, &bitmaps, list) {
797                 list_del_init(&e->list);
798                 ret = io_ctl_read_bitmap(&io_ctl, e);
799                 if (ret)
800                         goto free_cache;
801         }
802
803         io_ctl_drop_pages(&io_ctl);
804         merge_space_tree(ctl);
805         ret = 1;
806 out:
807         io_ctl_free(&io_ctl);
808         return ret;
809 free_cache:
810         io_ctl_drop_pages(&io_ctl);
811         __btrfs_remove_free_space_cache(ctl);
812         goto out;
813 }
814
815 int load_free_space_cache(struct btrfs_fs_info *fs_info,
816                           struct btrfs_block_group_cache *block_group)
817 {
818         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
819         struct inode *inode;
820         struct btrfs_path *path;
821         int ret = 0;
822         bool matched;
823         u64 used = btrfs_block_group_used(&block_group->item);
824
825         /*
826          * If this block group has been marked to be cleared for one reason or
827          * another then we can't trust the on disk cache, so just return.
828          */
829         spin_lock(&block_group->lock);
830         if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
831                 spin_unlock(&block_group->lock);
832                 return 0;
833         }
834         spin_unlock(&block_group->lock);
835
836         path = btrfs_alloc_path();
837         if (!path)
838                 return 0;
839         path->search_commit_root = 1;
840         path->skip_locking = 1;
841
842         /*
843          * We must pass a path with search_commit_root set to btrfs_iget in
844          * order to avoid a deadlock when allocating extents for the tree root.
845          *
846          * When we are COWing an extent buffer from the tree root, when looking
847          * for a free extent, at extent-tree.c:find_free_extent(), we can find
848          * block group without its free space cache loaded. When we find one
849          * we must load its space cache which requires reading its free space
850          * cache's inode item from the root tree. If this inode item is located
851          * in the same leaf that we started COWing before, then we end up in
852          * deadlock on the extent buffer (trying to read lock it when we
853          * previously write locked it).
854          *
855          * It's safe to read the inode item using the commit root because
856          * block groups, once loaded, stay in memory forever (until they are
857          * removed) as well as their space caches once loaded. New block groups
858          * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
859          * we will never try to read their inode item while the fs is mounted.
860          */
861         inode = lookup_free_space_inode(block_group, path);
862         if (IS_ERR(inode)) {
863                 btrfs_free_path(path);
864                 return 0;
865         }
866
867         /* We may have converted the inode and made the cache invalid. */
868         spin_lock(&block_group->lock);
869         if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
870                 spin_unlock(&block_group->lock);
871                 btrfs_free_path(path);
872                 goto out;
873         }
874         spin_unlock(&block_group->lock);
875
876         ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
877                                       path, block_group->key.objectid);
878         btrfs_free_path(path);
879         if (ret <= 0)
880                 goto out;
881
882         spin_lock(&ctl->tree_lock);
883         matched = (ctl->free_space == (block_group->key.offset - used -
884                                        block_group->bytes_super));
885         spin_unlock(&ctl->tree_lock);
886
887         if (!matched) {
888                 __btrfs_remove_free_space_cache(ctl);
889                 btrfs_warn(fs_info,
890                            "block group %llu has wrong amount of free space",
891                            block_group->key.objectid);
892                 ret = -1;
893         }
894 out:
895         if (ret < 0) {
896                 /* This cache is bogus, make sure it gets cleared */
897                 spin_lock(&block_group->lock);
898                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
899                 spin_unlock(&block_group->lock);
900                 ret = 0;
901
902                 btrfs_warn(fs_info,
903                            "failed to load free space cache for block group %llu, rebuilding it now",
904                            block_group->key.objectid);
905         }
906
907         iput(inode);
908         return ret;
909 }
910
911 static noinline_for_stack
912 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
913                               struct btrfs_free_space_ctl *ctl,
914                               struct btrfs_block_group_cache *block_group,
915                               int *entries, int *bitmaps,
916                               struct list_head *bitmap_list)
917 {
918         int ret;
919         struct btrfs_free_cluster *cluster = NULL;
920         struct btrfs_free_cluster *cluster_locked = NULL;
921         struct rb_node *node = rb_first(&ctl->free_space_offset);
922         struct btrfs_trim_range *trim_entry;
923
924         /* Get the cluster for this block_group if it exists */
925         if (block_group && !list_empty(&block_group->cluster_list)) {
926                 cluster = list_entry(block_group->cluster_list.next,
927                                      struct btrfs_free_cluster,
928                                      block_group_list);
929         }
930
931         if (!node && cluster) {
932                 cluster_locked = cluster;
933                 spin_lock(&cluster_locked->lock);
934                 node = rb_first(&cluster->root);
935                 cluster = NULL;
936         }
937
938         /* Write out the extent entries */
939         while (node) {
940                 struct btrfs_free_space *e;
941
942                 e = rb_entry(node, struct btrfs_free_space, offset_index);
943                 *entries += 1;
944
945                 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
946                                        e->bitmap);
947                 if (ret)
948                         goto fail;
949
950                 if (e->bitmap) {
951                         list_add_tail(&e->list, bitmap_list);
952                         *bitmaps += 1;
953                 }
954                 node = rb_next(node);
955                 if (!node && cluster) {
956                         node = rb_first(&cluster->root);
957                         cluster_locked = cluster;
958                         spin_lock(&cluster_locked->lock);
959                         cluster = NULL;
960                 }
961         }
962         if (cluster_locked) {
963                 spin_unlock(&cluster_locked->lock);
964                 cluster_locked = NULL;
965         }
966
967         /*
968          * Make sure we don't miss any range that was removed from our rbtree
969          * because trimming is running. Otherwise after a umount+mount (or crash
970          * after committing the transaction) we would leak free space and get
971          * an inconsistent free space cache report from fsck.
972          */
973         list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
974                 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
975                                        trim_entry->bytes, NULL);
976                 if (ret)
977                         goto fail;
978                 *entries += 1;
979         }
980
981         return 0;
982 fail:
983         if (cluster_locked)
984                 spin_unlock(&cluster_locked->lock);
985         return -ENOSPC;
986 }
987
988 static noinline_for_stack int
989 update_cache_item(struct btrfs_trans_handle *trans,
990                   struct btrfs_root *root,
991                   struct inode *inode,
992                   struct btrfs_path *path, u64 offset,
993                   int entries, int bitmaps)
994 {
995         struct btrfs_key key;
996         struct btrfs_free_space_header *header;
997         struct extent_buffer *leaf;
998         int ret;
999
1000         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1001         key.offset = offset;
1002         key.type = 0;
1003
1004         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1005         if (ret < 0) {
1006                 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1007                                  EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL);
1008                 goto fail;
1009         }
1010         leaf = path->nodes[0];
1011         if (ret > 0) {
1012                 struct btrfs_key found_key;
1013                 ASSERT(path->slots[0]);
1014                 path->slots[0]--;
1015                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1016                 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1017                     found_key.offset != offset) {
1018                         clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1019                                          inode->i_size - 1,
1020                                          EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
1021                                          NULL);
1022                         btrfs_release_path(path);
1023                         goto fail;
1024                 }
1025         }
1026
1027         BTRFS_I(inode)->generation = trans->transid;
1028         header = btrfs_item_ptr(leaf, path->slots[0],
1029                                 struct btrfs_free_space_header);
1030         btrfs_set_free_space_entries(leaf, header, entries);
1031         btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1032         btrfs_set_free_space_generation(leaf, header, trans->transid);
1033         btrfs_mark_buffer_dirty(leaf);
1034         btrfs_release_path(path);
1035
1036         return 0;
1037
1038 fail:
1039         return -1;
1040 }
1041
1042 static noinline_for_stack int
1043 write_pinned_extent_entries(struct btrfs_fs_info *fs_info,
1044                             struct btrfs_block_group_cache *block_group,
1045                             struct btrfs_io_ctl *io_ctl,
1046                             int *entries)
1047 {
1048         u64 start, extent_start, extent_end, len;
1049         struct extent_io_tree *unpin = NULL;
1050         int ret;
1051
1052         if (!block_group)
1053                 return 0;
1054
1055         /*
1056          * We want to add any pinned extents to our free space cache
1057          * so we don't leak the space
1058          *
1059          * We shouldn't have switched the pinned extents yet so this is the
1060          * right one
1061          */
1062         unpin = fs_info->pinned_extents;
1063
1064         start = block_group->key.objectid;
1065
1066         while (start < block_group->key.objectid + block_group->key.offset) {
1067                 ret = find_first_extent_bit(unpin, start,
1068                                             &extent_start, &extent_end,
1069                                             EXTENT_DIRTY, NULL);
1070                 if (ret)
1071                         return 0;
1072
1073                 /* This pinned extent is out of our range */
1074                 if (extent_start >= block_group->key.objectid +
1075                     block_group->key.offset)
1076                         return 0;
1077
1078                 extent_start = max(extent_start, start);
1079                 extent_end = min(block_group->key.objectid +
1080                                  block_group->key.offset, extent_end + 1);
1081                 len = extent_end - extent_start;
1082
1083                 *entries += 1;
1084                 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1085                 if (ret)
1086                         return -ENOSPC;
1087
1088                 start = extent_end;
1089         }
1090
1091         return 0;
1092 }
1093
1094 static noinline_for_stack int
1095 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1096 {
1097         struct btrfs_free_space *entry, *next;
1098         int ret;
1099
1100         /* Write out the bitmaps */
1101         list_for_each_entry_safe(entry, next, bitmap_list, list) {
1102                 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1103                 if (ret)
1104                         return -ENOSPC;
1105                 list_del_init(&entry->list);
1106         }
1107
1108         return 0;
1109 }
1110
1111 static int flush_dirty_cache(struct inode *inode)
1112 {
1113         int ret;
1114
1115         ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1116         if (ret)
1117                 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1118                                  EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL);
1119
1120         return ret;
1121 }
1122
1123 static void noinline_for_stack
1124 cleanup_bitmap_list(struct list_head *bitmap_list)
1125 {
1126         struct btrfs_free_space *entry, *next;
1127
1128         list_for_each_entry_safe(entry, next, bitmap_list, list)
1129                 list_del_init(&entry->list);
1130 }
1131
1132 static void noinline_for_stack
1133 cleanup_write_cache_enospc(struct inode *inode,
1134                            struct btrfs_io_ctl *io_ctl,
1135                            struct extent_state **cached_state)
1136 {
1137         io_ctl_drop_pages(io_ctl);
1138         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1139                              i_size_read(inode) - 1, cached_state);
1140 }
1141
1142 static int __btrfs_wait_cache_io(struct btrfs_root *root,
1143                                  struct btrfs_trans_handle *trans,
1144                                  struct btrfs_block_group_cache *block_group,
1145                                  struct btrfs_io_ctl *io_ctl,
1146                                  struct btrfs_path *path, u64 offset)
1147 {
1148         int ret;
1149         struct inode *inode = io_ctl->inode;
1150
1151         if (!inode)
1152                 return 0;
1153
1154         /* Flush the dirty pages in the cache file. */
1155         ret = flush_dirty_cache(inode);
1156         if (ret)
1157                 goto out;
1158
1159         /* Update the cache item to tell everyone this cache file is valid. */
1160         ret = update_cache_item(trans, root, inode, path, offset,
1161                                 io_ctl->entries, io_ctl->bitmaps);
1162 out:
1163         io_ctl_free(io_ctl);
1164         if (ret) {
1165                 invalidate_inode_pages2(inode->i_mapping);
1166                 BTRFS_I(inode)->generation = 0;
1167                 if (block_group) {
1168 #ifdef DEBUG
1169                         btrfs_err(root->fs_info,
1170                                   "failed to write free space cache for block group %llu",
1171                                   block_group->key.objectid);
1172 #endif
1173                 }
1174         }
1175         btrfs_update_inode(trans, root, inode);
1176
1177         if (block_group) {
1178                 /* the dirty list is protected by the dirty_bgs_lock */
1179                 spin_lock(&trans->transaction->dirty_bgs_lock);
1180
1181                 /* the disk_cache_state is protected by the block group lock */
1182                 spin_lock(&block_group->lock);
1183
1184                 /*
1185                  * only mark this as written if we didn't get put back on
1186                  * the dirty list while waiting for IO.   Otherwise our
1187                  * cache state won't be right, and we won't get written again
1188                  */
1189                 if (!ret && list_empty(&block_group->dirty_list))
1190                         block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1191                 else if (ret)
1192                         block_group->disk_cache_state = BTRFS_DC_ERROR;
1193
1194                 spin_unlock(&block_group->lock);
1195                 spin_unlock(&trans->transaction->dirty_bgs_lock);
1196                 io_ctl->inode = NULL;
1197                 iput(inode);
1198         }
1199
1200         return ret;
1201
1202 }
1203
1204 static int btrfs_wait_cache_io_root(struct btrfs_root *root,
1205                                     struct btrfs_trans_handle *trans,
1206                                     struct btrfs_io_ctl *io_ctl,
1207                                     struct btrfs_path *path)
1208 {
1209         return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0);
1210 }
1211
1212 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1213                         struct btrfs_block_group_cache *block_group,
1214                         struct btrfs_path *path)
1215 {
1216         return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1217                                      block_group, &block_group->io_ctl,
1218                                      path, block_group->key.objectid);
1219 }
1220
1221 /**
1222  * __btrfs_write_out_cache - write out cached info to an inode
1223  * @root - the root the inode belongs to
1224  * @ctl - the free space cache we are going to write out
1225  * @block_group - the block_group for this cache if it belongs to a block_group
1226  * @trans - the trans handle
1227  *
1228  * This function writes out a free space cache struct to disk for quick recovery
1229  * on mount.  This will return 0 if it was successful in writing the cache out,
1230  * or an errno if it was not.
1231  */
1232 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1233                                    struct btrfs_free_space_ctl *ctl,
1234                                    struct btrfs_block_group_cache *block_group,
1235                                    struct btrfs_io_ctl *io_ctl,
1236                                    struct btrfs_trans_handle *trans)
1237 {
1238         struct btrfs_fs_info *fs_info = root->fs_info;
1239         struct extent_state *cached_state = NULL;
1240         LIST_HEAD(bitmap_list);
1241         int entries = 0;
1242         int bitmaps = 0;
1243         int ret;
1244         int must_iput = 0;
1245
1246         if (!i_size_read(inode))
1247                 return -EIO;
1248
1249         WARN_ON(io_ctl->pages);
1250         ret = io_ctl_init(io_ctl, inode, 1);
1251         if (ret)
1252                 return ret;
1253
1254         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1255                 down_write(&block_group->data_rwsem);
1256                 spin_lock(&block_group->lock);
1257                 if (block_group->delalloc_bytes) {
1258                         block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1259                         spin_unlock(&block_group->lock);
1260                         up_write(&block_group->data_rwsem);
1261                         BTRFS_I(inode)->generation = 0;
1262                         ret = 0;
1263                         must_iput = 1;
1264                         goto out;
1265                 }
1266                 spin_unlock(&block_group->lock);
1267         }
1268
1269         /* Lock all pages first so we can lock the extent safely. */
1270         ret = io_ctl_prepare_pages(io_ctl, inode, 0);
1271         if (ret)
1272                 goto out_unlock;
1273
1274         lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1275                          &cached_state);
1276
1277         io_ctl_set_generation(io_ctl, trans->transid);
1278
1279         mutex_lock(&ctl->cache_writeout_mutex);
1280         /* Write out the extent entries in the free space cache */
1281         spin_lock(&ctl->tree_lock);
1282         ret = write_cache_extent_entries(io_ctl, ctl,
1283                                          block_group, &entries, &bitmaps,
1284                                          &bitmap_list);
1285         if (ret)
1286                 goto out_nospc_locked;
1287
1288         /*
1289          * Some spaces that are freed in the current transaction are pinned,
1290          * they will be added into free space cache after the transaction is
1291          * committed, we shouldn't lose them.
1292          *
1293          * If this changes while we are working we'll get added back to
1294          * the dirty list and redo it.  No locking needed
1295          */
1296         ret = write_pinned_extent_entries(fs_info, block_group,
1297                                           io_ctl, &entries);
1298         if (ret)
1299                 goto out_nospc_locked;
1300
1301         /*
1302          * At last, we write out all the bitmaps and keep cache_writeout_mutex
1303          * locked while doing it because a concurrent trim can be manipulating
1304          * or freeing the bitmap.
1305          */
1306         ret = write_bitmap_entries(io_ctl, &bitmap_list);
1307         spin_unlock(&ctl->tree_lock);
1308         mutex_unlock(&ctl->cache_writeout_mutex);
1309         if (ret)
1310                 goto out_nospc;
1311
1312         /* Zero out the rest of the pages just to make sure */
1313         io_ctl_zero_remaining_pages(io_ctl);
1314
1315         /* Everything is written out, now we dirty the pages in the file. */
1316         ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0,
1317                                 i_size_read(inode), &cached_state);
1318         if (ret)
1319                 goto out_nospc;
1320
1321         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1322                 up_write(&block_group->data_rwsem);
1323         /*
1324          * Release the pages and unlock the extent, we will flush
1325          * them out later
1326          */
1327         io_ctl_drop_pages(io_ctl);
1328
1329         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1330                              i_size_read(inode) - 1, &cached_state);
1331
1332         /*
1333          * at this point the pages are under IO and we're happy,
1334          * The caller is responsible for waiting on them and updating the
1335          * the cache and the inode
1336          */
1337         io_ctl->entries = entries;
1338         io_ctl->bitmaps = bitmaps;
1339
1340         ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1341         if (ret)
1342                 goto out;
1343
1344         return 0;
1345
1346 out:
1347         io_ctl->inode = NULL;
1348         io_ctl_free(io_ctl);
1349         if (ret) {
1350                 invalidate_inode_pages2(inode->i_mapping);
1351                 BTRFS_I(inode)->generation = 0;
1352         }
1353         btrfs_update_inode(trans, root, inode);
1354         if (must_iput)
1355                 iput(inode);
1356         return ret;
1357
1358 out_nospc_locked:
1359         cleanup_bitmap_list(&bitmap_list);
1360         spin_unlock(&ctl->tree_lock);
1361         mutex_unlock(&ctl->cache_writeout_mutex);
1362
1363 out_nospc:
1364         cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1365
1366 out_unlock:
1367         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1368                 up_write(&block_group->data_rwsem);
1369
1370         goto out;
1371 }
1372
1373 int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
1374                           struct btrfs_block_group_cache *block_group,
1375                           struct btrfs_path *path)
1376 {
1377         struct btrfs_fs_info *fs_info = trans->fs_info;
1378         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1379         struct inode *inode;
1380         int ret = 0;
1381
1382         spin_lock(&block_group->lock);
1383         if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1384                 spin_unlock(&block_group->lock);
1385                 return 0;
1386         }
1387         spin_unlock(&block_group->lock);
1388
1389         inode = lookup_free_space_inode(block_group, path);
1390         if (IS_ERR(inode))
1391                 return 0;
1392
1393         ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1394                                 block_group, &block_group->io_ctl, trans);
1395         if (ret) {
1396 #ifdef DEBUG
1397                 btrfs_err(fs_info,
1398                           "failed to write free space cache for block group %llu",
1399                           block_group->key.objectid);
1400 #endif
1401                 spin_lock(&block_group->lock);
1402                 block_group->disk_cache_state = BTRFS_DC_ERROR;
1403                 spin_unlock(&block_group->lock);
1404
1405                 block_group->io_ctl.inode = NULL;
1406                 iput(inode);
1407         }
1408
1409         /*
1410          * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1411          * to wait for IO and put the inode
1412          */
1413
1414         return ret;
1415 }
1416
1417 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1418                                           u64 offset)
1419 {
1420         ASSERT(offset >= bitmap_start);
1421         offset -= bitmap_start;
1422         return (unsigned long)(div_u64(offset, unit));
1423 }
1424
1425 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1426 {
1427         return (unsigned long)(div_u64(bytes, unit));
1428 }
1429
1430 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1431                                    u64 offset)
1432 {
1433         u64 bitmap_start;
1434         u64 bytes_per_bitmap;
1435
1436         bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1437         bitmap_start = offset - ctl->start;
1438         bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1439         bitmap_start *= bytes_per_bitmap;
1440         bitmap_start += ctl->start;
1441
1442         return bitmap_start;
1443 }
1444
1445 static int tree_insert_offset(struct rb_root *root, u64 offset,
1446                               struct rb_node *node, int bitmap)
1447 {
1448         struct rb_node **p = &root->rb_node;
1449         struct rb_node *parent = NULL;
1450         struct btrfs_free_space *info;
1451
1452         while (*p) {
1453                 parent = *p;
1454                 info = rb_entry(parent, struct btrfs_free_space, offset_index);
1455
1456                 if (offset < info->offset) {
1457                         p = &(*p)->rb_left;
1458                 } else if (offset > info->offset) {
1459                         p = &(*p)->rb_right;
1460                 } else {
1461                         /*
1462                          * we could have a bitmap entry and an extent entry
1463                          * share the same offset.  If this is the case, we want
1464                          * the extent entry to always be found first if we do a
1465                          * linear search through the tree, since we want to have
1466                          * the quickest allocation time, and allocating from an
1467                          * extent is faster than allocating from a bitmap.  So
1468                          * if we're inserting a bitmap and we find an entry at
1469                          * this offset, we want to go right, or after this entry
1470                          * logically.  If we are inserting an extent and we've
1471                          * found a bitmap, we want to go left, or before
1472                          * logically.
1473                          */
1474                         if (bitmap) {
1475                                 if (info->bitmap) {
1476                                         WARN_ON_ONCE(1);
1477                                         return -EEXIST;
1478                                 }
1479                                 p = &(*p)->rb_right;
1480                         } else {
1481                                 if (!info->bitmap) {
1482                                         WARN_ON_ONCE(1);
1483                                         return -EEXIST;
1484                                 }
1485                                 p = &(*p)->rb_left;
1486                         }
1487                 }
1488         }
1489
1490         rb_link_node(node, parent, p);
1491         rb_insert_color(node, root);
1492
1493         return 0;
1494 }
1495
1496 /*
1497  * searches the tree for the given offset.
1498  *
1499  * fuzzy - If this is set, then we are trying to make an allocation, and we just
1500  * want a section that has at least bytes size and comes at or after the given
1501  * offset.
1502  */
1503 static struct btrfs_free_space *
1504 tree_search_offset(struct btrfs_free_space_ctl *ctl,
1505                    u64 offset, int bitmap_only, int fuzzy)
1506 {
1507         struct rb_node *n = ctl->free_space_offset.rb_node;
1508         struct btrfs_free_space *entry, *prev = NULL;
1509
1510         /* find entry that is closest to the 'offset' */
1511         while (1) {
1512                 if (!n) {
1513                         entry = NULL;
1514                         break;
1515                 }
1516
1517                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1518                 prev = entry;
1519
1520                 if (offset < entry->offset)
1521                         n = n->rb_left;
1522                 else if (offset > entry->offset)
1523                         n = n->rb_right;
1524                 else
1525                         break;
1526         }
1527
1528         if (bitmap_only) {
1529                 if (!entry)
1530                         return NULL;
1531                 if (entry->bitmap)
1532                         return entry;
1533
1534                 /*
1535                  * bitmap entry and extent entry may share same offset,
1536                  * in that case, bitmap entry comes after extent entry.
1537                  */
1538                 n = rb_next(n);
1539                 if (!n)
1540                         return NULL;
1541                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1542                 if (entry->offset != offset)
1543                         return NULL;
1544
1545                 WARN_ON(!entry->bitmap);
1546                 return entry;
1547         } else if (entry) {
1548                 if (entry->bitmap) {
1549                         /*
1550                          * if previous extent entry covers the offset,
1551                          * we should return it instead of the bitmap entry
1552                          */
1553                         n = rb_prev(&entry->offset_index);
1554                         if (n) {
1555                                 prev = rb_entry(n, struct btrfs_free_space,
1556                                                 offset_index);
1557                                 if (!prev->bitmap &&
1558                                     prev->offset + prev->bytes > offset)
1559                                         entry = prev;
1560                         }
1561                 }
1562                 return entry;
1563         }
1564
1565         if (!prev)
1566                 return NULL;
1567
1568         /* find last entry before the 'offset' */
1569         entry = prev;
1570         if (entry->offset > offset) {
1571                 n = rb_prev(&entry->offset_index);
1572                 if (n) {
1573                         entry = rb_entry(n, struct btrfs_free_space,
1574                                         offset_index);
1575                         ASSERT(entry->offset <= offset);
1576                 } else {
1577                         if (fuzzy)
1578                                 return entry;
1579                         else
1580                                 return NULL;
1581                 }
1582         }
1583
1584         if (entry->bitmap) {
1585                 n = rb_prev(&entry->offset_index);
1586                 if (n) {
1587                         prev = rb_entry(n, struct btrfs_free_space,
1588                                         offset_index);
1589                         if (!prev->bitmap &&
1590                             prev->offset + prev->bytes > offset)
1591                                 return prev;
1592                 }
1593                 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1594                         return entry;
1595         } else if (entry->offset + entry->bytes > offset)
1596                 return entry;
1597
1598         if (!fuzzy)
1599                 return NULL;
1600
1601         while (1) {
1602                 if (entry->bitmap) {
1603                         if (entry->offset + BITS_PER_BITMAP *
1604                             ctl->unit > offset)
1605                                 break;
1606                 } else {
1607                         if (entry->offset + entry->bytes > offset)
1608                                 break;
1609                 }
1610
1611                 n = rb_next(&entry->offset_index);
1612                 if (!n)
1613                         return NULL;
1614                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1615         }
1616         return entry;
1617 }
1618
1619 static inline void
1620 __unlink_free_space(struct btrfs_free_space_ctl *ctl,
1621                     struct btrfs_free_space *info)
1622 {
1623         rb_erase(&info->offset_index, &ctl->free_space_offset);
1624         ctl->free_extents--;
1625 }
1626
1627 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1628                               struct btrfs_free_space *info)
1629 {
1630         __unlink_free_space(ctl, info);
1631         ctl->free_space -= info->bytes;
1632 }
1633
1634 static int link_free_space(struct btrfs_free_space_ctl *ctl,
1635                            struct btrfs_free_space *info)
1636 {
1637         int ret = 0;
1638
1639         ASSERT(info->bytes || info->bitmap);
1640         ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1641                                  &info->offset_index, (info->bitmap != NULL));
1642         if (ret)
1643                 return ret;
1644
1645         ctl->free_space += info->bytes;
1646         ctl->free_extents++;
1647         return ret;
1648 }
1649
1650 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1651 {
1652         struct btrfs_block_group_cache *block_group = ctl->private;
1653         u64 max_bytes;
1654         u64 bitmap_bytes;
1655         u64 extent_bytes;
1656         u64 size = block_group->key.offset;
1657         u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1658         u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1659
1660         max_bitmaps = max_t(u64, max_bitmaps, 1);
1661
1662         ASSERT(ctl->total_bitmaps <= max_bitmaps);
1663
1664         /*
1665          * The goal is to keep the total amount of memory used per 1gb of space
1666          * at or below 32k, so we need to adjust how much memory we allow to be
1667          * used by extent based free space tracking
1668          */
1669         if (size < SZ_1G)
1670                 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1671         else
1672                 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
1673
1674         /*
1675          * we want to account for 1 more bitmap than what we have so we can make
1676          * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
1677          * we add more bitmaps.
1678          */
1679         bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit;
1680
1681         if (bitmap_bytes >= max_bytes) {
1682                 ctl->extents_thresh = 0;
1683                 return;
1684         }
1685
1686         /*
1687          * we want the extent entry threshold to always be at most 1/2 the max
1688          * bytes we can have, or whatever is less than that.
1689          */
1690         extent_bytes = max_bytes - bitmap_bytes;
1691         extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
1692
1693         ctl->extents_thresh =
1694                 div_u64(extent_bytes, sizeof(struct btrfs_free_space));
1695 }
1696
1697 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1698                                        struct btrfs_free_space *info,
1699                                        u64 offset, u64 bytes)
1700 {
1701         unsigned long start, count;
1702
1703         start = offset_to_bit(info->offset, ctl->unit, offset);
1704         count = bytes_to_bits(bytes, ctl->unit);
1705         ASSERT(start + count <= BITS_PER_BITMAP);
1706
1707         bitmap_clear(info->bitmap, start, count);
1708
1709         info->bytes -= bytes;
1710         if (info->max_extent_size > ctl->unit)
1711                 info->max_extent_size = 0;
1712 }
1713
1714 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1715                               struct btrfs_free_space *info, u64 offset,
1716                               u64 bytes)
1717 {
1718         __bitmap_clear_bits(ctl, info, offset, bytes);
1719         ctl->free_space -= bytes;
1720 }
1721
1722 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1723                             struct btrfs_free_space *info, u64 offset,
1724                             u64 bytes)
1725 {
1726         unsigned long start, count;
1727
1728         start = offset_to_bit(info->offset, ctl->unit, offset);
1729         count = bytes_to_bits(bytes, ctl->unit);
1730         ASSERT(start + count <= BITS_PER_BITMAP);
1731
1732         bitmap_set(info->bitmap, start, count);
1733
1734         info->bytes += bytes;
1735         ctl->free_space += bytes;
1736 }
1737
1738 /*
1739  * If we can not find suitable extent, we will use bytes to record
1740  * the size of the max extent.
1741  */
1742 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1743                          struct btrfs_free_space *bitmap_info, u64 *offset,
1744                          u64 *bytes, bool for_alloc)
1745 {
1746         unsigned long found_bits = 0;
1747         unsigned long max_bits = 0;
1748         unsigned long bits, i;
1749         unsigned long next_zero;
1750         unsigned long extent_bits;
1751
1752         /*
1753          * Skip searching the bitmap if we don't have a contiguous section that
1754          * is large enough for this allocation.
1755          */
1756         if (for_alloc &&
1757             bitmap_info->max_extent_size &&
1758             bitmap_info->max_extent_size < *bytes) {
1759                 *bytes = bitmap_info->max_extent_size;
1760                 return -1;
1761         }
1762
1763         i = offset_to_bit(bitmap_info->offset, ctl->unit,
1764                           max_t(u64, *offset, bitmap_info->offset));
1765         bits = bytes_to_bits(*bytes, ctl->unit);
1766
1767         for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1768                 if (for_alloc && bits == 1) {
1769                         found_bits = 1;
1770                         break;
1771                 }
1772                 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1773                                                BITS_PER_BITMAP, i);
1774                 extent_bits = next_zero - i;
1775                 if (extent_bits >= bits) {
1776                         found_bits = extent_bits;
1777                         break;
1778                 } else if (extent_bits > max_bits) {
1779                         max_bits = extent_bits;
1780                 }
1781                 i = next_zero;
1782         }
1783
1784         if (found_bits) {
1785                 *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1786                 *bytes = (u64)(found_bits) * ctl->unit;
1787                 return 0;
1788         }
1789
1790         *bytes = (u64)(max_bits) * ctl->unit;
1791         bitmap_info->max_extent_size = *bytes;
1792         return -1;
1793 }
1794
1795 static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
1796 {
1797         if (entry->bitmap)
1798                 return entry->max_extent_size;
1799         return entry->bytes;
1800 }
1801
1802 /* Cache the size of the max extent in bytes */
1803 static struct btrfs_free_space *
1804 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1805                 unsigned long align, u64 *max_extent_size)
1806 {
1807         struct btrfs_free_space *entry;
1808         struct rb_node *node;
1809         u64 tmp;
1810         u64 align_off;
1811         int ret;
1812
1813         if (!ctl->free_space_offset.rb_node)
1814                 goto out;
1815
1816         entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1817         if (!entry)
1818                 goto out;
1819
1820         for (node = &entry->offset_index; node; node = rb_next(node)) {
1821                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1822                 if (entry->bytes < *bytes) {
1823                         *max_extent_size = max(get_max_extent_size(entry),
1824                                                *max_extent_size);
1825                         continue;
1826                 }
1827
1828                 /* make sure the space returned is big enough
1829                  * to match our requested alignment
1830                  */
1831                 if (*bytes >= align) {
1832                         tmp = entry->offset - ctl->start + align - 1;
1833                         tmp = div64_u64(tmp, align);
1834                         tmp = tmp * align + ctl->start;
1835                         align_off = tmp - entry->offset;
1836                 } else {
1837                         align_off = 0;
1838                         tmp = entry->offset;
1839                 }
1840
1841                 if (entry->bytes < *bytes + align_off) {
1842                         *max_extent_size = max(get_max_extent_size(entry),
1843                                                *max_extent_size);
1844                         continue;
1845                 }
1846
1847                 if (entry->bitmap) {
1848                         u64 size = *bytes;
1849
1850                         ret = search_bitmap(ctl, entry, &tmp, &size, true);
1851                         if (!ret) {
1852                                 *offset = tmp;
1853                                 *bytes = size;
1854                                 return entry;
1855                         } else {
1856                                 *max_extent_size =
1857                                         max(get_max_extent_size(entry),
1858                                             *max_extent_size);
1859                         }
1860                         continue;
1861                 }
1862
1863                 *offset = tmp;
1864                 *bytes = entry->bytes - align_off;
1865                 return entry;
1866         }
1867 out:
1868         return NULL;
1869 }
1870
1871 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1872                            struct btrfs_free_space *info, u64 offset)
1873 {
1874         info->offset = offset_to_bitmap(ctl, offset);
1875         info->bytes = 0;
1876         INIT_LIST_HEAD(&info->list);
1877         link_free_space(ctl, info);
1878         ctl->total_bitmaps++;
1879
1880         ctl->op->recalc_thresholds(ctl);
1881 }
1882
1883 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1884                         struct btrfs_free_space *bitmap_info)
1885 {
1886         unlink_free_space(ctl, bitmap_info);
1887         kfree(bitmap_info->bitmap);
1888         kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1889         ctl->total_bitmaps--;
1890         ctl->op->recalc_thresholds(ctl);
1891 }
1892
1893 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1894                               struct btrfs_free_space *bitmap_info,
1895                               u64 *offset, u64 *bytes)
1896 {
1897         u64 end;
1898         u64 search_start, search_bytes;
1899         int ret;
1900
1901 again:
1902         end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1903
1904         /*
1905          * We need to search for bits in this bitmap.  We could only cover some
1906          * of the extent in this bitmap thanks to how we add space, so we need
1907          * to search for as much as it as we can and clear that amount, and then
1908          * go searching for the next bit.
1909          */
1910         search_start = *offset;
1911         search_bytes = ctl->unit;
1912         search_bytes = min(search_bytes, end - search_start + 1);
1913         ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
1914                             false);
1915         if (ret < 0 || search_start != *offset)
1916                 return -EINVAL;
1917
1918         /* We may have found more bits than what we need */
1919         search_bytes = min(search_bytes, *bytes);
1920
1921         /* Cannot clear past the end of the bitmap */
1922         search_bytes = min(search_bytes, end - search_start + 1);
1923
1924         bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
1925         *offset += search_bytes;
1926         *bytes -= search_bytes;
1927
1928         if (*bytes) {
1929                 struct rb_node *next = rb_next(&bitmap_info->offset_index);
1930                 if (!bitmap_info->bytes)
1931                         free_bitmap(ctl, bitmap_info);
1932
1933                 /*
1934                  * no entry after this bitmap, but we still have bytes to
1935                  * remove, so something has gone wrong.
1936                  */
1937                 if (!next)
1938                         return -EINVAL;
1939
1940                 bitmap_info = rb_entry(next, struct btrfs_free_space,
1941                                        offset_index);
1942
1943                 /*
1944                  * if the next entry isn't a bitmap we need to return to let the
1945                  * extent stuff do its work.
1946                  */
1947                 if (!bitmap_info->bitmap)
1948                         return -EAGAIN;
1949
1950                 /*
1951                  * Ok the next item is a bitmap, but it may not actually hold
1952                  * the information for the rest of this free space stuff, so
1953                  * look for it, and if we don't find it return so we can try
1954                  * everything over again.
1955                  */
1956                 search_start = *offset;
1957                 search_bytes = ctl->unit;
1958                 ret = search_bitmap(ctl, bitmap_info, &search_start,
1959                                     &search_bytes, false);
1960                 if (ret < 0 || search_start != *offset)
1961                         return -EAGAIN;
1962
1963                 goto again;
1964         } else if (!bitmap_info->bytes)
1965                 free_bitmap(ctl, bitmap_info);
1966
1967         return 0;
1968 }
1969
1970 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
1971                                struct btrfs_free_space *info, u64 offset,
1972                                u64 bytes)
1973 {
1974         u64 bytes_to_set = 0;
1975         u64 end;
1976
1977         end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
1978
1979         bytes_to_set = min(end - offset, bytes);
1980
1981         bitmap_set_bits(ctl, info, offset, bytes_to_set);
1982
1983         /*
1984          * We set some bytes, we have no idea what the max extent size is
1985          * anymore.
1986          */
1987         info->max_extent_size = 0;
1988
1989         return bytes_to_set;
1990
1991 }
1992
1993 static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1994                       struct btrfs_free_space *info)
1995 {
1996         struct btrfs_block_group_cache *block_group = ctl->private;
1997         struct btrfs_fs_info *fs_info = block_group->fs_info;
1998         bool forced = false;
1999
2000 #ifdef CONFIG_BTRFS_DEBUG
2001         if (btrfs_should_fragment_free_space(block_group))
2002                 forced = true;
2003 #endif
2004
2005         /*
2006          * If we are below the extents threshold then we can add this as an
2007          * extent, and don't have to deal with the bitmap
2008          */
2009         if (!forced && ctl->free_extents < ctl->extents_thresh) {
2010                 /*
2011                  * If this block group has some small extents we don't want to
2012                  * use up all of our free slots in the cache with them, we want
2013                  * to reserve them to larger extents, however if we have plenty
2014                  * of cache left then go ahead an dadd them, no sense in adding
2015                  * the overhead of a bitmap if we don't have to.
2016                  */
2017                 if (info->bytes <= fs_info->sectorsize * 4) {
2018                         if (ctl->free_extents * 2 <= ctl->extents_thresh)
2019                                 return false;
2020                 } else {
2021                         return false;
2022                 }
2023         }
2024
2025         /*
2026          * The original block groups from mkfs can be really small, like 8
2027          * megabytes, so don't bother with a bitmap for those entries.  However
2028          * some block groups can be smaller than what a bitmap would cover but
2029          * are still large enough that they could overflow the 32k memory limit,
2030          * so allow those block groups to still be allowed to have a bitmap
2031          * entry.
2032          */
2033         if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
2034                 return false;
2035
2036         return true;
2037 }
2038
2039 static const struct btrfs_free_space_op free_space_op = {
2040         .recalc_thresholds      = recalculate_thresholds,
2041         .use_bitmap             = use_bitmap,
2042 };
2043
2044 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2045                               struct btrfs_free_space *info)
2046 {
2047         struct btrfs_free_space *bitmap_info;
2048         struct btrfs_block_group_cache *block_group = NULL;
2049         int added = 0;
2050         u64 bytes, offset, bytes_added;
2051         int ret;
2052
2053         bytes = info->bytes;
2054         offset = info->offset;
2055
2056         if (!ctl->op->use_bitmap(ctl, info))
2057                 return 0;
2058
2059         if (ctl->op == &free_space_op)
2060                 block_group = ctl->private;
2061 again:
2062         /*
2063          * Since we link bitmaps right into the cluster we need to see if we
2064          * have a cluster here, and if so and it has our bitmap we need to add
2065          * the free space to that bitmap.
2066          */
2067         if (block_group && !list_empty(&block_group->cluster_list)) {
2068                 struct btrfs_free_cluster *cluster;
2069                 struct rb_node *node;
2070                 struct btrfs_free_space *entry;
2071
2072                 cluster = list_entry(block_group->cluster_list.next,
2073                                      struct btrfs_free_cluster,
2074                                      block_group_list);
2075                 spin_lock(&cluster->lock);
2076                 node = rb_first(&cluster->root);
2077                 if (!node) {
2078                         spin_unlock(&cluster->lock);
2079                         goto no_cluster_bitmap;
2080                 }
2081
2082                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2083                 if (!entry->bitmap) {
2084                         spin_unlock(&cluster->lock);
2085                         goto no_cluster_bitmap;
2086                 }
2087
2088                 if (entry->offset == offset_to_bitmap(ctl, offset)) {
2089                         bytes_added = add_bytes_to_bitmap(ctl, entry,
2090                                                           offset, bytes);
2091                         bytes -= bytes_added;
2092                         offset += bytes_added;
2093                 }
2094                 spin_unlock(&cluster->lock);
2095                 if (!bytes) {
2096                         ret = 1;
2097                         goto out;
2098                 }
2099         }
2100
2101 no_cluster_bitmap:
2102         bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2103                                          1, 0);
2104         if (!bitmap_info) {
2105                 ASSERT(added == 0);
2106                 goto new_bitmap;
2107         }
2108
2109         bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
2110         bytes -= bytes_added;
2111         offset += bytes_added;
2112         added = 0;
2113
2114         if (!bytes) {
2115                 ret = 1;
2116                 goto out;
2117         } else
2118                 goto again;
2119
2120 new_bitmap:
2121         if (info && info->bitmap) {
2122                 add_new_bitmap(ctl, info, offset);
2123                 added = 1;
2124                 info = NULL;
2125                 goto again;
2126         } else {
2127                 spin_unlock(&ctl->tree_lock);
2128
2129                 /* no pre-allocated info, allocate a new one */
2130                 if (!info) {
2131                         info = kmem_cache_zalloc(btrfs_free_space_cachep,
2132                                                  GFP_NOFS);
2133                         if (!info) {
2134                                 spin_lock(&ctl->tree_lock);
2135                                 ret = -ENOMEM;
2136                                 goto out;
2137                         }
2138                 }
2139
2140                 /* allocate the bitmap */
2141                 info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
2142                 spin_lock(&ctl->tree_lock);
2143                 if (!info->bitmap) {
2144                         ret = -ENOMEM;
2145                         goto out;
2146                 }
2147                 goto again;
2148         }
2149
2150 out:
2151         if (info) {
2152                 kfree(info->bitmap);
2153                 kmem_cache_free(btrfs_free_space_cachep, info);
2154         }
2155
2156         return ret;
2157 }
2158
2159 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2160                           struct btrfs_free_space *info, bool update_stat)
2161 {
2162         struct btrfs_free_space *left_info;
2163         struct btrfs_free_space *right_info;
2164         bool merged = false;
2165         u64 offset = info->offset;
2166         u64 bytes = info->bytes;
2167
2168         /*
2169          * first we want to see if there is free space adjacent to the range we
2170          * are adding, if there is remove that struct and add a new one to
2171          * cover the entire range
2172          */
2173         right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2174         if (right_info && rb_prev(&right_info->offset_index))
2175                 left_info = rb_entry(rb_prev(&right_info->offset_index),
2176                                      struct btrfs_free_space, offset_index);
2177         else
2178                 left_info = tree_search_offset(ctl, offset - 1, 0, 0);
2179
2180         if (right_info && !right_info->bitmap) {
2181                 if (update_stat)
2182                         unlink_free_space(ctl, right_info);
2183                 else
2184                         __unlink_free_space(ctl, right_info);
2185                 info->bytes += right_info->bytes;
2186                 kmem_cache_free(btrfs_free_space_cachep, right_info);
2187                 merged = true;
2188         }
2189
2190         if (left_info && !left_info->bitmap &&
2191             left_info->offset + left_info->bytes == offset) {
2192                 if (update_stat)
2193                         unlink_free_space(ctl, left_info);
2194                 else
2195                         __unlink_free_space(ctl, left_info);
2196                 info->offset = left_info->offset;
2197                 info->bytes += left_info->bytes;
2198                 kmem_cache_free(btrfs_free_space_cachep, left_info);
2199                 merged = true;
2200         }
2201
2202         return merged;
2203 }
2204
2205 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2206                                      struct btrfs_free_space *info,
2207                                      bool update_stat)
2208 {
2209         struct btrfs_free_space *bitmap;
2210         unsigned long i;
2211         unsigned long j;
2212         const u64 end = info->offset + info->bytes;
2213         const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2214         u64 bytes;
2215
2216         bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2217         if (!bitmap)
2218                 return false;
2219
2220         i = offset_to_bit(bitmap->offset, ctl->unit, end);
2221         j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2222         if (j == i)
2223                 return false;
2224         bytes = (j - i) * ctl->unit;
2225         info->bytes += bytes;
2226
2227         if (update_stat)
2228                 bitmap_clear_bits(ctl, bitmap, end, bytes);
2229         else
2230                 __bitmap_clear_bits(ctl, bitmap, end, bytes);
2231
2232         if (!bitmap->bytes)
2233                 free_bitmap(ctl, bitmap);
2234
2235         return true;
2236 }
2237
2238 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2239                                        struct btrfs_free_space *info,
2240                                        bool update_stat)
2241 {
2242         struct btrfs_free_space *bitmap;
2243         u64 bitmap_offset;
2244         unsigned long i;
2245         unsigned long j;
2246         unsigned long prev_j;
2247         u64 bytes;
2248
2249         bitmap_offset = offset_to_bitmap(ctl, info->offset);
2250         /* If we're on a boundary, try the previous logical bitmap. */
2251         if (bitmap_offset == info->offset) {
2252                 if (info->offset == 0)
2253                         return false;
2254                 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2255         }
2256
2257         bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2258         if (!bitmap)
2259                 return false;
2260
2261         i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2262         j = 0;
2263         prev_j = (unsigned long)-1;
2264         for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2265                 if (j > i)
2266                         break;
2267                 prev_j = j;
2268         }
2269         if (prev_j == i)
2270                 return false;
2271
2272         if (prev_j == (unsigned long)-1)
2273                 bytes = (i + 1) * ctl->unit;
2274         else
2275                 bytes = (i - prev_j) * ctl->unit;
2276
2277         info->offset -= bytes;
2278         info->bytes += bytes;
2279
2280         if (update_stat)
2281                 bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2282         else
2283                 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2284
2285         if (!bitmap->bytes)
2286                 free_bitmap(ctl, bitmap);
2287
2288         return true;
2289 }
2290
2291 /*
2292  * We prefer always to allocate from extent entries, both for clustered and
2293  * non-clustered allocation requests. So when attempting to add a new extent
2294  * entry, try to see if there's adjacent free space in bitmap entries, and if
2295  * there is, migrate that space from the bitmaps to the extent.
2296  * Like this we get better chances of satisfying space allocation requests
2297  * because we attempt to satisfy them based on a single cache entry, and never
2298  * on 2 or more entries - even if the entries represent a contiguous free space
2299  * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2300  * ends).
2301  */
2302 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2303                               struct btrfs_free_space *info,
2304                               bool update_stat)
2305 {
2306         /*
2307          * Only work with disconnected entries, as we can change their offset,
2308          * and must be extent entries.
2309          */
2310         ASSERT(!info->bitmap);
2311         ASSERT(RB_EMPTY_NODE(&info->offset_index));
2312
2313         if (ctl->total_bitmaps > 0) {
2314                 bool stole_end;
2315                 bool stole_front = false;
2316
2317                 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2318                 if (ctl->total_bitmaps > 0)
2319                         stole_front = steal_from_bitmap_to_front(ctl, info,
2320                                                                  update_stat);
2321
2322                 if (stole_end || stole_front)
2323                         try_merge_free_space(ctl, info, update_stat);
2324         }
2325 }
2326
2327 int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
2328                            struct btrfs_free_space_ctl *ctl,
2329                            u64 offset, u64 bytes)
2330 {
2331         struct btrfs_free_space *info;
2332         int ret = 0;
2333
2334         info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2335         if (!info)
2336                 return -ENOMEM;
2337
2338         info->offset = offset;
2339         info->bytes = bytes;
2340         RB_CLEAR_NODE(&info->offset_index);
2341
2342         spin_lock(&ctl->tree_lock);
2343
2344         if (try_merge_free_space(ctl, info, true))
2345                 goto link;
2346
2347         /*
2348          * There was no extent directly to the left or right of this new
2349          * extent then we know we're going to have to allocate a new extent, so
2350          * before we do that see if we need to drop this into a bitmap
2351          */
2352         ret = insert_into_bitmap(ctl, info);
2353         if (ret < 0) {
2354                 goto out;
2355         } else if (ret) {
2356                 ret = 0;
2357                 goto out;
2358         }
2359 link:
2360         /*
2361          * Only steal free space from adjacent bitmaps if we're sure we're not
2362          * going to add the new free space to existing bitmap entries - because
2363          * that would mean unnecessary work that would be reverted. Therefore
2364          * attempt to steal space from bitmaps if we're adding an extent entry.
2365          */
2366         steal_from_bitmap(ctl, info, true);
2367
2368         ret = link_free_space(ctl, info);
2369         if (ret)
2370                 kmem_cache_free(btrfs_free_space_cachep, info);
2371 out:
2372         spin_unlock(&ctl->tree_lock);
2373
2374         if (ret) {
2375                 btrfs_crit(fs_info, "unable to add free space :%d", ret);
2376                 ASSERT(ret != -EEXIST);
2377         }
2378
2379         return ret;
2380 }
2381
2382 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2383                             u64 offset, u64 bytes)
2384 {
2385         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2386         struct btrfs_free_space *info;
2387         int ret;
2388         bool re_search = false;
2389
2390         spin_lock(&ctl->tree_lock);
2391
2392 again:
2393         ret = 0;
2394         if (!bytes)
2395                 goto out_lock;
2396
2397         info = tree_search_offset(ctl, offset, 0, 0);
2398         if (!info) {
2399                 /*
2400                  * oops didn't find an extent that matched the space we wanted
2401                  * to remove, look for a bitmap instead
2402                  */
2403                 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2404                                           1, 0);
2405                 if (!info) {
2406                         /*
2407                          * If we found a partial bit of our free space in a
2408                          * bitmap but then couldn't find the other part this may
2409                          * be a problem, so WARN about it.
2410                          */
2411                         WARN_ON(re_search);
2412                         goto out_lock;
2413                 }
2414         }
2415
2416         re_search = false;
2417         if (!info->bitmap) {
2418                 unlink_free_space(ctl, info);
2419                 if (offset == info->offset) {
2420                         u64 to_free = min(bytes, info->bytes);
2421
2422                         info->bytes -= to_free;
2423                         info->offset += to_free;
2424                         if (info->bytes) {
2425                                 ret = link_free_space(ctl, info);
2426                                 WARN_ON(ret);
2427                         } else {
2428                                 kmem_cache_free(btrfs_free_space_cachep, info);
2429                         }
2430
2431                         offset += to_free;
2432                         bytes -= to_free;
2433                         goto again;
2434                 } else {
2435                         u64 old_end = info->bytes + info->offset;
2436
2437                         info->bytes = offset - info->offset;
2438                         ret = link_free_space(ctl, info);
2439                         WARN_ON(ret);
2440                         if (ret)
2441                                 goto out_lock;
2442
2443                         /* Not enough bytes in this entry to satisfy us */
2444                         if (old_end < offset + bytes) {
2445                                 bytes -= old_end - offset;
2446                                 offset = old_end;
2447                                 goto again;
2448                         } else if (old_end == offset + bytes) {
2449                                 /* all done */
2450                                 goto out_lock;
2451                         }
2452                         spin_unlock(&ctl->tree_lock);
2453
2454                         ret = btrfs_add_free_space(block_group, offset + bytes,
2455                                                    old_end - (offset + bytes));
2456                         WARN_ON(ret);
2457                         goto out;
2458                 }
2459         }
2460
2461         ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2462         if (ret == -EAGAIN) {
2463                 re_search = true;
2464                 goto again;
2465         }
2466 out_lock:
2467         spin_unlock(&ctl->tree_lock);
2468 out:
2469         return ret;
2470 }
2471
2472 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2473                            u64 bytes)
2474 {
2475         struct btrfs_fs_info *fs_info = block_group->fs_info;
2476         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2477         struct btrfs_free_space *info;
2478         struct rb_node *n;
2479         int count = 0;
2480
2481         spin_lock(&ctl->tree_lock);
2482         for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2483                 info = rb_entry(n, struct btrfs_free_space, offset_index);
2484                 if (info->bytes >= bytes && !block_group->ro)
2485                         count++;
2486                 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2487                            info->offset, info->bytes,
2488                        (info->bitmap) ? "yes" : "no");
2489         }
2490         spin_unlock(&ctl->tree_lock);
2491         btrfs_info(fs_info, "block group has cluster?: %s",
2492                list_empty(&block_group->cluster_list) ? "no" : "yes");
2493         btrfs_info(fs_info,
2494                    "%d blocks of free space at or bigger than bytes is", count);
2495 }
2496
2497 void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
2498 {
2499         struct btrfs_fs_info *fs_info = block_group->fs_info;
2500         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2501
2502         spin_lock_init(&ctl->tree_lock);
2503         ctl->unit = fs_info->sectorsize;
2504         ctl->start = block_group->key.objectid;
2505         ctl->private = block_group;
2506         ctl->op = &free_space_op;
2507         INIT_LIST_HEAD(&ctl->trimming_ranges);
2508         mutex_init(&ctl->cache_writeout_mutex);
2509
2510         /*
2511          * we only want to have 32k of ram per block group for keeping
2512          * track of free space, and if we pass 1/2 of that we want to
2513          * start converting things over to using bitmaps
2514          */
2515         ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
2516 }
2517
2518 /*
2519  * for a given cluster, put all of its extents back into the free
2520  * space cache.  If the block group passed doesn't match the block group
2521  * pointed to by the cluster, someone else raced in and freed the
2522  * cluster already.  In that case, we just return without changing anything
2523  */
2524 static int
2525 __btrfs_return_cluster_to_free_space(
2526                              struct btrfs_block_group_cache *block_group,
2527                              struct btrfs_free_cluster *cluster)
2528 {
2529         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2530         struct btrfs_free_space *entry;
2531         struct rb_node *node;
2532
2533         spin_lock(&cluster->lock);
2534         if (cluster->block_group != block_group)
2535                 goto out;
2536
2537         cluster->block_group = NULL;
2538         cluster->window_start = 0;
2539         list_del_init(&cluster->block_group_list);
2540
2541         node = rb_first(&cluster->root);
2542         while (node) {
2543                 bool bitmap;
2544
2545                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2546                 node = rb_next(&entry->offset_index);
2547                 rb_erase(&entry->offset_index, &cluster->root);
2548                 RB_CLEAR_NODE(&entry->offset_index);
2549
2550                 bitmap = (entry->bitmap != NULL);
2551                 if (!bitmap) {
2552                         try_merge_free_space(ctl, entry, false);
2553                         steal_from_bitmap(ctl, entry, false);
2554                 }
2555                 tree_insert_offset(&ctl->free_space_offset,
2556                                    entry->offset, &entry->offset_index, bitmap);
2557         }
2558         cluster->root = RB_ROOT;
2559
2560 out:
2561         spin_unlock(&cluster->lock);
2562         btrfs_put_block_group(block_group);
2563         return 0;
2564 }
2565
2566 static void __btrfs_remove_free_space_cache_locked(
2567                                 struct btrfs_free_space_ctl *ctl)
2568 {
2569         struct btrfs_free_space *info;
2570         struct rb_node *node;
2571
2572         while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2573                 info = rb_entry(node, struct btrfs_free_space, offset_index);
2574                 if (!info->bitmap) {
2575                         unlink_free_space(ctl, info);
2576                         kmem_cache_free(btrfs_free_space_cachep, info);
2577                 } else {
2578                         free_bitmap(ctl, info);
2579                 }
2580
2581                 cond_resched_lock(&ctl->tree_lock);
2582         }
2583 }
2584
2585 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2586 {
2587         spin_lock(&ctl->tree_lock);
2588         __btrfs_remove_free_space_cache_locked(ctl);
2589         spin_unlock(&ctl->tree_lock);
2590 }
2591
2592 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
2593 {
2594         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2595         struct btrfs_free_cluster *cluster;
2596         struct list_head *head;
2597
2598         spin_lock(&ctl->tree_lock);
2599         while ((head = block_group->cluster_list.next) !=
2600                &block_group->cluster_list) {
2601                 cluster = list_entry(head, struct btrfs_free_cluster,
2602                                      block_group_list);
2603
2604                 WARN_ON(cluster->block_group != block_group);
2605                 __btrfs_return_cluster_to_free_space(block_group, cluster);
2606
2607                 cond_resched_lock(&ctl->tree_lock);
2608         }
2609         __btrfs_remove_free_space_cache_locked(ctl);
2610         spin_unlock(&ctl->tree_lock);
2611
2612 }
2613
2614 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2615                                u64 offset, u64 bytes, u64 empty_size,
2616                                u64 *max_extent_size)
2617 {
2618         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2619         struct btrfs_free_space *entry = NULL;
2620         u64 bytes_search = bytes + empty_size;
2621         u64 ret = 0;
2622         u64 align_gap = 0;
2623         u64 align_gap_len = 0;
2624
2625         spin_lock(&ctl->tree_lock);
2626         entry = find_free_space(ctl, &offset, &bytes_search,
2627                                 block_group->full_stripe_len, max_extent_size);
2628         if (!entry)
2629                 goto out;
2630
2631         ret = offset;
2632         if (entry->bitmap) {
2633                 bitmap_clear_bits(ctl, entry, offset, bytes);
2634                 if (!entry->bytes)
2635                         free_bitmap(ctl, entry);
2636         } else {
2637                 unlink_free_space(ctl, entry);
2638                 align_gap_len = offset - entry->offset;
2639                 align_gap = entry->offset;
2640
2641                 entry->offset = offset + bytes;
2642                 WARN_ON(entry->bytes < bytes + align_gap_len);
2643
2644                 entry->bytes -= bytes + align_gap_len;
2645                 if (!entry->bytes)
2646                         kmem_cache_free(btrfs_free_space_cachep, entry);
2647                 else
2648                         link_free_space(ctl, entry);
2649         }
2650 out:
2651         spin_unlock(&ctl->tree_lock);
2652
2653         if (align_gap_len)
2654                 __btrfs_add_free_space(block_group->fs_info, ctl,
2655                                        align_gap, align_gap_len);
2656         return ret;
2657 }
2658
2659 /*
2660  * given a cluster, put all of its extents back into the free space
2661  * cache.  If a block group is passed, this function will only free
2662  * a cluster that belongs to the passed block group.
2663  *
2664  * Otherwise, it'll get a reference on the block group pointed to by the
2665  * cluster and remove the cluster from it.
2666  */
2667 int btrfs_return_cluster_to_free_space(
2668                                struct btrfs_block_group_cache *block_group,
2669                                struct btrfs_free_cluster *cluster)
2670 {
2671         struct btrfs_free_space_ctl *ctl;
2672         int ret;
2673
2674         /* first, get a safe pointer to the block group */
2675         spin_lock(&cluster->lock);
2676         if (!block_group) {
2677                 block_group = cluster->block_group;
2678                 if (!block_group) {
2679                         spin_unlock(&cluster->lock);
2680                         return 0;
2681                 }
2682         } else if (cluster->block_group != block_group) {
2683                 /* someone else has already freed it don't redo their work */
2684                 spin_unlock(&cluster->lock);
2685                 return 0;
2686         }
2687         atomic_inc(&block_group->count);
2688         spin_unlock(&cluster->lock);
2689
2690         ctl = block_group->free_space_ctl;
2691
2692         /* now return any extents the cluster had on it */
2693         spin_lock(&ctl->tree_lock);
2694         ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
2695         spin_unlock(&ctl->tree_lock);
2696
2697         /* finally drop our ref */
2698         btrfs_put_block_group(block_group);
2699         return ret;
2700 }
2701
2702 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
2703                                    struct btrfs_free_cluster *cluster,
2704                                    struct btrfs_free_space *entry,
2705                                    u64 bytes, u64 min_start,
2706                                    u64 *max_extent_size)
2707 {
2708         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2709         int err;
2710         u64 search_start = cluster->window_start;
2711         u64 search_bytes = bytes;
2712         u64 ret = 0;
2713
2714         search_start = min_start;
2715         search_bytes = bytes;
2716
2717         err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
2718         if (err) {
2719                 *max_extent_size = max(get_max_extent_size(entry),
2720                                        *max_extent_size);
2721                 return 0;
2722         }
2723
2724         ret = search_start;
2725         __bitmap_clear_bits(ctl, entry, ret, bytes);
2726
2727         return ret;
2728 }
2729
2730 /*
2731  * given a cluster, try to allocate 'bytes' from it, returns 0
2732  * if it couldn't find anything suitably large, or a logical disk offset
2733  * if things worked out
2734  */
2735 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
2736                              struct btrfs_free_cluster *cluster, u64 bytes,
2737                              u64 min_start, u64 *max_extent_size)
2738 {
2739         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2740         struct btrfs_free_space *entry = NULL;
2741         struct rb_node *node;
2742         u64 ret = 0;
2743
2744         spin_lock(&cluster->lock);
2745         if (bytes > cluster->max_size)
2746                 goto out;
2747
2748         if (cluster->block_group != block_group)
2749                 goto out;
2750
2751         node = rb_first(&cluster->root);
2752         if (!node)
2753                 goto out;
2754
2755         entry = rb_entry(node, struct btrfs_free_space, offset_index);
2756         while (1) {
2757                 if (entry->bytes < bytes)
2758                         *max_extent_size = max(get_max_extent_size(entry),
2759                                                *max_extent_size);
2760
2761                 if (entry->bytes < bytes ||
2762                     (!entry->bitmap && entry->offset < min_start)) {
2763                         node = rb_next(&entry->offset_index);
2764                         if (!node)
2765                                 break;
2766                         entry = rb_entry(node, struct btrfs_free_space,
2767                                          offset_index);
2768                         continue;
2769                 }
2770
2771                 if (entry->bitmap) {
2772                         ret = btrfs_alloc_from_bitmap(block_group,
2773                                                       cluster, entry, bytes,
2774                                                       cluster->window_start,
2775                                                       max_extent_size);
2776                         if (ret == 0) {
2777                                 node = rb_next(&entry->offset_index);
2778                                 if (!node)
2779                                         break;
2780                                 entry = rb_entry(node, struct btrfs_free_space,
2781                                                  offset_index);
2782                                 continue;
2783                         }
2784                         cluster->window_start += bytes;
2785                 } else {
2786                         ret = entry->offset;
2787
2788                         entry->offset += bytes;
2789                         entry->bytes -= bytes;
2790                 }
2791
2792                 if (entry->bytes == 0)
2793                         rb_erase(&entry->offset_index, &cluster->root);
2794                 break;
2795         }
2796 out:
2797         spin_unlock(&cluster->lock);
2798
2799         if (!ret)
2800                 return 0;
2801
2802         spin_lock(&ctl->tree_lock);
2803
2804         ctl->free_space -= bytes;
2805         if (entry->bytes == 0) {
2806                 ctl->free_extents--;
2807                 if (entry->bitmap) {
2808                         kfree(entry->bitmap);
2809                         ctl->total_bitmaps--;
2810                         ctl->op->recalc_thresholds(ctl);
2811                 }
2812                 kmem_cache_free(btrfs_free_space_cachep, entry);
2813         }
2814
2815         spin_unlock(&ctl->tree_lock);
2816
2817         return ret;
2818 }
2819
2820 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2821                                 struct btrfs_free_space *entry,
2822                                 struct btrfs_free_cluster *cluster,
2823                                 u64 offset, u64 bytes,
2824                                 u64 cont1_bytes, u64 min_bytes)
2825 {
2826         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2827         unsigned long next_zero;
2828         unsigned long i;
2829         unsigned long want_bits;
2830         unsigned long min_bits;
2831         unsigned long found_bits;
2832         unsigned long max_bits = 0;
2833         unsigned long start = 0;
2834         unsigned long total_found = 0;
2835         int ret;
2836
2837         i = offset_to_bit(entry->offset, ctl->unit,
2838                           max_t(u64, offset, entry->offset));
2839         want_bits = bytes_to_bits(bytes, ctl->unit);
2840         min_bits = bytes_to_bits(min_bytes, ctl->unit);
2841
2842         /*
2843          * Don't bother looking for a cluster in this bitmap if it's heavily
2844          * fragmented.
2845          */
2846         if (entry->max_extent_size &&
2847             entry->max_extent_size < cont1_bytes)
2848                 return -ENOSPC;
2849 again:
2850         found_bits = 0;
2851         for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
2852                 next_zero = find_next_zero_bit(entry->bitmap,
2853                                                BITS_PER_BITMAP, i);
2854                 if (next_zero - i >= min_bits) {
2855                         found_bits = next_zero - i;
2856                         if (found_bits > max_bits)
2857                                 max_bits = found_bits;
2858                         break;
2859                 }
2860                 if (next_zero - i > max_bits)
2861                         max_bits = next_zero - i;
2862                 i = next_zero;
2863         }
2864
2865         if (!found_bits) {
2866                 entry->max_extent_size = (u64)max_bits * ctl->unit;
2867                 return -ENOSPC;
2868         }
2869
2870         if (!total_found) {
2871                 start = i;
2872                 cluster->max_size = 0;
2873         }
2874
2875         total_found += found_bits;
2876
2877         if (cluster->max_size < found_bits * ctl->unit)
2878                 cluster->max_size = found_bits * ctl->unit;
2879
2880         if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2881                 i = next_zero + 1;
2882                 goto again;
2883         }
2884
2885         cluster->window_start = start * ctl->unit + entry->offset;
2886         rb_erase(&entry->offset_index, &ctl->free_space_offset);
2887         ret = tree_insert_offset(&cluster->root, entry->offset,
2888                                  &entry->offset_index, 1);
2889         ASSERT(!ret); /* -EEXIST; Logic error */
2890
2891         trace_btrfs_setup_cluster(block_group, cluster,
2892                                   total_found * ctl->unit, 1);
2893         return 0;
2894 }
2895
2896 /*
2897  * This searches the block group for just extents to fill the cluster with.
2898  * Try to find a cluster with at least bytes total bytes, at least one
2899  * extent of cont1_bytes, and other clusters of at least min_bytes.
2900  */
2901 static noinline int
2902 setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2903                         struct btrfs_free_cluster *cluster,
2904                         struct list_head *bitmaps, u64 offset, u64 bytes,
2905                         u64 cont1_bytes, u64 min_bytes)
2906 {
2907         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2908         struct btrfs_free_space *first = NULL;
2909         struct btrfs_free_space *entry = NULL;
2910         struct btrfs_free_space *last;
2911         struct rb_node *node;
2912         u64 window_free;
2913         u64 max_extent;
2914         u64 total_size = 0;
2915
2916         entry = tree_search_offset(ctl, offset, 0, 1);
2917         if (!entry)
2918                 return -ENOSPC;
2919
2920         /*
2921          * We don't want bitmaps, so just move along until we find a normal
2922          * extent entry.
2923          */
2924         while (entry->bitmap || entry->bytes < min_bytes) {
2925                 if (entry->bitmap && list_empty(&entry->list))
2926                         list_add_tail(&entry->list, bitmaps);
2927                 node = rb_next(&entry->offset_index);
2928                 if (!node)
2929                         return -ENOSPC;
2930                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2931         }
2932
2933         window_free = entry->bytes;
2934         max_extent = entry->bytes;
2935         first = entry;
2936         last = entry;
2937
2938         for (node = rb_next(&entry->offset_index); node;
2939              node = rb_next(&entry->offset_index)) {
2940                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2941
2942                 if (entry->bitmap) {
2943                         if (list_empty(&entry->list))
2944                                 list_add_tail(&entry->list, bitmaps);
2945                         continue;
2946                 }
2947
2948                 if (entry->bytes < min_bytes)
2949                         continue;
2950
2951                 last = entry;
2952                 window_free += entry->bytes;
2953                 if (entry->bytes > max_extent)
2954                         max_extent = entry->bytes;
2955         }
2956
2957         if (window_free < bytes || max_extent < cont1_bytes)
2958                 return -ENOSPC;
2959
2960         cluster->window_start = first->offset;
2961
2962         node = &first->offset_index;
2963
2964         /*
2965          * now we've found our entries, pull them out of the free space
2966          * cache and put them into the cluster rbtree
2967          */
2968         do {
2969                 int ret;
2970
2971                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2972                 node = rb_next(&entry->offset_index);
2973                 if (entry->bitmap || entry->bytes < min_bytes)
2974                         continue;
2975
2976                 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2977                 ret = tree_insert_offset(&cluster->root, entry->offset,
2978                                          &entry->offset_index, 0);
2979                 total_size += entry->bytes;
2980                 ASSERT(!ret); /* -EEXIST; Logic error */
2981         } while (node && entry != last);
2982
2983         cluster->max_size = max_extent;
2984         trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
2985         return 0;
2986 }
2987
2988 /*
2989  * This specifically looks for bitmaps that may work in the cluster, we assume
2990  * that we have already failed to find extents that will work.
2991  */
2992 static noinline int
2993 setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2994                      struct btrfs_free_cluster *cluster,
2995                      struct list_head *bitmaps, u64 offset, u64 bytes,
2996                      u64 cont1_bytes, u64 min_bytes)
2997 {
2998         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2999         struct btrfs_free_space *entry = NULL;
3000         int ret = -ENOSPC;
3001         u64 bitmap_offset = offset_to_bitmap(ctl, offset);
3002
3003         if (ctl->total_bitmaps == 0)
3004                 return -ENOSPC;
3005
3006         /*
3007          * The bitmap that covers offset won't be in the list unless offset
3008          * is just its start offset.
3009          */
3010         if (!list_empty(bitmaps))
3011                 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3012
3013         if (!entry || entry->offset != bitmap_offset) {
3014                 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3015                 if (entry && list_empty(&entry->list))
3016                         list_add(&entry->list, bitmaps);
3017         }
3018
3019         list_for_each_entry(entry, bitmaps, list) {
3020                 if (entry->bytes < bytes)
3021                         continue;
3022                 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3023                                            bytes, cont1_bytes, min_bytes);
3024                 if (!ret)
3025                         return 0;
3026         }
3027
3028         /*
3029          * The bitmaps list has all the bitmaps that record free space
3030          * starting after offset, so no more search is required.
3031          */
3032         return -ENOSPC;
3033 }
3034
3035 /*
3036  * here we try to find a cluster of blocks in a block group.  The goal
3037  * is to find at least bytes+empty_size.
3038  * We might not find them all in one contiguous area.
3039  *
3040  * returns zero and sets up cluster if things worked out, otherwise
3041  * it returns -enospc
3042  */
3043 int btrfs_find_space_cluster(struct btrfs_fs_info *fs_info,
3044                              struct btrfs_block_group_cache *block_group,
3045                              struct btrfs_free_cluster *cluster,
3046                              u64 offset, u64 bytes, u64 empty_size)
3047 {
3048         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3049         struct btrfs_free_space *entry, *tmp;
3050         LIST_HEAD(bitmaps);
3051         u64 min_bytes;
3052         u64 cont1_bytes;
3053         int ret;
3054
3055         /*
3056          * Choose the minimum extent size we'll require for this
3057          * cluster.  For SSD_SPREAD, don't allow any fragmentation.
3058          * For metadata, allow allocates with smaller extents.  For
3059          * data, keep it dense.
3060          */
3061         if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3062                 cont1_bytes = min_bytes = bytes + empty_size;
3063         } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3064                 cont1_bytes = bytes;
3065                 min_bytes = fs_info->sectorsize;
3066         } else {
3067                 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3068                 min_bytes = fs_info->sectorsize;
3069         }
3070
3071         spin_lock(&ctl->tree_lock);
3072
3073         /*
3074          * If we know we don't have enough space to make a cluster don't even
3075          * bother doing all the work to try and find one.
3076          */
3077         if (ctl->free_space < bytes) {
3078                 spin_unlock(&ctl->tree_lock);
3079                 return -ENOSPC;
3080         }
3081
3082         spin_lock(&cluster->lock);
3083
3084         /* someone already found a cluster, hooray */
3085         if (cluster->block_group) {
3086                 ret = 0;
3087                 goto out;
3088         }
3089
3090         trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3091                                  min_bytes);
3092
3093         ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3094                                       bytes + empty_size,
3095                                       cont1_bytes, min_bytes);
3096         if (ret)
3097                 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3098                                            offset, bytes + empty_size,
3099                                            cont1_bytes, min_bytes);
3100
3101         /* Clear our temporary list */
3102         list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3103                 list_del_init(&entry->list);
3104
3105         if (!ret) {
3106                 atomic_inc(&block_group->count);
3107                 list_add_tail(&cluster->block_group_list,
3108                               &block_group->cluster_list);
3109                 cluster->block_group = block_group;
3110         } else {
3111                 trace_btrfs_failed_cluster_setup(block_group);
3112         }
3113 out:
3114         spin_unlock(&cluster->lock);
3115         spin_unlock(&ctl->tree_lock);
3116
3117         return ret;
3118 }
3119
3120 /*
3121  * simple code to zero out a cluster
3122  */
3123 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3124 {
3125         spin_lock_init(&cluster->lock);
3126         spin_lock_init(&cluster->refill_lock);
3127         cluster->root = RB_ROOT;
3128         cluster->max_size = 0;