btrfs: get fs_info from trans in btrfs_write_out_cache
[sfrench/cifs-2.6.git] / fs / btrfs / free-space-cache.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2008 Red Hat.  All rights reserved.
4  */
5
6 #include <linux/pagemap.h>
7 #include <linux/sched.h>
8 #include <linux/sched/signal.h>
9 #include <linux/slab.h>
10 #include <linux/math64.h>
11 #include <linux/ratelimit.h>
12 #include <linux/error-injection.h>
13 #include <linux/sched/mm.h>
14 #include "ctree.h"
15 #include "free-space-cache.h"
16 #include "transaction.h"
17 #include "disk-io.h"
18 #include "extent_io.h"
19 #include "inode-map.h"
20 #include "volumes.h"
21
22 #define BITS_PER_BITMAP         (PAGE_SIZE * 8UL)
23 #define MAX_CACHE_BYTES_PER_GIG SZ_32K
24
25 struct btrfs_trim_range {
26         u64 start;
27         u64 bytes;
28         struct list_head list;
29 };
30
31 static int link_free_space(struct btrfs_free_space_ctl *ctl,
32                            struct btrfs_free_space *info);
33 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
34                               struct btrfs_free_space *info);
35 static int btrfs_wait_cache_io_root(struct btrfs_root *root,
36                              struct btrfs_trans_handle *trans,
37                              struct btrfs_io_ctl *io_ctl,
38                              struct btrfs_path *path);
39
40 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
41                                                struct btrfs_path *path,
42                                                u64 offset)
43 {
44         struct btrfs_fs_info *fs_info = root->fs_info;
45         struct btrfs_key key;
46         struct btrfs_key location;
47         struct btrfs_disk_key disk_key;
48         struct btrfs_free_space_header *header;
49         struct extent_buffer *leaf;
50         struct inode *inode = NULL;
51         unsigned nofs_flag;
52         int ret;
53
54         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
55         key.offset = offset;
56         key.type = 0;
57
58         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
59         if (ret < 0)
60                 return ERR_PTR(ret);
61         if (ret > 0) {
62                 btrfs_release_path(path);
63                 return ERR_PTR(-ENOENT);
64         }
65
66         leaf = path->nodes[0];
67         header = btrfs_item_ptr(leaf, path->slots[0],
68                                 struct btrfs_free_space_header);
69         btrfs_free_space_key(leaf, header, &disk_key);
70         btrfs_disk_key_to_cpu(&location, &disk_key);
71         btrfs_release_path(path);
72
73         /*
74          * We are often under a trans handle at this point, so we need to make
75          * sure NOFS is set to keep us from deadlocking.
76          */
77         nofs_flag = memalloc_nofs_save();
78         inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path);
79         btrfs_release_path(path);
80         memalloc_nofs_restore(nofs_flag);
81         if (IS_ERR(inode))
82                 return inode;
83
84         mapping_set_gfp_mask(inode->i_mapping,
85                         mapping_gfp_constraint(inode->i_mapping,
86                         ~(__GFP_FS | __GFP_HIGHMEM)));
87
88         return inode;
89 }
90
91 struct inode *lookup_free_space_inode(struct btrfs_fs_info *fs_info,
92                                       struct btrfs_block_group_cache
93                                       *block_group, struct btrfs_path *path)
94 {
95         struct inode *inode = NULL;
96         u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
97
98         spin_lock(&block_group->lock);
99         if (block_group->inode)
100                 inode = igrab(block_group->inode);
101         spin_unlock(&block_group->lock);
102         if (inode)
103                 return inode;
104
105         inode = __lookup_free_space_inode(fs_info->tree_root, path,
106                                           block_group->key.objectid);
107         if (IS_ERR(inode))
108                 return inode;
109
110         spin_lock(&block_group->lock);
111         if (!((BTRFS_I(inode)->flags & flags) == flags)) {
112                 btrfs_info(fs_info, "Old style space inode found, converting.");
113                 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
114                         BTRFS_INODE_NODATACOW;
115                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
116         }
117
118         if (!block_group->iref) {
119                 block_group->inode = igrab(inode);
120                 block_group->iref = 1;
121         }
122         spin_unlock(&block_group->lock);
123
124         return inode;
125 }
126
127 static int __create_free_space_inode(struct btrfs_root *root,
128                                      struct btrfs_trans_handle *trans,
129                                      struct btrfs_path *path,
130                                      u64 ino, u64 offset)
131 {
132         struct btrfs_key key;
133         struct btrfs_disk_key disk_key;
134         struct btrfs_free_space_header *header;
135         struct btrfs_inode_item *inode_item;
136         struct extent_buffer *leaf;
137         u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
138         int ret;
139
140         ret = btrfs_insert_empty_inode(trans, root, path, ino);
141         if (ret)
142                 return ret;
143
144         /* We inline crc's for the free disk space cache */
145         if (ino != BTRFS_FREE_INO_OBJECTID)
146                 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
147
148         leaf = path->nodes[0];
149         inode_item = btrfs_item_ptr(leaf, path->slots[0],
150                                     struct btrfs_inode_item);
151         btrfs_item_key(leaf, &disk_key, path->slots[0]);
152         memzero_extent_buffer(leaf, (unsigned long)inode_item,
153                              sizeof(*inode_item));
154         btrfs_set_inode_generation(leaf, inode_item, trans->transid);
155         btrfs_set_inode_size(leaf, inode_item, 0);
156         btrfs_set_inode_nbytes(leaf, inode_item, 0);
157         btrfs_set_inode_uid(leaf, inode_item, 0);
158         btrfs_set_inode_gid(leaf, inode_item, 0);
159         btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
160         btrfs_set_inode_flags(leaf, inode_item, flags);
161         btrfs_set_inode_nlink(leaf, inode_item, 1);
162         btrfs_set_inode_transid(leaf, inode_item, trans->transid);
163         btrfs_set_inode_block_group(leaf, inode_item, offset);
164         btrfs_mark_buffer_dirty(leaf);
165         btrfs_release_path(path);
166
167         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
168         key.offset = offset;
169         key.type = 0;
170         ret = btrfs_insert_empty_item(trans, root, path, &key,
171                                       sizeof(struct btrfs_free_space_header));
172         if (ret < 0) {
173                 btrfs_release_path(path);
174                 return ret;
175         }
176
177         leaf = path->nodes[0];
178         header = btrfs_item_ptr(leaf, path->slots[0],
179                                 struct btrfs_free_space_header);
180         memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
181         btrfs_set_free_space_key(leaf, header, &disk_key);
182         btrfs_mark_buffer_dirty(leaf);
183         btrfs_release_path(path);
184
185         return 0;
186 }
187
188 int create_free_space_inode(struct btrfs_trans_handle *trans,
189                             struct btrfs_block_group_cache *block_group,
190                             struct btrfs_path *path)
191 {
192         int ret;
193         u64 ino;
194
195         ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino);
196         if (ret < 0)
197                 return ret;
198
199         return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
200                                          ino, block_group->key.objectid);
201 }
202
203 int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
204                                        struct btrfs_block_rsv *rsv)
205 {
206         u64 needed_bytes;
207         int ret;
208
209         /* 1 for slack space, 1 for updating the inode */
210         needed_bytes = btrfs_calc_trunc_metadata_size(fs_info, 1) +
211                 btrfs_calc_trans_metadata_size(fs_info, 1);
212
213         spin_lock(&rsv->lock);
214         if (rsv->reserved < needed_bytes)
215                 ret = -ENOSPC;
216         else
217                 ret = 0;
218         spin_unlock(&rsv->lock);
219         return ret;
220 }
221
222 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
223                                     struct btrfs_block_group_cache *block_group,
224                                     struct inode *inode)
225 {
226         struct btrfs_root *root = BTRFS_I(inode)->root;
227         int ret = 0;
228         bool locked = false;
229
230         if (block_group) {
231                 struct btrfs_path *path = btrfs_alloc_path();
232
233                 if (!path) {
234                         ret = -ENOMEM;
235                         goto fail;
236                 }
237                 locked = true;
238                 mutex_lock(&trans->transaction->cache_write_mutex);
239                 if (!list_empty(&block_group->io_list)) {
240                         list_del_init(&block_group->io_list);
241
242                         btrfs_wait_cache_io(trans, block_group, path);
243                         btrfs_put_block_group(block_group);
244                 }
245
246                 /*
247                  * now that we've truncated the cache away, its no longer
248                  * setup or written
249                  */
250                 spin_lock(&block_group->lock);
251                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
252                 spin_unlock(&block_group->lock);
253                 btrfs_free_path(path);
254         }
255
256         btrfs_i_size_write(BTRFS_I(inode), 0);
257         truncate_pagecache(inode, 0);
258
259         /*
260          * We skip the throttling logic for free space cache inodes, so we don't
261          * need to check for -EAGAIN.
262          */
263         ret = btrfs_truncate_inode_items(trans, root, inode,
264                                          0, BTRFS_EXTENT_DATA_KEY);
265         if (ret)
266                 goto fail;
267
268         ret = btrfs_update_inode(trans, root, inode);
269
270 fail:
271         if (locked)
272                 mutex_unlock(&trans->transaction->cache_write_mutex);
273         if (ret)
274                 btrfs_abort_transaction(trans, ret);
275
276         return ret;
277 }
278
279 static void readahead_cache(struct inode *inode)
280 {
281         struct file_ra_state *ra;
282         unsigned long last_index;
283
284         ra = kzalloc(sizeof(*ra), GFP_NOFS);
285         if (!ra)
286                 return;
287
288         file_ra_state_init(ra, inode->i_mapping);
289         last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
290
291         page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
292
293         kfree(ra);
294 }
295
296 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
297                        int write)
298 {
299         int num_pages;
300         int check_crcs = 0;
301
302         num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
303
304         if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID)
305                 check_crcs = 1;
306
307         /* Make sure we can fit our crcs and generation into the first page */
308         if (write && check_crcs &&
309             (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
310                 return -ENOSPC;
311
312         memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
313
314         io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
315         if (!io_ctl->pages)
316                 return -ENOMEM;
317
318         io_ctl->num_pages = num_pages;
319         io_ctl->fs_info = btrfs_sb(inode->i_sb);
320         io_ctl->check_crcs = check_crcs;
321         io_ctl->inode = inode;
322
323         return 0;
324 }
325 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
326
327 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
328 {
329         kfree(io_ctl->pages);
330         io_ctl->pages = NULL;
331 }
332
333 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
334 {
335         if (io_ctl->cur) {
336                 io_ctl->cur = NULL;
337                 io_ctl->orig = NULL;
338         }
339 }
340
341 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
342 {
343         ASSERT(io_ctl->index < io_ctl->num_pages);
344         io_ctl->page = io_ctl->pages[io_ctl->index++];
345         io_ctl->cur = page_address(io_ctl->page);
346         io_ctl->orig = io_ctl->cur;
347         io_ctl->size = PAGE_SIZE;
348         if (clear)
349                 clear_page(io_ctl->cur);
350 }
351
352 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
353 {
354         int i;
355
356         io_ctl_unmap_page(io_ctl);
357
358         for (i = 0; i < io_ctl->num_pages; i++) {
359                 if (io_ctl->pages[i]) {
360                         ClearPageChecked(io_ctl->pages[i]);
361                         unlock_page(io_ctl->pages[i]);
362                         put_page(io_ctl->pages[i]);
363                 }
364         }
365 }
366
367 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
368                                 int uptodate)
369 {
370         struct page *page;
371         gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
372         int i;
373
374         for (i = 0; i < io_ctl->num_pages; i++) {
375                 page = find_or_create_page(inode->i_mapping, i, mask);
376                 if (!page) {
377                         io_ctl_drop_pages(io_ctl);
378                         return -ENOMEM;
379                 }
380                 io_ctl->pages[i] = page;
381                 if (uptodate && !PageUptodate(page)) {
382                         btrfs_readpage(NULL, page);
383                         lock_page(page);
384                         if (!PageUptodate(page)) {
385                                 btrfs_err(BTRFS_I(inode)->root->fs_info,
386                                            "error reading free space cache");
387                                 io_ctl_drop_pages(io_ctl);
388                                 return -EIO;
389                         }
390                 }
391         }
392
393         for (i = 0; i < io_ctl->num_pages; i++) {
394                 clear_page_dirty_for_io(io_ctl->pages[i]);
395                 set_page_extent_mapped(io_ctl->pages[i]);
396         }
397
398         return 0;
399 }
400
401 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
402 {
403         __le64 *val;
404
405         io_ctl_map_page(io_ctl, 1);
406
407         /*
408          * Skip the csum areas.  If we don't check crcs then we just have a
409          * 64bit chunk at the front of the first page.
410          */
411         if (io_ctl->check_crcs) {
412                 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
413                 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
414         } else {
415                 io_ctl->cur += sizeof(u64);
416                 io_ctl->size -= sizeof(u64) * 2;
417         }
418
419         val = io_ctl->cur;
420         *val = cpu_to_le64(generation);
421         io_ctl->cur += sizeof(u64);
422 }
423
424 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
425 {
426         __le64 *gen;
427
428         /*
429          * Skip the crc area.  If we don't check crcs then we just have a 64bit
430          * chunk at the front of the first page.
431          */
432         if (io_ctl->check_crcs) {
433                 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
434                 io_ctl->size -= sizeof(u64) +
435                         (sizeof(u32) * io_ctl->num_pages);
436         } else {
437                 io_ctl->cur += sizeof(u64);
438                 io_ctl->size -= sizeof(u64) * 2;
439         }
440
441         gen = io_ctl->cur;
442         if (le64_to_cpu(*gen) != generation) {
443                 btrfs_err_rl(io_ctl->fs_info,
444                         "space cache generation (%llu) does not match inode (%llu)",
445                                 *gen, generation);
446                 io_ctl_unmap_page(io_ctl);
447                 return -EIO;
448         }
449         io_ctl->cur += sizeof(u64);
450         return 0;
451 }
452
453 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
454 {
455         u32 *tmp;
456         u32 crc = ~(u32)0;
457         unsigned offset = 0;
458
459         if (!io_ctl->check_crcs) {
460                 io_ctl_unmap_page(io_ctl);
461                 return;
462         }
463
464         if (index == 0)
465                 offset = sizeof(u32) * io_ctl->num_pages;
466
467         crc = btrfs_csum_data(io_ctl->orig + offset, crc,
468                               PAGE_SIZE - offset);
469         btrfs_csum_final(crc, (u8 *)&crc);
470         io_ctl_unmap_page(io_ctl);
471         tmp = page_address(io_ctl->pages[0]);
472         tmp += index;
473         *tmp = crc;
474 }
475
476 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
477 {
478         u32 *tmp, val;
479         u32 crc = ~(u32)0;
480         unsigned offset = 0;
481
482         if (!io_ctl->check_crcs) {
483                 io_ctl_map_page(io_ctl, 0);
484                 return 0;
485         }
486
487         if (index == 0)
488                 offset = sizeof(u32) * io_ctl->num_pages;
489
490         tmp = page_address(io_ctl->pages[0]);
491         tmp += index;
492         val = *tmp;
493
494         io_ctl_map_page(io_ctl, 0);
495         crc = btrfs_csum_data(io_ctl->orig + offset, crc,
496                               PAGE_SIZE - offset);
497         btrfs_csum_final(crc, (u8 *)&crc);
498         if (val != crc) {
499                 btrfs_err_rl(io_ctl->fs_info,
500                         "csum mismatch on free space cache");
501                 io_ctl_unmap_page(io_ctl);
502                 return -EIO;
503         }
504
505         return 0;
506 }
507
508 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
509                             void *bitmap)
510 {
511         struct btrfs_free_space_entry *entry;
512
513         if (!io_ctl->cur)
514                 return -ENOSPC;
515
516         entry = io_ctl->cur;
517         entry->offset = cpu_to_le64(offset);
518         entry->bytes = cpu_to_le64(bytes);
519         entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
520                 BTRFS_FREE_SPACE_EXTENT;
521         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
522         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
523
524         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
525                 return 0;
526
527         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
528
529         /* No more pages to map */
530         if (io_ctl->index >= io_ctl->num_pages)
531                 return 0;
532
533         /* map the next page */
534         io_ctl_map_page(io_ctl, 1);
535         return 0;
536 }
537
538 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
539 {
540         if (!io_ctl->cur)
541                 return -ENOSPC;
542
543         /*
544          * If we aren't at the start of the current page, unmap this one and
545          * map the next one if there is any left.
546          */
547         if (io_ctl->cur != io_ctl->orig) {
548                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
549                 if (io_ctl->index >= io_ctl->num_pages)
550                         return -ENOSPC;
551                 io_ctl_map_page(io_ctl, 0);
552         }
553
554         copy_page(io_ctl->cur, bitmap);
555         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
556         if (io_ctl->index < io_ctl->num_pages)
557                 io_ctl_map_page(io_ctl, 0);
558         return 0;
559 }
560
561 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
562 {
563         /*
564          * If we're not on the boundary we know we've modified the page and we
565          * need to crc the page.
566          */
567         if (io_ctl->cur != io_ctl->orig)
568                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
569         else
570                 io_ctl_unmap_page(io_ctl);
571
572         while (io_ctl->index < io_ctl->num_pages) {
573                 io_ctl_map_page(io_ctl, 1);
574                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
575         }
576 }
577
578 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
579                             struct btrfs_free_space *entry, u8 *type)
580 {
581         struct btrfs_free_space_entry *e;
582         int ret;
583
584         if (!io_ctl->cur) {
585                 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
586                 if (ret)
587                         return ret;
588         }
589
590         e = io_ctl->cur;
591         entry->offset = le64_to_cpu(e->offset);
592         entry->bytes = le64_to_cpu(e->bytes);
593         *type = e->type;
594         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
595         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
596
597         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
598                 return 0;
599
600         io_ctl_unmap_page(io_ctl);
601
602         return 0;
603 }
604
605 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
606                               struct btrfs_free_space *entry)
607 {
608         int ret;
609
610         ret = io_ctl_check_crc(io_ctl, io_ctl->index);
611         if (ret)
612                 return ret;
613
614         copy_page(entry->bitmap, io_ctl->cur);
615         io_ctl_unmap_page(io_ctl);
616
617         return 0;
618 }
619
620 /*
621  * Since we attach pinned extents after the fact we can have contiguous sections
622  * of free space that are split up in entries.  This poses a problem with the
623  * tree logging stuff since it could have allocated across what appears to be 2
624  * entries since we would have merged the entries when adding the pinned extents
625  * back to the free space cache.  So run through the space cache that we just
626  * loaded and merge contiguous entries.  This will make the log replay stuff not
627  * blow up and it will make for nicer allocator behavior.
628  */
629 static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
630 {
631         struct btrfs_free_space *e, *prev = NULL;
632         struct rb_node *n;
633
634 again:
635         spin_lock(&ctl->tree_lock);
636         for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
637                 e = rb_entry(n, struct btrfs_free_space, offset_index);
638                 if (!prev)
639                         goto next;
640                 if (e->bitmap || prev->bitmap)
641                         goto next;
642                 if (prev->offset + prev->bytes == e->offset) {
643                         unlink_free_space(ctl, prev);
644                         unlink_free_space(ctl, e);
645                         prev->bytes += e->bytes;
646                         kmem_cache_free(btrfs_free_space_cachep, e);
647                         link_free_space(ctl, prev);
648                         prev = NULL;
649                         spin_unlock(&ctl->tree_lock);
650                         goto again;
651                 }
652 next:
653                 prev = e;
654         }
655         spin_unlock(&ctl->tree_lock);
656 }
657
658 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
659                                    struct btrfs_free_space_ctl *ctl,
660                                    struct btrfs_path *path, u64 offset)
661 {
662         struct btrfs_fs_info *fs_info = root->fs_info;
663         struct btrfs_free_space_header *header;
664         struct extent_buffer *leaf;
665         struct btrfs_io_ctl io_ctl;
666         struct btrfs_key key;
667         struct btrfs_free_space *e, *n;
668         LIST_HEAD(bitmaps);
669         u64 num_entries;
670         u64 num_bitmaps;
671         u64 generation;
672         u8 type;
673         int ret = 0;
674
675         /* Nothing in the space cache, goodbye */
676         if (!i_size_read(inode))
677                 return 0;
678
679         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
680         key.offset = offset;
681         key.type = 0;
682
683         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
684         if (ret < 0)
685                 return 0;
686         else if (ret > 0) {
687                 btrfs_release_path(path);
688                 return 0;
689         }
690
691         ret = -1;
692
693         leaf = path->nodes[0];
694         header = btrfs_item_ptr(leaf, path->slots[0],
695                                 struct btrfs_free_space_header);
696         num_entries = btrfs_free_space_entries(leaf, header);
697         num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
698         generation = btrfs_free_space_generation(leaf, header);
699         btrfs_release_path(path);
700
701         if (!BTRFS_I(inode)->generation) {
702                 btrfs_info(fs_info,
703                            "the free space cache file (%llu) is invalid, skip it",
704                            offset);
705                 return 0;
706         }
707
708         if (BTRFS_I(inode)->generation != generation) {
709                 btrfs_err(fs_info,
710                           "free space inode generation (%llu) did not match free space cache generation (%llu)",
711                           BTRFS_I(inode)->generation, generation);
712                 return 0;
713         }
714
715         if (!num_entries)
716                 return 0;
717
718         ret = io_ctl_init(&io_ctl, inode, 0);
719         if (ret)
720                 return ret;
721
722         readahead_cache(inode);
723
724         ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
725         if (ret)
726                 goto out;
727
728         ret = io_ctl_check_crc(&io_ctl, 0);
729         if (ret)
730                 goto free_cache;
731
732         ret = io_ctl_check_generation(&io_ctl, generation);
733         if (ret)
734                 goto free_cache;
735
736         while (num_entries) {
737                 e = kmem_cache_zalloc(btrfs_free_space_cachep,
738                                       GFP_NOFS);
739                 if (!e)
740                         goto free_cache;
741
742                 ret = io_ctl_read_entry(&io_ctl, e, &type);
743                 if (ret) {
744                         kmem_cache_free(btrfs_free_space_cachep, e);
745                         goto free_cache;
746                 }
747
748                 if (!e->bytes) {
749                         kmem_cache_free(btrfs_free_space_cachep, e);
750                         goto free_cache;
751                 }
752
753                 if (type == BTRFS_FREE_SPACE_EXTENT) {
754                         spin_lock(&ctl->tree_lock);
755                         ret = link_free_space(ctl, e);
756                         spin_unlock(&ctl->tree_lock);
757                         if (ret) {
758                                 btrfs_err(fs_info,
759                                         "Duplicate entries in free space cache, dumping");
760                                 kmem_cache_free(btrfs_free_space_cachep, e);
761                                 goto free_cache;
762                         }
763                 } else {
764                         ASSERT(num_bitmaps);
765                         num_bitmaps--;
766                         e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
767                         if (!e->bitmap) {
768                                 kmem_cache_free(
769                                         btrfs_free_space_cachep, e);
770                                 goto free_cache;
771                         }
772                         spin_lock(&ctl->tree_lock);
773                         ret = link_free_space(ctl, e);
774                         ctl->total_bitmaps++;
775                         ctl->op->recalc_thresholds(ctl);
776                         spin_unlock(&ctl->tree_lock);
777                         if (ret) {
778                                 btrfs_err(fs_info,
779                                         "Duplicate entries in free space cache, dumping");
780                                 kmem_cache_free(btrfs_free_space_cachep, e);
781                                 goto free_cache;
782                         }
783                         list_add_tail(&e->list, &bitmaps);
784                 }
785
786                 num_entries--;
787         }
788
789         io_ctl_unmap_page(&io_ctl);
790
791         /*
792          * We add the bitmaps at the end of the entries in order that
793          * the bitmap entries are added to the cache.
794          */
795         list_for_each_entry_safe(e, n, &bitmaps, list) {
796                 list_del_init(&e->list);
797                 ret = io_ctl_read_bitmap(&io_ctl, e);
798                 if (ret)
799                         goto free_cache;
800         }
801
802         io_ctl_drop_pages(&io_ctl);
803         merge_space_tree(ctl);
804         ret = 1;
805 out:
806         io_ctl_free(&io_ctl);
807         return ret;
808 free_cache:
809         io_ctl_drop_pages(&io_ctl);
810         __btrfs_remove_free_space_cache(ctl);
811         goto out;
812 }
813
814 int load_free_space_cache(struct btrfs_fs_info *fs_info,
815                           struct btrfs_block_group_cache *block_group)
816 {
817         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
818         struct inode *inode;
819         struct btrfs_path *path;
820         int ret = 0;
821         bool matched;
822         u64 used = btrfs_block_group_used(&block_group->item);
823
824         /*
825          * If this block group has been marked to be cleared for one reason or
826          * another then we can't trust the on disk cache, so just return.
827          */
828         spin_lock(&block_group->lock);
829         if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
830                 spin_unlock(&block_group->lock);
831                 return 0;
832         }
833         spin_unlock(&block_group->lock);
834
835         path = btrfs_alloc_path();
836         if (!path)
837                 return 0;
838         path->search_commit_root = 1;
839         path->skip_locking = 1;
840
841         /*
842          * We must pass a path with search_commit_root set to btrfs_iget in
843          * order to avoid a deadlock when allocating extents for the tree root.
844          *
845          * When we are COWing an extent buffer from the tree root, when looking
846          * for a free extent, at extent-tree.c:find_free_extent(), we can find
847          * block group without its free space cache loaded. When we find one
848          * we must load its space cache which requires reading its free space
849          * cache's inode item from the root tree. If this inode item is located
850          * in the same leaf that we started COWing before, then we end up in
851          * deadlock on the extent buffer (trying to read lock it when we
852          * previously write locked it).
853          *
854          * It's safe to read the inode item using the commit root because
855          * block groups, once loaded, stay in memory forever (until they are
856          * removed) as well as their space caches once loaded. New block groups
857          * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
858          * we will never try to read their inode item while the fs is mounted.
859          */
860         inode = lookup_free_space_inode(fs_info, block_group, path);
861         if (IS_ERR(inode)) {
862                 btrfs_free_path(path);
863                 return 0;
864         }
865
866         /* We may have converted the inode and made the cache invalid. */
867         spin_lock(&block_group->lock);
868         if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
869                 spin_unlock(&block_group->lock);
870                 btrfs_free_path(path);
871                 goto out;
872         }
873         spin_unlock(&block_group->lock);
874
875         ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
876                                       path, block_group->key.objectid);
877         btrfs_free_path(path);
878         if (ret <= 0)
879                 goto out;
880
881         spin_lock(&ctl->tree_lock);
882         matched = (ctl->free_space == (block_group->key.offset - used -
883                                        block_group->bytes_super));
884         spin_unlock(&ctl->tree_lock);
885
886         if (!matched) {
887                 __btrfs_remove_free_space_cache(ctl);
888                 btrfs_warn(fs_info,
889                            "block group %llu has wrong amount of free space",
890                            block_group->key.objectid);
891                 ret = -1;
892         }
893 out:
894         if (ret < 0) {
895                 /* This cache is bogus, make sure it gets cleared */
896                 spin_lock(&block_group->lock);
897                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
898                 spin_unlock(&block_group->lock);
899                 ret = 0;
900
901                 btrfs_warn(fs_info,
902                            "failed to load free space cache for block group %llu, rebuilding it now",
903                            block_group->key.objectid);
904         }
905
906         iput(inode);
907         return ret;
908 }
909
910 static noinline_for_stack
911 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
912                               struct btrfs_free_space_ctl *ctl,
913                               struct btrfs_block_group_cache *block_group,
914                               int *entries, int *bitmaps,
915                               struct list_head *bitmap_list)
916 {
917         int ret;
918         struct btrfs_free_cluster *cluster = NULL;
919         struct btrfs_free_cluster *cluster_locked = NULL;
920         struct rb_node *node = rb_first(&ctl->free_space_offset);
921         struct btrfs_trim_range *trim_entry;
922
923         /* Get the cluster for this block_group if it exists */
924         if (block_group && !list_empty(&block_group->cluster_list)) {
925                 cluster = list_entry(block_group->cluster_list.next,
926                                      struct btrfs_free_cluster,
927                                      block_group_list);
928         }
929
930         if (!node && cluster) {
931                 cluster_locked = cluster;
932                 spin_lock(&cluster_locked->lock);
933                 node = rb_first(&cluster->root);
934                 cluster = NULL;
935         }
936
937         /* Write out the extent entries */
938         while (node) {
939                 struct btrfs_free_space *e;
940
941                 e = rb_entry(node, struct btrfs_free_space, offset_index);
942                 *entries += 1;
943
944                 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
945                                        e->bitmap);
946                 if (ret)
947                         goto fail;
948
949                 if (e->bitmap) {
950                         list_add_tail(&e->list, bitmap_list);
951                         *bitmaps += 1;
952                 }
953                 node = rb_next(node);
954                 if (!node && cluster) {
955                         node = rb_first(&cluster->root);
956                         cluster_locked = cluster;
957                         spin_lock(&cluster_locked->lock);
958                         cluster = NULL;
959                 }
960         }
961         if (cluster_locked) {
962                 spin_unlock(&cluster_locked->lock);
963                 cluster_locked = NULL;
964         }
965
966         /*
967          * Make sure we don't miss any range that was removed from our rbtree
968          * because trimming is running. Otherwise after a umount+mount (or crash
969          * after committing the transaction) we would leak free space and get
970          * an inconsistent free space cache report from fsck.
971          */
972         list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
973                 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
974                                        trim_entry->bytes, NULL);
975                 if (ret)
976                         goto fail;
977                 *entries += 1;
978         }
979
980         return 0;
981 fail:
982         if (cluster_locked)
983                 spin_unlock(&cluster_locked->lock);
984         return -ENOSPC;
985 }
986
987 static noinline_for_stack int
988 update_cache_item(struct btrfs_trans_handle *trans,
989                   struct btrfs_root *root,
990                   struct inode *inode,
991                   struct btrfs_path *path, u64 offset,
992                   int entries, int bitmaps)
993 {
994         struct btrfs_key key;
995         struct btrfs_free_space_header *header;
996         struct extent_buffer *leaf;
997         int ret;
998
999         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1000         key.offset = offset;
1001         key.type = 0;
1002
1003         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1004         if (ret < 0) {
1005                 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1006                                  EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL);
1007                 goto fail;
1008         }
1009         leaf = path->nodes[0];
1010         if (ret > 0) {
1011                 struct btrfs_key found_key;
1012                 ASSERT(path->slots[0]);
1013                 path->slots[0]--;
1014                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1015                 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1016                     found_key.offset != offset) {
1017                         clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1018                                          inode->i_size - 1,
1019                                          EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
1020                                          NULL);
1021                         btrfs_release_path(path);
1022                         goto fail;
1023                 }
1024         }
1025
1026         BTRFS_I(inode)->generation = trans->transid;
1027         header = btrfs_item_ptr(leaf, path->slots[0],
1028                                 struct btrfs_free_space_header);
1029         btrfs_set_free_space_entries(leaf, header, entries);
1030         btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1031         btrfs_set_free_space_generation(leaf, header, trans->transid);
1032         btrfs_mark_buffer_dirty(leaf);
1033         btrfs_release_path(path);
1034
1035         return 0;
1036
1037 fail:
1038         return -1;
1039 }
1040
1041 static noinline_for_stack int
1042 write_pinned_extent_entries(struct btrfs_fs_info *fs_info,
1043                             struct btrfs_block_group_cache *block_group,
1044                             struct btrfs_io_ctl *io_ctl,
1045                             int *entries)
1046 {
1047         u64 start, extent_start, extent_end, len;
1048         struct extent_io_tree *unpin = NULL;
1049         int ret;
1050
1051         if (!block_group)
1052                 return 0;
1053
1054         /*
1055          * We want to add any pinned extents to our free space cache
1056          * so we don't leak the space
1057          *
1058          * We shouldn't have switched the pinned extents yet so this is the
1059          * right one
1060          */
1061         unpin = fs_info->pinned_extents;
1062
1063         start = block_group->key.objectid;
1064
1065         while (start < block_group->key.objectid + block_group->key.offset) {
1066                 ret = find_first_extent_bit(unpin, start,
1067                                             &extent_start, &extent_end,
1068                                             EXTENT_DIRTY, NULL);
1069                 if (ret)
1070                         return 0;
1071
1072                 /* This pinned extent is out of our range */
1073                 if (extent_start >= block_group->key.objectid +
1074                     block_group->key.offset)
1075                         return 0;
1076
1077                 extent_start = max(extent_start, start);
1078                 extent_end = min(block_group->key.objectid +
1079                                  block_group->key.offset, extent_end + 1);
1080                 len = extent_end - extent_start;
1081
1082                 *entries += 1;
1083                 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1084                 if (ret)
1085                         return -ENOSPC;
1086
1087                 start = extent_end;
1088         }
1089
1090         return 0;
1091 }
1092
1093 static noinline_for_stack int
1094 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1095 {
1096         struct btrfs_free_space *entry, *next;
1097         int ret;
1098
1099         /* Write out the bitmaps */
1100         list_for_each_entry_safe(entry, next, bitmap_list, list) {
1101                 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1102                 if (ret)
1103                         return -ENOSPC;
1104                 list_del_init(&entry->list);
1105         }
1106
1107         return 0;
1108 }
1109
1110 static int flush_dirty_cache(struct inode *inode)
1111 {
1112         int ret;
1113
1114         ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1115         if (ret)
1116                 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1117                                  EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL);
1118
1119         return ret;
1120 }
1121
1122 static void noinline_for_stack
1123 cleanup_bitmap_list(struct list_head *bitmap_list)
1124 {
1125         struct btrfs_free_space *entry, *next;
1126
1127         list_for_each_entry_safe(entry, next, bitmap_list, list)
1128                 list_del_init(&entry->list);
1129 }
1130
1131 static void noinline_for_stack
1132 cleanup_write_cache_enospc(struct inode *inode,
1133                            struct btrfs_io_ctl *io_ctl,
1134                            struct extent_state **cached_state)
1135 {
1136         io_ctl_drop_pages(io_ctl);
1137         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1138                              i_size_read(inode) - 1, cached_state);
1139 }
1140
1141 static int __btrfs_wait_cache_io(struct btrfs_root *root,
1142                                  struct btrfs_trans_handle *trans,
1143                                  struct btrfs_block_group_cache *block_group,
1144                                  struct btrfs_io_ctl *io_ctl,
1145                                  struct btrfs_path *path, u64 offset)
1146 {
1147         int ret;
1148         struct inode *inode = io_ctl->inode;
1149
1150         if (!inode)
1151                 return 0;
1152
1153         /* Flush the dirty pages in the cache file. */
1154         ret = flush_dirty_cache(inode);
1155         if (ret)
1156                 goto out;
1157
1158         /* Update the cache item to tell everyone this cache file is valid. */
1159         ret = update_cache_item(trans, root, inode, path, offset,
1160                                 io_ctl->entries, io_ctl->bitmaps);
1161 out:
1162         io_ctl_free(io_ctl);
1163         if (ret) {
1164                 invalidate_inode_pages2(inode->i_mapping);
1165                 BTRFS_I(inode)->generation = 0;
1166                 if (block_group) {
1167 #ifdef DEBUG
1168                         btrfs_err(root->fs_info,
1169                                   "failed to write free space cache for block group %llu",
1170                                   block_group->key.objectid);
1171 #endif
1172                 }
1173         }
1174         btrfs_update_inode(trans, root, inode);
1175
1176         if (block_group) {
1177                 /* the dirty list is protected by the dirty_bgs_lock */
1178                 spin_lock(&trans->transaction->dirty_bgs_lock);
1179
1180                 /* the disk_cache_state is protected by the block group lock */
1181                 spin_lock(&block_group->lock);
1182
1183                 /*
1184                  * only mark this as written if we didn't get put back on
1185                  * the dirty list while waiting for IO.   Otherwise our
1186                  * cache state won't be right, and we won't get written again
1187                  */
1188                 if (!ret && list_empty(&block_group->dirty_list))
1189                         block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1190                 else if (ret)
1191                         block_group->disk_cache_state = BTRFS_DC_ERROR;
1192
1193                 spin_unlock(&block_group->lock);
1194                 spin_unlock(&trans->transaction->dirty_bgs_lock);
1195                 io_ctl->inode = NULL;
1196                 iput(inode);
1197         }
1198
1199         return ret;
1200
1201 }
1202
1203 static int btrfs_wait_cache_io_root(struct btrfs_root *root,
1204                                     struct btrfs_trans_handle *trans,
1205                                     struct btrfs_io_ctl *io_ctl,
1206                                     struct btrfs_path *path)
1207 {
1208         return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0);
1209 }
1210
1211 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1212                         struct btrfs_block_group_cache *block_group,
1213                         struct btrfs_path *path)
1214 {
1215         return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1216                                      block_group, &block_group->io_ctl,
1217                                      path, block_group->key.objectid);
1218 }
1219
1220 /**
1221  * __btrfs_write_out_cache - write out cached info to an inode
1222  * @root - the root the inode belongs to
1223  * @ctl - the free space cache we are going to write out
1224  * @block_group - the block_group for this cache if it belongs to a block_group
1225  * @trans - the trans handle
1226  *
1227  * This function writes out a free space cache struct to disk for quick recovery
1228  * on mount.  This will return 0 if it was successful in writing the cache out,
1229  * or an errno if it was not.
1230  */
1231 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1232                                    struct btrfs_free_space_ctl *ctl,
1233                                    struct btrfs_block_group_cache *block_group,
1234                                    struct btrfs_io_ctl *io_ctl,
1235                                    struct btrfs_trans_handle *trans)
1236 {
1237         struct btrfs_fs_info *fs_info = root->fs_info;
1238         struct extent_state *cached_state = NULL;
1239         LIST_HEAD(bitmap_list);
1240         int entries = 0;
1241         int bitmaps = 0;
1242         int ret;
1243         int must_iput = 0;
1244
1245         if (!i_size_read(inode))
1246                 return -EIO;
1247
1248         WARN_ON(io_ctl->pages);
1249         ret = io_ctl_init(io_ctl, inode, 1);
1250         if (ret)
1251                 return ret;
1252
1253         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1254                 down_write(&block_group->data_rwsem);
1255                 spin_lock(&block_group->lock);
1256                 if (block_group->delalloc_bytes) {
1257                         block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1258                         spin_unlock(&block_group->lock);
1259                         up_write(&block_group->data_rwsem);
1260                         BTRFS_I(inode)->generation = 0;
1261                         ret = 0;
1262                         must_iput = 1;
1263                         goto out;
1264                 }
1265                 spin_unlock(&block_group->lock);
1266         }
1267
1268         /* Lock all pages first so we can lock the extent safely. */
1269         ret = io_ctl_prepare_pages(io_ctl, inode, 0);
1270         if (ret)
1271                 goto out_unlock;
1272
1273         lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1274                          &cached_state);
1275
1276         io_ctl_set_generation(io_ctl, trans->transid);
1277
1278         mutex_lock(&ctl->cache_writeout_mutex);
1279         /* Write out the extent entries in the free space cache */
1280         spin_lock(&ctl->tree_lock);
1281         ret = write_cache_extent_entries(io_ctl, ctl,
1282                                          block_group, &entries, &bitmaps,
1283                                          &bitmap_list);
1284         if (ret)
1285                 goto out_nospc_locked;
1286
1287         /*
1288          * Some spaces that are freed in the current transaction are pinned,
1289          * they will be added into free space cache after the transaction is
1290          * committed, we shouldn't lose them.
1291          *
1292          * If this changes while we are working we'll get added back to
1293          * the dirty list and redo it.  No locking needed
1294          */
1295         ret = write_pinned_extent_entries(fs_info, block_group,
1296                                           io_ctl, &entries);
1297         if (ret)
1298                 goto out_nospc_locked;
1299
1300         /*
1301          * At last, we write out all the bitmaps and keep cache_writeout_mutex
1302          * locked while doing it because a concurrent trim can be manipulating
1303          * or freeing the bitmap.
1304          */
1305         ret = write_bitmap_entries(io_ctl, &bitmap_list);
1306         spin_unlock(&ctl->tree_lock);
1307         mutex_unlock(&ctl->cache_writeout_mutex);
1308         if (ret)
1309                 goto out_nospc;
1310
1311         /* Zero out the rest of the pages just to make sure */
1312         io_ctl_zero_remaining_pages(io_ctl);
1313
1314         /* Everything is written out, now we dirty the pages in the file. */
1315         ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0,
1316                                 i_size_read(inode), &cached_state);
1317         if (ret)
1318                 goto out_nospc;
1319
1320         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1321                 up_write(&block_group->data_rwsem);
1322         /*
1323          * Release the pages and unlock the extent, we will flush
1324          * them out later
1325          */
1326         io_ctl_drop_pages(io_ctl);
1327
1328         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1329                              i_size_read(inode) - 1, &cached_state);
1330
1331         /*
1332          * at this point the pages are under IO and we're happy,
1333          * The caller is responsible for waiting on them and updating the
1334          * the cache and the inode
1335          */
1336         io_ctl->entries = entries;
1337         io_ctl->bitmaps = bitmaps;
1338
1339         ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1340         if (ret)
1341                 goto out;
1342
1343         return 0;
1344
1345 out:
1346         io_ctl->inode = NULL;
1347         io_ctl_free(io_ctl);
1348         if (ret) {
1349                 invalidate_inode_pages2(inode->i_mapping);
1350                 BTRFS_I(inode)->generation = 0;
1351         }
1352         btrfs_update_inode(trans, root, inode);
1353         if (must_iput)
1354                 iput(inode);
1355         return ret;
1356
1357 out_nospc_locked:
1358         cleanup_bitmap_list(&bitmap_list);
1359         spin_unlock(&ctl->tree_lock);
1360         mutex_unlock(&ctl->cache_writeout_mutex);
1361
1362 out_nospc:
1363         cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1364
1365 out_unlock:
1366         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1367                 up_write(&block_group->data_rwsem);
1368
1369         goto out;
1370 }
1371
1372 int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
1373                           struct btrfs_block_group_cache *block_group,
1374                           struct btrfs_path *path)
1375 {
1376         struct btrfs_fs_info *fs_info = trans->fs_info;
1377         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1378         struct inode *inode;
1379         int ret = 0;
1380
1381         spin_lock(&block_group->lock);
1382         if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1383                 spin_unlock(&block_group->lock);
1384                 return 0;
1385         }
1386         spin_unlock(&block_group->lock);
1387
1388         inode = lookup_free_space_inode(fs_info, block_group, path);
1389         if (IS_ERR(inode))
1390                 return 0;
1391
1392         ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1393                                 block_group, &block_group->io_ctl, trans);
1394         if (ret) {
1395 #ifdef DEBUG
1396                 btrfs_err(fs_info,
1397                           "failed to write free space cache for block group %llu",
1398                           block_group->key.objectid);
1399 #endif
1400                 spin_lock(&block_group->lock);
1401                 block_group->disk_cache_state = BTRFS_DC_ERROR;
1402                 spin_unlock(&block_group->lock);
1403
1404                 block_group->io_ctl.inode = NULL;
1405                 iput(inode);
1406         }
1407
1408         /*
1409          * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1410          * to wait for IO and put the inode
1411          */
1412
1413         return ret;
1414 }
1415
1416 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1417                                           u64 offset)
1418 {
1419         ASSERT(offset >= bitmap_start);
1420         offset -= bitmap_start;
1421         return (unsigned long)(div_u64(offset, unit));
1422 }
1423
1424 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1425 {
1426         return (unsigned long)(div_u64(bytes, unit));
1427 }
1428
1429 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1430                                    u64 offset)
1431 {
1432         u64 bitmap_start;
1433         u64 bytes_per_bitmap;
1434
1435         bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1436         bitmap_start = offset - ctl->start;
1437         bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1438         bitmap_start *= bytes_per_bitmap;
1439         bitmap_start += ctl->start;
1440
1441         return bitmap_start;
1442 }
1443
1444 static int tree_insert_offset(struct rb_root *root, u64 offset,
1445                               struct rb_node *node, int bitmap)
1446 {
1447         struct rb_node **p = &root->rb_node;
1448         struct rb_node *parent = NULL;
1449         struct btrfs_free_space *info;
1450
1451         while (*p) {
1452                 parent = *p;
1453                 info = rb_entry(parent, struct btrfs_free_space, offset_index);
1454
1455                 if (offset < info->offset) {
1456                         p = &(*p)->rb_left;
1457                 } else if (offset > info->offset) {
1458                         p = &(*p)->rb_right;
1459                 } else {
1460                         /*
1461                          * we could have a bitmap entry and an extent entry
1462                          * share the same offset.  If this is the case, we want
1463                          * the extent entry to always be found first if we do a
1464                          * linear search through the tree, since we want to have
1465                          * the quickest allocation time, and allocating from an
1466                          * extent is faster than allocating from a bitmap.  So
1467                          * if we're inserting a bitmap and we find an entry at
1468                          * this offset, we want to go right, or after this entry
1469                          * logically.  If we are inserting an extent and we've
1470                          * found a bitmap, we want to go left, or before
1471                          * logically.
1472                          */
1473                         if (bitmap) {
1474                                 if (info->bitmap) {
1475                                         WARN_ON_ONCE(1);
1476                                         return -EEXIST;
1477                                 }
1478                                 p = &(*p)->rb_right;
1479                         } else {
1480                                 if (!info->bitmap) {
1481                                         WARN_ON_ONCE(1);
1482                                         return -EEXIST;
1483                                 }
1484                                 p = &(*p)->rb_left;
1485                         }
1486                 }
1487         }
1488
1489         rb_link_node(node, parent, p);
1490         rb_insert_color(node, root);
1491
1492         return 0;
1493 }
1494
1495 /*
1496  * searches the tree for the given offset.
1497  *
1498  * fuzzy - If this is set, then we are trying to make an allocation, and we just
1499  * want a section that has at least bytes size and comes at or after the given
1500  * offset.
1501  */
1502 static struct btrfs_free_space *
1503 tree_search_offset(struct btrfs_free_space_ctl *ctl,
1504                    u64 offset, int bitmap_only, int fuzzy)
1505 {
1506         struct rb_node *n = ctl->free_space_offset.rb_node;
1507         struct btrfs_free_space *entry, *prev = NULL;
1508
1509         /* find entry that is closest to the 'offset' */
1510         while (1) {
1511                 if (!n) {
1512                         entry = NULL;
1513                         break;
1514                 }
1515
1516                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1517                 prev = entry;
1518
1519                 if (offset < entry->offset)
1520                         n = n->rb_left;
1521                 else if (offset > entry->offset)
1522                         n = n->rb_right;
1523                 else
1524                         break;
1525         }
1526
1527         if (bitmap_only) {
1528                 if (!entry)
1529                         return NULL;
1530                 if (entry->bitmap)
1531                         return entry;
1532
1533                 /*
1534                  * bitmap entry and extent entry may share same offset,
1535                  * in that case, bitmap entry comes after extent entry.
1536                  */
1537                 n = rb_next(n);
1538                 if (!n)
1539                         return NULL;
1540                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1541                 if (entry->offset != offset)
1542                         return NULL;
1543
1544                 WARN_ON(!entry->bitmap);
1545                 return entry;
1546         } else if (entry) {
1547                 if (entry->bitmap) {
1548                         /*
1549                          * if previous extent entry covers the offset,
1550                          * we should return it instead of the bitmap entry
1551                          */
1552                         n = rb_prev(&entry->offset_index);
1553                         if (n) {
1554                                 prev = rb_entry(n, struct btrfs_free_space,
1555                                                 offset_index);
1556                                 if (!prev->bitmap &&
1557                                     prev->offset + prev->bytes > offset)
1558                                         entry = prev;
1559                         }
1560                 }
1561                 return entry;
1562         }
1563
1564         if (!prev)
1565                 return NULL;
1566
1567         /* find last entry before the 'offset' */
1568         entry = prev;
1569         if (entry->offset > offset) {
1570                 n = rb_prev(&entry->offset_index);
1571                 if (n) {
1572                         entry = rb_entry(n, struct btrfs_free_space,
1573                                         offset_index);
1574                         ASSERT(entry->offset <= offset);
1575                 } else {
1576                         if (fuzzy)
1577                                 return entry;
1578                         else
1579                                 return NULL;
1580                 }
1581         }
1582
1583         if (entry->bitmap) {
1584                 n = rb_prev(&entry->offset_index);
1585                 if (n) {
1586                         prev = rb_entry(n, struct btrfs_free_space,
1587                                         offset_index);
1588                         if (!prev->bitmap &&
1589                             prev->offset + prev->bytes > offset)
1590                                 return prev;
1591                 }
1592                 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1593                         return entry;
1594         } else if (entry->offset + entry->bytes > offset)
1595                 return entry;
1596
1597         if (!fuzzy)
1598                 return NULL;
1599
1600         while (1) {
1601                 if (entry->bitmap) {
1602                         if (entry->offset + BITS_PER_BITMAP *
1603                             ctl->unit > offset)
1604                                 break;
1605                 } else {
1606                         if (entry->offset + entry->bytes > offset)
1607                                 break;
1608                 }
1609
1610                 n = rb_next(&entry->offset_index);
1611                 if (!n)
1612                         return NULL;
1613                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1614         }
1615         return entry;
1616 }
1617
1618 static inline void
1619 __unlink_free_space(struct btrfs_free_space_ctl *ctl,
1620                     struct btrfs_free_space *info)
1621 {
1622         rb_erase(&info->offset_index, &ctl->free_space_offset);
1623         ctl->free_extents--;
1624 }
1625
1626 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1627                               struct btrfs_free_space *info)
1628 {
1629         __unlink_free_space(ctl, info);
1630         ctl->free_space -= info->bytes;
1631 }
1632
1633 static int link_free_space(struct btrfs_free_space_ctl *ctl,
1634                            struct btrfs_free_space *info)
1635 {
1636         int ret = 0;
1637
1638         ASSERT(info->bytes || info->bitmap);
1639         ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1640                                  &info->offset_index, (info->bitmap != NULL));
1641         if (ret)
1642                 return ret;
1643
1644         ctl->free_space += info->bytes;
1645         ctl->free_extents++;
1646         return ret;
1647 }
1648
1649 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1650 {
1651         struct btrfs_block_group_cache *block_group = ctl->private;
1652         u64 max_bytes;
1653         u64 bitmap_bytes;
1654         u64 extent_bytes;
1655         u64 size = block_group->key.offset;
1656         u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1657         u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1658
1659         max_bitmaps = max_t(u64, max_bitmaps, 1);
1660
1661         ASSERT(ctl->total_bitmaps <= max_bitmaps);
1662
1663         /*
1664          * The goal is to keep the total amount of memory used per 1gb of space
1665          * at or below 32k, so we need to adjust how much memory we allow to be
1666          * used by extent based free space tracking
1667          */
1668         if (size < SZ_1G)
1669                 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1670         else
1671                 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
1672
1673         /*
1674          * we want to account for 1 more bitmap than what we have so we can make
1675          * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
1676          * we add more bitmaps.
1677          */
1678         bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit;
1679
1680         if (bitmap_bytes >= max_bytes) {
1681                 ctl->extents_thresh = 0;
1682                 return;
1683         }
1684
1685         /*
1686          * we want the extent entry threshold to always be at most 1/2 the max
1687          * bytes we can have, or whatever is less than that.
1688          */
1689         extent_bytes = max_bytes - bitmap_bytes;
1690         extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
1691
1692         ctl->extents_thresh =
1693                 div_u64(extent_bytes, sizeof(struct btrfs_free_space));
1694 }
1695
1696 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1697                                        struct btrfs_free_space *info,
1698                                        u64 offset, u64 bytes)
1699 {
1700         unsigned long start, count;
1701
1702         start = offset_to_bit(info->offset, ctl->unit, offset);
1703         count = bytes_to_bits(bytes, ctl->unit);
1704         ASSERT(start + count <= BITS_PER_BITMAP);
1705
1706         bitmap_clear(info->bitmap, start, count);
1707
1708         info->bytes -= bytes;
1709         if (info->max_extent_size > ctl->unit)
1710                 info->max_extent_size = 0;
1711 }
1712
1713 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1714                               struct btrfs_free_space *info, u64 offset,
1715                               u64 bytes)
1716 {
1717         __bitmap_clear_bits(ctl, info, offset, bytes);
1718         ctl->free_space -= bytes;
1719 }
1720
1721 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1722                             struct btrfs_free_space *info, u64 offset,
1723                             u64 bytes)
1724 {
1725         unsigned long start, count;
1726
1727         start = offset_to_bit(info->offset, ctl->unit, offset);
1728         count = bytes_to_bits(bytes, ctl->unit);
1729         ASSERT(start + count <= BITS_PER_BITMAP);
1730
1731         bitmap_set(info->bitmap, start, count);
1732
1733         info->bytes += bytes;
1734         ctl->free_space += bytes;
1735 }
1736
1737 /*
1738  * If we can not find suitable extent, we will use bytes to record
1739  * the size of the max extent.
1740  */
1741 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1742                          struct btrfs_free_space *bitmap_info, u64 *offset,
1743                          u64 *bytes, bool for_alloc)
1744 {
1745         unsigned long found_bits = 0;
1746         unsigned long max_bits = 0;
1747         unsigned long bits, i;
1748         unsigned long next_zero;
1749         unsigned long extent_bits;
1750
1751         /*
1752          * Skip searching the bitmap if we don't have a contiguous section that
1753          * is large enough for this allocation.
1754          */
1755         if (for_alloc &&
1756             bitmap_info->max_extent_size &&
1757             bitmap_info->max_extent_size < *bytes) {
1758                 *bytes = bitmap_info->max_extent_size;
1759                 return -1;
1760         }
1761
1762         i = offset_to_bit(bitmap_info->offset, ctl->unit,
1763                           max_t(u64, *offset, bitmap_info->offset));
1764         bits = bytes_to_bits(*bytes, ctl->unit);
1765
1766         for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1767                 if (for_alloc && bits == 1) {
1768                         found_bits = 1;
1769                         break;
1770                 }
1771                 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1772                                                BITS_PER_BITMAP, i);
1773                 extent_bits = next_zero - i;
1774                 if (extent_bits >= bits) {
1775                         found_bits = extent_bits;
1776                         break;
1777                 } else if (extent_bits > max_bits) {
1778                         max_bits = extent_bits;
1779                 }
1780                 i = next_zero;
1781         }
1782
1783         if (found_bits) {
1784                 *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1785                 *bytes = (u64)(found_bits) * ctl->unit;
1786                 return 0;
1787         }
1788
1789         *bytes = (u64)(max_bits) * ctl->unit;
1790         bitmap_info->max_extent_size = *bytes;
1791         return -1;
1792 }
1793
1794 static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
1795 {
1796         if (entry->bitmap)
1797                 return entry->max_extent_size;
1798         return entry->bytes;
1799 }
1800
1801 /* Cache the size of the max extent in bytes */
1802 static struct btrfs_free_space *
1803 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1804                 unsigned long align, u64 *max_extent_size)
1805 {
1806         struct btrfs_free_space *entry;
1807         struct rb_node *node;
1808         u64 tmp;
1809         u64 align_off;
1810         int ret;
1811
1812         if (!ctl->free_space_offset.rb_node)
1813                 goto out;
1814
1815         entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1816         if (!entry)
1817                 goto out;
1818
1819         for (node = &entry->offset_index; node; node = rb_next(node)) {
1820                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1821                 if (entry->bytes < *bytes) {
1822                         *max_extent_size = max(get_max_extent_size(entry),
1823                                                *max_extent_size);
1824                         continue;
1825                 }
1826
1827                 /* make sure the space returned is big enough
1828                  * to match our requested alignment
1829                  */
1830                 if (*bytes >= align) {
1831                         tmp = entry->offset - ctl->start + align - 1;
1832                         tmp = div64_u64(tmp, align);
1833                         tmp = tmp * align + ctl->start;
1834                         align_off = tmp - entry->offset;
1835                 } else {
1836                         align_off = 0;
1837                         tmp = entry->offset;
1838                 }
1839
1840                 if (entry->bytes < *bytes + align_off) {
1841                         *max_extent_size = max(get_max_extent_size(entry),
1842                                                *max_extent_size);
1843                         continue;
1844                 }
1845
1846                 if (entry->bitmap) {
1847                         u64 size = *bytes;
1848
1849                         ret = search_bitmap(ctl, entry, &tmp, &size, true);
1850                         if (!ret) {
1851                                 *offset = tmp;
1852                                 *bytes = size;
1853                                 return entry;
1854                         } else {
1855                                 *max_extent_size =
1856                                         max(get_max_extent_size(entry),
1857                                             *max_extent_size);
1858                         }
1859                         continue;
1860                 }
1861
1862                 *offset = tmp;
1863                 *bytes = entry->bytes - align_off;
1864                 return entry;
1865         }
1866 out:
1867         return NULL;
1868 }
1869
1870 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1871                            struct btrfs_free_space *info, u64 offset)
1872 {
1873         info->offset = offset_to_bitmap(ctl, offset);
1874         info->bytes = 0;
1875         INIT_LIST_HEAD(&info->list);
1876         link_free_space(ctl, info);
1877         ctl->total_bitmaps++;
1878
1879         ctl->op->recalc_thresholds(ctl);
1880 }
1881
1882 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1883                         struct btrfs_free_space *bitmap_info)
1884 {
1885         unlink_free_space(ctl, bitmap_info);
1886         kfree(bitmap_info->bitmap);
1887         kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1888         ctl->total_bitmaps--;
1889         ctl->op->recalc_thresholds(ctl);
1890 }
1891
1892 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1893                               struct btrfs_free_space *bitmap_info,
1894                               u64 *offset, u64 *bytes)
1895 {
1896         u64 end;
1897         u64 search_start, search_bytes;
1898         int ret;
1899
1900 again:
1901         end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1902
1903         /*
1904          * We need to search for bits in this bitmap.  We could only cover some
1905          * of the extent in this bitmap thanks to how we add space, so we need
1906          * to search for as much as it as we can and clear that amount, and then
1907          * go searching for the next bit.
1908          */
1909         search_start = *offset;
1910         search_bytes = ctl->unit;
1911         search_bytes = min(search_bytes, end - search_start + 1);
1912         ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
1913                             false);
1914         if (ret < 0 || search_start != *offset)
1915                 return -EINVAL;
1916
1917         /* We may have found more bits than what we need */
1918         search_bytes = min(search_bytes, *bytes);
1919
1920         /* Cannot clear past the end of the bitmap */
1921         search_bytes = min(search_bytes, end - search_start + 1);
1922
1923         bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
1924         *offset += search_bytes;
1925         *bytes -= search_bytes;
1926
1927         if (*bytes) {
1928                 struct rb_node *next = rb_next(&bitmap_info->offset_index);
1929                 if (!bitmap_info->bytes)
1930                         free_bitmap(ctl, bitmap_info);
1931
1932                 /*
1933                  * no entry after this bitmap, but we still have bytes to
1934                  * remove, so something has gone wrong.
1935                  */
1936                 if (!next)
1937                         return -EINVAL;
1938
1939                 bitmap_info = rb_entry(next, struct btrfs_free_space,
1940                                        offset_index);
1941
1942                 /*
1943                  * if the next entry isn't a bitmap we need to return to let the
1944                  * extent stuff do its work.
1945                  */
1946                 if (!bitmap_info->bitmap)
1947                         return -EAGAIN;
1948
1949                 /*
1950                  * Ok the next item is a bitmap, but it may not actually hold
1951                  * the information for the rest of this free space stuff, so
1952                  * look for it, and if we don't find it return so we can try
1953                  * everything over again.
1954                  */
1955                 search_start = *offset;
1956                 search_bytes = ctl->unit;
1957                 ret = search_bitmap(ctl, bitmap_info, &search_start,
1958                                     &search_bytes, false);
1959                 if (ret < 0 || search_start != *offset)
1960                         return -EAGAIN;
1961
1962                 goto again;
1963         } else if (!bitmap_info->bytes)
1964                 free_bitmap(ctl, bitmap_info);
1965
1966         return 0;
1967 }
1968
1969 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
1970                                struct btrfs_free_space *info, u64 offset,
1971                                u64 bytes)
1972 {
1973         u64 bytes_to_set = 0;
1974         u64 end;
1975
1976         end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
1977
1978         bytes_to_set = min(end - offset, bytes);
1979
1980         bitmap_set_bits(ctl, info, offset, bytes_to_set);
1981
1982         /*
1983          * We set some bytes, we have no idea what the max extent size is
1984          * anymore.
1985          */
1986         info->max_extent_size = 0;
1987
1988         return bytes_to_set;
1989
1990 }
1991
1992 static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1993                       struct btrfs_free_space *info)
1994 {
1995         struct btrfs_block_group_cache *block_group = ctl->private;
1996         struct btrfs_fs_info *fs_info = block_group->fs_info;
1997         bool forced = false;
1998
1999 #ifdef CONFIG_BTRFS_DEBUG
2000         if (btrfs_should_fragment_free_space(block_group))
2001                 forced = true;
2002 #endif
2003
2004         /*
2005          * If we are below the extents threshold then we can add this as an
2006          * extent, and don't have to deal with the bitmap
2007          */
2008         if (!forced && ctl->free_extents < ctl->extents_thresh) {
2009                 /*
2010                  * If this block group has some small extents we don't want to
2011                  * use up all of our free slots in the cache with them, we want
2012                  * to reserve them to larger extents, however if we have plenty
2013                  * of cache left then go ahead an dadd them, no sense in adding
2014                  * the overhead of a bitmap if we don't have to.
2015                  */
2016                 if (info->bytes <= fs_info->sectorsize * 4) {
2017                         if (ctl->free_extents * 2 <= ctl->extents_thresh)
2018                                 return false;
2019                 } else {
2020                         return false;
2021                 }
2022         }
2023
2024         /*
2025          * The original block groups from mkfs can be really small, like 8
2026          * megabytes, so don't bother with a bitmap for those entries.  However
2027          * some block groups can be smaller than what a bitmap would cover but
2028          * are still large enough that they could overflow the 32k memory limit,
2029          * so allow those block groups to still be allowed to have a bitmap
2030          * entry.
2031          */
2032         if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
2033                 return false;
2034
2035         return true;
2036 }
2037
2038 static const struct btrfs_free_space_op free_space_op = {
2039         .recalc_thresholds      = recalculate_thresholds,
2040         .use_bitmap             = use_bitmap,
2041 };
2042
2043 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2044                               struct btrfs_free_space *info)
2045 {
2046         struct btrfs_free_space *bitmap_info;
2047         struct btrfs_block_group_cache *block_group = NULL;
2048         int added = 0;
2049         u64 bytes, offset, bytes_added;
2050         int ret;
2051
2052         bytes = info->bytes;
2053         offset = info->offset;
2054
2055         if (!ctl->op->use_bitmap(ctl, info))
2056                 return 0;
2057
2058         if (ctl->op == &free_space_op)
2059                 block_group = ctl->private;
2060 again:
2061         /*
2062          * Since we link bitmaps right into the cluster we need to see if we
2063          * have a cluster here, and if so and it has our bitmap we need to add
2064          * the free space to that bitmap.
2065          */
2066         if (block_group && !list_empty(&block_group->cluster_list)) {
2067                 struct btrfs_free_cluster *cluster;
2068                 struct rb_node *node;
2069                 struct btrfs_free_space *entry;
2070
2071                 cluster = list_entry(block_group->cluster_list.next,
2072                                      struct btrfs_free_cluster,
2073                                      block_group_list);
2074                 spin_lock(&cluster->lock);
2075                 node = rb_first(&cluster->root);
2076                 if (!node) {
2077                         spin_unlock(&cluster->lock);
2078                         goto no_cluster_bitmap;
2079                 }
2080
2081                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2082                 if (!entry->bitmap) {
2083                         spin_unlock(&cluster->lock);
2084                         goto no_cluster_bitmap;
2085                 }
2086
2087                 if (entry->offset == offset_to_bitmap(ctl, offset)) {
2088                         bytes_added = add_bytes_to_bitmap(ctl, entry,
2089                                                           offset, bytes);
2090                         bytes -= bytes_added;
2091                         offset += bytes_added;
2092                 }
2093                 spin_unlock(&cluster->lock);
2094                 if (!bytes) {
2095                         ret = 1;
2096                         goto out;
2097                 }
2098         }
2099
2100 no_cluster_bitmap:
2101         bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2102                                          1, 0);
2103         if (!bitmap_info) {
2104                 ASSERT(added == 0);
2105                 goto new_bitmap;
2106         }
2107
2108         bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
2109         bytes -= bytes_added;
2110         offset += bytes_added;
2111         added = 0;
2112
2113         if (!bytes) {
2114                 ret = 1;
2115                 goto out;
2116         } else
2117                 goto again;
2118
2119 new_bitmap:
2120         if (info && info->bitmap) {
2121                 add_new_bitmap(ctl, info, offset);
2122                 added = 1;
2123                 info = NULL;
2124                 goto again;
2125         } else {
2126                 spin_unlock(&ctl->tree_lock);
2127
2128                 /* no pre-allocated info, allocate a new one */
2129                 if (!info) {
2130                         info = kmem_cache_zalloc(btrfs_free_space_cachep,
2131                                                  GFP_NOFS);
2132                         if (!info) {
2133                                 spin_lock(&ctl->tree_lock);
2134                                 ret = -ENOMEM;
2135                                 goto out;
2136                         }
2137                 }
2138
2139                 /* allocate the bitmap */
2140                 info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
2141                 spin_lock(&ctl->tree_lock);
2142                 if (!info->bitmap) {
2143                         ret = -ENOMEM;
2144                         goto out;
2145                 }
2146                 goto again;
2147         }
2148
2149 out:
2150         if (info) {
2151                 kfree(info->bitmap);
2152                 kmem_cache_free(btrfs_free_space_cachep, info);
2153         }
2154
2155         return ret;
2156 }
2157
2158 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2159                           struct btrfs_free_space *info, bool update_stat)
2160 {
2161         struct btrfs_free_space *left_info;
2162         struct btrfs_free_space *right_info;
2163         bool merged = false;
2164         u64 offset = info->offset;
2165         u64 bytes = info->bytes;
2166
2167         /*
2168          * first we want to see if there is free space adjacent to the range we
2169          * are adding, if there is remove that struct and add a new one to
2170          * cover the entire range
2171          */
2172         right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2173         if (right_info && rb_prev(&right_info->offset_index))
2174                 left_info = rb_entry(rb_prev(&right_info->offset_index),
2175                                      struct btrfs_free_space, offset_index);
2176         else
2177                 left_info = tree_search_offset(ctl, offset - 1, 0, 0);
2178
2179         if (right_info && !right_info->bitmap) {
2180                 if (update_stat)
2181                         unlink_free_space(ctl, right_info);
2182                 else
2183                         __unlink_free_space(ctl, right_info);
2184                 info->bytes += right_info->bytes;
2185                 kmem_cache_free(btrfs_free_space_cachep, right_info);
2186                 merged = true;
2187         }
2188
2189         if (left_info && !left_info->bitmap &&
2190             left_info->offset + left_info->bytes == offset) {
2191                 if (update_stat)
2192                         unlink_free_space(ctl, left_info);
2193                 else
2194                         __unlink_free_space(ctl, left_info);
2195                 info->offset = left_info->offset;
2196                 info->bytes += left_info->bytes;
2197                 kmem_cache_free(btrfs_free_space_cachep, left_info);
2198                 merged = true;
2199         }
2200
2201         return merged;
2202 }
2203
2204 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2205                                      struct btrfs_free_space *info,
2206                                      bool update_stat)
2207 {
2208         struct btrfs_free_space *bitmap;
2209         unsigned long i;
2210         unsigned long j;
2211         const u64 end = info->offset + info->bytes;
2212         const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2213         u64 bytes;
2214
2215         bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2216         if (!bitmap)
2217                 return false;
2218
2219         i = offset_to_bit(bitmap->offset, ctl->unit, end);
2220         j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2221         if (j == i)
2222                 return false;
2223         bytes = (j - i) * ctl->unit;
2224         info->bytes += bytes;
2225
2226         if (update_stat)
2227                 bitmap_clear_bits(ctl, bitmap, end, bytes);
2228         else
2229                 __bitmap_clear_bits(ctl, bitmap, end, bytes);
2230
2231         if (!bitmap->bytes)
2232                 free_bitmap(ctl, bitmap);
2233
2234         return true;
2235 }
2236
2237 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2238                                        struct btrfs_free_space *info,
2239                                        bool update_stat)
2240 {
2241         struct btrfs_free_space *bitmap;
2242         u64 bitmap_offset;
2243         unsigned long i;
2244         unsigned long j;
2245         unsigned long prev_j;
2246         u64 bytes;
2247
2248         bitmap_offset = offset_to_bitmap(ctl, info->offset);
2249         /* If we're on a boundary, try the previous logical bitmap. */
2250         if (bitmap_offset == info->offset) {
2251                 if (info->offset == 0)
2252                         return false;
2253                 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2254         }
2255
2256         bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2257         if (!bitmap)
2258                 return false;
2259
2260         i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2261         j = 0;
2262         prev_j = (unsigned long)-1;
2263         for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2264                 if (j > i)
2265                         break;
2266                 prev_j = j;
2267         }
2268         if (prev_j == i)
2269                 return false;
2270
2271         if (prev_j == (unsigned long)-1)
2272                 bytes = (i + 1) * ctl->unit;
2273         else
2274                 bytes = (i - prev_j) * ctl->unit;
2275
2276         info->offset -= bytes;
2277         info->bytes += bytes;
2278
2279         if (update_stat)
2280                 bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2281         else
2282                 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2283
2284         if (!bitmap->bytes)
2285                 free_bitmap(ctl, bitmap);
2286
2287         return true;
2288 }
2289
2290 /*
2291  * We prefer always to allocate from extent entries, both for clustered and
2292  * non-clustered allocation requests. So when attempting to add a new extent
2293  * entry, try to see if there's adjacent free space in bitmap entries, and if
2294  * there is, migrate that space from the bitmaps to the extent.
2295  * Like this we get better chances of satisfying space allocation requests
2296  * because we attempt to satisfy them based on a single cache entry, and never
2297  * on 2 or more entries - even if the entries represent a contiguous free space
2298  * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2299  * ends).
2300  */
2301 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2302                               struct btrfs_free_space *info,
2303                               bool update_stat)
2304 {
2305         /*
2306          * Only work with disconnected entries, as we can change their offset,
2307          * and must be extent entries.
2308          */
2309         ASSERT(!info->bitmap);
2310         ASSERT(RB_EMPTY_NODE(&info->offset_index));
2311
2312         if (ctl->total_bitmaps > 0) {
2313                 bool stole_end;
2314                 bool stole_front = false;
2315
2316                 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2317                 if (ctl->total_bitmaps > 0)
2318                         stole_front = steal_from_bitmap_to_front(ctl, info,
2319                                                                  update_stat);
2320
2321                 if (stole_end || stole_front)
2322                         try_merge_free_space(ctl, info, update_stat);
2323         }
2324 }
2325
2326 int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
2327                            struct btrfs_free_space_ctl *ctl,
2328                            u64 offset, u64 bytes)
2329 {
2330         struct btrfs_free_space *info;
2331         int ret = 0;
2332
2333         info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2334         if (!info)
2335                 return -ENOMEM;
2336
2337         info->offset = offset;
2338         info->bytes = bytes;
2339         RB_CLEAR_NODE(&info->offset_index);
2340
2341         spin_lock(&ctl->tree_lock);
2342
2343         if (try_merge_free_space(ctl, info, true))
2344                 goto link;
2345
2346         /*
2347          * There was no extent directly to the left or right of this new
2348          * extent then we know we're going to have to allocate a new extent, so
2349          * before we do that see if we need to drop this into a bitmap
2350          */
2351         ret = insert_into_bitmap(ctl, info);
2352         if (ret < 0) {
2353                 goto out;
2354         } else if (ret) {
2355                 ret = 0;
2356                 goto out;
2357         }
2358 link:
2359         /*
2360          * Only steal free space from adjacent bitmaps if we're sure we're not
2361          * going to add the new free space to existing bitmap entries - because
2362          * that would mean unnecessary work that would be reverted. Therefore
2363          * attempt to steal space from bitmaps if we're adding an extent entry.
2364          */
2365         steal_from_bitmap(ctl, info, true);
2366
2367         ret = link_free_space(ctl, info);
2368         if (ret)
2369                 kmem_cache_free(btrfs_free_space_cachep, info);
2370 out:
2371         spin_unlock(&ctl->tree_lock);
2372
2373         if (ret) {
2374                 btrfs_crit(fs_info, "unable to add free space :%d", ret);
2375                 ASSERT(ret != -EEXIST);
2376         }
2377
2378         return ret;
2379 }
2380
2381 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2382                             u64 offset, u64 bytes)
2383 {
2384         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2385         struct btrfs_free_space *info;
2386         int ret;
2387         bool re_search = false;
2388
2389         spin_lock(&ctl->tree_lock);
2390
2391 again:
2392         ret = 0;
2393         if (!bytes)
2394                 goto out_lock;
2395
2396         info = tree_search_offset(ctl, offset, 0, 0);
2397         if (!info) {
2398                 /*
2399                  * oops didn't find an extent that matched the space we wanted
2400                  * to remove, look for a bitmap instead
2401                  */
2402                 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2403                                           1, 0);
2404                 if (!info) {
2405                         /*
2406                          * If we found a partial bit of our free space in a
2407                          * bitmap but then couldn't find the other part this may
2408                          * be a problem, so WARN about it.
2409                          */
2410                         WARN_ON(re_search);
2411                         goto out_lock;
2412                 }
2413         }
2414
2415         re_search = false;
2416         if (!info->bitmap) {
2417                 unlink_free_space(ctl, info);
2418                 if (offset == info->offset) {
2419                         u64 to_free = min(bytes, info->bytes);
2420
2421                         info->bytes -= to_free;
2422                         info->offset += to_free;
2423                         if (info->bytes) {
2424                                 ret = link_free_space(ctl, info);
2425                                 WARN_ON(ret);
2426                         } else {
2427                                 kmem_cache_free(btrfs_free_space_cachep, info);
2428                         }
2429
2430                         offset += to_free;
2431                         bytes -= to_free;
2432                         goto again;
2433                 } else {
2434                         u64 old_end = info->bytes + info->offset;
2435
2436                         info->bytes = offset - info->offset;
2437                         ret = link_free_space(ctl, info);
2438                         WARN_ON(ret);
2439                         if (ret)
2440                                 goto out_lock;
2441
2442                         /* Not enough bytes in this entry to satisfy us */
2443                         if (old_end < offset + bytes) {
2444                                 bytes -= old_end - offset;
2445                                 offset = old_end;
2446                                 goto again;
2447                         } else if (old_end == offset + bytes) {
2448                                 /* all done */
2449                                 goto out_lock;
2450                         }
2451                         spin_unlock(&ctl->tree_lock);
2452
2453                         ret = btrfs_add_free_space(block_group, offset + bytes,
2454                                                    old_end - (offset + bytes));
2455                         WARN_ON(ret);
2456                         goto out;
2457                 }
2458         }
2459
2460         ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2461         if (ret == -EAGAIN) {
2462                 re_search = true;
2463                 goto again;
2464         }
2465 out_lock:
2466         spin_unlock(&ctl->tree_lock);
2467 out:
2468         return ret;
2469 }
2470
2471 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2472                            u64 bytes)
2473 {
2474         struct btrfs_fs_info *fs_info = block_group->fs_info;
2475         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2476         struct btrfs_free_space *info;
2477         struct rb_node *n;
2478         int count = 0;
2479
2480         spin_lock(&ctl->tree_lock);
2481         for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2482                 info = rb_entry(n, struct btrfs_free_space, offset_index);
2483                 if (info->bytes >= bytes && !block_group->ro)
2484                         count++;
2485                 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2486                            info->offset, info->bytes,
2487                        (info->bitmap) ? "yes" : "no");
2488         }
2489         spin_unlock(&ctl->tree_lock);
2490         btrfs_info(fs_info, "block group has cluster?: %s",
2491                list_empty(&block_group->cluster_list) ? "no" : "yes");
2492         btrfs_info(fs_info,
2493                    "%d blocks of free space at or bigger than bytes is", count);
2494 }
2495
2496 void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
2497 {
2498         struct btrfs_fs_info *fs_info = block_group->fs_info;
2499         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2500
2501         spin_lock_init(&ctl->tree_lock);
2502         ctl->unit = fs_info->sectorsize;
2503         ctl->start = block_group->key.objectid;
2504         ctl->private = block_group;
2505         ctl->op = &free_space_op;
2506         INIT_LIST_HEAD(&ctl->trimming_ranges);
2507         mutex_init(&ctl->cache_writeout_mutex);
2508
2509         /*
2510          * we only want to have 32k of ram per block group for keeping
2511          * track of free space, and if we pass 1/2 of that we want to
2512          * start converting things over to using bitmaps
2513          */
2514         ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
2515 }
2516
2517 /*
2518  * for a given cluster, put all of its extents back into the free
2519  * space cache.  If the block group passed doesn't match the block group
2520  * pointed to by the cluster, someone else raced in and freed the
2521  * cluster already.  In that case, we just return without changing anything
2522  */
2523 static int
2524 __btrfs_return_cluster_to_free_space(
2525                              struct btrfs_block_group_cache *block_group,
2526                              struct btrfs_free_cluster *cluster)
2527 {
2528         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2529         struct btrfs_free_space *entry;
2530         struct rb_node *node;
2531
2532         spin_lock(&cluster->lock);
2533         if (cluster->block_group != block_group)
2534                 goto out;
2535
2536         cluster->block_group = NULL;
2537         cluster->window_start = 0;
2538         list_del_init(&cluster->block_group_list);
2539
2540         node = rb_first(&cluster->root);
2541         while (node) {
2542                 bool bitmap;
2543
2544                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2545                 node = rb_next(&entry->offset_index);
2546                 rb_erase(&entry->offset_index, &cluster->root);
2547                 RB_CLEAR_NODE(&entry->offset_index);
2548
2549                 bitmap = (entry->bitmap != NULL);
2550                 if (!bitmap) {
2551                         try_merge_free_space(ctl, entry, false);
2552                         steal_from_bitmap(ctl, entry, false);
2553                 }
2554                 tree_insert_offset(&ctl->free_space_offset,
2555                                    entry->offset, &entry->offset_index, bitmap);
2556         }
2557         cluster->root = RB_ROOT;
2558
2559 out:
2560         spin_unlock(&cluster->lock);
2561         btrfs_put_block_group(block_group);
2562         return 0;
2563 }
2564
2565 static void __btrfs_remove_free_space_cache_locked(
2566                                 struct btrfs_free_space_ctl *ctl)
2567 {
2568         struct btrfs_free_space *info;
2569         struct rb_node *node;
2570
2571         while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2572                 info = rb_entry(node, struct btrfs_free_space, offset_index);
2573                 if (!info->bitmap) {
2574                         unlink_free_space(ctl, info);
2575                         kmem_cache_free(btrfs_free_space_cachep, info);
2576                 } else {
2577                         free_bitmap(ctl, info);
2578                 }
2579
2580                 cond_resched_lock(&ctl->tree_lock);
2581         }
2582 }
2583
2584 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2585 {
2586         spin_lock(&ctl->tree_lock);
2587         __btrfs_remove_free_space_cache_locked(ctl);
2588         spin_unlock(&ctl->tree_lock);
2589 }
2590
2591 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
2592 {
2593         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2594         struct btrfs_free_cluster *cluster;
2595         struct list_head *head;
2596
2597         spin_lock(&ctl->tree_lock);
2598         while ((head = block_group->cluster_list.next) !=
2599                &block_group->cluster_list) {
2600                 cluster = list_entry(head, struct btrfs_free_cluster,
2601                                      block_group_list);
2602
2603                 WARN_ON(cluster->block_group != block_group);
2604                 __btrfs_return_cluster_to_free_space(block_group, cluster);
2605
2606                 cond_resched_lock(&ctl->tree_lock);
2607         }
2608         __btrfs_remove_free_space_cache_locked(ctl);
2609         spin_unlock(&ctl->tree_lock);
2610
2611 }
2612
2613 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2614                                u64 offset, u64 bytes, u64 empty_size,
2615                                u64 *max_extent_size)
2616 {
2617         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2618         struct btrfs_free_space *entry = NULL;
2619         u64 bytes_search = bytes + empty_size;
2620         u64 ret = 0;
2621         u64 align_gap = 0;
2622         u64 align_gap_len = 0;
2623
2624         spin_lock(&ctl->tree_lock);
2625         entry = find_free_space(ctl, &offset, &bytes_search,
2626                                 block_group->full_stripe_len, max_extent_size);
2627         if (!entry)
2628                 goto out;
2629
2630         ret = offset;
2631         if (entry->bitmap) {
2632                 bitmap_clear_bits(ctl, entry, offset, bytes);
2633                 if (!entry->bytes)
2634                         free_bitmap(ctl, entry);
2635         } else {
2636                 unlink_free_space(ctl, entry);
2637                 align_gap_len = offset - entry->offset;
2638                 align_gap = entry->offset;
2639
2640                 entry->offset = offset + bytes;
2641                 WARN_ON(entry->bytes < bytes + align_gap_len);
2642
2643                 entry->bytes -= bytes + align_gap_len;
2644                 if (!entry->bytes)
2645                         kmem_cache_free(btrfs_free_space_cachep, entry);
2646                 else
2647                         link_free_space(ctl, entry);
2648         }
2649 out:
2650         spin_unlock(&ctl->tree_lock);
2651
2652         if (align_gap_len)
2653                 __btrfs_add_free_space(block_group->fs_info, ctl,
2654                                        align_gap, align_gap_len);
2655         return ret;
2656 }
2657
2658 /*
2659  * given a cluster, put all of its extents back into the free space
2660  * cache.  If a block group is passed, this function will only free
2661  * a cluster that belongs to the passed block group.
2662  *
2663  * Otherwise, it'll get a reference on the block group pointed to by the
2664  * cluster and remove the cluster from it.
2665  */
2666 int btrfs_return_cluster_to_free_space(
2667                                struct btrfs_block_group_cache *block_group,
2668                                struct btrfs_free_cluster *cluster)
2669 {
2670         struct btrfs_free_space_ctl *ctl;
2671         int ret;
2672
2673         /* first, get a safe pointer to the block group */
2674         spin_lock(&cluster->lock);
2675         if (!block_group) {
2676                 block_group = cluster->block_group;
2677                 if (!block_group) {
2678                         spin_unlock(&cluster->lock);
2679                         return 0;
2680                 }
2681         } else if (cluster->block_group != block_group) {
2682                 /* someone else has already freed it don't redo their work */
2683                 spin_unlock(&cluster->lock);
2684                 return 0;
2685         }
2686         atomic_inc(&block_group->count);
2687         spin_unlock(&cluster->lock);
2688
2689         ctl = block_group->free_space_ctl;
2690
2691         /* now return any extents the cluster had on it */
2692         spin_lock(&ctl->tree_lock);
2693         ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
2694         spin_unlock(&ctl->tree_lock);
2695
2696         /* finally drop our ref */
2697         btrfs_put_block_group(block_group);
2698         return ret;
2699 }
2700
2701 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
2702                                    struct btrfs_free_cluster *cluster,
2703                                    struct btrfs_free_space *entry,
2704                                    u64 bytes, u64 min_start,
2705                                    u64 *max_extent_size)
2706 {
2707         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2708         int err;
2709         u64 search_start = cluster->window_start;
2710         u64 search_bytes = bytes;
2711         u64 ret = 0;
2712
2713         search_start = min_start;
2714         search_bytes = bytes;
2715
2716         err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
2717         if (err) {
2718                 *max_extent_size = max(get_max_extent_size(entry),
2719                                        *max_extent_size);
2720                 return 0;
2721         }
2722
2723         ret = search_start;
2724         __bitmap_clear_bits(ctl, entry, ret, bytes);
2725
2726         return ret;
2727 }
2728
2729 /*
2730  * given a cluster, try to allocate 'bytes' from it, returns 0
2731  * if it couldn't find anything suitably large, or a logical disk offset
2732  * if things worked out
2733  */
2734 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
2735                              struct btrfs_free_cluster *cluster, u64 bytes,
2736                              u64 min_start, u64 *max_extent_size)
2737 {
2738         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2739         struct btrfs_free_space *entry = NULL;
2740         struct rb_node *node;
2741         u64 ret = 0;
2742
2743         spin_lock(&cluster->lock);
2744         if (bytes > cluster->max_size)
2745                 goto out;
2746
2747         if (cluster->block_group != block_group)
2748                 goto out;
2749
2750         node = rb_first(&cluster->root);
2751         if (!node)
2752                 goto out;
2753
2754         entry = rb_entry(node, struct btrfs_free_space, offset_index);
2755         while (1) {
2756                 if (entry->bytes < bytes)
2757                         *max_extent_size = max(get_max_extent_size(entry),
2758                                                *max_extent_size);
2759
2760                 if (entry->bytes < bytes ||
2761                     (!entry->bitmap && entry->offset < min_start)) {
2762                         node = rb_next(&entry->offset_index);
2763                         if (!node)
2764                                 break;
2765                         entry = rb_entry(node, struct btrfs_free_space,
2766                                          offset_index);
2767                         continue;
2768                 }
2769
2770                 if (entry->bitmap) {
2771                         ret = btrfs_alloc_from_bitmap(block_group,
2772                                                       cluster, entry, bytes,
2773                                                       cluster->window_start,
2774                                                       max_extent_size);
2775                         if (ret == 0) {
2776                                 node = rb_next(&entry->offset_index);
2777                                 if (!node)
2778                                         break;
2779                                 entry = rb_entry(node, struct btrfs_free_space,
2780                                                  offset_index);
2781                                 continue;
2782                         }
2783                         cluster->window_start += bytes;
2784                 } else {
2785                         ret = entry->offset;
2786
2787                         entry->offset += bytes;
2788                         entry->bytes -= bytes;
2789                 }
2790
2791                 if (entry->bytes == 0)
2792                         rb_erase(&entry->offset_index, &cluster->root);
2793                 break;
2794         }
2795 out:
2796         spin_unlock(&cluster->lock);
2797
2798         if (!ret)
2799                 return 0;
2800
2801         spin_lock(&ctl->tree_lock);
2802
2803         ctl->free_space -= bytes;
2804         if (entry->bytes == 0) {
2805                 ctl->free_extents--;
2806                 if (entry->bitmap) {
2807                         kfree(entry->bitmap);
2808                         ctl->total_bitmaps--;
2809                         ctl->op->recalc_thresholds(ctl);
2810                 }
2811                 kmem_cache_free(btrfs_free_space_cachep, entry);
2812         }
2813
2814         spin_unlock(&ctl->tree_lock);
2815
2816         return ret;
2817 }
2818
2819 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2820                                 struct btrfs_free_space *entry,
2821                                 struct btrfs_free_cluster *cluster,
2822                                 u64 offset, u64 bytes,
2823                                 u64 cont1_bytes, u64 min_bytes)
2824 {
2825         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2826         unsigned long next_zero;
2827         unsigned long i;
2828         unsigned long want_bits;
2829         unsigned long min_bits;
2830         unsigned long found_bits;
2831         unsigned long max_bits = 0;
2832         unsigned long start = 0;
2833         unsigned long total_found = 0;
2834         int ret;
2835
2836         i = offset_to_bit(entry->offset, ctl->unit,
2837                           max_t(u64, offset, entry->offset));
2838         want_bits = bytes_to_bits(bytes, ctl->unit);
2839         min_bits = bytes_to_bits(min_bytes, ctl->unit);
2840
2841         /*
2842          * Don't bother looking for a cluster in this bitmap if it's heavily
2843          * fragmented.
2844          */
2845         if (entry->max_extent_size &&
2846             entry->max_extent_size < cont1_bytes)
2847                 return -ENOSPC;
2848 again:
2849         found_bits = 0;
2850         for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
2851                 next_zero = find_next_zero_bit(entry->bitmap,
2852                                                BITS_PER_BITMAP, i);
2853                 if (next_zero - i >= min_bits) {
2854                         found_bits = next_zero - i;
2855                         if (found_bits > max_bits)
2856                                 max_bits = found_bits;
2857                         break;
2858                 }
2859                 if (next_zero - i > max_bits)
2860                         max_bits = next_zero - i;
2861                 i = next_zero;
2862         }
2863
2864         if (!found_bits) {
2865                 entry->max_extent_size = (u64)max_bits * ctl->unit;
2866                 return -ENOSPC;
2867         }
2868
2869         if (!total_found) {
2870                 start = i;
2871                 cluster->max_size = 0;
2872         }
2873
2874         total_found += found_bits;
2875
2876         if (cluster->max_size < found_bits * ctl->unit)
2877                 cluster->max_size = found_bits * ctl->unit;
2878
2879         if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2880                 i = next_zero + 1;
2881                 goto again;
2882         }
2883
2884         cluster->window_start = start * ctl->unit + entry->offset;
2885         rb_erase(&entry->offset_index, &ctl->free_space_offset);
2886         ret = tree_insert_offset(&cluster->root, entry->offset,
2887                                  &entry->offset_index, 1);
2888         ASSERT(!ret); /* -EEXIST; Logic error */
2889
2890         trace_btrfs_setup_cluster(block_group, cluster,
2891                                   total_found * ctl->unit, 1);
2892         return 0;
2893 }
2894
2895 /*
2896  * This searches the block group for just extents to fill the cluster with.
2897  * Try to find a cluster with at least bytes total bytes, at least one
2898  * extent of cont1_bytes, and other clusters of at least min_bytes.
2899  */
2900 static noinline int
2901 setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2902                         struct btrfs_free_cluster *cluster,
2903                         struct list_head *bitmaps, u64 offset, u64 bytes,
2904                         u64 cont1_bytes, u64 min_bytes)
2905 {
2906         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2907         struct btrfs_free_space *first = NULL;
2908         struct btrfs_free_space *entry = NULL;
2909         struct btrfs_free_space *last;
2910         struct rb_node *node;
2911         u64 window_free;
2912         u64 max_extent;
2913         u64 total_size = 0;
2914
2915         entry = tree_search_offset(ctl, offset, 0, 1);
2916         if (!entry)
2917                 return -ENOSPC;
2918
2919         /*
2920          * We don't want bitmaps, so just move along until we find a normal
2921          * extent entry.
2922          */
2923         while (entry->bitmap || entry->bytes < min_bytes) {
2924                 if (entry->bitmap && list_empty(&entry->list))
2925                         list_add_tail(&entry->list, bitmaps);
2926                 node = rb_next(&entry->offset_index);
2927                 if (!node)
2928                         return -ENOSPC;
2929                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2930         }
2931
2932         window_free = entry->bytes;
2933         max_extent = entry->bytes;
2934         first = entry;
2935         last = entry;
2936
2937         for (node = rb_next(&entry->offset_index); node;
2938              node = rb_next(&entry->offset_index)) {
2939                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2940
2941                 if (entry->bitmap) {
2942                         if (list_empty(&entry->list))
2943                                 list_add_tail(&entry->list, bitmaps);
2944                         continue;
2945                 }
2946
2947                 if (entry->bytes < min_bytes)
2948                         continue;
2949
2950                 last = entry;
2951                 window_free += entry->bytes;
2952                 if (entry->bytes > max_extent)
2953                         max_extent = entry->bytes;
2954         }
2955
2956         if (window_free < bytes || max_extent < cont1_bytes)
2957                 return -ENOSPC;
2958
2959         cluster->window_start = first->offset;
2960
2961         node = &first->offset_index;
2962
2963         /*
2964          * now we've found our entries, pull them out of the free space
2965          * cache and put them into the cluster rbtree
2966          */
2967         do {
2968                 int ret;
2969
2970                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2971                 node = rb_next(&entry->offset_index);
2972                 if (entry->bitmap || entry->bytes < min_bytes)
2973                         continue;
2974
2975                 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2976                 ret = tree_insert_offset(&cluster->root, entry->offset,
2977                                          &entry->offset_index, 0);
2978                 total_size += entry->bytes;
2979                 ASSERT(!ret); /* -EEXIST; Logic error */
2980         } while (node && entry != last);
2981
2982         cluster->max_size = max_extent;
2983         trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
2984         return 0;
2985 }
2986
2987 /*
2988  * This specifically looks for bitmaps that may work in the cluster, we assume
2989  * that we have already failed to find extents that will work.
2990  */
2991 static noinline int
2992 setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2993                      struct btrfs_free_cluster *cluster,
2994                      struct list_head *bitmaps, u64 offset, u64 bytes,
2995                      u64 cont1_bytes, u64 min_bytes)
2996 {
2997         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2998         struct btrfs_free_space *entry = NULL;
2999         int ret = -ENOSPC;
3000         u64 bitmap_offset = offset_to_bitmap(ctl, offset);
3001
3002         if (ctl->total_bitmaps == 0)
3003                 return -ENOSPC;
3004
3005         /*
3006          * The bitmap that covers offset won't be in the list unless offset
3007          * is just its start offset.
3008          */
3009         if (!list_empty(bitmaps))
3010                 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3011
3012         if (!entry || entry->offset != bitmap_offset) {
3013                 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3014                 if (entry && list_empty(&entry->list))
3015                         list_add(&entry->list, bitmaps);
3016         }
3017
3018         list_for_each_entry(entry, bitmaps, list) {
3019                 if (entry->bytes < bytes)
3020                         continue;
3021                 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3022                                            bytes, cont1_bytes, min_bytes);
3023                 if (!ret)
3024                         return 0;
3025         }
3026
3027         /*
3028          * The bitmaps list has all the bitmaps that record free space
3029          * starting after offset, so no more search is required.
3030          */
3031         return -ENOSPC;
3032 }
3033
3034 /*
3035  * here we try to find a cluster of blocks in a block group.  The goal
3036  * is to find at least bytes+empty_size.
3037  * We might not find them all in one contiguous area.
3038  *
3039  * returns zero and sets up cluster if things worked out, otherwise
3040  * it returns -enospc
3041  */
3042 int btrfs_find_space_cluster(struct btrfs_fs_info *fs_info,
3043                              struct btrfs_block_group_cache *block_group,
3044                              struct btrfs_free_cluster *cluster,
3045                              u64 offset, u64 bytes, u64 empty_size)
3046 {
3047         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3048         struct btrfs_free_space *entry, *tmp;
3049         LIST_HEAD(bitmaps);
3050         u64 min_bytes;
3051         u64 cont1_bytes;
3052         int ret;
3053
3054         /*
3055          * Choose the minimum extent size we'll require for this
3056          * cluster.  For SSD_SPREAD, don't allow any fragmentation.
3057          * For metadata, allow allocates with smaller extents.  For
3058          * data, keep it dense.
3059          */
3060         if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3061                 cont1_bytes = min_bytes = bytes + empty_size;
3062         } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3063                 cont1_bytes = bytes;
3064                 min_bytes = fs_info->sectorsize;
3065         } else {
3066                 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3067                 min_bytes = fs_info->sectorsize;
3068         }
3069
3070         spin_lock(&ctl->tree_lock);
3071
3072         /*
3073          * If we know we don't have enough space to make a cluster don't even
3074          * bother doing all the work to try and find one.
3075          */
3076         if (ctl->free_space < bytes) {
3077                 spin_unlock(&ctl->tree_lock);
3078                 return -ENOSPC;
3079         }
3080
3081         spin_lock(&cluster->lock);
3082
3083         /* someone already found a cluster, hooray */
3084         if (cluster->block_group) {
3085                 ret = 0;
3086                 goto out;
3087         }
3088
3089         trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3090                                  min_bytes);
3091
3092         ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3093                                       bytes + empty_size,
3094                                       cont1_bytes, min_bytes);
3095         if (ret)
3096                 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3097                                            offset, bytes + empty_size,
3098                                            cont1_bytes, min_bytes);
3099
3100         /* Clear our temporary list */
3101         list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3102                 list_del_init(&entry->list);
3103
3104         if (!ret) {
3105                 atomic_inc(&block_group->count);
3106                 list_add_tail(&cluster->block_group_list,
3107                               &block_group->cluster_list);
3108                 cluster->block_group = block_group;
3109         } else {
3110                 trace_btrfs_failed_cluster_setup(block_group);
3111         }
3112 out:
3113         spin_unlock(&cluster->lock);
3114         spin_unlock(&ctl->tree_lock);
3115
3116         return ret;
3117 }
3118
3119 /*
3120  * simple code to zero out a cluster
3121  */
3122 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3123 {
3124         spin_lock_init(&cluster->lock);
3125         spin_lock_init(&cluster->refill_lock);
3126         cluster->root = RB_ROOT;
3127         cluster->max_size = 0;