Merge tag 'driver-core-6.9-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git...
[sfrench/cifs-2.6.git] / fs / bcachefs / alloc_foreground.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright 2012 Google, Inc.
4  *
5  * Foreground allocator code: allocate buckets from freelist, and allocate in
6  * sector granularity from writepoints.
7  *
8  * bch2_bucket_alloc() allocates a single bucket from a specific device.
9  *
10  * bch2_bucket_alloc_set() allocates one or more buckets from different devices
11  * in a given filesystem.
12  */
13
14 #include "bcachefs.h"
15 #include "alloc_background.h"
16 #include "alloc_foreground.h"
17 #include "backpointers.h"
18 #include "btree_iter.h"
19 #include "btree_update.h"
20 #include "btree_gc.h"
21 #include "buckets.h"
22 #include "buckets_waiting_for_journal.h"
23 #include "clock.h"
24 #include "debug.h"
25 #include "disk_groups.h"
26 #include "ec.h"
27 #include "error.h"
28 #include "io_write.h"
29 #include "journal.h"
30 #include "movinggc.h"
31 #include "nocow_locking.h"
32 #include "trace.h"
33
34 #include <linux/math64.h>
35 #include <linux/rculist.h>
36 #include <linux/rcupdate.h>
37
38 static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
39                                            struct mutex *lock)
40 {
41         if (!mutex_trylock(lock)) {
42                 bch2_trans_unlock(trans);
43                 mutex_lock(lock);
44         }
45 }
46
47 const char * const bch2_watermarks[] = {
48 #define x(t) #t,
49         BCH_WATERMARKS()
50 #undef x
51         NULL
52 };
53
54 /*
55  * Open buckets represent a bucket that's currently being allocated from.  They
56  * serve two purposes:
57  *
58  *  - They track buckets that have been partially allocated, allowing for
59  *    sub-bucket sized allocations - they're used by the sector allocator below
60  *
61  *  - They provide a reference to the buckets they own that mark and sweep GC
62  *    can find, until the new allocation has a pointer to it inserted into the
63  *    btree
64  *
65  * When allocating some space with the sector allocator, the allocation comes
66  * with a reference to an open bucket - the caller is required to put that
67  * reference _after_ doing the index update that makes its allocation reachable.
68  */
69
70 void bch2_reset_alloc_cursors(struct bch_fs *c)
71 {
72         rcu_read_lock();
73         for_each_member_device_rcu(c, ca, NULL)
74                 ca->alloc_cursor = 0;
75         rcu_read_unlock();
76 }
77
78 static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
79 {
80         open_bucket_idx_t idx = ob - c->open_buckets;
81         open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
82
83         ob->hash = *slot;
84         *slot = idx;
85 }
86
87 static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
88 {
89         open_bucket_idx_t idx = ob - c->open_buckets;
90         open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
91
92         while (*slot != idx) {
93                 BUG_ON(!*slot);
94                 slot = &c->open_buckets[*slot].hash;
95         }
96
97         *slot = ob->hash;
98         ob->hash = 0;
99 }
100
101 void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
102 {
103         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
104
105         if (ob->ec) {
106                 ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
107                 return;
108         }
109
110         percpu_down_read(&c->mark_lock);
111         spin_lock(&ob->lock);
112
113         ob->valid = false;
114         ob->data_type = 0;
115
116         spin_unlock(&ob->lock);
117         percpu_up_read(&c->mark_lock);
118
119         spin_lock(&c->freelist_lock);
120         bch2_open_bucket_hash_remove(c, ob);
121
122         ob->freelist = c->open_buckets_freelist;
123         c->open_buckets_freelist = ob - c->open_buckets;
124
125         c->open_buckets_nr_free++;
126         ca->nr_open_buckets--;
127         spin_unlock(&c->freelist_lock);
128
129         closure_wake_up(&c->open_buckets_wait);
130 }
131
132 void bch2_open_bucket_write_error(struct bch_fs *c,
133                                   struct open_buckets *obs,
134                                   unsigned dev)
135 {
136         struct open_bucket *ob;
137         unsigned i;
138
139         open_bucket_for_each(c, obs, ob, i)
140                 if (ob->dev == dev && ob->ec)
141                         bch2_ec_bucket_cancel(c, ob);
142 }
143
144 static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
145 {
146         struct open_bucket *ob;
147
148         BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
149
150         ob = c->open_buckets + c->open_buckets_freelist;
151         c->open_buckets_freelist = ob->freelist;
152         atomic_set(&ob->pin, 1);
153         ob->data_type = 0;
154
155         c->open_buckets_nr_free--;
156         return ob;
157 }
158
159 static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
160 {
161         BUG_ON(c->open_buckets_partial_nr >=
162                ARRAY_SIZE(c->open_buckets_partial));
163
164         spin_lock(&c->freelist_lock);
165         ob->on_partial_list = true;
166         c->open_buckets_partial[c->open_buckets_partial_nr++] =
167                 ob - c->open_buckets;
168         spin_unlock(&c->freelist_lock);
169
170         closure_wake_up(&c->open_buckets_wait);
171         closure_wake_up(&c->freelist_wait);
172 }
173
174 /* _only_ for allocating the journal on a new device: */
175 long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
176 {
177         while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
178                 u64 b = ca->new_fs_bucket_idx++;
179
180                 if (!is_superblock_bucket(ca, b) &&
181                     (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
182                         return b;
183         }
184
185         return -1;
186 }
187
188 static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
189 {
190         switch (watermark) {
191         case BCH_WATERMARK_interior_updates:
192                 return 0;
193         case BCH_WATERMARK_reclaim:
194                 return OPEN_BUCKETS_COUNT / 6;
195         case BCH_WATERMARK_btree:
196         case BCH_WATERMARK_btree_copygc:
197                 return OPEN_BUCKETS_COUNT / 4;
198         case BCH_WATERMARK_copygc:
199                 return OPEN_BUCKETS_COUNT / 3;
200         default:
201                 return OPEN_BUCKETS_COUNT / 2;
202         }
203 }
204
205 static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
206                                               u64 bucket,
207                                               enum bch_watermark watermark,
208                                               const struct bch_alloc_v4 *a,
209                                               struct bucket_alloc_state *s,
210                                               struct closure *cl)
211 {
212         struct open_bucket *ob;
213
214         if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
215                 s->skipped_nouse++;
216                 return NULL;
217         }
218
219         if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
220                 s->skipped_open++;
221                 return NULL;
222         }
223
224         if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
225                         c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
226                 s->skipped_need_journal_commit++;
227                 return NULL;
228         }
229
230         if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
231                 s->skipped_nocow++;
232                 return NULL;
233         }
234
235         spin_lock(&c->freelist_lock);
236
237         if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
238                 if (cl)
239                         closure_wait(&c->open_buckets_wait, cl);
240
241                 track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
242                 spin_unlock(&c->freelist_lock);
243                 return ERR_PTR(-BCH_ERR_open_buckets_empty);
244         }
245
246         /* Recheck under lock: */
247         if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
248                 spin_unlock(&c->freelist_lock);
249                 s->skipped_open++;
250                 return NULL;
251         }
252
253         ob = bch2_open_bucket_alloc(c);
254
255         spin_lock(&ob->lock);
256
257         ob->valid       = true;
258         ob->sectors_free = ca->mi.bucket_size;
259         ob->dev         = ca->dev_idx;
260         ob->gen         = a->gen;
261         ob->bucket      = bucket;
262         spin_unlock(&ob->lock);
263
264         ca->nr_open_buckets++;
265         bch2_open_bucket_hash_add(c, ob);
266
267         track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
268         track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
269
270         spin_unlock(&c->freelist_lock);
271         return ob;
272 }
273
274 static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
275                                             enum bch_watermark watermark, u64 free_entry,
276                                             struct bucket_alloc_state *s,
277                                             struct bkey_s_c freespace_k,
278                                             struct closure *cl)
279 {
280         struct bch_fs *c = trans->c;
281         struct btree_iter iter = { NULL };
282         struct bkey_s_c k;
283         struct open_bucket *ob;
284         struct bch_alloc_v4 a_convert;
285         const struct bch_alloc_v4 *a;
286         u64 b = free_entry & ~(~0ULL << 56);
287         unsigned genbits = free_entry >> 56;
288         struct printbuf buf = PRINTBUF;
289         int ret;
290
291         if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
292                 prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
293                        "  freespace key ",
294                         ca->mi.first_bucket, ca->mi.nbuckets);
295                 bch2_bkey_val_to_text(&buf, c, freespace_k);
296                 bch2_trans_inconsistent(trans, "%s", buf.buf);
297                 ob = ERR_PTR(-EIO);
298                 goto err;
299         }
300
301         k = bch2_bkey_get_iter(trans, &iter,
302                                BTREE_ID_alloc, POS(ca->dev_idx, b),
303                                BTREE_ITER_CACHED);
304         ret = bkey_err(k);
305         if (ret) {
306                 ob = ERR_PTR(ret);
307                 goto err;
308         }
309
310         a = bch2_alloc_to_v4(k, &a_convert);
311
312         if (a->data_type != BCH_DATA_free) {
313                 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
314                         ob = NULL;
315                         goto err;
316                 }
317
318                 prt_printf(&buf, "non free bucket in freespace btree\n"
319                        "  freespace key ");
320                 bch2_bkey_val_to_text(&buf, c, freespace_k);
321                 prt_printf(&buf, "\n  ");
322                 bch2_bkey_val_to_text(&buf, c, k);
323                 bch2_trans_inconsistent(trans, "%s", buf.buf);
324                 ob = ERR_PTR(-EIO);
325                 goto err;
326         }
327
328         if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
329             c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
330                 prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
331                        "  freespace key ",
332                        genbits, alloc_freespace_genbits(*a) >> 56);
333                 bch2_bkey_val_to_text(&buf, c, freespace_k);
334                 prt_printf(&buf, "\n  ");
335                 bch2_bkey_val_to_text(&buf, c, k);
336                 bch2_trans_inconsistent(trans, "%s", buf.buf);
337                 ob = ERR_PTR(-EIO);
338                 goto err;
339         }
340
341         if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) {
342                 struct bch_backpointer bp;
343                 struct bpos bp_pos = POS_MIN;
344
345                 ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
346                                                 &bp_pos, &bp,
347                                                 BTREE_ITER_NOPRESERVE);
348                 if (ret) {
349                         ob = ERR_PTR(ret);
350                         goto err;
351                 }
352
353                 if (!bkey_eq(bp_pos, POS_MAX)) {
354                         /*
355                          * Bucket may have data in it - we don't call
356                          * bc2h_trans_inconnsistent() because fsck hasn't
357                          * finished yet
358                          */
359                         ob = NULL;
360                         goto err;
361                 }
362         }
363
364         ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
365         if (!ob)
366                 set_btree_iter_dontneed(&iter);
367 err:
368         if (iter.path)
369                 set_btree_iter_dontneed(&iter);
370         bch2_trans_iter_exit(trans, &iter);
371         printbuf_exit(&buf);
372         return ob;
373 }
374
375 /*
376  * This path is for before the freespace btree is initialized:
377  *
378  * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
379  * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
380  */
381 static noinline struct open_bucket *
382 bch2_bucket_alloc_early(struct btree_trans *trans,
383                         struct bch_dev *ca,
384                         enum bch_watermark watermark,
385                         struct bucket_alloc_state *s,
386                         struct closure *cl)
387 {
388         struct btree_iter iter, citer;
389         struct bkey_s_c k, ck;
390         struct open_bucket *ob = NULL;
391         u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
392         u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
393         u64 alloc_cursor = alloc_start;
394         int ret;
395
396         /*
397          * Scan with an uncached iterator to avoid polluting the key cache. An
398          * uncached iter will return a cached key if one exists, but if not
399          * there is no other underlying protection for the associated key cache
400          * slot. To avoid racing bucket allocations, look up the cached key slot
401          * of any likely allocation candidate before attempting to proceed with
402          * the allocation. This provides proper exclusion on the associated
403          * bucket.
404          */
405 again:
406         for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
407                            BTREE_ITER_SLOTS, k, ret) {
408                 struct bch_alloc_v4 a_convert;
409                 const struct bch_alloc_v4 *a;
410
411                 if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
412                         break;
413
414                 if (ca->new_fs_bucket_idx &&
415                     is_superblock_bucket(ca, k.k->p.offset))
416                         continue;
417
418                 a = bch2_alloc_to_v4(k, &a_convert);
419                 if (a->data_type != BCH_DATA_free)
420                         continue;
421
422                 /* now check the cached key to serialize concurrent allocs of the bucket */
423                 ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
424                 ret = bkey_err(ck);
425                 if (ret)
426                         break;
427
428                 a = bch2_alloc_to_v4(ck, &a_convert);
429                 if (a->data_type != BCH_DATA_free)
430                         goto next;
431
432                 s->buckets_seen++;
433
434                 ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
435 next:
436                 set_btree_iter_dontneed(&citer);
437                 bch2_trans_iter_exit(trans, &citer);
438                 if (ob)
439                         break;
440         }
441         bch2_trans_iter_exit(trans, &iter);
442
443         alloc_cursor = iter.pos.offset;
444         ca->alloc_cursor = alloc_cursor;
445
446         if (!ob && ret)
447                 ob = ERR_PTR(ret);
448
449         if (!ob && alloc_start > first_bucket) {
450                 alloc_cursor = alloc_start = first_bucket;
451                 goto again;
452         }
453
454         return ob;
455 }
456
457 static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
458                                                    struct bch_dev *ca,
459                                                    enum bch_watermark watermark,
460                                                    struct bucket_alloc_state *s,
461                                                    struct closure *cl)
462 {
463         struct btree_iter iter;
464         struct bkey_s_c k;
465         struct open_bucket *ob = NULL;
466         u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
467         u64 alloc_cursor = alloc_start;
468         int ret;
469
470         BUG_ON(ca->new_fs_bucket_idx);
471 again:
472         for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
473                                      POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
474                 if (k.k->p.inode != ca->dev_idx)
475                         break;
476
477                 for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
478                      alloc_cursor < k.k->p.offset;
479                      alloc_cursor++) {
480                         ret = btree_trans_too_many_iters(trans);
481                         if (ret) {
482                                 ob = ERR_PTR(ret);
483                                 break;
484                         }
485
486                         s->buckets_seen++;
487
488                         ob = try_alloc_bucket(trans, ca, watermark,
489                                               alloc_cursor, s, k, cl);
490                         if (ob) {
491                                 set_btree_iter_dontneed(&iter);
492                                 break;
493                         }
494                 }
495
496                 if (ob || ret)
497                         break;
498         }
499         bch2_trans_iter_exit(trans, &iter);
500
501         ca->alloc_cursor = alloc_cursor;
502
503         if (!ob && ret)
504                 ob = ERR_PTR(ret);
505
506         if (!ob && alloc_start > ca->mi.first_bucket) {
507                 alloc_cursor = alloc_start = ca->mi.first_bucket;
508                 goto again;
509         }
510
511         return ob;
512 }
513
514 /**
515  * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
516  * @trans:      transaction object
517  * @ca:         device to allocate from
518  * @watermark:  how important is this allocation?
519  * @cl:         if not NULL, closure to be used to wait if buckets not available
520  * @usage:      for secondarily also returning the current device usage
521  *
522  * Returns:     an open_bucket on success, or an ERR_PTR() on failure.
523  */
524 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
525                                       struct bch_dev *ca,
526                                       enum bch_watermark watermark,
527                                       struct closure *cl,
528                                       struct bch_dev_usage *usage)
529 {
530         struct bch_fs *c = trans->c;
531         struct open_bucket *ob = NULL;
532         bool freespace = READ_ONCE(ca->mi.freespace_initialized);
533         u64 avail;
534         struct bucket_alloc_state s = { 0 };
535         bool waiting = false;
536 again:
537         bch2_dev_usage_read_fast(ca, usage);
538         avail = dev_buckets_free(ca, *usage, watermark);
539
540         if (usage->d[BCH_DATA_need_discard].buckets > avail)
541                 bch2_do_discards(c);
542
543         if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
544                 bch2_do_gc_gens(c);
545
546         if (should_invalidate_buckets(ca, *usage))
547                 bch2_do_invalidates(c);
548
549         if (!avail) {
550                 if (cl && !waiting) {
551                         closure_wait(&c->freelist_wait, cl);
552                         waiting = true;
553                         goto again;
554                 }
555
556                 track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
557
558                 ob = ERR_PTR(-BCH_ERR_freelist_empty);
559                 goto err;
560         }
561
562         if (waiting)
563                 closure_wake_up(&c->freelist_wait);
564 alloc:
565         ob = likely(freespace)
566                 ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
567                 : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
568
569         if (s.skipped_need_journal_commit * 2 > avail)
570                 bch2_journal_flush_async(&c->journal, NULL);
571
572         if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
573                 freespace = false;
574                 goto alloc;
575         }
576 err:
577         if (!ob)
578                 ob = ERR_PTR(-BCH_ERR_no_buckets_found);
579
580         if (!IS_ERR(ob))
581                 trace_and_count(c, bucket_alloc, ca,
582                                 bch2_watermarks[watermark],
583                                 ob->bucket,
584                                 usage->d[BCH_DATA_free].buckets,
585                                 avail,
586                                 bch2_copygc_wait_amount(c),
587                                 c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
588                                 &s,
589                                 cl == NULL,
590                                 "");
591         else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
592                 trace_and_count(c, bucket_alloc_fail, ca,
593                                 bch2_watermarks[watermark],
594                                 0,
595                                 usage->d[BCH_DATA_free].buckets,
596                                 avail,
597                                 bch2_copygc_wait_amount(c),
598                                 c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
599                                 &s,
600                                 cl == NULL,
601                                 bch2_err_str(PTR_ERR(ob)));
602
603         return ob;
604 }
605
606 struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
607                                       enum bch_watermark watermark,
608                                       struct closure *cl)
609 {
610         struct bch_dev_usage usage;
611         struct open_bucket *ob;
612
613         bch2_trans_do(c, NULL, NULL, 0,
614                       PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
615                                                         cl, &usage)));
616         return ob;
617 }
618
619 static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
620                             unsigned l, unsigned r)
621 {
622         return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
623                 (stripe->next_alloc[l] < stripe->next_alloc[r]));
624 }
625
626 #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
627
628 struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
629                                           struct dev_stripe_state *stripe,
630                                           struct bch_devs_mask *devs)
631 {
632         struct dev_alloc_list ret = { .nr = 0 };
633         unsigned i;
634
635         for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
636                 ret.devs[ret.nr++] = i;
637
638         bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
639         return ret;
640 }
641
642 static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
643                                struct dev_stripe_state *stripe,
644                                struct bch_dev_usage *usage)
645 {
646         u64 *v = stripe->next_alloc + ca->dev_idx;
647         u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
648         u64 free_space_inv = free_space
649                 ? div64_u64(1ULL << 48, free_space)
650                 : 1ULL << 48;
651         u64 scale = *v / 4;
652
653         if (*v + free_space_inv >= *v)
654                 *v += free_space_inv;
655         else
656                 *v = U64_MAX;
657
658         for (v = stripe->next_alloc;
659              v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
660                 *v = *v < scale ? 0 : *v - scale;
661 }
662
663 void bch2_dev_stripe_increment(struct bch_dev *ca,
664                                struct dev_stripe_state *stripe)
665 {
666         struct bch_dev_usage usage;
667
668         bch2_dev_usage_read_fast(ca, &usage);
669         bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
670 }
671
672 static int add_new_bucket(struct bch_fs *c,
673                            struct open_buckets *ptrs,
674                            struct bch_devs_mask *devs_may_alloc,
675                            unsigned nr_replicas,
676                            unsigned *nr_effective,
677                            bool *have_cache,
678                            unsigned flags,
679                            struct open_bucket *ob)
680 {
681         unsigned durability =
682                 bch_dev_bkey_exists(c, ob->dev)->mi.durability;
683
684         BUG_ON(*nr_effective >= nr_replicas);
685
686         __clear_bit(ob->dev, devs_may_alloc->d);
687         *nr_effective   += durability;
688         *have_cache     |= !durability;
689
690         ob_push(c, ptrs, ob);
691
692         if (*nr_effective >= nr_replicas)
693                 return 1;
694         if (ob->ec)
695                 return 1;
696         return 0;
697 }
698
699 int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
700                       struct open_buckets *ptrs,
701                       struct dev_stripe_state *stripe,
702                       struct bch_devs_mask *devs_may_alloc,
703                       unsigned nr_replicas,
704                       unsigned *nr_effective,
705                       bool *have_cache,
706                       unsigned flags,
707                       enum bch_data_type data_type,
708                       enum bch_watermark watermark,
709                       struct closure *cl)
710 {
711         struct bch_fs *c = trans->c;
712         struct dev_alloc_list devs_sorted =
713                 bch2_dev_alloc_list(c, stripe, devs_may_alloc);
714         unsigned dev;
715         struct bch_dev *ca;
716         int ret = -BCH_ERR_insufficient_devices;
717         unsigned i;
718
719         BUG_ON(*nr_effective >= nr_replicas);
720
721         for (i = 0; i < devs_sorted.nr; i++) {
722                 struct bch_dev_usage usage;
723                 struct open_bucket *ob;
724
725                 dev = devs_sorted.devs[i];
726
727                 rcu_read_lock();
728                 ca = rcu_dereference(c->devs[dev]);
729                 if (ca)
730                         percpu_ref_get(&ca->ref);
731                 rcu_read_unlock();
732
733                 if (!ca)
734                         continue;
735
736                 if (!ca->mi.durability && *have_cache) {
737                         percpu_ref_put(&ca->ref);
738                         continue;
739                 }
740
741                 ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage);
742                 if (!IS_ERR(ob))
743                         bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
744                 percpu_ref_put(&ca->ref);
745
746                 if (IS_ERR(ob)) {
747                         ret = PTR_ERR(ob);
748                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
749                                 break;
750                         continue;
751                 }
752
753                 ob->data_type = data_type;
754
755                 if (add_new_bucket(c, ptrs, devs_may_alloc,
756                                    nr_replicas, nr_effective,
757                                    have_cache, flags, ob)) {
758                         ret = 0;
759                         break;
760                 }
761         }
762
763         return ret;
764 }
765
766 /* Allocate from stripes: */
767
768 /*
769  * if we can't allocate a new stripe because there are already too many
770  * partially filled stripes, force allocating from an existing stripe even when
771  * it's to a device we don't want:
772  */
773
774 static int bucket_alloc_from_stripe(struct btree_trans *trans,
775                          struct open_buckets *ptrs,
776                          struct write_point *wp,
777                          struct bch_devs_mask *devs_may_alloc,
778                          u16 target,
779                          unsigned nr_replicas,
780                          unsigned *nr_effective,
781                          bool *have_cache,
782                          enum bch_watermark watermark,
783                          unsigned flags,
784                          struct closure *cl)
785 {
786         struct bch_fs *c = trans->c;
787         struct dev_alloc_list devs_sorted;
788         struct ec_stripe_head *h;
789         struct open_bucket *ob;
790         unsigned i, ec_idx;
791         int ret = 0;
792
793         if (nr_replicas < 2)
794                 return 0;
795
796         if (ec_open_bucket(c, ptrs))
797                 return 0;
798
799         h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
800         if (IS_ERR(h))
801                 return PTR_ERR(h);
802         if (!h)
803                 return 0;
804
805         devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
806
807         for (i = 0; i < devs_sorted.nr; i++)
808                 for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
809                         if (!h->s->blocks[ec_idx])
810                                 continue;
811
812                         ob = c->open_buckets + h->s->blocks[ec_idx];
813                         if (ob->dev == devs_sorted.devs[i] &&
814                             !test_and_set_bit(ec_idx, h->s->blocks_allocated))
815                                 goto got_bucket;
816                 }
817         goto out_put_head;
818 got_bucket:
819         ob->ec_idx      = ec_idx;
820         ob->ec          = h->s;
821         ec_stripe_new_get(h->s, STRIPE_REF_io);
822
823         ret = add_new_bucket(c, ptrs, devs_may_alloc,
824                              nr_replicas, nr_effective,
825                              have_cache, flags, ob);
826 out_put_head:
827         bch2_ec_stripe_head_put(c, h);
828         return ret;
829 }
830
831 /* Sector allocator */
832
833 static bool want_bucket(struct bch_fs *c,
834                         struct write_point *wp,
835                         struct bch_devs_mask *devs_may_alloc,
836                         bool *have_cache, bool ec,
837                         struct open_bucket *ob)
838 {
839         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
840
841         if (!test_bit(ob->dev, devs_may_alloc->d))
842                 return false;
843
844         if (ob->data_type != wp->data_type)
845                 return false;
846
847         if (!ca->mi.durability &&
848             (wp->data_type == BCH_DATA_btree || ec || *have_cache))
849                 return false;
850
851         if (ec != (ob->ec != NULL))
852                 return false;
853
854         return true;
855 }
856
857 static int bucket_alloc_set_writepoint(struct bch_fs *c,
858                                        struct open_buckets *ptrs,
859                                        struct write_point *wp,
860                                        struct bch_devs_mask *devs_may_alloc,
861                                        unsigned nr_replicas,
862                                        unsigned *nr_effective,
863                                        bool *have_cache,
864                                        bool ec, unsigned flags)
865 {
866         struct open_buckets ptrs_skip = { .nr = 0 };
867         struct open_bucket *ob;
868         unsigned i;
869         int ret = 0;
870
871         open_bucket_for_each(c, &wp->ptrs, ob, i) {
872                 if (!ret && want_bucket(c, wp, devs_may_alloc,
873                                         have_cache, ec, ob))
874                         ret = add_new_bucket(c, ptrs, devs_may_alloc,
875                                        nr_replicas, nr_effective,
876                                        have_cache, flags, ob);
877                 else
878                         ob_push(c, &ptrs_skip, ob);
879         }
880         wp->ptrs = ptrs_skip;
881
882         return ret;
883 }
884
885 static int bucket_alloc_set_partial(struct bch_fs *c,
886                                     struct open_buckets *ptrs,
887                                     struct write_point *wp,
888                                     struct bch_devs_mask *devs_may_alloc,
889                                     unsigned nr_replicas,
890                                     unsigned *nr_effective,
891                                     bool *have_cache, bool ec,
892                                     enum bch_watermark watermark,
893                                     unsigned flags)
894 {
895         int i, ret = 0;
896
897         if (!c->open_buckets_partial_nr)
898                 return 0;
899
900         spin_lock(&c->freelist_lock);
901
902         if (!c->open_buckets_partial_nr)
903                 goto unlock;
904
905         for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
906                 struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
907
908                 if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
909                         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
910                         struct bch_dev_usage usage;
911                         u64 avail;
912
913                         bch2_dev_usage_read_fast(ca, &usage);
914                         avail = dev_buckets_free(ca, usage, watermark);
915                         if (!avail)
916                                 continue;
917
918                         array_remove_item(c->open_buckets_partial,
919                                           c->open_buckets_partial_nr,
920                                           i);
921                         ob->on_partial_list = false;
922
923                         ret = add_new_bucket(c, ptrs, devs_may_alloc,
924                                              nr_replicas, nr_effective,
925                                              have_cache, flags, ob);
926                         if (ret)
927                                 break;
928                 }
929         }
930 unlock:
931         spin_unlock(&c->freelist_lock);
932         return ret;
933 }
934
935 static int __open_bucket_add_buckets(struct btree_trans *trans,
936                         struct open_buckets *ptrs,
937                         struct write_point *wp,
938                         struct bch_devs_list *devs_have,
939                         u16 target,
940                         bool erasure_code,
941                         unsigned nr_replicas,
942                         unsigned *nr_effective,
943                         bool *have_cache,
944                         enum bch_watermark watermark,
945                         unsigned flags,
946                         struct closure *_cl)
947 {
948         struct bch_fs *c = trans->c;
949         struct bch_devs_mask devs;
950         struct open_bucket *ob;
951         struct closure *cl = NULL;
952         unsigned i;
953         int ret;
954
955         devs = target_rw_devs(c, wp->data_type, target);
956
957         /* Don't allocate from devices we already have pointers to: */
958         darray_for_each(*devs_have, i)
959                 __clear_bit(*i, devs.d);
960
961         open_bucket_for_each(c, ptrs, ob, i)
962                 __clear_bit(ob->dev, devs.d);
963
964         if (erasure_code && ec_open_bucket(c, ptrs))
965                 return 0;
966
967         ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
968                                  nr_replicas, nr_effective,
969                                  have_cache, erasure_code, flags);
970         if (ret)
971                 return ret;
972
973         ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
974                                  nr_replicas, nr_effective,
975                                  have_cache, erasure_code, watermark, flags);
976         if (ret)
977                 return ret;
978
979         if (erasure_code) {
980                 ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
981                                          target,
982                                          nr_replicas, nr_effective,
983                                          have_cache,
984                                          watermark, flags, _cl);
985         } else {
986 retry_blocking:
987                 /*
988                  * Try nonblocking first, so that if one device is full we'll try from
989                  * other devices:
990                  */
991                 ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
992                                         nr_replicas, nr_effective, have_cache,
993                                         flags, wp->data_type, watermark, cl);
994                 if (ret &&
995                     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
996                     !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
997                     !cl && _cl) {
998                         cl = _cl;
999                         goto retry_blocking;
1000                 }
1001         }
1002
1003         return ret;
1004 }
1005
1006 static int open_bucket_add_buckets(struct btree_trans *trans,
1007                         struct open_buckets *ptrs,
1008                         struct write_point *wp,
1009                         struct bch_devs_list *devs_have,
1010                         u16 target,
1011                         unsigned erasure_code,
1012                         unsigned nr_replicas,
1013                         unsigned *nr_effective,
1014                         bool *have_cache,
1015                         enum bch_watermark watermark,
1016                         unsigned flags,
1017                         struct closure *cl)
1018 {
1019         int ret;
1020
1021         if (erasure_code) {
1022                 ret = __open_bucket_add_buckets(trans, ptrs, wp,
1023                                 devs_have, target, erasure_code,
1024                                 nr_replicas, nr_effective, have_cache,
1025                                 watermark, flags, cl);
1026                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1027                     bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
1028                     bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
1029                     bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
1030                         return ret;
1031                 if (*nr_effective >= nr_replicas)
1032                         return 0;
1033         }
1034
1035         ret = __open_bucket_add_buckets(trans, ptrs, wp,
1036                         devs_have, target, false,
1037                         nr_replicas, nr_effective, have_cache,
1038                         watermark, flags, cl);
1039         return ret < 0 ? ret : 0;
1040 }
1041
1042 /**
1043  * should_drop_bucket - check if this is open_bucket should go away
1044  * @ob:         open_bucket to predicate on
1045  * @c:          filesystem handle
1046  * @ca:         if set, we're killing buckets for a particular device
1047  * @ec:         if true, we're shutting down erasure coding and killing all ec
1048  *              open_buckets
1049  *              otherwise, return true
1050  * Returns: true if we should kill this open_bucket
1051  *
1052  * We're killing open_buckets because we're shutting down a device, erasure
1053  * coding, or the entire filesystem - check if this open_bucket matches:
1054  */
1055 static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
1056                                struct bch_dev *ca, bool ec)
1057 {
1058         if (ec) {
1059                 return ob->ec != NULL;
1060         } else if (ca) {
1061                 bool drop = ob->dev == ca->dev_idx;
1062                 struct open_bucket *ob2;
1063                 unsigned i;
1064
1065                 if (!drop && ob->ec) {
1066                         unsigned nr_blocks;
1067
1068                         mutex_lock(&ob->ec->lock);
1069                         nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
1070
1071                         for (i = 0; i < nr_blocks; i++) {
1072                                 if (!ob->ec->blocks[i])
1073                                         continue;
1074
1075                                 ob2 = c->open_buckets + ob->ec->blocks[i];
1076                                 drop |= ob2->dev == ca->dev_idx;
1077                         }
1078                         mutex_unlock(&ob->ec->lock);
1079                 }
1080
1081                 return drop;
1082         } else {
1083                 return true;
1084         }
1085 }
1086
1087 static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
1088                                  bool ec, struct write_point *wp)
1089 {
1090         struct open_buckets ptrs = { .nr = 0 };
1091         struct open_bucket *ob;
1092         unsigned i;
1093
1094         mutex_lock(&wp->lock);
1095         open_bucket_for_each(c, &wp->ptrs, ob, i)
1096                 if (should_drop_bucket(ob, c, ca, ec))
1097                         bch2_open_bucket_put(c, ob);
1098                 else
1099                         ob_push(c, &ptrs, ob);
1100         wp->ptrs = ptrs;
1101         mutex_unlock(&wp->lock);
1102 }
1103
1104 void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
1105                             bool ec)
1106 {
1107         unsigned i;
1108
1109         /* Next, close write points that point to this device... */
1110         for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1111                 bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
1112
1113         bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
1114         bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
1115         bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
1116
1117         mutex_lock(&c->btree_reserve_cache_lock);
1118         while (c->btree_reserve_cache_nr) {
1119                 struct btree_alloc *a =
1120                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1121
1122                 bch2_open_buckets_put(c, &a->ob);
1123         }
1124         mutex_unlock(&c->btree_reserve_cache_lock);
1125
1126         spin_lock(&c->freelist_lock);
1127         i = 0;
1128         while (i < c->open_buckets_partial_nr) {
1129                 struct open_bucket *ob =
1130                         c->open_buckets + c->open_buckets_partial[i];
1131
1132                 if (should_drop_bucket(ob, c, ca, ec)) {
1133                         --c->open_buckets_partial_nr;
1134                         swap(c->open_buckets_partial[i],
1135                              c->open_buckets_partial[c->open_buckets_partial_nr]);
1136                         ob->on_partial_list = false;
1137                         spin_unlock(&c->freelist_lock);
1138                         bch2_open_bucket_put(c, ob);
1139                         spin_lock(&c->freelist_lock);
1140                 } else {
1141                         i++;
1142                 }
1143         }
1144         spin_unlock(&c->freelist_lock);
1145
1146         bch2_ec_stop_dev(c, ca);
1147 }
1148
1149 static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
1150                                                  unsigned long write_point)
1151 {
1152         unsigned hash =
1153                 hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
1154
1155         return &c->write_points_hash[hash];
1156 }
1157
1158 static struct write_point *__writepoint_find(struct hlist_head *head,
1159                                              unsigned long write_point)
1160 {
1161         struct write_point *wp;
1162
1163         rcu_read_lock();
1164         hlist_for_each_entry_rcu(wp, head, node)
1165                 if (wp->write_point == write_point)
1166                         goto out;
1167         wp = NULL;
1168 out:
1169         rcu_read_unlock();
1170         return wp;
1171 }
1172
1173 static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
1174 {
1175         u64 stranded    = c->write_points_nr * c->bucket_size_max;
1176         u64 free        = bch2_fs_usage_read_short(c).free;
1177
1178         return stranded * factor > free;
1179 }
1180
1181 static bool try_increase_writepoints(struct bch_fs *c)
1182 {
1183         struct write_point *wp;
1184
1185         if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
1186             too_many_writepoints(c, 32))
1187                 return false;
1188
1189         wp = c->write_points + c->write_points_nr++;
1190         hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
1191         return true;
1192 }
1193
1194 static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
1195 {
1196         struct bch_fs *c = trans->c;
1197         struct write_point *wp;
1198         struct open_bucket *ob;
1199         unsigned i;
1200
1201         mutex_lock(&c->write_points_hash_lock);
1202         if (c->write_points_nr < old_nr) {
1203                 mutex_unlock(&c->write_points_hash_lock);
1204                 return true;
1205         }
1206
1207         if (c->write_points_nr == 1 ||
1208             !too_many_writepoints(c, 8)) {
1209                 mutex_unlock(&c->write_points_hash_lock);
1210                 return false;
1211         }
1212
1213         wp = c->write_points + --c->write_points_nr;
1214
1215         hlist_del_rcu(&wp->node);
1216         mutex_unlock(&c->write_points_hash_lock);
1217
1218         bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1219         open_bucket_for_each(c, &wp->ptrs, ob, i)
1220                 open_bucket_free_unused(c, ob);
1221         wp->ptrs.nr = 0;
1222         mutex_unlock(&wp->lock);
1223         return true;
1224 }
1225
1226 static struct write_point *writepoint_find(struct btree_trans *trans,
1227                                            unsigned long write_point)
1228 {
1229         struct bch_fs *c = trans->c;
1230         struct write_point *wp, *oldest;
1231         struct hlist_head *head;
1232
1233         if (!(write_point & 1UL)) {
1234                 wp = (struct write_point *) write_point;
1235                 bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1236                 return wp;
1237         }
1238
1239         head = writepoint_hash(c, write_point);
1240 restart_find:
1241         wp = __writepoint_find(head, write_point);
1242         if (wp) {
1243 lock_wp:
1244                 bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1245                 if (wp->write_point == write_point)
1246                         goto out;
1247                 mutex_unlock(&wp->lock);
1248                 goto restart_find;
1249         }
1250 restart_find_oldest:
1251         oldest = NULL;
1252         for (wp = c->write_points;
1253              wp < c->write_points + c->write_points_nr; wp++)
1254                 if (!oldest || time_before64(wp->last_used, oldest->last_used))
1255                         oldest = wp;
1256
1257         bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
1258         bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
1259         if (oldest >= c->write_points + c->write_points_nr ||
1260             try_increase_writepoints(c)) {
1261                 mutex_unlock(&c->write_points_hash_lock);
1262                 mutex_unlock(&oldest->lock);
1263                 goto restart_find_oldest;
1264         }
1265
1266         wp = __writepoint_find(head, write_point);
1267         if (wp && wp != oldest) {
1268                 mutex_unlock(&c->write_points_hash_lock);
1269                 mutex_unlock(&oldest->lock);
1270                 goto lock_wp;
1271         }
1272
1273         wp = oldest;
1274         hlist_del_rcu(&wp->node);
1275         wp->write_point = write_point;
1276         hlist_add_head_rcu(&wp->node, head);
1277         mutex_unlock(&c->write_points_hash_lock);
1278 out:
1279         wp->last_used = local_clock();
1280         return wp;
1281 }
1282
1283 static noinline void
1284 deallocate_extra_replicas(struct bch_fs *c,
1285                           struct open_buckets *ptrs,
1286                           struct open_buckets *ptrs_no_use,
1287                           unsigned extra_replicas)
1288 {
1289         struct open_buckets ptrs2 = { 0 };
1290         struct open_bucket *ob;
1291         unsigned i;
1292
1293         open_bucket_for_each(c, ptrs, ob, i) {
1294                 unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
1295
1296                 if (d && d <= extra_replicas) {
1297                         extra_replicas -= d;
1298                         ob_push(c, ptrs_no_use, ob);
1299                 } else {
1300                         ob_push(c, &ptrs2, ob);
1301                 }
1302         }
1303
1304         *ptrs = ptrs2;
1305 }
1306
1307 /*
1308  * Get us an open_bucket we can allocate from, return with it locked:
1309  */
1310 int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
1311                              unsigned target,
1312                              unsigned erasure_code,
1313                              struct write_point_specifier write_point,
1314                              struct bch_devs_list *devs_have,
1315                              unsigned nr_replicas,
1316                              unsigned nr_replicas_required,
1317                              enum bch_watermark watermark,
1318                              unsigned flags,
1319                              struct closure *cl,
1320                              struct write_point **wp_ret)
1321 {
1322         struct bch_fs *c = trans->c;
1323         struct write_point *wp;
1324         struct open_bucket *ob;
1325         struct open_buckets ptrs;
1326         unsigned nr_effective, write_points_nr;
1327         bool have_cache;
1328         int ret;
1329         int i;
1330
1331         if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
1332                 erasure_code = false;
1333
1334         BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
1335
1336         BUG_ON(!nr_replicas || !nr_replicas_required);
1337 retry:
1338         ptrs.nr         = 0;
1339         nr_effective    = 0;
1340         write_points_nr = c->write_points_nr;
1341         have_cache      = false;
1342
1343         *wp_ret = wp = writepoint_find(trans, write_point.v);
1344
1345         /* metadata may not allocate on cache devices: */
1346         if (wp->data_type != BCH_DATA_user)
1347                 have_cache = true;
1348
1349         if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
1350                 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1351                                               target, erasure_code,
1352                                               nr_replicas, &nr_effective,
1353                                               &have_cache, watermark,
1354                                               flags, NULL);
1355                 if (!ret ||
1356                     bch2_err_matches(ret, BCH_ERR_transaction_restart))
1357                         goto alloc_done;
1358
1359                 /* Don't retry from all devices if we're out of open buckets: */
1360                 if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
1361                         int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1362                                               target, erasure_code,
1363                                               nr_replicas, &nr_effective,
1364                                               &have_cache, watermark,
1365                                               flags, cl);
1366                         if (!ret2 ||
1367                             bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
1368                             bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
1369                                 ret = ret2;
1370                                 goto alloc_done;
1371                         }
1372                 }
1373
1374                 /*
1375                  * Only try to allocate cache (durability = 0 devices) from the
1376                  * specified target:
1377                  */
1378                 have_cache = true;
1379
1380                 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1381                                               0, erasure_code,
1382                                               nr_replicas, &nr_effective,
1383                                               &have_cache, watermark,
1384                                               flags, cl);
1385         } else {
1386                 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1387                                               target, erasure_code,
1388                                               nr_replicas, &nr_effective,
1389                                               &have_cache, watermark,
1390                                               flags, cl);
1391         }
1392 alloc_done:
1393         BUG_ON(!ret && nr_effective < nr_replicas);
1394
1395         if (erasure_code && !ec_open_bucket(c, &ptrs))
1396                 pr_debug("failed to get ec bucket: ret %u", ret);
1397
1398         if (ret == -BCH_ERR_insufficient_devices &&
1399             nr_effective >= nr_replicas_required)
1400                 ret = 0;
1401
1402         if (ret)
1403                 goto err;
1404
1405         if (nr_effective > nr_replicas)
1406                 deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
1407
1408         /* Free buckets we didn't use: */
1409         open_bucket_for_each(c, &wp->ptrs, ob, i)
1410                 open_bucket_free_unused(c, ob);
1411
1412         wp->ptrs = ptrs;
1413
1414         wp->sectors_free = UINT_MAX;
1415
1416         open_bucket_for_each(c, &wp->ptrs, ob, i)
1417                 wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
1418
1419         BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
1420
1421         return 0;
1422 err:
1423         open_bucket_for_each(c, &wp->ptrs, ob, i)
1424                 if (ptrs.nr < ARRAY_SIZE(ptrs.v))
1425                         ob_push(c, &ptrs, ob);
1426                 else
1427                         open_bucket_free_unused(c, ob);
1428         wp->ptrs = ptrs;
1429
1430         mutex_unlock(&wp->lock);
1431
1432         if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
1433             try_decrease_writepoints(trans, write_points_nr))
1434                 goto retry;
1435
1436         if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
1437             bch2_err_matches(ret, BCH_ERR_freelist_empty))
1438                 return cl
1439                         ? -BCH_ERR_bucket_alloc_blocked
1440                         : -BCH_ERR_ENOSPC_bucket_alloc;
1441
1442         return ret;
1443 }
1444
1445 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
1446 {
1447         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1448
1449         return (struct bch_extent_ptr) {
1450                 .type   = 1 << BCH_EXTENT_ENTRY_ptr,
1451                 .gen    = ob->gen,
1452                 .dev    = ob->dev,
1453                 .offset = bucket_to_sector(ca, ob->bucket) +
1454                         ca->mi.bucket_size -
1455                         ob->sectors_free,
1456         };
1457 }
1458
1459 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
1460                                     struct bkey_i *k, unsigned sectors,
1461                                     bool cached)
1462 {
1463         bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
1464 }
1465
1466 /*
1467  * Append pointers to the space we just allocated to @k, and mark @sectors space
1468  * as allocated out of @ob
1469  */
1470 void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
1471 {
1472         bch2_alloc_sectors_done_inlined(c, wp);
1473 }
1474
1475 static inline void writepoint_init(struct write_point *wp,
1476                                    enum bch_data_type type)
1477 {
1478         mutex_init(&wp->lock);
1479         wp->data_type = type;
1480
1481         INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
1482         INIT_LIST_HEAD(&wp->writes);
1483         spin_lock_init(&wp->writes_lock);
1484 }
1485
1486 void bch2_fs_allocator_foreground_init(struct bch_fs *c)
1487 {
1488         struct open_bucket *ob;
1489         struct write_point *wp;
1490
1491         mutex_init(&c->write_points_hash_lock);
1492         c->write_points_nr = ARRAY_SIZE(c->write_points);
1493
1494         /* open bucket 0 is a sentinal NULL: */
1495         spin_lock_init(&c->open_buckets[0].lock);
1496
1497         for (ob = c->open_buckets + 1;
1498              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
1499                 spin_lock_init(&ob->lock);
1500                 c->open_buckets_nr_free++;
1501
1502                 ob->freelist = c->open_buckets_freelist;
1503                 c->open_buckets_freelist = ob - c->open_buckets;
1504         }
1505
1506         writepoint_init(&c->btree_write_point,          BCH_DATA_btree);
1507         writepoint_init(&c->rebalance_write_point,      BCH_DATA_user);
1508         writepoint_init(&c->copygc_write_point,         BCH_DATA_user);
1509
1510         for (wp = c->write_points;
1511              wp < c->write_points + c->write_points_nr; wp++) {
1512                 writepoint_init(wp, BCH_DATA_user);
1513
1514                 wp->last_used   = local_clock();
1515                 wp->write_point = (unsigned long) wp;
1516                 hlist_add_head_rcu(&wp->node,
1517                                    writepoint_hash(c, wp->write_point));
1518         }
1519 }
1520
1521 static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
1522 {
1523         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1524         unsigned data_type = ob->data_type;
1525         barrier(); /* READ_ONCE() doesn't work on bitfields */
1526
1527         prt_printf(out, "%zu ref %u ",
1528                    ob - c->open_buckets,
1529                    atomic_read(&ob->pin));
1530         bch2_prt_data_type(out, data_type);
1531         prt_printf(out, " %u:%llu gen %u allocated %u/%u",
1532                    ob->dev, ob->bucket, ob->gen,
1533                    ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
1534         if (ob->ec)
1535                 prt_printf(out, " ec idx %llu", ob->ec->idx);
1536         if (ob->on_partial_list)
1537                 prt_str(out, " partial");
1538         prt_newline(out);
1539 }
1540
1541 void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
1542 {
1543         struct open_bucket *ob;
1544
1545         out->atomic++;
1546
1547         for (ob = c->open_buckets;
1548              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1549              ob++) {
1550                 spin_lock(&ob->lock);
1551                 if (ob->valid && !ob->on_partial_list)
1552                         bch2_open_bucket_to_text(out, c, ob);
1553                 spin_unlock(&ob->lock);
1554         }
1555
1556         --out->atomic;
1557 }
1558
1559 void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
1560 {
1561         unsigned i;
1562
1563         out->atomic++;
1564         spin_lock(&c->freelist_lock);
1565
1566         for (i = 0; i < c->open_buckets_partial_nr; i++)
1567                 bch2_open_bucket_to_text(out, c,
1568                                 c->open_buckets + c->open_buckets_partial[i]);
1569
1570         spin_unlock(&c->freelist_lock);
1571         --out->atomic;
1572 }
1573
1574 static const char * const bch2_write_point_states[] = {
1575 #define x(n)    #n,
1576         WRITE_POINT_STATES()
1577 #undef x
1578         NULL
1579 };
1580
1581 static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
1582                                      struct write_point *wp)
1583 {
1584         struct open_bucket *ob;
1585         unsigned i;
1586
1587         prt_printf(out, "%lu: ", wp->write_point);
1588         prt_human_readable_u64(out, wp->sectors_allocated);
1589
1590         prt_printf(out, " last wrote: ");
1591         bch2_pr_time_units(out, sched_clock() - wp->last_used);
1592
1593         for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
1594                 prt_printf(out, " %s: ", bch2_write_point_states[i]);
1595                 bch2_pr_time_units(out, wp->time[i]);
1596         }
1597
1598         prt_newline(out);
1599
1600         printbuf_indent_add(out, 2);
1601         open_bucket_for_each(c, &wp->ptrs, ob, i)
1602                 bch2_open_bucket_to_text(out, c, ob);
1603         printbuf_indent_sub(out, 2);
1604 }
1605
1606 void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
1607 {
1608         struct write_point *wp;
1609
1610         prt_str(out, "Foreground write points\n");
1611         for (wp = c->write_points;
1612              wp < c->write_points + ARRAY_SIZE(c->write_points);
1613              wp++)
1614                 bch2_write_point_to_text(out, c, wp);
1615
1616         prt_str(out, "Copygc write point\n");
1617         bch2_write_point_to_text(out, c, &c->copygc_write_point);
1618
1619         prt_str(out, "Rebalance write point\n");
1620         bch2_write_point_to_text(out, c, &c->rebalance_write_point);
1621
1622         prt_str(out, "Btree write point\n");
1623         bch2_write_point_to_text(out, c, &c->btree_write_point);
1624 }