x86/boot/64: Move 5-level paging global variable assignments back
[sfrench/cifs-2.6.git] / fs / bcachefs / alloc_foreground.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright 2012 Google, Inc.
4  *
5  * Foreground allocator code: allocate buckets from freelist, and allocate in
6  * sector granularity from writepoints.
7  *
8  * bch2_bucket_alloc() allocates a single bucket from a specific device.
9  *
10  * bch2_bucket_alloc_set() allocates one or more buckets from different devices
11  * in a given filesystem.
12  */
13
14 #include "bcachefs.h"
15 #include "alloc_background.h"
16 #include "alloc_foreground.h"
17 #include "backpointers.h"
18 #include "btree_iter.h"
19 #include "btree_update.h"
20 #include "btree_gc.h"
21 #include "buckets.h"
22 #include "buckets_waiting_for_journal.h"
23 #include "clock.h"
24 #include "debug.h"
25 #include "disk_groups.h"
26 #include "ec.h"
27 #include "error.h"
28 #include "io_write.h"
29 #include "journal.h"
30 #include "movinggc.h"
31 #include "nocow_locking.h"
32 #include "trace.h"
33
34 #include <linux/math64.h>
35 #include <linux/rculist.h>
36 #include <linux/rcupdate.h>
37
38 static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
39                                            struct mutex *lock)
40 {
41         if (!mutex_trylock(lock)) {
42                 bch2_trans_unlock(trans);
43                 mutex_lock(lock);
44         }
45 }
46
47 const char * const bch2_watermarks[] = {
48 #define x(t) #t,
49         BCH_WATERMARKS()
50 #undef x
51         NULL
52 };
53
54 /*
55  * Open buckets represent a bucket that's currently being allocated from.  They
56  * serve two purposes:
57  *
58  *  - They track buckets that have been partially allocated, allowing for
59  *    sub-bucket sized allocations - they're used by the sector allocator below
60  *
61  *  - They provide a reference to the buckets they own that mark and sweep GC
62  *    can find, until the new allocation has a pointer to it inserted into the
63  *    btree
64  *
65  * When allocating some space with the sector allocator, the allocation comes
66  * with a reference to an open bucket - the caller is required to put that
67  * reference _after_ doing the index update that makes its allocation reachable.
68  */
69
70 void bch2_reset_alloc_cursors(struct bch_fs *c)
71 {
72         rcu_read_lock();
73         for_each_member_device_rcu(c, ca, NULL)
74                 ca->alloc_cursor = 0;
75         rcu_read_unlock();
76 }
77
78 static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
79 {
80         open_bucket_idx_t idx = ob - c->open_buckets;
81         open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
82
83         ob->hash = *slot;
84         *slot = idx;
85 }
86
87 static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
88 {
89         open_bucket_idx_t idx = ob - c->open_buckets;
90         open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
91
92         while (*slot != idx) {
93                 BUG_ON(!*slot);
94                 slot = &c->open_buckets[*slot].hash;
95         }
96
97         *slot = ob->hash;
98         ob->hash = 0;
99 }
100
101 void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
102 {
103         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
104
105         if (ob->ec) {
106                 ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
107                 return;
108         }
109
110         percpu_down_read(&c->mark_lock);
111         spin_lock(&ob->lock);
112
113         ob->valid = false;
114         ob->data_type = 0;
115
116         spin_unlock(&ob->lock);
117         percpu_up_read(&c->mark_lock);
118
119         spin_lock(&c->freelist_lock);
120         bch2_open_bucket_hash_remove(c, ob);
121
122         ob->freelist = c->open_buckets_freelist;
123         c->open_buckets_freelist = ob - c->open_buckets;
124
125         c->open_buckets_nr_free++;
126         ca->nr_open_buckets--;
127         spin_unlock(&c->freelist_lock);
128
129         closure_wake_up(&c->open_buckets_wait);
130 }
131
132 void bch2_open_bucket_write_error(struct bch_fs *c,
133                                   struct open_buckets *obs,
134                                   unsigned dev)
135 {
136         struct open_bucket *ob;
137         unsigned i;
138
139         open_bucket_for_each(c, obs, ob, i)
140                 if (ob->dev == dev && ob->ec)
141                         bch2_ec_bucket_cancel(c, ob);
142 }
143
144 static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
145 {
146         struct open_bucket *ob;
147
148         BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
149
150         ob = c->open_buckets + c->open_buckets_freelist;
151         c->open_buckets_freelist = ob->freelist;
152         atomic_set(&ob->pin, 1);
153         ob->data_type = 0;
154
155         c->open_buckets_nr_free--;
156         return ob;
157 }
158
159 static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
160 {
161         BUG_ON(c->open_buckets_partial_nr >=
162                ARRAY_SIZE(c->open_buckets_partial));
163
164         spin_lock(&c->freelist_lock);
165         ob->on_partial_list = true;
166         c->open_buckets_partial[c->open_buckets_partial_nr++] =
167                 ob - c->open_buckets;
168         spin_unlock(&c->freelist_lock);
169
170         closure_wake_up(&c->open_buckets_wait);
171         closure_wake_up(&c->freelist_wait);
172 }
173
174 /* _only_ for allocating the journal on a new device: */
175 long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
176 {
177         while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
178                 u64 b = ca->new_fs_bucket_idx++;
179
180                 if (!is_superblock_bucket(ca, b) &&
181                     (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
182                         return b;
183         }
184
185         return -1;
186 }
187
188 static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
189 {
190         switch (watermark) {
191         case BCH_WATERMARK_reclaim:
192                 return 0;
193         case BCH_WATERMARK_btree:
194         case BCH_WATERMARK_btree_copygc:
195                 return OPEN_BUCKETS_COUNT / 4;
196         case BCH_WATERMARK_copygc:
197                 return OPEN_BUCKETS_COUNT / 3;
198         default:
199                 return OPEN_BUCKETS_COUNT / 2;
200         }
201 }
202
203 static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
204                                               u64 bucket,
205                                               enum bch_watermark watermark,
206                                               const struct bch_alloc_v4 *a,
207                                               struct bucket_alloc_state *s,
208                                               struct closure *cl)
209 {
210         struct open_bucket *ob;
211
212         if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
213                 s->skipped_nouse++;
214                 return NULL;
215         }
216
217         if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
218                 s->skipped_open++;
219                 return NULL;
220         }
221
222         if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
223                         c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
224                 s->skipped_need_journal_commit++;
225                 return NULL;
226         }
227
228         if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
229                 s->skipped_nocow++;
230                 return NULL;
231         }
232
233         spin_lock(&c->freelist_lock);
234
235         if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
236                 if (cl)
237                         closure_wait(&c->open_buckets_wait, cl);
238
239                 track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
240                 spin_unlock(&c->freelist_lock);
241                 return ERR_PTR(-BCH_ERR_open_buckets_empty);
242         }
243
244         /* Recheck under lock: */
245         if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
246                 spin_unlock(&c->freelist_lock);
247                 s->skipped_open++;
248                 return NULL;
249         }
250
251         ob = bch2_open_bucket_alloc(c);
252
253         spin_lock(&ob->lock);
254
255         ob->valid       = true;
256         ob->sectors_free = ca->mi.bucket_size;
257         ob->dev         = ca->dev_idx;
258         ob->gen         = a->gen;
259         ob->bucket      = bucket;
260         spin_unlock(&ob->lock);
261
262         ca->nr_open_buckets++;
263         bch2_open_bucket_hash_add(c, ob);
264
265         track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
266         track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
267
268         spin_unlock(&c->freelist_lock);
269         return ob;
270 }
271
272 static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
273                                             enum bch_watermark watermark, u64 free_entry,
274                                             struct bucket_alloc_state *s,
275                                             struct bkey_s_c freespace_k,
276                                             struct closure *cl)
277 {
278         struct bch_fs *c = trans->c;
279         struct btree_iter iter = { NULL };
280         struct bkey_s_c k;
281         struct open_bucket *ob;
282         struct bch_alloc_v4 a_convert;
283         const struct bch_alloc_v4 *a;
284         u64 b = free_entry & ~(~0ULL << 56);
285         unsigned genbits = free_entry >> 56;
286         struct printbuf buf = PRINTBUF;
287         int ret;
288
289         if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
290                 prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
291                        "  freespace key ",
292                         ca->mi.first_bucket, ca->mi.nbuckets);
293                 bch2_bkey_val_to_text(&buf, c, freespace_k);
294                 bch2_trans_inconsistent(trans, "%s", buf.buf);
295                 ob = ERR_PTR(-EIO);
296                 goto err;
297         }
298
299         k = bch2_bkey_get_iter(trans, &iter,
300                                BTREE_ID_alloc, POS(ca->dev_idx, b),
301                                BTREE_ITER_CACHED);
302         ret = bkey_err(k);
303         if (ret) {
304                 ob = ERR_PTR(ret);
305                 goto err;
306         }
307
308         a = bch2_alloc_to_v4(k, &a_convert);
309
310         if (a->data_type != BCH_DATA_free) {
311                 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
312                         ob = NULL;
313                         goto err;
314                 }
315
316                 prt_printf(&buf, "non free bucket in freespace btree\n"
317                        "  freespace key ");
318                 bch2_bkey_val_to_text(&buf, c, freespace_k);
319                 prt_printf(&buf, "\n  ");
320                 bch2_bkey_val_to_text(&buf, c, k);
321                 bch2_trans_inconsistent(trans, "%s", buf.buf);
322                 ob = ERR_PTR(-EIO);
323                 goto err;
324         }
325
326         if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
327             c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
328                 prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
329                        "  freespace key ",
330                        genbits, alloc_freespace_genbits(*a) >> 56);
331                 bch2_bkey_val_to_text(&buf, c, freespace_k);
332                 prt_printf(&buf, "\n  ");
333                 bch2_bkey_val_to_text(&buf, c, k);
334                 bch2_trans_inconsistent(trans, "%s", buf.buf);
335                 ob = ERR_PTR(-EIO);
336                 goto err;
337         }
338
339         if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) {
340                 struct bch_backpointer bp;
341                 struct bpos bp_pos = POS_MIN;
342
343                 ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
344                                                 &bp_pos, &bp,
345                                                 BTREE_ITER_NOPRESERVE);
346                 if (ret) {
347                         ob = ERR_PTR(ret);
348                         goto err;
349                 }
350
351                 if (!bkey_eq(bp_pos, POS_MAX)) {
352                         /*
353                          * Bucket may have data in it - we don't call
354                          * bc2h_trans_inconnsistent() because fsck hasn't
355                          * finished yet
356                          */
357                         ob = NULL;
358                         goto err;
359                 }
360         }
361
362         ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
363         if (!ob)
364                 set_btree_iter_dontneed(&iter);
365 err:
366         if (iter.path)
367                 set_btree_iter_dontneed(&iter);
368         bch2_trans_iter_exit(trans, &iter);
369         printbuf_exit(&buf);
370         return ob;
371 }
372
373 /*
374  * This path is for before the freespace btree is initialized:
375  *
376  * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
377  * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
378  */
379 static noinline struct open_bucket *
380 bch2_bucket_alloc_early(struct btree_trans *trans,
381                         struct bch_dev *ca,
382                         enum bch_watermark watermark,
383                         struct bucket_alloc_state *s,
384                         struct closure *cl)
385 {
386         struct btree_iter iter, citer;
387         struct bkey_s_c k, ck;
388         struct open_bucket *ob = NULL;
389         u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
390         u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
391         u64 alloc_cursor = alloc_start;
392         int ret;
393
394         /*
395          * Scan with an uncached iterator to avoid polluting the key cache. An
396          * uncached iter will return a cached key if one exists, but if not
397          * there is no other underlying protection for the associated key cache
398          * slot. To avoid racing bucket allocations, look up the cached key slot
399          * of any likely allocation candidate before attempting to proceed with
400          * the allocation. This provides proper exclusion on the associated
401          * bucket.
402          */
403 again:
404         for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
405                            BTREE_ITER_SLOTS, k, ret) {
406                 struct bch_alloc_v4 a_convert;
407                 const struct bch_alloc_v4 *a;
408
409                 if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
410                         break;
411
412                 if (ca->new_fs_bucket_idx &&
413                     is_superblock_bucket(ca, k.k->p.offset))
414                         continue;
415
416                 a = bch2_alloc_to_v4(k, &a_convert);
417                 if (a->data_type != BCH_DATA_free)
418                         continue;
419
420                 /* now check the cached key to serialize concurrent allocs of the bucket */
421                 ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
422                 ret = bkey_err(ck);
423                 if (ret)
424                         break;
425
426                 a = bch2_alloc_to_v4(ck, &a_convert);
427                 if (a->data_type != BCH_DATA_free)
428                         goto next;
429
430                 s->buckets_seen++;
431
432                 ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
433 next:
434                 set_btree_iter_dontneed(&citer);
435                 bch2_trans_iter_exit(trans, &citer);
436                 if (ob)
437                         break;
438         }
439         bch2_trans_iter_exit(trans, &iter);
440
441         alloc_cursor = iter.pos.offset;
442         ca->alloc_cursor = alloc_cursor;
443
444         if (!ob && ret)
445                 ob = ERR_PTR(ret);
446
447         if (!ob && alloc_start > first_bucket) {
448                 alloc_cursor = alloc_start = first_bucket;
449                 goto again;
450         }
451
452         return ob;
453 }
454
455 static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
456                                                    struct bch_dev *ca,
457                                                    enum bch_watermark watermark,
458                                                    struct bucket_alloc_state *s,
459                                                    struct closure *cl)
460 {
461         struct btree_iter iter;
462         struct bkey_s_c k;
463         struct open_bucket *ob = NULL;
464         u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
465         u64 alloc_cursor = alloc_start;
466         int ret;
467
468         BUG_ON(ca->new_fs_bucket_idx);
469 again:
470         for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
471                                      POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
472                 if (k.k->p.inode != ca->dev_idx)
473                         break;
474
475                 for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
476                      alloc_cursor < k.k->p.offset;
477                      alloc_cursor++) {
478                         ret = btree_trans_too_many_iters(trans);
479                         if (ret) {
480                                 ob = ERR_PTR(ret);
481                                 break;
482                         }
483
484                         s->buckets_seen++;
485
486                         ob = try_alloc_bucket(trans, ca, watermark,
487                                               alloc_cursor, s, k, cl);
488                         if (ob) {
489                                 set_btree_iter_dontneed(&iter);
490                                 break;
491                         }
492                 }
493
494                 if (ob || ret)
495                         break;
496         }
497         bch2_trans_iter_exit(trans, &iter);
498
499         ca->alloc_cursor = alloc_cursor;
500
501         if (!ob && ret)
502                 ob = ERR_PTR(ret);
503
504         if (!ob && alloc_start > ca->mi.first_bucket) {
505                 alloc_cursor = alloc_start = ca->mi.first_bucket;
506                 goto again;
507         }
508
509         return ob;
510 }
511
512 /**
513  * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
514  * @trans:      transaction object
515  * @ca:         device to allocate from
516  * @watermark:  how important is this allocation?
517  * @cl:         if not NULL, closure to be used to wait if buckets not available
518  * @usage:      for secondarily also returning the current device usage
519  *
520  * Returns:     an open_bucket on success, or an ERR_PTR() on failure.
521  */
522 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
523                                       struct bch_dev *ca,
524                                       enum bch_watermark watermark,
525                                       struct closure *cl,
526                                       struct bch_dev_usage *usage)
527 {
528         struct bch_fs *c = trans->c;
529         struct open_bucket *ob = NULL;
530         bool freespace = READ_ONCE(ca->mi.freespace_initialized);
531         u64 avail;
532         struct bucket_alloc_state s = { 0 };
533         bool waiting = false;
534 again:
535         bch2_dev_usage_read_fast(ca, usage);
536         avail = dev_buckets_free(ca, *usage, watermark);
537
538         if (usage->d[BCH_DATA_need_discard].buckets > avail)
539                 bch2_do_discards(c);
540
541         if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
542                 bch2_do_gc_gens(c);
543
544         if (should_invalidate_buckets(ca, *usage))
545                 bch2_do_invalidates(c);
546
547         if (!avail) {
548                 if (cl && !waiting) {
549                         closure_wait(&c->freelist_wait, cl);
550                         waiting = true;
551                         goto again;
552                 }
553
554                 track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
555
556                 ob = ERR_PTR(-BCH_ERR_freelist_empty);
557                 goto err;
558         }
559
560         if (waiting)
561                 closure_wake_up(&c->freelist_wait);
562 alloc:
563         ob = likely(freespace)
564                 ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
565                 : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
566
567         if (s.skipped_need_journal_commit * 2 > avail)
568                 bch2_journal_flush_async(&c->journal, NULL);
569
570         if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
571                 freespace = false;
572                 goto alloc;
573         }
574 err:
575         if (!ob)
576                 ob = ERR_PTR(-BCH_ERR_no_buckets_found);
577
578         if (!IS_ERR(ob))
579                 trace_and_count(c, bucket_alloc, ca,
580                                 bch2_watermarks[watermark],
581                                 ob->bucket,
582                                 usage->d[BCH_DATA_free].buckets,
583                                 avail,
584                                 bch2_copygc_wait_amount(c),
585                                 c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
586                                 &s,
587                                 cl == NULL,
588                                 "");
589         else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
590                 trace_and_count(c, bucket_alloc_fail, ca,
591                                 bch2_watermarks[watermark],
592                                 0,
593                                 usage->d[BCH_DATA_free].buckets,
594                                 avail,
595                                 bch2_copygc_wait_amount(c),
596                                 c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
597                                 &s,
598                                 cl == NULL,
599                                 bch2_err_str(PTR_ERR(ob)));
600
601         return ob;
602 }
603
604 struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
605                                       enum bch_watermark watermark,
606                                       struct closure *cl)
607 {
608         struct bch_dev_usage usage;
609         struct open_bucket *ob;
610
611         bch2_trans_do(c, NULL, NULL, 0,
612                       PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
613                                                         cl, &usage)));
614         return ob;
615 }
616
617 static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
618                             unsigned l, unsigned r)
619 {
620         return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
621                 (stripe->next_alloc[l] < stripe->next_alloc[r]));
622 }
623
624 #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
625
626 struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
627                                           struct dev_stripe_state *stripe,
628                                           struct bch_devs_mask *devs)
629 {
630         struct dev_alloc_list ret = { .nr = 0 };
631         unsigned i;
632
633         for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
634                 ret.devs[ret.nr++] = i;
635
636         bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
637         return ret;
638 }
639
640 static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
641                                struct dev_stripe_state *stripe,
642                                struct bch_dev_usage *usage)
643 {
644         u64 *v = stripe->next_alloc + ca->dev_idx;
645         u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
646         u64 free_space_inv = free_space
647                 ? div64_u64(1ULL << 48, free_space)
648                 : 1ULL << 48;
649         u64 scale = *v / 4;
650
651         if (*v + free_space_inv >= *v)
652                 *v += free_space_inv;
653         else
654                 *v = U64_MAX;
655
656         for (v = stripe->next_alloc;
657              v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
658                 *v = *v < scale ? 0 : *v - scale;
659 }
660
661 void bch2_dev_stripe_increment(struct bch_dev *ca,
662                                struct dev_stripe_state *stripe)
663 {
664         struct bch_dev_usage usage;
665
666         bch2_dev_usage_read_fast(ca, &usage);
667         bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
668 }
669
670 static int add_new_bucket(struct bch_fs *c,
671                            struct open_buckets *ptrs,
672                            struct bch_devs_mask *devs_may_alloc,
673                            unsigned nr_replicas,
674                            unsigned *nr_effective,
675                            bool *have_cache,
676                            unsigned flags,
677                            struct open_bucket *ob)
678 {
679         unsigned durability =
680                 bch_dev_bkey_exists(c, ob->dev)->mi.durability;
681
682         BUG_ON(*nr_effective >= nr_replicas);
683
684         __clear_bit(ob->dev, devs_may_alloc->d);
685         *nr_effective   += durability;
686         *have_cache     |= !durability;
687
688         ob_push(c, ptrs, ob);
689
690         if (*nr_effective >= nr_replicas)
691                 return 1;
692         if (ob->ec)
693                 return 1;
694         return 0;
695 }
696
697 int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
698                       struct open_buckets *ptrs,
699                       struct dev_stripe_state *stripe,
700                       struct bch_devs_mask *devs_may_alloc,
701                       unsigned nr_replicas,
702                       unsigned *nr_effective,
703                       bool *have_cache,
704                       unsigned flags,
705                       enum bch_data_type data_type,
706                       enum bch_watermark watermark,
707                       struct closure *cl)
708 {
709         struct bch_fs *c = trans->c;
710         struct dev_alloc_list devs_sorted =
711                 bch2_dev_alloc_list(c, stripe, devs_may_alloc);
712         unsigned dev;
713         struct bch_dev *ca;
714         int ret = -BCH_ERR_insufficient_devices;
715         unsigned i;
716
717         BUG_ON(*nr_effective >= nr_replicas);
718
719         for (i = 0; i < devs_sorted.nr; i++) {
720                 struct bch_dev_usage usage;
721                 struct open_bucket *ob;
722
723                 dev = devs_sorted.devs[i];
724
725                 rcu_read_lock();
726                 ca = rcu_dereference(c->devs[dev]);
727                 if (ca)
728                         percpu_ref_get(&ca->ref);
729                 rcu_read_unlock();
730
731                 if (!ca)
732                         continue;
733
734                 if (!ca->mi.durability && *have_cache) {
735                         percpu_ref_put(&ca->ref);
736                         continue;
737                 }
738
739                 ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage);
740                 if (!IS_ERR(ob))
741                         bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
742                 percpu_ref_put(&ca->ref);
743
744                 if (IS_ERR(ob)) {
745                         ret = PTR_ERR(ob);
746                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
747                                 break;
748                         continue;
749                 }
750
751                 ob->data_type = data_type;
752
753                 if (add_new_bucket(c, ptrs, devs_may_alloc,
754                                    nr_replicas, nr_effective,
755                                    have_cache, flags, ob)) {
756                         ret = 0;
757                         break;
758                 }
759         }
760
761         return ret;
762 }
763
764 /* Allocate from stripes: */
765
766 /*
767  * if we can't allocate a new stripe because there are already too many
768  * partially filled stripes, force allocating from an existing stripe even when
769  * it's to a device we don't want:
770  */
771
772 static int bucket_alloc_from_stripe(struct btree_trans *trans,
773                          struct open_buckets *ptrs,
774                          struct write_point *wp,
775                          struct bch_devs_mask *devs_may_alloc,
776                          u16 target,
777                          unsigned nr_replicas,
778                          unsigned *nr_effective,
779                          bool *have_cache,
780                          enum bch_watermark watermark,
781                          unsigned flags,
782                          struct closure *cl)
783 {
784         struct bch_fs *c = trans->c;
785         struct dev_alloc_list devs_sorted;
786         struct ec_stripe_head *h;
787         struct open_bucket *ob;
788         unsigned i, ec_idx;
789         int ret = 0;
790
791         if (nr_replicas < 2)
792                 return 0;
793
794         if (ec_open_bucket(c, ptrs))
795                 return 0;
796
797         h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
798         if (IS_ERR(h))
799                 return PTR_ERR(h);
800         if (!h)
801                 return 0;
802
803         devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
804
805         for (i = 0; i < devs_sorted.nr; i++)
806                 for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
807                         if (!h->s->blocks[ec_idx])
808                                 continue;
809
810                         ob = c->open_buckets + h->s->blocks[ec_idx];
811                         if (ob->dev == devs_sorted.devs[i] &&
812                             !test_and_set_bit(ec_idx, h->s->blocks_allocated))
813                                 goto got_bucket;
814                 }
815         goto out_put_head;
816 got_bucket:
817         ob->ec_idx      = ec_idx;
818         ob->ec          = h->s;
819         ec_stripe_new_get(h->s, STRIPE_REF_io);
820
821         ret = add_new_bucket(c, ptrs, devs_may_alloc,
822                              nr_replicas, nr_effective,
823                              have_cache, flags, ob);
824 out_put_head:
825         bch2_ec_stripe_head_put(c, h);
826         return ret;
827 }
828
829 /* Sector allocator */
830
831 static bool want_bucket(struct bch_fs *c,
832                         struct write_point *wp,
833                         struct bch_devs_mask *devs_may_alloc,
834                         bool *have_cache, bool ec,
835                         struct open_bucket *ob)
836 {
837         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
838
839         if (!test_bit(ob->dev, devs_may_alloc->d))
840                 return false;
841
842         if (ob->data_type != wp->data_type)
843                 return false;
844
845         if (!ca->mi.durability &&
846             (wp->data_type == BCH_DATA_btree || ec || *have_cache))
847                 return false;
848
849         if (ec != (ob->ec != NULL))
850                 return false;
851
852         return true;
853 }
854
855 static int bucket_alloc_set_writepoint(struct bch_fs *c,
856                                        struct open_buckets *ptrs,
857                                        struct write_point *wp,
858                                        struct bch_devs_mask *devs_may_alloc,
859                                        unsigned nr_replicas,
860                                        unsigned *nr_effective,
861                                        bool *have_cache,
862                                        bool ec, unsigned flags)
863 {
864         struct open_buckets ptrs_skip = { .nr = 0 };
865         struct open_bucket *ob;
866         unsigned i;
867         int ret = 0;
868
869         open_bucket_for_each(c, &wp->ptrs, ob, i) {
870                 if (!ret && want_bucket(c, wp, devs_may_alloc,
871                                         have_cache, ec, ob))
872                         ret = add_new_bucket(c, ptrs, devs_may_alloc,
873                                        nr_replicas, nr_effective,
874                                        have_cache, flags, ob);
875                 else
876                         ob_push(c, &ptrs_skip, ob);
877         }
878         wp->ptrs = ptrs_skip;
879
880         return ret;
881 }
882
883 static int bucket_alloc_set_partial(struct bch_fs *c,
884                                     struct open_buckets *ptrs,
885                                     struct write_point *wp,
886                                     struct bch_devs_mask *devs_may_alloc,
887                                     unsigned nr_replicas,
888                                     unsigned *nr_effective,
889                                     bool *have_cache, bool ec,
890                                     enum bch_watermark watermark,
891                                     unsigned flags)
892 {
893         int i, ret = 0;
894
895         if (!c->open_buckets_partial_nr)
896                 return 0;
897
898         spin_lock(&c->freelist_lock);
899
900         if (!c->open_buckets_partial_nr)
901                 goto unlock;
902
903         for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
904                 struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
905
906                 if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
907                         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
908                         struct bch_dev_usage usage;
909                         u64 avail;
910
911                         bch2_dev_usage_read_fast(ca, &usage);
912                         avail = dev_buckets_free(ca, usage, watermark);
913                         if (!avail)
914                                 continue;
915
916                         array_remove_item(c->open_buckets_partial,
917                                           c->open_buckets_partial_nr,
918                                           i);
919                         ob->on_partial_list = false;
920
921                         ret = add_new_bucket(c, ptrs, devs_may_alloc,
922                                              nr_replicas, nr_effective,
923                                              have_cache, flags, ob);
924                         if (ret)
925                                 break;
926                 }
927         }
928 unlock:
929         spin_unlock(&c->freelist_lock);
930         return ret;
931 }
932
933 static int __open_bucket_add_buckets(struct btree_trans *trans,
934                         struct open_buckets *ptrs,
935                         struct write_point *wp,
936                         struct bch_devs_list *devs_have,
937                         u16 target,
938                         bool erasure_code,
939                         unsigned nr_replicas,
940                         unsigned *nr_effective,
941                         bool *have_cache,
942                         enum bch_watermark watermark,
943                         unsigned flags,
944                         struct closure *_cl)
945 {
946         struct bch_fs *c = trans->c;
947         struct bch_devs_mask devs;
948         struct open_bucket *ob;
949         struct closure *cl = NULL;
950         unsigned i;
951         int ret;
952
953         devs = target_rw_devs(c, wp->data_type, target);
954
955         /* Don't allocate from devices we already have pointers to: */
956         darray_for_each(*devs_have, i)
957                 __clear_bit(*i, devs.d);
958
959         open_bucket_for_each(c, ptrs, ob, i)
960                 __clear_bit(ob->dev, devs.d);
961
962         if (erasure_code && ec_open_bucket(c, ptrs))
963                 return 0;
964
965         ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
966                                  nr_replicas, nr_effective,
967                                  have_cache, erasure_code, flags);
968         if (ret)
969                 return ret;
970
971         ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
972                                  nr_replicas, nr_effective,
973                                  have_cache, erasure_code, watermark, flags);
974         if (ret)
975                 return ret;
976
977         if (erasure_code) {
978                 ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
979                                          target,
980                                          nr_replicas, nr_effective,
981                                          have_cache,
982                                          watermark, flags, _cl);
983         } else {
984 retry_blocking:
985                 /*
986                  * Try nonblocking first, so that if one device is full we'll try from
987                  * other devices:
988                  */
989                 ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
990                                         nr_replicas, nr_effective, have_cache,
991                                         flags, wp->data_type, watermark, cl);
992                 if (ret &&
993                     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
994                     !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
995                     !cl && _cl) {
996                         cl = _cl;
997                         goto retry_blocking;
998                 }
999         }
1000
1001         return ret;
1002 }
1003
1004 static int open_bucket_add_buckets(struct btree_trans *trans,
1005                         struct open_buckets *ptrs,
1006                         struct write_point *wp,
1007                         struct bch_devs_list *devs_have,
1008                         u16 target,
1009                         unsigned erasure_code,
1010                         unsigned nr_replicas,
1011                         unsigned *nr_effective,
1012                         bool *have_cache,
1013                         enum bch_watermark watermark,
1014                         unsigned flags,
1015                         struct closure *cl)
1016 {
1017         int ret;
1018
1019         if (erasure_code) {
1020                 ret = __open_bucket_add_buckets(trans, ptrs, wp,
1021                                 devs_have, target, erasure_code,
1022                                 nr_replicas, nr_effective, have_cache,
1023                                 watermark, flags, cl);
1024                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1025                     bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
1026                     bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
1027                     bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
1028                         return ret;
1029                 if (*nr_effective >= nr_replicas)
1030                         return 0;
1031         }
1032
1033         ret = __open_bucket_add_buckets(trans, ptrs, wp,
1034                         devs_have, target, false,
1035                         nr_replicas, nr_effective, have_cache,
1036                         watermark, flags, cl);
1037         return ret < 0 ? ret : 0;
1038 }
1039
1040 /**
1041  * should_drop_bucket - check if this is open_bucket should go away
1042  * @ob:         open_bucket to predicate on
1043  * @c:          filesystem handle
1044  * @ca:         if set, we're killing buckets for a particular device
1045  * @ec:         if true, we're shutting down erasure coding and killing all ec
1046  *              open_buckets
1047  *              otherwise, return true
1048  * Returns: true if we should kill this open_bucket
1049  *
1050  * We're killing open_buckets because we're shutting down a device, erasure
1051  * coding, or the entire filesystem - check if this open_bucket matches:
1052  */
1053 static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
1054                                struct bch_dev *ca, bool ec)
1055 {
1056         if (ec) {
1057                 return ob->ec != NULL;
1058         } else if (ca) {
1059                 bool drop = ob->dev == ca->dev_idx;
1060                 struct open_bucket *ob2;
1061                 unsigned i;
1062
1063                 if (!drop && ob->ec) {
1064                         unsigned nr_blocks;
1065
1066                         mutex_lock(&ob->ec->lock);
1067                         nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
1068
1069                         for (i = 0; i < nr_blocks; i++) {
1070                                 if (!ob->ec->blocks[i])
1071                                         continue;
1072
1073                                 ob2 = c->open_buckets + ob->ec->blocks[i];
1074                                 drop |= ob2->dev == ca->dev_idx;
1075                         }
1076                         mutex_unlock(&ob->ec->lock);
1077                 }
1078
1079                 return drop;
1080         } else {
1081                 return true;
1082         }
1083 }
1084
1085 static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
1086                                  bool ec, struct write_point *wp)
1087 {
1088         struct open_buckets ptrs = { .nr = 0 };
1089         struct open_bucket *ob;
1090         unsigned i;
1091
1092         mutex_lock(&wp->lock);
1093         open_bucket_for_each(c, &wp->ptrs, ob, i)
1094                 if (should_drop_bucket(ob, c, ca, ec))
1095                         bch2_open_bucket_put(c, ob);
1096                 else
1097                         ob_push(c, &ptrs, ob);
1098         wp->ptrs = ptrs;
1099         mutex_unlock(&wp->lock);
1100 }
1101
1102 void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
1103                             bool ec)
1104 {
1105         unsigned i;
1106
1107         /* Next, close write points that point to this device... */
1108         for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1109                 bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
1110
1111         bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
1112         bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
1113         bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
1114
1115         mutex_lock(&c->btree_reserve_cache_lock);
1116         while (c->btree_reserve_cache_nr) {
1117                 struct btree_alloc *a =
1118                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1119
1120                 bch2_open_buckets_put(c, &a->ob);
1121         }
1122         mutex_unlock(&c->btree_reserve_cache_lock);
1123
1124         spin_lock(&c->freelist_lock);
1125         i = 0;
1126         while (i < c->open_buckets_partial_nr) {
1127                 struct open_bucket *ob =
1128                         c->open_buckets + c->open_buckets_partial[i];
1129
1130                 if (should_drop_bucket(ob, c, ca, ec)) {
1131                         --c->open_buckets_partial_nr;
1132                         swap(c->open_buckets_partial[i],
1133                              c->open_buckets_partial[c->open_buckets_partial_nr]);
1134                         ob->on_partial_list = false;
1135                         spin_unlock(&c->freelist_lock);
1136                         bch2_open_bucket_put(c, ob);
1137                         spin_lock(&c->freelist_lock);
1138                 } else {
1139                         i++;
1140                 }
1141         }
1142         spin_unlock(&c->freelist_lock);
1143
1144         bch2_ec_stop_dev(c, ca);
1145 }
1146
1147 static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
1148                                                  unsigned long write_point)
1149 {
1150         unsigned hash =
1151                 hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
1152
1153         return &c->write_points_hash[hash];
1154 }
1155
1156 static struct write_point *__writepoint_find(struct hlist_head *head,
1157                                              unsigned long write_point)
1158 {
1159         struct write_point *wp;
1160
1161         rcu_read_lock();
1162         hlist_for_each_entry_rcu(wp, head, node)
1163                 if (wp->write_point == write_point)
1164                         goto out;
1165         wp = NULL;
1166 out:
1167         rcu_read_unlock();
1168         return wp;
1169 }
1170
1171 static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
1172 {
1173         u64 stranded    = c->write_points_nr * c->bucket_size_max;
1174         u64 free        = bch2_fs_usage_read_short(c).free;
1175
1176         return stranded * factor > free;
1177 }
1178
1179 static bool try_increase_writepoints(struct bch_fs *c)
1180 {
1181         struct write_point *wp;
1182
1183         if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
1184             too_many_writepoints(c, 32))
1185                 return false;
1186
1187         wp = c->write_points + c->write_points_nr++;
1188         hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
1189         return true;
1190 }
1191
1192 static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
1193 {
1194         struct bch_fs *c = trans->c;
1195         struct write_point *wp;
1196         struct open_bucket *ob;
1197         unsigned i;
1198
1199         mutex_lock(&c->write_points_hash_lock);
1200         if (c->write_points_nr < old_nr) {
1201                 mutex_unlock(&c->write_points_hash_lock);
1202                 return true;
1203         }
1204
1205         if (c->write_points_nr == 1 ||
1206             !too_many_writepoints(c, 8)) {
1207                 mutex_unlock(&c->write_points_hash_lock);
1208                 return false;
1209         }
1210
1211         wp = c->write_points + --c->write_points_nr;
1212
1213         hlist_del_rcu(&wp->node);
1214         mutex_unlock(&c->write_points_hash_lock);
1215
1216         bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1217         open_bucket_for_each(c, &wp->ptrs, ob, i)
1218                 open_bucket_free_unused(c, ob);
1219         wp->ptrs.nr = 0;
1220         mutex_unlock(&wp->lock);
1221         return true;
1222 }
1223
1224 static struct write_point *writepoint_find(struct btree_trans *trans,
1225                                            unsigned long write_point)
1226 {
1227         struct bch_fs *c = trans->c;
1228         struct write_point *wp, *oldest;
1229         struct hlist_head *head;
1230
1231         if (!(write_point & 1UL)) {
1232                 wp = (struct write_point *) write_point;
1233                 bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1234                 return wp;
1235         }
1236
1237         head = writepoint_hash(c, write_point);
1238 restart_find:
1239         wp = __writepoint_find(head, write_point);
1240         if (wp) {
1241 lock_wp:
1242                 bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1243                 if (wp->write_point == write_point)
1244                         goto out;
1245                 mutex_unlock(&wp->lock);
1246                 goto restart_find;
1247         }
1248 restart_find_oldest:
1249         oldest = NULL;
1250         for (wp = c->write_points;
1251              wp < c->write_points + c->write_points_nr; wp++)
1252                 if (!oldest || time_before64(wp->last_used, oldest->last_used))
1253                         oldest = wp;
1254
1255         bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
1256         bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
1257         if (oldest >= c->write_points + c->write_points_nr ||
1258             try_increase_writepoints(c)) {
1259                 mutex_unlock(&c->write_points_hash_lock);
1260                 mutex_unlock(&oldest->lock);
1261                 goto restart_find_oldest;
1262         }
1263
1264         wp = __writepoint_find(head, write_point);
1265         if (wp && wp != oldest) {
1266                 mutex_unlock(&c->write_points_hash_lock);
1267                 mutex_unlock(&oldest->lock);
1268                 goto lock_wp;
1269         }
1270
1271         wp = oldest;
1272         hlist_del_rcu(&wp->node);
1273         wp->write_point = write_point;
1274         hlist_add_head_rcu(&wp->node, head);
1275         mutex_unlock(&c->write_points_hash_lock);
1276 out:
1277         wp->last_used = local_clock();
1278         return wp;
1279 }
1280
1281 static noinline void
1282 deallocate_extra_replicas(struct bch_fs *c,
1283                           struct open_buckets *ptrs,
1284                           struct open_buckets *ptrs_no_use,
1285                           unsigned extra_replicas)
1286 {
1287         struct open_buckets ptrs2 = { 0 };
1288         struct open_bucket *ob;
1289         unsigned i;
1290
1291         open_bucket_for_each(c, ptrs, ob, i) {
1292                 unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
1293
1294                 if (d && d <= extra_replicas) {
1295                         extra_replicas -= d;
1296                         ob_push(c, ptrs_no_use, ob);
1297                 } else {
1298                         ob_push(c, &ptrs2, ob);
1299                 }
1300         }
1301
1302         *ptrs = ptrs2;
1303 }
1304
1305 /*
1306  * Get us an open_bucket we can allocate from, return with it locked:
1307  */
1308 int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
1309                              unsigned target,
1310                              unsigned erasure_code,
1311                              struct write_point_specifier write_point,
1312                              struct bch_devs_list *devs_have,
1313                              unsigned nr_replicas,
1314                              unsigned nr_replicas_required,
1315                              enum bch_watermark watermark,
1316                              unsigned flags,
1317                              struct closure *cl,
1318                              struct write_point **wp_ret)
1319 {
1320         struct bch_fs *c = trans->c;
1321         struct write_point *wp;
1322         struct open_bucket *ob;
1323         struct open_buckets ptrs;
1324         unsigned nr_effective, write_points_nr;
1325         bool have_cache;
1326         int ret;
1327         int i;
1328
1329         if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
1330                 erasure_code = false;
1331
1332         BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
1333
1334         BUG_ON(!nr_replicas || !nr_replicas_required);
1335 retry:
1336         ptrs.nr         = 0;
1337         nr_effective    = 0;
1338         write_points_nr = c->write_points_nr;
1339         have_cache      = false;
1340
1341         *wp_ret = wp = writepoint_find(trans, write_point.v);
1342
1343         /* metadata may not allocate on cache devices: */
1344         if (wp->data_type != BCH_DATA_user)
1345                 have_cache = true;
1346
1347         if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
1348                 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1349                                               target, erasure_code,
1350                                               nr_replicas, &nr_effective,
1351                                               &have_cache, watermark,
1352                                               flags, NULL);
1353                 if (!ret ||
1354                     bch2_err_matches(ret, BCH_ERR_transaction_restart))
1355                         goto alloc_done;
1356
1357                 /* Don't retry from all devices if we're out of open buckets: */
1358                 if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
1359                         int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1360                                               target, erasure_code,
1361                                               nr_replicas, &nr_effective,
1362                                               &have_cache, watermark,
1363                                               flags, cl);
1364                         if (!ret ||
1365                             bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1366                             bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
1367                                 goto alloc_done;
1368                 }
1369
1370                 /*
1371                  * Only try to allocate cache (durability = 0 devices) from the
1372                  * specified target:
1373                  */
1374                 have_cache = true;
1375
1376                 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1377                                               0, erasure_code,
1378                                               nr_replicas, &nr_effective,
1379                                               &have_cache, watermark,
1380                                               flags, cl);
1381         } else {
1382                 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1383                                               target, erasure_code,
1384                                               nr_replicas, &nr_effective,
1385                                               &have_cache, watermark,
1386                                               flags, cl);
1387         }
1388 alloc_done:
1389         BUG_ON(!ret && nr_effective < nr_replicas);
1390
1391         if (erasure_code && !ec_open_bucket(c, &ptrs))
1392                 pr_debug("failed to get ec bucket: ret %u", ret);
1393
1394         if (ret == -BCH_ERR_insufficient_devices &&
1395             nr_effective >= nr_replicas_required)
1396                 ret = 0;
1397
1398         if (ret)
1399                 goto err;
1400
1401         if (nr_effective > nr_replicas)
1402                 deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
1403
1404         /* Free buckets we didn't use: */
1405         open_bucket_for_each(c, &wp->ptrs, ob, i)
1406                 open_bucket_free_unused(c, ob);
1407
1408         wp->ptrs = ptrs;
1409
1410         wp->sectors_free = UINT_MAX;
1411
1412         open_bucket_for_each(c, &wp->ptrs, ob, i)
1413                 wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
1414
1415         BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
1416
1417         return 0;
1418 err:
1419         open_bucket_for_each(c, &wp->ptrs, ob, i)
1420                 if (ptrs.nr < ARRAY_SIZE(ptrs.v))
1421                         ob_push(c, &ptrs, ob);
1422                 else
1423                         open_bucket_free_unused(c, ob);
1424         wp->ptrs = ptrs;
1425
1426         mutex_unlock(&wp->lock);
1427
1428         if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
1429             try_decrease_writepoints(trans, write_points_nr))
1430                 goto retry;
1431
1432         if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
1433             bch2_err_matches(ret, BCH_ERR_freelist_empty))
1434                 return cl
1435                         ? -BCH_ERR_bucket_alloc_blocked
1436                         : -BCH_ERR_ENOSPC_bucket_alloc;
1437
1438         return ret;
1439 }
1440
1441 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
1442 {
1443         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1444
1445         return (struct bch_extent_ptr) {
1446                 .type   = 1 << BCH_EXTENT_ENTRY_ptr,
1447                 .gen    = ob->gen,
1448                 .dev    = ob->dev,
1449                 .offset = bucket_to_sector(ca, ob->bucket) +
1450                         ca->mi.bucket_size -
1451                         ob->sectors_free,
1452         };
1453 }
1454
1455 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
1456                                     struct bkey_i *k, unsigned sectors,
1457                                     bool cached)
1458 {
1459         bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
1460 }
1461
1462 /*
1463  * Append pointers to the space we just allocated to @k, and mark @sectors space
1464  * as allocated out of @ob
1465  */
1466 void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
1467 {
1468         bch2_alloc_sectors_done_inlined(c, wp);
1469 }
1470
1471 static inline void writepoint_init(struct write_point *wp,
1472                                    enum bch_data_type type)
1473 {
1474         mutex_init(&wp->lock);
1475         wp->data_type = type;
1476
1477         INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
1478         INIT_LIST_HEAD(&wp->writes);
1479         spin_lock_init(&wp->writes_lock);
1480 }
1481
1482 void bch2_fs_allocator_foreground_init(struct bch_fs *c)
1483 {
1484         struct open_bucket *ob;
1485         struct write_point *wp;
1486
1487         mutex_init(&c->write_points_hash_lock);
1488         c->write_points_nr = ARRAY_SIZE(c->write_points);
1489
1490         /* open bucket 0 is a sentinal NULL: */
1491         spin_lock_init(&c->open_buckets[0].lock);
1492
1493         for (ob = c->open_buckets + 1;
1494              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
1495                 spin_lock_init(&ob->lock);
1496                 c->open_buckets_nr_free++;
1497
1498                 ob->freelist = c->open_buckets_freelist;
1499                 c->open_buckets_freelist = ob - c->open_buckets;
1500         }
1501
1502         writepoint_init(&c->btree_write_point,          BCH_DATA_btree);
1503         writepoint_init(&c->rebalance_write_point,      BCH_DATA_user);
1504         writepoint_init(&c->copygc_write_point,         BCH_DATA_user);
1505
1506         for (wp = c->write_points;
1507              wp < c->write_points + c->write_points_nr; wp++) {
1508                 writepoint_init(wp, BCH_DATA_user);
1509
1510                 wp->last_used   = local_clock();
1511                 wp->write_point = (unsigned long) wp;
1512                 hlist_add_head_rcu(&wp->node,
1513                                    writepoint_hash(c, wp->write_point));
1514         }
1515 }
1516
1517 static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
1518 {
1519         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1520         unsigned data_type = ob->data_type;
1521         barrier(); /* READ_ONCE() doesn't work on bitfields */
1522
1523         prt_printf(out, "%zu ref %u ",
1524                    ob - c->open_buckets,
1525                    atomic_read(&ob->pin));
1526         bch2_prt_data_type(out, data_type);
1527         prt_printf(out, " %u:%llu gen %u allocated %u/%u",
1528                    ob->dev, ob->bucket, ob->gen,
1529                    ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
1530         if (ob->ec)
1531                 prt_printf(out, " ec idx %llu", ob->ec->idx);
1532         if (ob->on_partial_list)
1533                 prt_str(out, " partial");
1534         prt_newline(out);
1535 }
1536
1537 void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
1538 {
1539         struct open_bucket *ob;
1540
1541         out->atomic++;
1542
1543         for (ob = c->open_buckets;
1544              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1545              ob++) {
1546                 spin_lock(&ob->lock);
1547                 if (ob->valid && !ob->on_partial_list)
1548                         bch2_open_bucket_to_text(out, c, ob);
1549                 spin_unlock(&ob->lock);
1550         }
1551
1552         --out->atomic;
1553 }
1554
1555 void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
1556 {
1557         unsigned i;
1558
1559         out->atomic++;
1560         spin_lock(&c->freelist_lock);
1561
1562         for (i = 0; i < c->open_buckets_partial_nr; i++)
1563                 bch2_open_bucket_to_text(out, c,
1564                                 c->open_buckets + c->open_buckets_partial[i]);
1565
1566         spin_unlock(&c->freelist_lock);
1567         --out->atomic;
1568 }
1569
1570 static const char * const bch2_write_point_states[] = {
1571 #define x(n)    #n,
1572         WRITE_POINT_STATES()
1573 #undef x
1574         NULL
1575 };
1576
1577 static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
1578                                      struct write_point *wp)
1579 {
1580         struct open_bucket *ob;
1581         unsigned i;
1582
1583         prt_printf(out, "%lu: ", wp->write_point);
1584         prt_human_readable_u64(out, wp->sectors_allocated);
1585
1586         prt_printf(out, " last wrote: ");
1587         bch2_pr_time_units(out, sched_clock() - wp->last_used);
1588
1589         for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
1590                 prt_printf(out, " %s: ", bch2_write_point_states[i]);
1591                 bch2_pr_time_units(out, wp->time[i]);
1592         }
1593
1594         prt_newline(out);
1595
1596         printbuf_indent_add(out, 2);
1597         open_bucket_for_each(c, &wp->ptrs, ob, i)
1598                 bch2_open_bucket_to_text(out, c, ob);
1599         printbuf_indent_sub(out, 2);
1600 }
1601
1602 void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
1603 {
1604         struct write_point *wp;
1605
1606         prt_str(out, "Foreground write points\n");
1607         for (wp = c->write_points;
1608              wp < c->write_points + ARRAY_SIZE(c->write_points);
1609              wp++)
1610                 bch2_write_point_to_text(out, c, wp);
1611
1612         prt_str(out, "Copygc write point\n");
1613         bch2_write_point_to_text(out, c, &c->copygc_write_point);
1614
1615         prt_str(out, "Rebalance write point\n");
1616         bch2_write_point_to_text(out, c, &c->rebalance_write_point);
1617
1618         prt_str(out, "Btree write point\n");
1619         bch2_write_point_to_text(out, c, &c->btree_write_point);
1620 }