bpf: Introduce bpf_mem_free_rcu() similar to kfree_rcu().
authorAlexei Starovoitov <ast@kernel.org>
Thu, 6 Jul 2023 03:34:45 +0000 (20:34 -0700)
committerDaniel Borkmann <daniel@iogearbox.net>
Wed, 12 Jul 2023 21:45:23 +0000 (23:45 +0200)
Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into
per-cpu free list the _rcu() flavor waits for RCU grace period and then moves
objects into free_by_rcu_ttrace list where they are waiting for RCU
task trace grace period to be freed into slab.

The life cycle of objects:
alloc: dequeue free_llist
free: enqeueu free_llist
free_rcu: enqueue free_by_rcu -> waiting_for_gp
free_llist above high watermark -> free_by_rcu_ttrace
after RCU GP waiting_for_gp -> free_by_rcu_ttrace
free_by_rcu_ttrace -> waiting_for_gp_ttrace -> slab

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/bpf/20230706033447.54696-13-alexei.starovoitov@gmail.com
include/linux/bpf_mem_alloc.h
kernel/bpf/memalloc.c

index 3929be5743f49556c3dcda132d43b769f4356144..d644bbb298af4aa3b2613cf51c426382e971f051 100644 (file)
@@ -27,10 +27,12 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
 /* kmalloc/kfree equivalent: */
 void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size);
 void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
+void bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
 
 /* kmem_cache_alloc/free equivalent: */
 void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
 void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
+void bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
 void bpf_mem_cache_raw_free(void *ptr);
 void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags);
 
index e5a87f6cf2ccd64b9b736610bdd540d413ac532b..17ef2e9b278a39c8a38b6dda4616125abd73bda4 100644 (file)
@@ -101,6 +101,15 @@ struct bpf_mem_cache {
        bool draining;
        struct bpf_mem_cache *tgt;
 
+       /* list of objects to be freed after RCU GP */
+       struct llist_head free_by_rcu;
+       struct llist_node *free_by_rcu_tail;
+       struct llist_head waiting_for_gp;
+       struct llist_node *waiting_for_gp_tail;
+       struct rcu_head rcu;
+       atomic_t call_rcu_in_progress;
+       struct llist_head free_llist_extra_rcu;
+
        /* list of objects to be freed after RCU tasks trace GP */
        struct llist_head free_by_rcu_ttrace;
        struct llist_head waiting_for_gp_ttrace;
@@ -346,6 +355,69 @@ static void free_bulk(struct bpf_mem_cache *c)
        do_call_rcu_ttrace(tgt);
 }
 
+static void __free_by_rcu(struct rcu_head *head)
+{
+       struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+       struct bpf_mem_cache *tgt = c->tgt;
+       struct llist_node *llnode;
+
+       llnode = llist_del_all(&c->waiting_for_gp);
+       if (!llnode)
+               goto out;
+
+       llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);
+
+       /* Objects went through regular RCU GP. Send them to RCU tasks trace */
+       do_call_rcu_ttrace(tgt);
+out:
+       atomic_set(&c->call_rcu_in_progress, 0);
+}
+
+static void check_free_by_rcu(struct bpf_mem_cache *c)
+{
+       struct llist_node *llnode, *t;
+       unsigned long flags;
+
+       /* drain free_llist_extra_rcu */
+       if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) {
+               inc_active(c, &flags);
+               llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
+                       if (__llist_add(llnode, &c->free_by_rcu))
+                               c->free_by_rcu_tail = llnode;
+               dec_active(c, flags);
+       }
+
+       if (llist_empty(&c->free_by_rcu))
+               return;
+
+       if (atomic_xchg(&c->call_rcu_in_progress, 1)) {
+               /*
+                * Instead of kmalloc-ing new rcu_head and triggering 10k
+                * call_rcu() to hit rcutree.qhimark and force RCU to notice
+                * the overload just ask RCU to hurry up. There could be many
+                * objects in free_by_rcu list.
+                * This hint reduces memory consumption for an artificial
+                * benchmark from 2 Gbyte to 150 Mbyte.
+                */
+               rcu_request_urgent_qs_task(current);
+               return;
+       }
+
+       WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+
+       inc_active(c, &flags);
+       WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu));
+       c->waiting_for_gp_tail = c->free_by_rcu_tail;
+       dec_active(c, flags);
+
+       if (unlikely(READ_ONCE(c->draining))) {
+               free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
+               atomic_set(&c->call_rcu_in_progress, 0);
+       } else {
+               call_rcu_hurry(&c->rcu, __free_by_rcu);
+       }
+}
+
 static void bpf_mem_refill(struct irq_work *work)
 {
        struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
@@ -360,6 +432,8 @@ static void bpf_mem_refill(struct irq_work *work)
                alloc_bulk(c, c->batch, NUMA_NO_NODE);
        else if (cnt > c->high_watermark)
                free_bulk(c);
+
+       check_free_by_rcu(c);
 }
 
 static void notrace irq_work_raise(struct bpf_mem_cache *c)
@@ -488,6 +562,9 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
        free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
        free_all(__llist_del_all(&c->free_llist), percpu);
        free_all(__llist_del_all(&c->free_llist_extra), percpu);
+       free_all(__llist_del_all(&c->free_by_rcu), percpu);
+       free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
+       free_all(llist_del_all(&c->waiting_for_gp), percpu);
 }
 
 static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
@@ -500,8 +577,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
 
 static void free_mem_alloc(struct bpf_mem_alloc *ma)
 {
-       /* waiting_for_gp_ttrace lists was drained, but __free_rcu might
-        * still execute. Wait for it now before we freeing percpu caches.
+       /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
+        * might still execute. Wait for them.
         *
         * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
         * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
@@ -510,7 +587,8 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma)
         * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
         * using rcu_trace_implies_rcu_gp() as well.
         */
-       rcu_barrier_tasks_trace();
+       rcu_barrier(); /* wait for __free_by_rcu */
+       rcu_barrier_tasks_trace(); /* wait for __free_rcu */
        if (!rcu_trace_implies_rcu_gp())
                rcu_barrier();
        free_mem_alloc_no_barrier(ma);
@@ -563,6 +641,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
                        irq_work_sync(&c->refill_work);
                        drain_mem_cache(c);
                        rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
+                       rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
                }
                /* objcg is the same across cpus */
                if (c->objcg)
@@ -579,6 +658,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
                                irq_work_sync(&c->refill_work);
                                drain_mem_cache(c);
                                rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
+                               rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
                        }
                }
                if (c->objcg)
@@ -663,6 +743,27 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
                irq_work_raise(c);
 }
 
+static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
+{
+       struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+       unsigned long flags;
+
+       c->tgt = *(struct bpf_mem_cache **)llnode;
+
+       local_irq_save(flags);
+       if (local_inc_return(&c->active) == 1) {
+               if (__llist_add(llnode, &c->free_by_rcu))
+                       c->free_by_rcu_tail = llnode;
+       } else {
+               llist_add(llnode, &c->free_llist_extra_rcu);
+       }
+       local_dec(&c->active);
+       local_irq_restore(flags);
+
+       if (!atomic_read(&c->call_rcu_in_progress))
+               irq_work_raise(c);
+}
+
 /* Called from BPF program or from sys_bpf syscall.
  * In both cases migration is disabled.
  */
@@ -696,6 +797,20 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
        unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
 }
 
+void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
+{
+       int idx;
+
+       if (!ptr)
+               return;
+
+       idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+       if (idx < 0)
+               return;
+
+       unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr);
+}
+
 void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
 {
        void *ret;
@@ -712,6 +827,14 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
        unit_free(this_cpu_ptr(ma->cache), ptr);
 }
 
+void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
+{
+       if (!ptr)
+               return;
+
+       unit_free_rcu(this_cpu_ptr(ma->cache), ptr);
+}
+
 /* Directly does a kfree() without putting 'ptr' back to the free_llist
  * for reuse and without waiting for a rcu_tasks_trace gp.
  * The caller must first go through the rcu_tasks_trace gp for 'ptr'