cgroup: add support for eBPF programs
authorDaniel Mack <daniel@zonque.org>
Wed, 23 Nov 2016 15:52:26 +0000 (16:52 +0100)
committerDavid S. Miller <davem@davemloft.net>
Fri, 25 Nov 2016 21:25:52 +0000 (16:25 -0500)
This patch adds two sets of eBPF program pointers to struct cgroup.
One for such that are directly pinned to a cgroup, and one for such
that are effective for it.

To illustrate the logic behind that, assume the following example
cgroup hierarchy.

  A - B - C
        \ D - E

If only B has a program attached, it will be effective for B, C, D
and E. If D then attaches a program itself, that will be effective for
both D and E, and the program in B will only affect B and C. Only one
program of a given type is effective for a cgroup.

Attaching and detaching programs will be done through the bpf(2)
syscall. For now, ingress and egress inet socket filtering are the
only supported use-cases.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/linux/bpf-cgroup.h [new file with mode: 0644]
include/linux/cgroup-defs.h
init/Kconfig
kernel/bpf/Makefile
kernel/bpf/cgroup.c [new file with mode: 0644]
kernel/cgroup.c

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
new file mode 100644 (file)
index 0000000..ec80d0c
--- /dev/null
@@ -0,0 +1,79 @@
+#ifndef _BPF_CGROUP_H
+#define _BPF_CGROUP_H
+
+#include <linux/bpf.h>
+#include <linux/jump_label.h>
+#include <uapi/linux/bpf.h>
+
+struct sock;
+struct cgroup;
+struct sk_buff;
+
+#ifdef CONFIG_CGROUP_BPF
+
+extern struct static_key_false cgroup_bpf_enabled_key;
+#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
+
+struct cgroup_bpf {
+       /*
+        * Store two sets of bpf_prog pointers, one for programs that are
+        * pinned directly to this cgroup, and one for those that are effective
+        * when this cgroup is accessed.
+        */
+       struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
+       struct bpf_prog *effective[MAX_BPF_ATTACH_TYPE];
+};
+
+void cgroup_bpf_put(struct cgroup *cgrp);
+void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
+
+void __cgroup_bpf_update(struct cgroup *cgrp,
+                        struct cgroup *parent,
+                        struct bpf_prog *prog,
+                        enum bpf_attach_type type);
+
+/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
+void cgroup_bpf_update(struct cgroup *cgrp,
+                      struct bpf_prog *prog,
+                      enum bpf_attach_type type);
+
+int __cgroup_bpf_run_filter(struct sock *sk,
+                           struct sk_buff *skb,
+                           enum bpf_attach_type type);
+
+/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb)                       \
+({                                                                     \
+       int __ret = 0;                                                  \
+       if (cgroup_bpf_enabled)                                         \
+               __ret = __cgroup_bpf_run_filter(sk, skb,                \
+                                               BPF_CGROUP_INET_INGRESS); \
+                                                                       \
+       __ret;                                                          \
+})
+
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb)                                \
+({                                                                     \
+       int __ret = 0;                                                  \
+       if (cgroup_bpf_enabled && sk && sk == skb->sk) {                \
+               typeof(sk) __sk = sk_to_full_sk(sk);                    \
+               if (sk_fullsock(__sk))                                  \
+                       __ret = __cgroup_bpf_run_filter(__sk, skb,      \
+                                               BPF_CGROUP_INET_EGRESS); \
+       }                                                               \
+       __ret;                                                          \
+})
+
+#else
+
+struct cgroup_bpf {};
+static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
+static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
+                                     struct cgroup *parent) {}
+
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
+
+#endif /* CONFIG_CGROUP_BPF */
+
+#endif /* _BPF_CGROUP_H */
index 5b17de62c962cd73d625427c2230d66e08cbcb4b..861b4677fc5b41134f96da33710a79735b827fe0 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/workqueue.h>
+#include <linux/bpf-cgroup.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -300,6 +301,9 @@ struct cgroup {
        /* used to schedule release agent */
        struct work_struct release_agent_work;
 
+       /* used to store eBPF programs */
+       struct cgroup_bpf bpf;
+
        /* ids of the ancestors at each level including self */
        int ancestor_ids[];
 };
index 34407f15e6d34da57be238f69441f1dad9e60764..405120b5f13e843caeb70cf87ee70458bc2746dd 100644 (file)
@@ -1154,6 +1154,18 @@ config CGROUP_PERF
 
          Say N if unsure.
 
+config CGROUP_BPF
+       bool "Support for eBPF programs attached to cgroups"
+       depends on BPF_SYSCALL && SOCK_CGROUP_DATA
+       help
+         Allow attaching eBPF programs to a cgroup using the bpf(2)
+         syscall command BPF_PROG_ATTACH.
+
+         In which context these programs are accessed depends on the type
+         of attachment. For instance, programs that are attached using
+         BPF_CGROUP_INET_INGRESS will be executed on the ingress path of
+         inet sockets.
+
 config CGROUP_DEBUG
        bool "Example controller"
        default n
index c4d89d6e2058481d149733d4767cf98bd99eeaa9..1276474ac3cd9ddf2a87f37312edc661d2ec335c 100644 (file)
@@ -5,3 +5,4 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
+obj-$(CONFIG_CGROUP_BPF) += cgroup.o
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
new file mode 100644 (file)
index 0000000..a0ab43f
--- /dev/null
@@ -0,0 +1,167 @@
+/*
+ * Functions to manage eBPF programs attached to cgroups
+ *
+ * Copyright (c) 2016 Daniel Mack
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
+#include <net/sock.h>
+
+DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
+EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+
+/**
+ * cgroup_bpf_put() - put references of all bpf programs
+ * @cgrp: the cgroup to modify
+ */
+void cgroup_bpf_put(struct cgroup *cgrp)
+{
+       unsigned int type;
+
+       for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
+               struct bpf_prog *prog = cgrp->bpf.prog[type];
+
+               if (prog) {
+                       bpf_prog_put(prog);
+                       static_branch_dec(&cgroup_bpf_enabled_key);
+               }
+       }
+}
+
+/**
+ * cgroup_bpf_inherit() - inherit effective programs from parent
+ * @cgrp: the cgroup to modify
+ * @parent: the parent to inherit from
+ */
+void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+{
+       unsigned int type;
+
+       for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
+               struct bpf_prog *e;
+
+               e = rcu_dereference_protected(parent->bpf.effective[type],
+                                             lockdep_is_held(&cgroup_mutex));
+               rcu_assign_pointer(cgrp->bpf.effective[type], e);
+       }
+}
+
+/**
+ * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ *                         propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
+ * @prog: A new program to pin
+ * @type: Type of pinning operation (ingress/egress)
+ *
+ * Each cgroup has a set of two pointers for bpf programs; one for eBPF
+ * programs it owns, and which is effective for execution.
+ *
+ * If @prog is %NULL, this function attaches a new program to the cgroup and
+ * releases the one that is currently attached, if any. @prog is then made
+ * the effective program of type @type in that cgroup.
+ *
+ * If @prog is %NULL, the currently attached program of type @type is released,
+ * and the effective program of the parent cgroup (if any) is inherited to
+ * @cgrp.
+ *
+ * Then, the descendants of @cgrp are walked and the effective program for
+ * each of them is set to the effective program of @cgrp unless the
+ * descendant has its own program attached, in which case the subbranch is
+ * skipped. This ensures that delegated subcgroups with own programs are left
+ * untouched.
+ *
+ * Must be called with cgroup_mutex held.
+ */
+void __cgroup_bpf_update(struct cgroup *cgrp,
+                        struct cgroup *parent,
+                        struct bpf_prog *prog,
+                        enum bpf_attach_type type)
+{
+       struct bpf_prog *old_prog, *effective;
+       struct cgroup_subsys_state *pos;
+
+       old_prog = xchg(cgrp->bpf.prog + type, prog);
+
+       effective = (!prog && parent) ?
+               rcu_dereference_protected(parent->bpf.effective[type],
+                                         lockdep_is_held(&cgroup_mutex)) :
+               prog;
+
+       css_for_each_descendant_pre(pos, &cgrp->self) {
+               struct cgroup *desc = container_of(pos, struct cgroup, self);
+
+               /* skip the subtree if the descendant has its own program */
+               if (desc->bpf.prog[type] && desc != cgrp)
+                       pos = css_rightmost_descendant(pos);
+               else
+                       rcu_assign_pointer(desc->bpf.effective[type],
+                                          effective);
+       }
+
+       if (prog)
+               static_branch_inc(&cgroup_bpf_enabled_key);
+
+       if (old_prog) {
+               bpf_prog_put(old_prog);
+               static_branch_dec(&cgroup_bpf_enabled_key);
+       }
+}
+
+/**
+ * __cgroup_bpf_run_filter() - Run a program for packet filtering
+ * @sk: The socken sending or receiving traffic
+ * @skb: The skb that is being sent or received
+ * @type: The type of program to be exectuted
+ *
+ * If no socket is passed, or the socket is not of type INET or INET6,
+ * this function does nothing and returns 0.
+ *
+ * The program type passed in via @type must be suitable for network
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter(struct sock *sk,
+                           struct sk_buff *skb,
+                           enum bpf_attach_type type)
+{
+       struct bpf_prog *prog;
+       struct cgroup *cgrp;
+       int ret = 0;
+
+       if (!sk || !sk_fullsock(sk))
+               return 0;
+
+       if (sk->sk_family != AF_INET &&
+           sk->sk_family != AF_INET6)
+               return 0;
+
+       cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+
+       rcu_read_lock();
+
+       prog = rcu_dereference(cgrp->bpf.effective[type]);
+       if (prog) {
+               unsigned int offset = skb->data - skb_network_header(skb);
+
+               __skb_push(skb, offset);
+               ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
+               __skb_pull(skb, offset);
+       }
+
+       rcu_read_unlock();
+
+       return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter);
index 85bc9beb046d9a6deda2e3564f4d5bd01d6fc27b..2ee9ec3051b20774b118a57e4609f30e87bf82be 100644 (file)
@@ -5074,6 +5074,8 @@ static void css_release_work_fn(struct work_struct *work)
                if (cgrp->kn)
                        RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
                                         NULL);
+
+               cgroup_bpf_put(cgrp);
        }
 
        mutex_unlock(&cgroup_mutex);
@@ -5281,6 +5283,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
        if (!cgroup_on_dfl(cgrp))
                cgrp->subtree_control = cgroup_control(cgrp);
 
+       if (parent)
+               cgroup_bpf_inherit(cgrp, parent);
+
        cgroup_propagate_control(cgrp);
 
        /* @cgrp doesn't have dir yet so the following will only create csses */
@@ -6495,6 +6500,19 @@ static __init int cgroup_namespaces_init(void)
 }
 subsys_initcall(cgroup_namespaces_init);
 
+#ifdef CONFIG_CGROUP_BPF
+void cgroup_bpf_update(struct cgroup *cgrp,
+                      struct bpf_prog *prog,
+                      enum bpf_attach_type type)
+{
+       struct cgroup *parent = cgroup_parent(cgrp);
+
+       mutex_lock(&cgroup_mutex);
+       __cgroup_bpf_update(cgrp, parent, prog, type);
+       mutex_unlock(&cgroup_mutex);
+}
+#endif /* CONFIG_CGROUP_BPF */
+
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *
 debug_css_alloc(struct cgroup_subsys_state *parent_css)