bpf: Add poke dependency tracking for prog array maps
authorDaniel Borkmann <daniel@iogearbox.net>
Fri, 22 Nov 2019 20:07:58 +0000 (21:07 +0100)
committerAlexei Starovoitov <ast@kernel.org>
Mon, 25 Nov 2019 01:04:11 +0000 (17:04 -0800)
This work adds program tracking to prog array maps. This is needed such
that upon prog array updates/deletions we can fix up all programs which
make use of this tail call map. We add ops->map_poke_{un,}track()
helpers to maps to maintain the list of programs and ops->map_poke_run()
for triggering the actual update.

bpf_array_aux is extended to contain the list head and poke_mutex in
order to serialize program patching during updates/deletions.
bpf_free_used_maps() will untrack the program shortly before dropping
the reference to the map. For clearing out the prog array once all urefs
are dropped we need to use schedule_work() to have a sleepable context.

The prog_array_map_poke_run() is triggered during updates/deletions and
walks the maintained prog list. It checks in their poke_tabs whether the
map and key is matching and runs the actual bpf_arch_text_poke() for
patching in the nop or new jmp location. Depending on the type of update,
we use one of BPF_MOD_{NOP_TO_JUMP,JUMP_TO_NOP,JUMP_TO_JUMP}.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/1fb364bb3c565b3e415d5ea348f036ff379e779d.1574452833.git.daniel@iogearbox.net
include/linux/bpf.h
kernel/bpf/arraymap.c
kernel/bpf/core.c
kernel/bpf/syscall.c

index 312983bf7faaa184ec1fc8ff0675e8a0adc4e9ad..c2f07fd410c17b2ef8f1c269933831d8a2d638aa 100644 (file)
@@ -22,6 +22,7 @@ struct bpf_verifier_env;
 struct bpf_verifier_log;
 struct perf_event;
 struct bpf_prog;
+struct bpf_prog_aux;
 struct bpf_map;
 struct sock;
 struct seq_file;
@@ -64,6 +65,12 @@ struct bpf_map_ops {
                             const struct btf_type *key_type,
                             const struct btf_type *value_type);
 
+       /* Prog poke tracking helpers. */
+       int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
+       void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
+       void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old,
+                            struct bpf_prog *new);
+
        /* Direct value access helpers. */
        int (*map_direct_value_addr)(const struct bpf_map *map,
                                     u64 *imm, u32 off);
@@ -588,6 +595,11 @@ struct bpf_array_aux {
         */
        enum bpf_prog_type type;
        bool jited;
+       /* Programs with direct jumps into programs part of this array. */
+       struct list_head poke_progs;
+       struct bpf_map *map;
+       struct mutex poke_mutex;
+       struct work_struct work;
 };
 
 struct bpf_array {
index 57da950ee55b51d94208935f43cadfb7152e1750..58bdf5fd24cc8c3c9d39902203bbf778e131d3bd 100644 (file)
@@ -586,10 +586,17 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
        if (IS_ERR(new_ptr))
                return PTR_ERR(new_ptr);
 
-       old_ptr = xchg(array->ptrs + index, new_ptr);
+       if (map->ops->map_poke_run) {
+               mutex_lock(&array->aux->poke_mutex);
+               old_ptr = xchg(array->ptrs + index, new_ptr);
+               map->ops->map_poke_run(map, index, old_ptr, new_ptr);
+               mutex_unlock(&array->aux->poke_mutex);
+       } else {
+               old_ptr = xchg(array->ptrs + index, new_ptr);
+       }
+
        if (old_ptr)
                map->ops->map_fd_put_ptr(old_ptr);
-
        return 0;
 }
 
@@ -602,7 +609,15 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
        if (index >= array->map.max_entries)
                return -E2BIG;
 
-       old_ptr = xchg(array->ptrs + index, NULL);
+       if (map->ops->map_poke_run) {
+               mutex_lock(&array->aux->poke_mutex);
+               old_ptr = xchg(array->ptrs + index, NULL);
+               map->ops->map_poke_run(map, index, old_ptr, NULL);
+               mutex_unlock(&array->aux->poke_mutex);
+       } else {
+               old_ptr = xchg(array->ptrs + index, NULL);
+       }
+
        if (old_ptr) {
                map->ops->map_fd_put_ptr(old_ptr);
                return 0;
@@ -671,6 +686,152 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
        rcu_read_unlock();
 }
 
+struct prog_poke_elem {
+       struct list_head list;
+       struct bpf_prog_aux *aux;
+};
+
+static int prog_array_map_poke_track(struct bpf_map *map,
+                                    struct bpf_prog_aux *prog_aux)
+{
+       struct prog_poke_elem *elem;
+       struct bpf_array_aux *aux;
+       int ret = 0;
+
+       aux = container_of(map, struct bpf_array, map)->aux;
+       mutex_lock(&aux->poke_mutex);
+       list_for_each_entry(elem, &aux->poke_progs, list) {
+               if (elem->aux == prog_aux)
+                       goto out;
+       }
+
+       elem = kmalloc(sizeof(*elem), GFP_KERNEL);
+       if (!elem) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       INIT_LIST_HEAD(&elem->list);
+       /* We must track the program's aux info at this point in time
+        * since the program pointer itself may not be stable yet, see
+        * also comment in prog_array_map_poke_run().
+        */
+       elem->aux = prog_aux;
+
+       list_add_tail(&elem->list, &aux->poke_progs);
+out:
+       mutex_unlock(&aux->poke_mutex);
+       return ret;
+}
+
+static void prog_array_map_poke_untrack(struct bpf_map *map,
+                                       struct bpf_prog_aux *prog_aux)
+{
+       struct prog_poke_elem *elem, *tmp;
+       struct bpf_array_aux *aux;
+
+       aux = container_of(map, struct bpf_array, map)->aux;
+       mutex_lock(&aux->poke_mutex);
+       list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
+               if (elem->aux == prog_aux) {
+                       list_del_init(&elem->list);
+                       kfree(elem);
+                       break;
+               }
+       }
+       mutex_unlock(&aux->poke_mutex);
+}
+
+static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
+                                   struct bpf_prog *old,
+                                   struct bpf_prog *new)
+{
+       enum bpf_text_poke_type type;
+       struct prog_poke_elem *elem;
+       struct bpf_array_aux *aux;
+
+       if (!old && new)
+               type = BPF_MOD_NOP_TO_JUMP;
+       else if (old && !new)
+               type = BPF_MOD_JUMP_TO_NOP;
+       else if (old && new)
+               type = BPF_MOD_JUMP_TO_JUMP;
+       else
+               return;
+
+       aux = container_of(map, struct bpf_array, map)->aux;
+       WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex));
+
+       list_for_each_entry(elem, &aux->poke_progs, list) {
+               struct bpf_jit_poke_descriptor *poke;
+               int i, ret;
+
+               for (i = 0; i < elem->aux->size_poke_tab; i++) {
+                       poke = &elem->aux->poke_tab[i];
+
+                       /* Few things to be aware of:
+                        *
+                        * 1) We can only ever access aux in this context, but
+                        *    not aux->prog since it might not be stable yet and
+                        *    there could be danger of use after free otherwise.
+                        * 2) Initially when we start tracking aux, the program
+                        *    is not JITed yet and also does not have a kallsyms
+                        *    entry. We skip these as poke->ip_stable is not
+                        *    active yet. The JIT will do the final fixup before
+                        *    setting it stable. The various poke->ip_stable are
+                        *    successively activated, so tail call updates can
+                        *    arrive from here while JIT is still finishing its
+                        *    final fixup for non-activated poke entries.
+                        * 3) On program teardown, the program's kallsym entry gets
+                        *    removed out of RCU callback, but we can only untrack
+                        *    from sleepable context, therefore bpf_arch_text_poke()
+                        *    might not see that this is in BPF text section and
+                        *    bails out with -EINVAL. As these are unreachable since
+                        *    RCU grace period already passed, we simply skip them.
+                        * 4) Also programs reaching refcount of zero while patching
+                        *    is in progress is okay since we're protected under
+                        *    poke_mutex and untrack the programs before the JIT
+                        *    buffer is freed. When we're still in the middle of
+                        *    patching and suddenly kallsyms entry of the program
+                        *    gets evicted, we just skip the rest which is fine due
+                        *    to point 3).
+                        * 5) Any other error happening below from bpf_arch_text_poke()
+                        *    is a unexpected bug.
+                        */
+                       if (!READ_ONCE(poke->ip_stable))
+                               continue;
+                       if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
+                               continue;
+                       if (poke->tail_call.map != map ||
+                           poke->tail_call.key != key)
+                               continue;
+
+                       ret = bpf_arch_text_poke(poke->ip, type,
+                                                old ? (u8 *)old->bpf_func +
+                                                poke->adj_off : NULL,
+                                                new ? (u8 *)new->bpf_func +
+                                                poke->adj_off : NULL);
+                       BUG_ON(ret < 0 && ret != -EINVAL);
+               }
+       }
+}
+
+static void prog_array_map_clear_deferred(struct work_struct *work)
+{
+       struct bpf_map *map = container_of(work, struct bpf_array_aux,
+                                          work)->map;
+       bpf_fd_array_map_clear(map);
+       bpf_map_put(map);
+}
+
+static void prog_array_map_clear(struct bpf_map *map)
+{
+       struct bpf_array_aux *aux = container_of(map, struct bpf_array,
+                                                map)->aux;
+       bpf_map_inc(map);
+       schedule_work(&aux->work);
+}
+
 static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
 {
        struct bpf_array_aux *aux;
@@ -680,6 +841,10 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
        if (!aux)
                return ERR_PTR(-ENOMEM);
 
+       INIT_WORK(&aux->work, prog_array_map_clear_deferred);
+       INIT_LIST_HEAD(&aux->poke_progs);
+       mutex_init(&aux->poke_mutex);
+
        map = array_map_alloc(attr);
        if (IS_ERR(map)) {
                kfree(aux);
@@ -687,14 +852,21 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
        }
 
        container_of(map, struct bpf_array, map)->aux = aux;
+       aux->map = map;
+
        return map;
 }
 
 static void prog_array_map_free(struct bpf_map *map)
 {
+       struct prog_poke_elem *elem, *tmp;
        struct bpf_array_aux *aux;
 
        aux = container_of(map, struct bpf_array, map)->aux;
+       list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
+               list_del_init(&elem->list);
+               kfree(elem);
+       }
        kfree(aux);
        fd_array_map_free(map);
 }
@@ -703,13 +875,16 @@ const struct bpf_map_ops prog_array_map_ops = {
        .map_alloc_check = fd_array_map_alloc_check,
        .map_alloc = prog_array_map_alloc,
        .map_free = prog_array_map_free,
+       .map_poke_track = prog_array_map_poke_track,
+       .map_poke_untrack = prog_array_map_poke_untrack,
+       .map_poke_run = prog_array_map_poke_run,
        .map_get_next_key = array_map_get_next_key,
        .map_lookup_elem = fd_array_map_lookup_elem,
        .map_delete_elem = fd_array_map_delete_elem,
        .map_fd_get_ptr = prog_fd_array_get_ptr,
        .map_fd_put_ptr = prog_fd_array_put_ptr,
        .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
-       .map_release_uref = bpf_fd_array_map_clear,
+       .map_release_uref = prog_array_map_clear,
        .map_seq_show_elem = prog_array_map_seq_show_elem,
 };
 
index 608b7085e0c9296836c271f3f77dd9c0601c4740..49e32acad7d80e310376c137453fffd97ac234c6 100644 (file)
@@ -2050,11 +2050,16 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
 
 static void bpf_free_used_maps(struct bpf_prog_aux *aux)
 {
+       struct bpf_map *map;
        int i;
 
        bpf_free_cgroup_storage(aux);
-       for (i = 0; i < aux->used_map_cnt; i++)
-               bpf_map_put(aux->used_maps[i]);
+       for (i = 0; i < aux->used_map_cnt; i++) {
+               map = aux->used_maps[i];
+               if (map->ops->map_poke_untrack)
+                       map->ops->map_poke_untrack(map, aux);
+               bpf_map_put(map);
+       }
        kfree(aux->used_maps);
 }
 
index b904d56ec6860828898894ab5bab47b5d21fac61..e3461ec5957060aef7db6660e6795674c25901c6 100644 (file)
 #include <linux/nospec.h>
 #include <uapi/linux/btf.h>
 
-#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
-                          (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
-                          (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
-                          (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
+                         (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
+                         (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
-#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
+#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
+                       IS_FD_HASH(map))
 
 #define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
 
@@ -877,7 +878,7 @@ static int map_lookup_elem(union bpf_attr *attr)
                err = bpf_percpu_cgroup_storage_copy(map, key, value);
        } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
                err = bpf_stackmap_copy(map, key, value);
-       } else if (IS_FD_ARRAY(map)) {
+       } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
                err = bpf_fd_array_map_lookup_elem(map, key, value);
        } else if (IS_FD_HASH(map)) {
                err = bpf_fd_htab_map_lookup_elem(map, key, value);
@@ -1004,6 +1005,10 @@ static int map_update_elem(union bpf_attr *attr)
                   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
                err = map->ops->map_update_elem(map, key, value, attr->flags);
                goto out;
+       } else if (IS_FD_PROG_ARRAY(map)) {
+               err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+                                                  attr->flags);
+               goto out;
        }
 
        /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
@@ -1086,6 +1091,9 @@ static int map_delete_elem(union bpf_attr *attr)
        if (bpf_map_is_dev_bound(map)) {
                err = bpf_map_offload_delete_elem(map, key);
                goto out;
+       } else if (IS_FD_PROG_ARRAY(map)) {
+               err = map->ops->map_delete_elem(map, key);
+               goto out;
        }
 
        preempt_disable();