bpf: Add poke dependency tracking for prog array maps

author Daniel Borkmann <daniel@iogearbox.net>

Fri, 22 Nov 2019 20:07:58 +0000 (21:07 +0100)

committer Alexei Starovoitov <ast@kernel.org>

Mon, 25 Nov 2019 01:04:11 +0000 (17:04 -0800)
author Daniel Borkmann <daniel@iogearbox.net>
Fri, 22 Nov 2019 20:07:58 +0000 (21:07 +0100)
committer Alexei Starovoitov <ast@kernel.org>
Mon, 25 Nov 2019 01:04:11 +0000 (17:04 -0800)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h

index 312983bf7faaa184ec1fc8ff0675e8a0adc4e9ad..c2f07fd410c17b2ef8f1c269933831d8a2d638aa 100644 (file)
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -22,6 +22,7 @@ struct bpf_verifier_env;
  struct bpf_verifier_log;
  struct perf_event;
  struct bpf_prog;
+struct bpf_prog_aux;
  struct bpf_map;
  struct sock;
  struct seq_file;
@@ -64,6 +65,12 @@ struct bpf_map_ops {
                              const struct btf_type *key_type,
                              const struct btf_type *value_type);
  
+       /* Prog poke tracking helpers. */
+       int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
+       void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
+       void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old,
+                            struct bpf_prog *new);
+
         /* Direct value access helpers. */
         int (*map_direct_value_addr)(const struct bpf_map *map,
                                      u64 *imm, u32 off);
@@ -588,6 +595,11 @@ struct bpf_array_aux {
          */
         enum bpf_prog_type type;
         bool jited;
+       /* Programs with direct jumps into programs part of this array. */
+       struct list_head poke_progs;
+       struct bpf_map *map;
+       struct mutex poke_mutex;
+       struct work_struct work;
  };
  
  struct bpf_array {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c

index 57da950ee55b51d94208935f43cadfb7152e1750..58bdf5fd24cc8c3c9d39902203bbf778e131d3bd 100644 (file)
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -586,10 +586,17 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
         if (IS_ERR(new_ptr))
                 return PTR_ERR(new_ptr);
  
-       old_ptr = xchg(array->ptrs + index, new_ptr);
+       if (map->ops->map_poke_run) {
+               mutex_lock(&array->aux->poke_mutex);
+               old_ptr = xchg(array->ptrs + index, new_ptr);
+               map->ops->map_poke_run(map, index, old_ptr, new_ptr);
+               mutex_unlock(&array->aux->poke_mutex);
+       } else {
+               old_ptr = xchg(array->ptrs + index, new_ptr);
+       }
+
         if (old_ptr)
                 map->ops->map_fd_put_ptr(old_ptr);
-
         return 0;
  }
  
@@ -602,7 +609,15 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
         if (index >= array->map.max_entries)
                 return -E2BIG;
  
-       old_ptr = xchg(array->ptrs + index, NULL);
+       if (map->ops->map_poke_run) {
+               mutex_lock(&array->aux->poke_mutex);
+               old_ptr = xchg(array->ptrs + index, NULL);
+               map->ops->map_poke_run(map, index, old_ptr, NULL);
+               mutex_unlock(&array->aux->poke_mutex);
+       } else {
+               old_ptr = xchg(array->ptrs + index, NULL);
+       }
+
         if (old_ptr) {
                 map->ops->map_fd_put_ptr(old_ptr);
                 return 0;
@@ -671,6 +686,152 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
         rcu_read_unlock();
  }
  
+struct prog_poke_elem {
+       struct list_head list;
+       struct bpf_prog_aux *aux;
+};
+
+static int prog_array_map_poke_track(struct bpf_map *map,
+                                    struct bpf_prog_aux *prog_aux)
+{
+       struct prog_poke_elem *elem;
+       struct bpf_array_aux *aux;
+       int ret = 0;
+
+       aux = container_of(map, struct bpf_array, map)->aux;
+       mutex_lock(&aux->poke_mutex);
+       list_for_each_entry(elem, &aux->poke_progs, list) {
+               if (elem->aux == prog_aux)
+                       goto out;
+       }
+
+       elem = kmalloc(sizeof(*elem), GFP_KERNEL);
+       if (!elem) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       INIT_LIST_HEAD(&elem->list);
+       /* We must track the program's aux info at this point in time
+        * since the program pointer itself may not be stable yet, see
+        * also comment in prog_array_map_poke_run().
+        */
+       elem->aux = prog_aux;
+
+       list_add_tail(&elem->list, &aux->poke_progs);
+out:
+       mutex_unlock(&aux->poke_mutex);
+       return ret;
+}
+
+static void prog_array_map_poke_untrack(struct bpf_map *map,
+                                       struct bpf_prog_aux *prog_aux)
+{
+       struct prog_poke_elem *elem, *tmp;
+       struct bpf_array_aux *aux;
+
+       aux = container_of(map, struct bpf_array, map)->aux;
+       mutex_lock(&aux->poke_mutex);
+       list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
+               if (elem->aux == prog_aux) {
+                       list_del_init(&elem->list);
+                       kfree(elem);
+                       break;
+               }
+       }
+       mutex_unlock(&aux->poke_mutex);
+}
+
+static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
+                                   struct bpf_prog *old,
+                                   struct bpf_prog *new)
+{
+       enum bpf_text_poke_type type;
+       struct prog_poke_elem *elem;
+       struct bpf_array_aux *aux;
+
+       if (!old && new)
+               type = BPF_MOD_NOP_TO_JUMP;
+       else if (old && !new)
+               type = BPF_MOD_JUMP_TO_NOP;
+       else if (old && new)
+               type = BPF_MOD_JUMP_TO_JUMP;
+       else
+               return;
+
+       aux = container_of(map, struct bpf_array, map)->aux;
+       WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex));
+
+       list_for_each_entry(elem, &aux->poke_progs, list) {
+               struct bpf_jit_poke_descriptor *poke;
+               int i, ret;
+
+               for (i = 0; i < elem->aux->size_poke_tab; i++) {
+                       poke = &elem->aux->poke_tab[i];
+
+                       /* Few things to be aware of:
+                        *
+                        * 1) We can only ever access aux in this context, but
+                        *    not aux->prog since it might not be stable yet and
+                        *    there could be danger of use after free otherwise.
+                        * 2) Initially when we start tracking aux, the program
+                        *    is not JITed yet and also does not have a kallsyms
+                        *    entry. We skip these as poke->ip_stable is not
+                        *    active yet. The JIT will do the final fixup before
+                        *    setting it stable. The various poke->ip_stable are
+                        *    successively activated, so tail call updates can
+                        *    arrive from here while JIT is still finishing its
+                        *    final fixup for non-activated poke entries.
+                        * 3) On program teardown, the program's kallsym entry gets
+                        *    removed out of RCU callback, but we can only untrack
+                        *    from sleepable context, therefore bpf_arch_text_poke()
+                        *    might not see that this is in BPF text section and
+                        *    bails out with -EINVAL. As these are unreachable since
+                        *    RCU grace period already passed, we simply skip them.
+                        * 4) Also programs reaching refcount of zero while patching
+                        *    is in progress is okay since we're protected under
+                        *    poke_mutex and untrack the programs before the JIT
+                        *    buffer is freed. When we're still in the middle of
+                        *    patching and suddenly kallsyms entry of the program
+                        *    gets evicted, we just skip the rest which is fine due
+                        *    to point 3).
+                        * 5) Any other error happening below from bpf_arch_text_poke()
+                        *    is a unexpected bug.
+                        */
+                       if (!READ_ONCE(poke->ip_stable))
+                               continue;
+                       if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
+                               continue;
+                       if (poke->tail_call.map != map ||
+                           poke->tail_call.key != key)
+                               continue;
+
+                       ret = bpf_arch_text_poke(poke->ip, type,
+                                                old ? (u8 *)old->bpf_func +
+                                                poke->adj_off : NULL,
+                                                new ? (u8 *)new->bpf_func +
+                                                poke->adj_off : NULL);
+                       BUG_ON(ret < 0 && ret != -EINVAL);
+               }
+       }
+}
+
+static void prog_array_map_clear_deferred(struct work_struct *work)
+{
+       struct bpf_map *map = container_of(work, struct bpf_array_aux,
+                                          work)->map;
+       bpf_fd_array_map_clear(map);
+       bpf_map_put(map);
+}
+
+static void prog_array_map_clear(struct bpf_map *map)
+{
+       struct bpf_array_aux *aux = container_of(map, struct bpf_array,
+                                                map)->aux;
+       bpf_map_inc(map);
+       schedule_work(&aux->work);
+}
+
  static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
  {
         struct bpf_array_aux *aux;
@@ -680,6 +841,10 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
         if (!aux)
                 return ERR_PTR(-ENOMEM);
  
+       INIT_WORK(&aux->work, prog_array_map_clear_deferred);
+       INIT_LIST_HEAD(&aux->poke_progs);
+       mutex_init(&aux->poke_mutex);
+
         map = array_map_alloc(attr);
         if (IS_ERR(map)) {
                 kfree(aux);
@@ -687,14 +852,21 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
         }
  
         container_of(map, struct bpf_array, map)->aux = aux;
+       aux->map = map;
+
         return map;
  }
  
  static void prog_array_map_free(struct bpf_map *map)
  {
+       struct prog_poke_elem *elem, *tmp;
         struct bpf_array_aux *aux;
  
         aux = container_of(map, struct bpf_array, map)->aux;
+       list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
+               list_del_init(&elem->list);
+               kfree(elem);
+       }
         kfree(aux);
         fd_array_map_free(map);
  }
@@ -703,13 +875,16 @@ const struct bpf_map_ops prog_array_map_ops = {
         .map_alloc_check = fd_array_map_alloc_check,
         .map_alloc = prog_array_map_alloc,
         .map_free = prog_array_map_free,
+       .map_poke_track = prog_array_map_poke_track,
+       .map_poke_untrack = prog_array_map_poke_untrack,
+       .map_poke_run = prog_array_map_poke_run,
         .map_get_next_key = array_map_get_next_key,
         .map_lookup_elem = fd_array_map_lookup_elem,
         .map_delete_elem = fd_array_map_delete_elem,
         .map_fd_get_ptr = prog_fd_array_get_ptr,
         .map_fd_put_ptr = prog_fd_array_put_ptr,
         .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
-       .map_release_uref = bpf_fd_array_map_clear,
+       .map_release_uref = prog_array_map_clear,
         .map_seq_show_elem = prog_array_map_seq_show_elem,
  };
  
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c

index 608b7085e0c9296836c271f3f77dd9c0601c4740..49e32acad7d80e310376c137453fffd97ac234c6 100644 (file)
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2050,11 +2050,16 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
  
  static void bpf_free_used_maps(struct bpf_prog_aux *aux)
  {
+       struct bpf_map *map;
         int i;
  
         bpf_free_cgroup_storage(aux);
-       for (i = 0; i < aux->used_map_cnt; i++)
-               bpf_map_put(aux->used_maps[i]);
+       for (i = 0; i < aux->used_map_cnt; i++) {
+               map = aux->used_maps[i];
+               if (map->ops->map_poke_untrack)
+                       map->ops->map_poke_untrack(map, aux);
+               bpf_map_put(map);
+       }
         kfree(aux->used_maps);
  }
  
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c

index b904d56ec6860828898894ab5bab47b5d21fac61..e3461ec5957060aef7db6660e6795674c25901c6 100644 (file)
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -25,12 +25,13 @@
  #include <linux/nospec.h>
  #include <uapi/linux/btf.h>
  
-#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
-                          (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
-                          (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
-                          (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
+                         (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
+                         (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
  #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
-#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
+#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
+                       IS_FD_HASH(map))
  
  #define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
  
@@ -877,7 +878,7 @@ static int map_lookup_elem(union bpf_attr *attr)
                 err = bpf_percpu_cgroup_storage_copy(map, key, value);
         } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
                 err = bpf_stackmap_copy(map, key, value);
-       } else if (IS_FD_ARRAY(map)) {
+       } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
                 err = bpf_fd_array_map_lookup_elem(map, key, value);
         } else if (IS_FD_HASH(map)) {
                 err = bpf_fd_htab_map_lookup_elem(map, key, value);
@@ -1004,6 +1005,10 @@ static int map_update_elem(union bpf_attr *attr)
                    map->map_type == BPF_MAP_TYPE_SOCKMAP) {
                 err = map->ops->map_update_elem(map, key, value, attr->flags);
                 goto out;
+       } else if (IS_FD_PROG_ARRAY(map)) {
+               err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+                                                  attr->flags);
+               goto out;
         }
  
         /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
@@ -1086,6 +1091,9 @@ static int map_delete_elem(union bpf_attr *attr)
         if (bpf_map_is_dev_bound(map)) {
                 err = bpf_map_offload_delete_elem(map, key);
                 goto out;
+       } else if (IS_FD_PROG_ARRAY(map)) {
+               err = map->ops->map_delete_elem(map, key);
+               goto out;
         }
  
         preempt_disable();
author	Daniel Borkmann <daniel@iogearbox.net>
	Fri, 22 Nov 2019 20:07:58 +0000 (21:07 +0100)
committer	Alexei Starovoitov <ast@kernel.org>
	Mon, 25 Nov 2019 01:04:11 +0000 (17:04 -0800)
include/linux/bpf.h		patch \| blob \| history
kernel/bpf/arraymap.c		patch \| blob \| history
kernel/bpf/core.c		patch \| blob \| history
kernel/bpf/syscall.c		patch \| blob \| history