Merge tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees...
[sfrench/cifs-2.6.git] / kernel / seccomp.c
index 15f47fc11d13f62603563c30749b11b752fce205..952dc1c902295ffe65ce24541dac98ff4dccf8d9 100644 (file)
@@ -143,6 +143,38 @@ struct notification {
        struct list_head notifications;
 };
 
+#ifdef SECCOMP_ARCH_NATIVE
+/**
+ * struct action_cache - per-filter cache of seccomp actions per
+ * arch/syscall pair
+ *
+ * @allow_native: A bitmap where each bit represents whether the
+ *               filter will always allow the syscall, for the
+ *               native architecture.
+ * @allow_compat: A bitmap where each bit represents whether the
+ *               filter will always allow the syscall, for the
+ *               compat architecture.
+ */
+struct action_cache {
+       DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
+#ifdef SECCOMP_ARCH_COMPAT
+       DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
+#endif
+};
+#else
+struct action_cache { };
+
+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
+                                            const struct seccomp_data *sd)
+{
+       return false;
+}
+
+static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
 /**
  * struct seccomp_filter - container for seccomp BPF programs
  *
@@ -159,6 +191,7 @@ struct notification {
  *        this filter after reaching 0. The @users count is always smaller
  *        or equal to @refs. Hence, reaching 0 for @users does not mean
  *        the filter can be freed.
+ * @cache: cache of arch/syscall mappings to actions
  * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
  * @prev: points to a previously installed, or inherited, filter
  * @prog: the BPF program to evaluate
@@ -180,6 +213,7 @@ struct seccomp_filter {
        refcount_t refs;
        refcount_t users;
        bool log;
+       struct action_cache cache;
        struct seccomp_filter *prev;
        struct bpf_prog *prog;
        struct notification *notif;
@@ -298,6 +332,52 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
        return 0;
 }
 
+#ifdef SECCOMP_ARCH_NATIVE
+static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
+                                                   size_t bitmap_size,
+                                                   int syscall_nr)
+{
+       if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
+               return false;
+       syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
+
+       return test_bit(syscall_nr, bitmap);
+}
+
+/**
+ * seccomp_cache_check_allow - lookup seccomp cache
+ * @sfilter: The seccomp filter
+ * @sd: The seccomp data to lookup the cache with
+ *
+ * Returns true if the seccomp_data is cached and allowed.
+ */
+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
+                                            const struct seccomp_data *sd)
+{
+       int syscall_nr = sd->nr;
+       const struct action_cache *cache = &sfilter->cache;
+
+#ifndef SECCOMP_ARCH_COMPAT
+       /* A native-only architecture doesn't need to check sd->arch. */
+       return seccomp_cache_check_allow_bitmap(cache->allow_native,
+                                               SECCOMP_ARCH_NATIVE_NR,
+                                               syscall_nr);
+#else
+       if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
+               return seccomp_cache_check_allow_bitmap(cache->allow_native,
+                                                       SECCOMP_ARCH_NATIVE_NR,
+                                                       syscall_nr);
+       if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
+               return seccomp_cache_check_allow_bitmap(cache->allow_compat,
+                                                       SECCOMP_ARCH_COMPAT_NR,
+                                                       syscall_nr);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+       WARN_ON_ONCE(true);
+       return false;
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
 /**
  * seccomp_run_filters - evaluates all seccomp filters against @sd
  * @sd: optional seccomp data to be passed to filters
@@ -320,6 +400,9 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
        if (WARN_ON(f == NULL))
                return SECCOMP_RET_KILL_PROCESS;
 
+       if (seccomp_cache_check_allow(f, sd))
+               return SECCOMP_RET_ALLOW;
+
        /*
         * All filters in the list are evaluated and the lowest BPF return
         * value always takes priority (ignoring the DATA).
@@ -470,6 +553,9 @@ void seccomp_filter_release(struct task_struct *tsk)
 {
        struct seccomp_filter *orig = tsk->seccomp.filter;
 
+       /* We are effectively holding the siglock by not having any sighand. */
+       WARN_ON(tsk->sighand != NULL);
+
        /* Detach task from its filter tree. */
        tsk->seccomp.filter = NULL;
        __seccomp_filter_release(orig);
@@ -544,7 +630,12 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 {
        struct seccomp_filter *sfilter;
        int ret;
-       const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
+       const bool save_orig =
+#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE)
+               true;
+#else
+               false;
+#endif
 
        if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
                return ERR_PTR(-EINVAL);
@@ -609,6 +700,148 @@ out:
        return filter;
 }
 
+#ifdef SECCOMP_ARCH_NATIVE
+/**
+ * seccomp_is_const_allow - check if filter is constant allow with given data
+ * @fprog: The BPF programs
+ * @sd: The seccomp data to check against, only syscall number and arch
+ *      number are considered constant.
+ */
+static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
+                                  struct seccomp_data *sd)
+{
+       unsigned int reg_value = 0;
+       unsigned int pc;
+       bool op_res;
+
+       if (WARN_ON_ONCE(!fprog))
+               return false;
+
+       for (pc = 0; pc < fprog->len; pc++) {
+               struct sock_filter *insn = &fprog->filter[pc];
+               u16 code = insn->code;
+               u32 k = insn->k;
+
+               switch (code) {
+               case BPF_LD | BPF_W | BPF_ABS:
+                       switch (k) {
+                       case offsetof(struct seccomp_data, nr):
+                               reg_value = sd->nr;
+                               break;
+                       case offsetof(struct seccomp_data, arch):
+                               reg_value = sd->arch;
+                               break;
+                       default:
+                               /* can't optimize (non-constant value load) */
+                               return false;
+                       }
+                       break;
+               case BPF_RET | BPF_K:
+                       /* reached return with constant values only, check allow */
+                       return k == SECCOMP_RET_ALLOW;
+               case BPF_JMP | BPF_JA:
+                       pc += insn->k;
+                       break;
+               case BPF_JMP | BPF_JEQ | BPF_K:
+               case BPF_JMP | BPF_JGE | BPF_K:
+               case BPF_JMP | BPF_JGT | BPF_K:
+               case BPF_JMP | BPF_JSET | BPF_K:
+                       switch (BPF_OP(code)) {
+                       case BPF_JEQ:
+                               op_res = reg_value == k;
+                               break;
+                       case BPF_JGE:
+                               op_res = reg_value >= k;
+                               break;
+                       case BPF_JGT:
+                               op_res = reg_value > k;
+                               break;
+                       case BPF_JSET:
+                               op_res = !!(reg_value & k);
+                               break;
+                       default:
+                               /* can't optimize (unknown jump) */
+                               return false;
+                       }
+
+                       pc += op_res ? insn->jt : insn->jf;
+                       break;
+               case BPF_ALU | BPF_AND | BPF_K:
+                       reg_value &= k;
+                       break;
+               default:
+                       /* can't optimize (unknown insn) */
+                       return false;
+               }
+       }
+
+       /* ran off the end of the filter?! */
+       WARN_ON(1);
+       return false;
+}
+
+static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
+                                        void *bitmap, const void *bitmap_prev,
+                                        size_t bitmap_size, int arch)
+{
+       struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
+       struct seccomp_data sd;
+       int nr;
+
+       if (bitmap_prev) {
+               /* The new filter must be as restrictive as the last. */
+               bitmap_copy(bitmap, bitmap_prev, bitmap_size);
+       } else {
+               /* Before any filters, all syscalls are always allowed. */
+               bitmap_fill(bitmap, bitmap_size);
+       }
+
+       for (nr = 0; nr < bitmap_size; nr++) {
+               /* No bitmap change: not a cacheable action. */
+               if (!test_bit(nr, bitmap))
+                       continue;
+
+               sd.nr = nr;
+               sd.arch = arch;
+
+               /* No bitmap change: continue to always allow. */
+               if (seccomp_is_const_allow(fprog, &sd))
+                       continue;
+
+               /*
+                * Not a cacheable action: always run filters.
+                * atomic clear_bit() not needed, filter not visible yet.
+                */
+               __clear_bit(nr, bitmap);
+       }
+}
+
+/**
+ * seccomp_cache_prepare - emulate the filter to find cachable syscalls
+ * @sfilter: The seccomp filter
+ *
+ * Returns 0 if successful or -errno if error occurred.
+ */
+static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+       struct action_cache *cache = &sfilter->cache;
+       const struct action_cache *cache_prev =
+               sfilter->prev ? &sfilter->prev->cache : NULL;
+
+       seccomp_cache_prepare_bitmap(sfilter, cache->allow_native,
+                                    cache_prev ? cache_prev->allow_native : NULL,
+                                    SECCOMP_ARCH_NATIVE_NR,
+                                    SECCOMP_ARCH_NATIVE);
+
+#ifdef SECCOMP_ARCH_COMPAT
+       seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat,
+                                    cache_prev ? cache_prev->allow_compat : NULL,
+                                    SECCOMP_ARCH_COMPAT_NR,
+                                    SECCOMP_ARCH_COMPAT);
+#endif /* SECCOMP_ARCH_COMPAT */
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
 /**
  * seccomp_attach_filter: validate and attach filter
  * @flags:  flags to change filter behavior
@@ -658,6 +891,7 @@ static long seccomp_attach_filter(unsigned int flags,
         * task reference.
         */
        filter->prev = current->seccomp.filter;
+       seccomp_cache_prepare(filter);
        current->seccomp.filter = filter;
        atomic_inc(&current->seccomp.filter_count);
 
@@ -1967,7 +2201,7 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
        return true;
 }
 
-static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
+static int read_actions_logged(struct ctl_table *ro_table, void *buffer,
                               size_t *lenp, loff_t *ppos)
 {
        char names[sizeof(seccomp_actions_avail)];
@@ -1985,7 +2219,7 @@ static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
        return proc_dostring(&table, 0, buffer, lenp, ppos);
 }
 
-static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
+static int write_actions_logged(struct ctl_table *ro_table, void *buffer,
                                size_t *lenp, loff_t *ppos, u32 *actions_logged)
 {
        char names[sizeof(seccomp_actions_avail)];
@@ -2103,3 +2337,59 @@ static int __init seccomp_sysctl_init(void)
 device_initcall(seccomp_sysctl_init)
 
 #endif /* CONFIG_SYSCTL */
+
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */
+static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
+                                       const void *bitmap, size_t bitmap_size)
+{
+       int nr;
+
+       for (nr = 0; nr < bitmap_size; nr++) {
+               bool cached = test_bit(nr, bitmap);
+               char *status = cached ? "ALLOW" : "FILTER";
+
+               seq_printf(m, "%s %d %s\n", name, nr, status);
+       }
+}
+
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+                          struct pid *pid, struct task_struct *task)
+{
+       struct seccomp_filter *f;
+       unsigned long flags;
+
+       /*
+        * We don't want some sandboxed process to know what their seccomp
+        * filters consist of.
+        */
+       if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
+               return -EACCES;
+
+       if (!lock_task_sighand(task, &flags))
+               return -ESRCH;
+
+       f = READ_ONCE(task->seccomp.filter);
+       if (!f) {
+               unlock_task_sighand(task, &flags);
+               return 0;
+       }
+
+       /* prevent filter from being freed while we are printing it */
+       __get_seccomp_filter(f);
+       unlock_task_sighand(task, &flags);
+
+       proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
+                                   f->cache.allow_native,
+                                   SECCOMP_ARCH_NATIVE_NR);
+
+#ifdef SECCOMP_ARCH_COMPAT
+       proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
+                                   f->cache.allow_compat,
+                                   SECCOMP_ARCH_COMPAT_NR);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+       __put_seccomp_filter(f);
+       return 0;
+}
+#endif /* CONFIG_SECCOMP_CACHE_DEBUG */