Merge tag 'pidfd-updates-v5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/braun...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 11 Jul 2019 05:17:21 +0000 (22:17 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 11 Jul 2019 05:17:21 +0000 (22:17 -0700)
Pull pidfd updates from Christian Brauner:
 "This adds two main features.

   - First, it adds polling support for pidfds. This allows process
     managers to know when a (non-parent) process dies in a race-free
     way.

     The notification mechanism used follows the same logic that is
     currently used when the parent of a task is notified of a child's
     death. With this patchset it is possible to put pidfds in an
     {e}poll loop and get reliable notifications for process (i.e.
     thread-group) exit.

   - The second feature compliments the first one by making it possible
     to retrieve pollable pidfds for processes that were not created
     using CLONE_PIDFD.

     A lot of processes get created with traditional PID-based calls
     such as fork() or clone() (without CLONE_PIDFD). For these
     processes a caller can currently not create a pollable pidfd. This
     is a problem for Android's low memory killer (LMK) and service
     managers such as systemd.

  Both patchsets are accompanied by selftests.

  It's perhaps worth noting that the work done so far and the work done
  in this branch for pidfd_open() and polling support do already see
  some adoption:

   - Android is in the process of backporting this work to all their LTS
     kernels [1]

   - Service managers make use of pidfd_send_signal but will need to
     wait until we enable waiting on pidfds for full adoption.

   - And projects I maintain make use of both pidfd_send_signal and
     CLONE_PIDFD [2] and will use polling support and pidfd_open() too"

[1] https://android-review.googlesource.com/q/topic:%22pidfd+polling+support+4.9+backport%22
    https://android-review.googlesource.com/q/topic:%22pidfd+polling+support+4.14+backport%22
    https://android-review.googlesource.com/q/topic:%22pidfd+polling+support+4.19+backport%22

[2] https://github.com/lxc/lxc/blob/aab6e3eb73c343231cdde775db938994fc6f2803/src/lxc/start.c#L1753

* tag 'pidfd-updates-v5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
  tests: add pidfd_open() tests
  arch: wire-up pidfd_open()
  pid: add pidfd_open()
  pidfd: add polling selftests
  pidfd: add polling support

29 files changed:
arch/alpha/kernel/syscalls/syscall.tbl
arch/arm/tools/syscall.tbl
arch/arm64/include/asm/unistd.h
arch/arm64/include/asm/unistd32.h
arch/ia64/kernel/syscalls/syscall.tbl
arch/m68k/kernel/syscalls/syscall.tbl
arch/microblaze/kernel/syscalls/syscall.tbl
arch/mips/kernel/syscalls/syscall_n32.tbl
arch/mips/kernel/syscalls/syscall_n64.tbl
arch/mips/kernel/syscalls/syscall_o32.tbl
arch/parisc/kernel/syscalls/syscall.tbl
arch/powerpc/kernel/syscalls/syscall.tbl
arch/s390/kernel/syscalls/syscall.tbl
arch/sh/kernel/syscalls/syscall.tbl
arch/sparc/kernel/syscalls/syscall.tbl
arch/x86/entry/syscalls/syscall_32.tbl
arch/x86/entry/syscalls/syscall_64.tbl
arch/xtensa/kernel/syscalls/syscall.tbl
include/linux/pid.h
include/linux/syscalls.h
include/uapi/asm-generic/unistd.h
kernel/fork.c
kernel/pid.c
kernel/signal.c
tools/testing/selftests/pidfd/.gitignore
tools/testing/selftests/pidfd/Makefile
tools/testing/selftests/pidfd/pidfd.h [new file with mode: 0644]
tools/testing/selftests/pidfd/pidfd_open_test.c [new file with mode: 0644]
tools/testing/selftests/pidfd/pidfd_test.c

index 9e7704e44f6ddd1ddcbe9e21afcd5ea5cfe9a097..1db9bbcfb84e1a3077ad7a7077dff671e48b46a2 100644 (file)
 541    common  fsconfig                        sys_fsconfig
 542    common  fsmount                         sys_fsmount
 543    common  fspick                          sys_fspick
+544    common  pidfd_open                      sys_pidfd_open
index aaf479a9e92d17ab08294d7e96442d77dd37fcdd..81e6e1817c45070f600f7f5aa10636938a002f28 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index 2a23614198f135bc1c4b77766dc50b05cae9af74..ede7b88d4f15ac08c62be79f9e381dd760cc35e8 100644 (file)
@@ -38,7 +38,7 @@
 #define __ARM_NR_compat_set_tls                (__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END            (__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls           434
+#define __NR_compat_syscalls           435
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
index aa995920bd34926f97eb52e04a097eaf282697c5..52415923e08f87f491e3af1e6a4ce55238844c0e 100644 (file)
@@ -875,6 +875,8 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
 __SYSCALL(__NR_fsmount, sys_fsmount)
 #define __NR_fspick 433
 __SYSCALL(__NR_fspick, sys_fspick)
+#define __NR_pidfd_open 434
+__SYSCALL(__NR_pidfd_open, sys_pidfd_open)
 
 /*
  * Please add new compat syscalls above this comment and update
index e01df3f2f80d3abfa74a207f435396f13caddeaf..ecc44926737b7189c97156e427b380a71ebcb634 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index 7e3d0734b2f377f7cc375d0e21668e4c9198f3db..9a3eb2558568b9ab4560dc77b233652b3cd6ee69 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index 26339e417695fb7e99560fe507a3cfb9a6c082e4..ad706f83c7553ca21261b423e84b865a76435780 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index 0e2dd68ade5784004abeebb552bfcf18dfed44f6..97035e19ad035b54991d4f895218dad31d3fd764 100644 (file)
 431    n32     fsconfig                        sys_fsconfig
 432    n32     fsmount                         sys_fsmount
 433    n32     fspick                          sys_fspick
+434    n32     pidfd_open                      sys_pidfd_open
index 5eebfa0d155c598354619f326951b15edad4cd54..d7292722d3b091a79ceee0910bb1f02a054501eb 100644 (file)
 431    n64     fsconfig                        sys_fsconfig
 432    n64     fsmount                         sys_fsmount
 433    n64     fspick                          sys_fspick
+434    n64     pidfd_open                      sys_pidfd_open
index 3cc1374e02d079a672be86affdd066cc54c0e326..dba084c92f1491279b21064822c618d3f459b9d3 100644 (file)
 431    o32     fsconfig                        sys_fsconfig
 432    o32     fsmount                         sys_fsmount
 433    o32     fspick                          sys_fspick
+434    o32     pidfd_open                      sys_pidfd_open
index c9e377d59232fd3b8b882c086440131042073e1f..5022b9e179c2067c635eff7825761c10d4028bb2 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index 103655d84b4b556891029bfe01d9df8beb89c443..f2c3bda2d39f11466b74766ea508621271a9085c 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index e822b2964a833b07e89fafe241920a53674bf948..6ebacfeaf853b326f99aa27c2a09d965d637f486 100644 (file)
 431  common    fsconfig                sys_fsconfig                    sys_fsconfig
 432  common    fsmount                 sys_fsmount                     sys_fsmount
 433  common    fspick                  sys_fspick                      sys_fspick
+434  common    pidfd_open              sys_pidfd_open                  sys_pidfd_open
index 016a727d435784d8386a9a8b0d007f17cafe4322..834c9c7d79faebfde9913664d9fb1ce108f1929c 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index e047480b160557c63ed976e63bb66152b9f50632..c58e71f211298c960577c759a0746c912475163e 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index ad968b7bac72ba4b516dcaaaf49ed9f6c3a69f0a..43e4429a5272c531deaf98b697e7f3b75da59156 100644 (file)
 431    i386    fsconfig                sys_fsconfig                    __ia32_sys_fsconfig
 432    i386    fsmount                 sys_fsmount                     __ia32_sys_fsmount
 433    i386    fspick                  sys_fspick                      __ia32_sys_fspick
+434    i386    pidfd_open              sys_pidfd_open                  __ia32_sys_pidfd_open
index b4e6f9e6204aa874f03337adc47f9bba0297f707..1bee0a77fdd37d52d44c32ee305d4d0a3acf2c38 100644 (file)
 431    common  fsconfig                __x64_sys_fsconfig
 432    common  fsmount                 __x64_sys_fsmount
 433    common  fspick                  __x64_sys_fspick
+434    common  pidfd_open              __x64_sys_pidfd_open
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
index 5fa0ee1c8e00f4dc3f64d79e913a33e75767c50c..782b81945cccdf5535e5b446dbf129c62836e530 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index 3c8ef5a199ca8d59c777e7e0e40a5c61ddfee94a..1484db6ca8d17a960b8dc18e0ae3bb732ea0d216 100644 (file)
@@ -3,6 +3,7 @@
 #define _LINUX_PID_H
 
 #include <linux/rculist.h>
+#include <linux/wait.h>
 
 enum pid_type
 {
@@ -60,6 +61,8 @@ struct pid
        unsigned int level;
        /* lists of tasks that use this pid */
        struct hlist_head tasks[PIDTYPE_MAX];
+       /* wait queue for pidfd notifications */
+       wait_queue_head_t wait_pidfd;
        struct rcu_head rcu;
        struct upid numbers[1];
 };
index bc4bbbb9ed9a257cdd90c04df7e6ab0340eadcb2..699aed6674a021d71144b63f1e1af7aaf4750808 100644 (file)
@@ -927,6 +927,7 @@ asmlinkage long sys_clock_adjtime32(clockid_t which_clock,
                                struct old_timex32 __user *tx);
 asmlinkage long sys_syncfs(int fd);
 asmlinkage long sys_setns(int fd, int nstype);
+asmlinkage long sys_pidfd_open(pid_t pid, unsigned int flags);
 asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg,
                             unsigned int vlen, unsigned flags);
 asmlinkage long sys_process_vm_readv(pid_t pid,
index a87904daf1034449980afb86b952a538f84b9da9..e5684a4512c0a4901b7bea21bfd2ef53072acfda 100644 (file)
@@ -844,9 +844,11 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
 __SYSCALL(__NR_fsmount, sys_fsmount)
 #define __NR_fspick 433
 __SYSCALL(__NR_fspick, sys_fspick)
+#define __NR_pidfd_open 434
+__SYSCALL(__NR_pidfd_open, sys_pidfd_open)
 
 #undef __NR_syscalls
-#define __NR_syscalls 434
+#define __NR_syscalls 435
 
 /*
  * 32 bit systems traditionally used different
index 847dd147b06899ad7efb51be5df850b6db631856..187c02ce534c87a955cf93d1d01f9fba57c723e2 100644 (file)
@@ -1711,8 +1711,34 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 }
 #endif
 
+/*
+ * Poll support for process exit notification.
+ */
+static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts)
+{
+       struct task_struct *task;
+       struct pid *pid = file->private_data;
+       int poll_flags = 0;
+
+       poll_wait(file, &pid->wait_pidfd, pts);
+
+       rcu_read_lock();
+       task = pid_task(pid, PIDTYPE_PID);
+       /*
+        * Inform pollers only when the whole thread group exits.
+        * If the thread group leader exits before all other threads in the
+        * group, then poll(2) should block, similar to the wait(2) family.
+        */
+       if (!task || (task->exit_state && thread_group_empty(task)))
+               poll_flags = POLLIN | POLLRDNORM;
+       rcu_read_unlock();
+
+       return poll_flags;
+}
+
 const struct file_operations pidfd_fops = {
        .release = pidfd_release,
+       .poll = pidfd_poll,
 #ifdef CONFIG_PROC_FS
        .show_fdinfo = pidfd_show_fdinfo,
 #endif
index e5cad0c7d5ddea08c054419b1f1525bada065bbf..16263b5265604dd7931d4431920d51b633429050 100644 (file)
@@ -38,6 +38,8 @@
 #include <linux/syscalls.h>
 #include <linux/proc_ns.h>
 #include <linux/proc_fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/task.h>
 #include <linux/idr.h>
 
@@ -214,6 +216,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
        for (type = 0; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_HEAD(&pid->tasks[type]);
 
+       init_waitqueue_head(&pid->wait_pidfd);
+
        upid = pid->numbers + ns->level;
        spin_lock_irq(&pidmap_lock);
        if (!(ns->pid_allocated & PIDNS_ADDING))
@@ -451,6 +455,73 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
        return idr_get_next(&ns->idr, &nr);
 }
 
+/**
+ * pidfd_create() - Create a new pid file descriptor.
+ *
+ * @pid:  struct pid that the pidfd will reference
+ *
+ * This creates a new pid file descriptor with the O_CLOEXEC flag set.
+ *
+ * Note, that this function can only be called after the fd table has
+ * been unshared to avoid leaking the pidfd to the new process.
+ *
+ * Return: On success, a cloexec pidfd is returned.
+ *         On error, a negative errno number will be returned.
+ */
+static int pidfd_create(struct pid *pid)
+{
+       int fd;
+
+       fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
+                             O_RDWR | O_CLOEXEC);
+       if (fd < 0)
+               put_pid(pid);
+
+       return fd;
+}
+
+/**
+ * pidfd_open() - Open new pid file descriptor.
+ *
+ * @pid:   pid for which to retrieve a pidfd
+ * @flags: flags to pass
+ *
+ * This creates a new pid file descriptor with the O_CLOEXEC flag set for
+ * the process identified by @pid. Currently, the process identified by
+ * @pid must be a thread-group leader. This restriction currently exists
+ * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
+ * be used with CLONE_THREAD) and pidfd polling (only supports thread group
+ * leaders).
+ *
+ * Return: On success, a cloexec pidfd is returned.
+ *         On error, a negative errno number will be returned.
+ */
+SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
+{
+       int fd, ret;
+       struct pid *p;
+
+       if (flags)
+               return -EINVAL;
+
+       if (pid <= 0)
+               return -EINVAL;
+
+       p = find_get_pid(pid);
+       if (!p)
+               return -ESRCH;
+
+       ret = 0;
+       rcu_read_lock();
+       if (!pid_task(p, PIDTYPE_TGID))
+               ret = -EINVAL;
+       rcu_read_unlock();
+
+       fd = ret ?: pidfd_create(p);
+       put_pid(p);
+       return fd;
+}
+
 void __init pid_idr_init(void)
 {
        /* Verify no one has done anything silly: */
index 91cb8ca41954ad4559e95c627f3189dba651ac53..dabe100d209191cb6b15a6f80910e92b28553f11 100644 (file)
@@ -1881,6 +1881,14 @@ ret:
        return ret;
 }
 
+static void do_notify_pidfd(struct task_struct *task)
+{
+       struct pid *pid;
+
+       pid = task_pid(task);
+       wake_up_all(&pid->wait_pidfd);
+}
+
 /*
  * Let a parent know about the death of a child.
  * For a stopped/continued status change, use do_notify_parent_cldstop instead.
@@ -1904,6 +1912,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
        BUG_ON(!tsk->ptrace &&
               (tsk->group_leader != tsk || !thread_group_empty(tsk)));
 
+       /* Wake up all pidfd waiters */
+       do_notify_pidfd(tsk);
+
        if (sig != SIGCHLD) {
                /*
                 * This is only possible if parent == real_parent.
index 822a1e63d045edee9e58ada40c213c14ce123add..16d84d117bc04c3f2ebf962c7cb684d84f691e6f 100644 (file)
@@ -1 +1,2 @@
+pidfd_open_test
 pidfd_test
index 443fedbd6231597d1b1c8253ea06a79544997fc0..720b2d884b3c52560d86078c704b1696a89c5a9d 100644 (file)
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
-CFLAGS += -g -I../../../../usr/include/
+CFLAGS += -g -I../../../../usr/include/ -lpthread
 
-TEST_GEN_PROGS := pidfd_test
+TEST_GEN_PROGS := pidfd_test pidfd_open_test
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h
new file mode 100644 (file)
index 0000000..8452e91
--- /dev/null
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __PIDFD_H
+#define __PIDFD_H
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/mount.h>
+
+#include "../kselftest.h"
+
+/*
+ * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c
+ * That means, when it wraps around any pid < 300 will be skipped.
+ * So we need to use a pid > 300 in order to test recycling.
+ */
+#define PID_RECYCLE 1000
+
+/*
+ * Define a few custom error codes for the child process to clearly indicate
+ * what is happening. This way we can tell the difference between a system
+ * error, a test error, etc.
+ */
+#define PIDFD_PASS 0
+#define PIDFD_FAIL 1
+#define PIDFD_ERROR 2
+#define PIDFD_SKIP 3
+#define PIDFD_XFAIL 4
+
+int wait_for_pid(pid_t pid)
+{
+       int status, ret;
+
+again:
+       ret = waitpid(pid, &status, 0);
+       if (ret == -1) {
+               if (errno == EINTR)
+                       goto again;
+
+               return -1;
+       }
+
+       if (!WIFEXITED(status))
+               return -1;
+
+       return WEXITSTATUS(status);
+}
+
+
+#endif /* __PIDFD_H */
diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c
new file mode 100644 (file)
index 0000000..0377133
--- /dev/null
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "pidfd.h"
+#include "../kselftest.h"
+
+static inline int sys_pidfd_open(pid_t pid, unsigned int flags)
+{
+       return syscall(__NR_pidfd_open, pid, flags);
+}
+
+static int safe_int(const char *numstr, int *converted)
+{
+       char *err = NULL;
+       long sli;
+
+       errno = 0;
+       sli = strtol(numstr, &err, 0);
+       if (errno == ERANGE && (sli == LONG_MAX || sli == LONG_MIN))
+               return -ERANGE;
+
+       if (errno != 0 && sli == 0)
+               return -EINVAL;
+
+       if (err == numstr || *err != '\0')
+               return -EINVAL;
+
+       if (sli > INT_MAX || sli < INT_MIN)
+               return -ERANGE;
+
+       *converted = (int)sli;
+       return 0;
+}
+
+static int char_left_gc(const char *buffer, size_t len)
+{
+       size_t i;
+
+       for (i = 0; i < len; i++) {
+               if (buffer[i] == ' ' ||
+                   buffer[i] == '\t')
+                       continue;
+
+               return i;
+       }
+
+       return 0;
+}
+
+static int char_right_gc(const char *buffer, size_t len)
+{
+       int i;
+
+       for (i = len - 1; i >= 0; i--) {
+               if (buffer[i] == ' '  ||
+                   buffer[i] == '\t' ||
+                   buffer[i] == '\n' ||
+                   buffer[i] == '\0')
+                       continue;
+
+               return i + 1;
+       }
+
+       return 0;
+}
+
+static char *trim_whitespace_in_place(char *buffer)
+{
+       buffer += char_left_gc(buffer, strlen(buffer));
+       buffer[char_right_gc(buffer, strlen(buffer))] = '\0';
+       return buffer;
+}
+
+static pid_t get_pid_from_fdinfo_file(int pidfd, const char *key, size_t keylen)
+{
+       int ret;
+       char path[512];
+       FILE *f;
+       size_t n = 0;
+       pid_t result = -1;
+       char *line = NULL;
+
+       snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", pidfd);
+
+       f = fopen(path, "re");
+       if (!f)
+               return -1;
+
+       while (getline(&line, &n, f) != -1) {
+               char *numstr;
+
+               if (strncmp(line, key, keylen))
+                       continue;
+
+               numstr = trim_whitespace_in_place(line + 4);
+               ret = safe_int(numstr, &result);
+               if (ret < 0)
+                       goto out;
+
+               break;
+       }
+
+out:
+       free(line);
+       fclose(f);
+       return result;
+}
+
+int main(int argc, char **argv)
+{
+       int pidfd = -1, ret = 1;
+       pid_t pid;
+
+       ksft_set_plan(3);
+
+       pidfd = sys_pidfd_open(-1, 0);
+       if (pidfd >= 0) {
+               ksft_print_msg(
+                       "%s - succeeded to open pidfd for invalid pid -1\n",
+                       strerror(errno));
+               goto on_error;
+       }
+       ksft_test_result_pass("do not allow invalid pid test: passed\n");
+
+       pidfd = sys_pidfd_open(getpid(), 1);
+       if (pidfd >= 0) {
+               ksft_print_msg(
+                       "%s - succeeded to open pidfd with invalid flag value specified\n",
+                       strerror(errno));
+               goto on_error;
+       }
+       ksft_test_result_pass("do not allow invalid flag test: passed\n");
+
+       pidfd = sys_pidfd_open(getpid(), 0);
+       if (pidfd < 0) {
+               ksft_print_msg("%s - failed to open pidfd\n", strerror(errno));
+               goto on_error;
+       }
+       ksft_test_result_pass("open a new pidfd test: passed\n");
+
+       pid = get_pid_from_fdinfo_file(pidfd, "Pid:", sizeof("Pid:") - 1);
+       ksft_print_msg("pidfd %d refers to process with pid %d\n", pidfd, pid);
+
+       ret = 0;
+
+on_error:
+       if (pidfd >= 0)
+               close(pidfd);
+
+       return !ret ? ksft_exit_pass() : ksft_exit_fail();
+}
index 104c75a33882c9538af0e0c61ecd92a81f4fdba6..7eaa8a3de26277212aa8350bb90e9098c9118741 100644 (file)
@@ -4,22 +4,49 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <linux/types.h>
+#include <pthread.h>
 #include <sched.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <syscall.h>
+#include <sys/epoll.h>
+#include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/wait.h>
+#include <time.h>
 #include <unistd.h>
 
+#include "pidfd.h"
 #include "../kselftest.h"
 
 #ifndef __NR_pidfd_send_signal
 #define __NR_pidfd_send_signal -1
 #endif
 
+#define str(s) _str(s)
+#define _str(s) #s
+#define CHILD_THREAD_MIN_WAIT 3 /* seconds */
+
+#define MAX_EVENTS 5
+
+#ifndef CLONE_PIDFD
+#define CLONE_PIDFD 0x00001000
+#endif
+
+static pid_t pidfd_clone(int flags, int *pidfd, int (*fn)(void *))
+{
+       size_t stack_size = 1024;
+       char *stack[1024] = { 0 };
+
+#ifdef __ia64__
+       return __clone2(fn, stack, stack_size, flags | SIGCHLD, NULL, pidfd);
+#else
+       return clone(fn, stack + stack_size, flags | SIGCHLD, NULL, pidfd);
+#endif
+}
+
 static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
                                        unsigned int flags)
 {
@@ -66,28 +93,6 @@ static int test_pidfd_send_signal_simple_success(void)
        return 0;
 }
 
-static int wait_for_pid(pid_t pid)
-{
-       int status, ret;
-
-again:
-       ret = waitpid(pid, &status, 0);
-       if (ret == -1) {
-               if (errno == EINTR)
-                       goto again;
-
-               return -1;
-       }
-
-       if (ret != pid)
-               goto again;
-
-       if (!WIFEXITED(status))
-               return -1;
-
-       return WEXITSTATUS(status);
-}
-
 static int test_pidfd_send_signal_exited_fail(void)
 {
        int pidfd, ret, saved_errno;
@@ -132,13 +137,6 @@ static int test_pidfd_send_signal_exited_fail(void)
        return 0;
 }
 
-/*
- * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c
- * That means, when it wraps around any pid < 300 will be skipped.
- * So we need to use a pid > 300 in order to test recycling.
- */
-#define PID_RECYCLE 1000
-
 /*
  * Maximum number of cycles we allow. This is equivalent to PID_MAX_DEFAULT.
  * If users set a higher limit or we have cycled PIDFD_MAX_DEFAULT number of
@@ -147,17 +145,6 @@ static int test_pidfd_send_signal_exited_fail(void)
  */
 #define PIDFD_MAX_DEFAULT 0x8000
 
-/*
- * Define a few custom error codes for the child process to clearly indicate
- * what is happening. This way we can tell the difference between a system
- * error, a test error, etc.
- */
-#define PIDFD_PASS 0
-#define PIDFD_FAIL 1
-#define PIDFD_ERROR 2
-#define PIDFD_SKIP 3
-#define PIDFD_XFAIL 4
-
 static int test_pidfd_send_signal_recycled_pid_fail(void)
 {
        int i, ret;
@@ -372,11 +359,192 @@ static int test_pidfd_send_signal_syscall_support(void)
        return 0;
 }
 
+static void *test_pidfd_poll_exec_thread(void *priv)
+{
+       ksft_print_msg("Child Thread: starting. pid %d tid %d ; and sleeping\n",
+                       getpid(), syscall(SYS_gettid));
+       ksft_print_msg("Child Thread: doing exec of sleep\n");
+
+       execl("/bin/sleep", "sleep", str(CHILD_THREAD_MIN_WAIT), (char *)NULL);
+
+       ksft_print_msg("Child Thread: DONE. pid %d tid %d\n",
+                       getpid(), syscall(SYS_gettid));
+       return NULL;
+}
+
+static void poll_pidfd(const char *test_name, int pidfd)
+{
+       int c;
+       int epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+       struct epoll_event event, events[MAX_EVENTS];
+
+       if (epoll_fd == -1)
+               ksft_exit_fail_msg("%s test: Failed to create epoll file descriptor "
+                                  "(errno %d)\n",
+                                  test_name, errno);
+
+       event.events = EPOLLIN;
+       event.data.fd = pidfd;
+
+       if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pidfd, &event)) {
+               ksft_exit_fail_msg("%s test: Failed to add epoll file descriptor "
+                                  "(errno %d)\n",
+                                  test_name, errno);
+       }
+
+       c = epoll_wait(epoll_fd, events, MAX_EVENTS, 5000);
+       if (c != 1 || !(events[0].events & EPOLLIN))
+               ksft_exit_fail_msg("%s test: Unexpected epoll_wait result (c=%d, events=%x) ",
+                                  "(errno %d)\n",
+                                  test_name, c, events[0].events, errno);
+
+       close(epoll_fd);
+       return;
+
+}
+
+static int child_poll_exec_test(void *args)
+{
+       pthread_t t1;
+
+       ksft_print_msg("Child (pidfd): starting. pid %d tid %d\n", getpid(),
+                       syscall(SYS_gettid));
+       pthread_create(&t1, NULL, test_pidfd_poll_exec_thread, NULL);
+       /*
+        * Exec in the non-leader thread will destroy the leader immediately.
+        * If the wait in the parent returns too soon, the test fails.
+        */
+       while (1)
+               sleep(1);
+}
+
+static void test_pidfd_poll_exec(int use_waitpid)
+{
+       int pid, pidfd = 0;
+       int status, ret;
+       pthread_t t1;
+       time_t prog_start = time(NULL);
+       const char *test_name = "pidfd_poll check for premature notification on child thread exec";
+
+       ksft_print_msg("Parent: pid: %d\n", getpid());
+       pid = pidfd_clone(CLONE_PIDFD, &pidfd, child_poll_exec_test);
+       if (pid < 0)
+               ksft_exit_fail_msg("%s test: pidfd_clone failed (ret %d, errno %d)\n",
+                                  test_name, pid, errno);
+
+       ksft_print_msg("Parent: Waiting for Child (%d) to complete.\n", pid);
+
+       if (use_waitpid) {
+               ret = waitpid(pid, &status, 0);
+               if (ret == -1)
+                       ksft_print_msg("Parent: error\n");
+
+               if (ret == pid)
+                       ksft_print_msg("Parent: Child process waited for.\n");
+       } else {
+               poll_pidfd(test_name, pidfd);
+       }
+
+       time_t prog_time = time(NULL) - prog_start;
+
+       ksft_print_msg("Time waited for child: %lu\n", prog_time);
+
+       close(pidfd);
+
+       if (prog_time < CHILD_THREAD_MIN_WAIT || prog_time > CHILD_THREAD_MIN_WAIT + 2)
+               ksft_exit_fail_msg("%s test: Failed\n", test_name);
+       else
+               ksft_test_result_pass("%s test: Passed\n", test_name);
+}
+
+static void *test_pidfd_poll_leader_exit_thread(void *priv)
+{
+       ksft_print_msg("Child Thread: starting. pid %d tid %d ; and sleeping\n",
+                       getpid(), syscall(SYS_gettid));
+       sleep(CHILD_THREAD_MIN_WAIT);
+       ksft_print_msg("Child Thread: DONE. pid %d tid %d\n", getpid(), syscall(SYS_gettid));
+       return NULL;
+}
+
+static time_t *child_exit_secs;
+static int child_poll_leader_exit_test(void *args)
+{
+       pthread_t t1, t2;
+
+       ksft_print_msg("Child: starting. pid %d tid %d\n", getpid(), syscall(SYS_gettid));
+       pthread_create(&t1, NULL, test_pidfd_poll_leader_exit_thread, NULL);
+       pthread_create(&t2, NULL, test_pidfd_poll_leader_exit_thread, NULL);
+
+       /*
+        * glibc exit calls exit_group syscall, so explicity call exit only
+        * so that only the group leader exits, leaving the threads alone.
+        */
+       *child_exit_secs = time(NULL);
+       syscall(SYS_exit, 0);
+}
+
+static void test_pidfd_poll_leader_exit(int use_waitpid)
+{
+       int pid, pidfd = 0;
+       int status, ret;
+       time_t prog_start = time(NULL);
+       const char *test_name = "pidfd_poll check for premature notification on non-empty"
+                               "group leader exit";
+
+       child_exit_secs = mmap(NULL, sizeof *child_exit_secs, PROT_READ | PROT_WRITE,
+                       MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+
+       if (child_exit_secs == MAP_FAILED)
+               ksft_exit_fail_msg("%s test: mmap failed (errno %d)\n",
+                                  test_name, errno);
+
+       ksft_print_msg("Parent: pid: %d\n", getpid());
+       pid = pidfd_clone(CLONE_PIDFD, &pidfd, child_poll_leader_exit_test);
+       if (pid < 0)
+               ksft_exit_fail_msg("%s test: pidfd_clone failed (ret %d, errno %d)\n",
+                                  test_name, pid, errno);
+
+       ksft_print_msg("Parent: Waiting for Child (%d) to complete.\n", pid);
+
+       if (use_waitpid) {
+               ret = waitpid(pid, &status, 0);
+               if (ret == -1)
+                       ksft_print_msg("Parent: error\n");
+       } else {
+               /*
+                * This sleep tests for the case where if the child exits, and is in
+                * EXIT_ZOMBIE, but the thread group leader is non-empty, then the poll
+                * doesn't prematurely return even though there are active threads
+                */
+               sleep(1);
+               poll_pidfd(test_name, pidfd);
+       }
+
+       if (ret == pid)
+               ksft_print_msg("Parent: Child process waited for.\n");
+
+       time_t since_child_exit = time(NULL) - *child_exit_secs;
+
+       ksft_print_msg("Time since child exit: %lu\n", since_child_exit);
+
+       close(pidfd);
+
+       if (since_child_exit < CHILD_THREAD_MIN_WAIT ||
+                       since_child_exit > CHILD_THREAD_MIN_WAIT + 2)
+               ksft_exit_fail_msg("%s test: Failed\n", test_name);
+       else
+               ksft_test_result_pass("%s test: Passed\n", test_name);
+}
+
 int main(int argc, char **argv)
 {
        ksft_print_header();
        ksft_set_plan(4);
 
+       test_pidfd_poll_exec(0);
+       test_pidfd_poll_exec(1);
+       test_pidfd_poll_leader_exit(0);
+       test_pidfd_poll_leader_exit(1);
        test_pidfd_send_signal_syscall_support();
        test_pidfd_send_signal_simple_success();
        test_pidfd_send_signal_exited_fail();