Merge tag 'threads-v5.6' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 30 Jan 2020 03:38:34 +0000 (19:38 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 30 Jan 2020 03:38:34 +0000 (19:38 -0800)
Pull thread management updates from Christian Brauner:
 "Sargun Dhillon over the last cycle has worked on the pidfd_getfd()
  syscall.

  This syscall allows for the retrieval of file descriptors of a process
  based on its pidfd. A task needs to have ptrace_may_access()
  permissions with PTRACE_MODE_ATTACH_REALCREDS (suggested by Oleg and
  Andy) on the target.

  One of the main use-cases is in combination with seccomp's user
  notification feature. As a reminder, seccomp's user notification
  feature was made available in v5.0. It allows a task to retrieve a
  file descriptor for its seccomp filter. The file descriptor is usually
  handed of to a more privileged supervising process. The supervisor can
  then listen for syscall events caught by the seccomp filter of the
  supervisee and perform actions in lieu of the supervisee, usually
  emulating syscalls. pidfd_getfd() is needed to expand its uses.

  There are currently two major users that wait on pidfd_getfd() and one
  future user:

   - Netflix, Sargun said, is working on a service mesh where users
     should be able to connect to a dns-based VIP. When a user connects
     to e.g. 1.2.3.4:80 that runs e.g. service "foo" they will be
     redirected to an envoy process. This service mesh uses seccomp user
     notifications and pidfd to intercept all connect calls and instead
     of connecting them to 1.2.3.4:80 connects them to e.g.
     127.0.0.1:8080.

   - LXD uses the seccomp notifier heavily to intercept and emulate
     mknod() and mount() syscalls for unprivileged containers/processes.
     With pidfd_getfd() more uses-cases e.g. bridging socket connections
     will be possible.

   - The patchset has also seen some interest from the browser corner.
     Right now, Firefox is using a SECCOMP_RET_TRAP sandbox managed by a
     broker process. In the future glibc will start blocking all signals
     during dlopen() rendering this type of sandbox impossible. Hence,
     in the future Firefox will switch to a seccomp-user-nofication
     based sandbox which also makes use of file descriptor retrieval.
     The thread for this can be found at
     https://sourceware.org/ml/libc-alpha/2019-12/msg00079.html

  With pidfd_getfd() it is e.g. possible to bridge socket connections
  for the supervisee (binding to a privileged port) and taking actions
  on file descriptors on behalf of the supervisee in general.

  Sargun's first version was using an ioctl on pidfds but various people
  pushed for it to be a proper syscall which he duely implemented as
  well over various review cycles. Selftests are of course included.
  I've also added instructions how to deal with merge conflicts below.

  There's also a small fix coming from the kernel mentee project to
  correctly annotate struct sighand_struct with __rcu to fix various
  sparse warnings. We've received a few more such fixes and even though
  they are mostly trivial I've decided to postpone them until after -rc1
  since they came in rather late and I don't want to risk introducing
  build warnings.

  Finally, there's a new prctl() command PR_{G,S}ET_IO_FLUSHER which is
  needed to avoid allocation recursions triggerable by storage drivers
  that have userspace parts that run in the IO path (e.g. dm-multipath,
  iscsi, etc). These allocation recursions deadlock the device.

  The new prctl() allows such privileged userspace components to avoid
  allocation recursions by setting the PF_MEMALLOC_NOIO and
  PF_LESS_THROTTLE flags. The patch carries the necessary acks from the
  relevant maintainers and is routed here as part of prctl()
  thread-management."

* tag 'threads-v5.6' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
  prctl: PR_{G,S}ET_IO_FLUSHER to support controlling memory reclaim
  sched.h: Annotate sighand_struct with __rcu
  test: Add test for pidfd getfd
  arch: wire up pidfd_getfd syscall
  pid: Implement pidfd_getfd syscall
  vfs, fdtable: Add fget_task helper

22 files changed:
1  2 
arch/alpha/kernel/syscalls/syscall.tbl
arch/arm/tools/syscall.tbl
arch/arm64/include/asm/unistd.h
arch/arm64/include/asm/unistd32.h
arch/ia64/kernel/syscalls/syscall.tbl
arch/m68k/kernel/syscalls/syscall.tbl
arch/microblaze/kernel/syscalls/syscall.tbl
arch/mips/kernel/syscalls/syscall_n32.tbl
arch/mips/kernel/syscalls/syscall_n64.tbl
arch/mips/kernel/syscalls/syscall_o32.tbl
arch/parisc/kernel/syscalls/syscall.tbl
arch/powerpc/kernel/syscalls/syscall.tbl
arch/s390/kernel/syscalls/syscall.tbl
arch/sh/kernel/syscalls/syscall.tbl
arch/sparc/kernel/syscalls/syscall.tbl
arch/x86/entry/syscalls/syscall_32.tbl
arch/x86/entry/syscalls/syscall_64.tbl
arch/xtensa/kernel/syscalls/syscall.tbl
fs/file.c
include/linux/sched.h
include/linux/syscalls.h
include/uapi/asm-generic/unistd.h

index 4d7f2ffa957c14c5422b12da725614d866d9970f,82301080f5e7d53845983e933aa39bfe9bff6964..36d42da7466aae5ccd8a8a70d0e70014ae63c2df
  543   common  fspick                          sys_fspick
  544   common  pidfd_open                      sys_pidfd_open
  # 545 reserved for clone3
 +547   common  openat2                         sys_openat2
+ 548   common  pidfd_getfd                     sys_pidfd_getfd
index 4ba54bc7e19a6795d417582ae5e3ed33e3c17e28,ba045e2f3a60e77f567884328d365fd68f68501c..4d1cf74a2caac645c4b217134a066a12bcbe7056
  433   common  fspick                          sys_fspick
  434   common  pidfd_open                      sys_pidfd_open
  435   common  clone3                          sys_clone3
 +437   common  openat2                         sys_openat2
+ 438   common  pidfd_getfd                     sys_pidfd_getfd
Simple merge
index 57f6f592d460c990a232cfa982bc440b732c912d,a8da97a2de4100b6d07a7ea882eddfa78fd4cfaa..c1c61635f89c374c80cab2410e87fbe261209540
@@@ -879,8 -879,8 +879,10 @@@ __SYSCALL(__NR_fspick, sys_fspick
  __SYSCALL(__NR_pidfd_open, sys_pidfd_open)
  #define __NR_clone3 435
  __SYSCALL(__NR_clone3, sys_clone3)
 +#define __NR_openat2 437
 +__SYSCALL(__NR_openat2, sys_openat2)
+ #define __NR_pidfd_getfd 438
+ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
  
  /*
   * Please add new compat syscalls above this comment and update
index 8d36f2e2dc892272eeac9caa963a7a72c7e6b3c5,2b11adfc860c8ebd94e1f3e846c4634f574e47ef..042911e670b80179a74e0b55f8f8a644ccb49f1c
  433   common  fspick                          sys_fspick
  434   common  pidfd_open                      sys_pidfd_open
  # 435 reserved for clone3
 +437   common  openat2                         sys_openat2
+ 438   common  pidfd_getfd                     sys_pidfd_getfd
index b911e0f50a71fcd51665b5fdc882a951ff95a1e1,44e879e9845986f9fdad3017d3288859d0e2da43..f4f49fcb76d0fe3e7107cc9e68a70aabf34ccb27
  432   common  fsmount                         sys_fsmount
  433   common  fspick                          sys_fspick
  434   common  pidfd_open                      sys_pidfd_open
 -# 435 reserved for clone3
 +435   common  clone3                          __sys_clone3
 +437   common  openat2                         sys_openat2
+ 438   common  pidfd_getfd                     sys_pidfd_getfd
index c04385e60833d6226be5a3663fdf3d5a35545532,7afa00125cc456968c5d08b7eb50603b4c069e2a..4c67b11f9c9ef83f70fb3b7cb891dbe81e271564
  433   common  fspick                          sys_fspick
  434   common  pidfd_open                      sys_pidfd_open
  435   common  clone3                          sys_clone3
 +437   common  openat2                         sys_openat2
+ 438   common  pidfd_getfd                     sys_pidfd_getfd
index 68c9ec06851f8942fe0e848cbc531a2346fdc188,856d5ba3446106bc4a4d3f088f99ca000a709ad7..1f9e8ad636cce34a128d143e604b6847caae0438
  433   n32     fspick                          sys_fspick
  434   n32     pidfd_open                      sys_pidfd_open
  435   n32     clone3                          __sys_clone3
 +437   n32     openat2                         sys_openat2
+ 438   n32     pidfd_getfd                     sys_pidfd_getfd
index 42a72d01005087db40910b2b359b6fdedce5867e,2db6075352f39a926f6019a4ed57827bf1ef5593..c0b9d802dbf6dd82c9b4be2a799871a5df8ef3db
  433   n64     fspick                          sys_fspick
  434   n64     pidfd_open                      sys_pidfd_open
  435   n64     clone3                          __sys_clone3
 +437   n64     openat2                         sys_openat2
+ 438   n64     pidfd_getfd                     sys_pidfd_getfd
index f114c4aed0ed901fe3d4ac97f0a43d0040534268,e9f9d4a9b10528e6cb957b85bd26288f22079a12..ac586774c980537ed69d85ab4850adb79cd2c9e8
  433   o32     fspick                          sys_fspick
  434   o32     pidfd_open                      sys_pidfd_open
  435   o32     clone3                          __sys_clone3
 +437   o32     openat2                         sys_openat2
+ 438   o32     pidfd_getfd                     sys_pidfd_getfd
index b550ae9a7fea9d55260a5ff1722d5df6abf09ef8,c58c7eb144ca08290f00db5be83f34fafc8278d0..52a15f5cd1308d222a821a231244d0471ab7be07
  433   common  fspick                          sys_fspick
  434   common  pidfd_open                      sys_pidfd_open
  435   common  clone3                          sys_clone3_wrapper
 +437   common  openat2                         sys_openat2
+ 438   common  pidfd_getfd                     sys_pidfd_getfd
index a8b5ecb5b602c2d44c92c31d9b74c5746f5d2f98,707609bfe3ea37cf37f5455eb4d9f6fec073e9cb..35b61bfc1b1ae928158dee422a150c19c8d30e4e
  433   common  fspick                          sys_fspick
  434   common  pidfd_open                      sys_pidfd_open
  435   nospu   clone3                          ppc_clone3
 +437   common  openat2                         sys_openat2
+ 438   common  pidfd_getfd                     sys_pidfd_getfd
index 16b571c06161d534841a0dedb09034e4abbe8585,185cd624face9646dfeaf3047c899f60184d2737..bd7bd3581a0fcd4f830d774c8fb98b53fa0066db
  433  common   fspick                  sys_fspick                      sys_fspick
  434  common   pidfd_open              sys_pidfd_open                  sys_pidfd_open
  435  common   clone3                  sys_clone3                      sys_clone3
 +437  common   openat2                 sys_openat2                     sys_openat2
+ 438  common   pidfd_getfd             sys_pidfd_getfd                 sys_pidfd_getfd
index a7185cc1862657f1d1bfe045ce9f7ef19ba7c671,88f90895aad8960f41a45eff751d95d0054b2d1d..c7a30fcd135f89cc4882ed67e8c6bf5a4ea54347
  433   common  fspick                          sys_fspick
  434   common  pidfd_open                      sys_pidfd_open
  # 435 reserved for clone3
 +437   common  openat2                         sys_openat2
+ 438   common  pidfd_getfd                     sys_pidfd_getfd
index b11c195520221947919ab63c66c990d0d695f5b4,218df6a2326e9988fa2f73193d18bda08f2423ba..f13615ecdecce2f69adbd13576b03a6aa4d8fbef
  433   common  fspick                          sys_fspick
  434   common  pidfd_open                      sys_pidfd_open
  # 435 reserved for clone3
 +437   common  openat2                 sys_openat2
+ 438   common  pidfd_getfd                     sys_pidfd_getfd
index d22a8b5c3fab97d6e29b3d78d362432e2edcf828,9c3101b65e0f6e04ae2cc8e3b8f319f4e117531a..c17cb77eb15093ea0c6e34f293a7214fcd75da61
  433   i386    fspick                  sys_fspick                      __ia32_sys_fspick
  434   i386    pidfd_open              sys_pidfd_open                  __ia32_sys_pidfd_open
  435   i386    clone3                  sys_clone3                      __ia32_sys_clone3
 +437   i386    openat2                 sys_openat2                     __ia32_sys_openat2
+ 438   i386    pidfd_getfd             sys_pidfd_getfd                 __ia32_sys_pidfd_getfd
index 9035647ef236d5355a223162698eba0b49b50c53,cef85db75a62bf8caf468bd6a3aa0a14ae934a79..44d510bc9b7877a18c082ceb168f01e94db0417b
  433   common  fspick                  __x64_sys_fspick
  434   common  pidfd_open              __x64_sys_pidfd_open
  435   common  clone3                  __x64_sys_clone3/ptregs
 +437   common  openat2                 __x64_sys_openat2
+ 438   common  pidfd_getfd             __x64_sys_pidfd_getfd
  
  #
  # x32-specific system call numbers start at 512 to avoid cache impact
index f0a68013c0384c53c66284c320b7fc78728575c5,ae15183def12c962cdad96572078aa20d4fc7d9c..85a9ab1bc04dab5cae943dbe168aedfe452fa6de
  433   common  fspick                          sys_fspick
  434   common  pidfd_open                      sys_pidfd_open
  435   common  clone3                          sys_clone3
 +437   common  openat2                         sys_openat2
+ 438   common  pidfd_getfd                     sys_pidfd_getfd
diff --cc fs/file.c
Simple merge
Simple merge
Simple merge
index d4122c0914720931f125e552ce58083683133998,d36ec3d645bd16941bb3472d1ab5875e2686e389..3a3201e4618ef8c7445895b26f6eebbaea1574f9
@@@ -850,12 -850,11 +850,14 @@@ __SYSCALL(__NR_pidfd_open, sys_pidfd_op
  #define __NR_clone3 435
  __SYSCALL(__NR_clone3, sys_clone3)
  #endif
 +
 +#define __NR_openat2 437
 +__SYSCALL(__NR_openat2, sys_openat2)
+ #define __NR_pidfd_getfd 438
+ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
  
  #undef __NR_syscalls
- #define __NR_syscalls 438
+ #define __NR_syscalls 439
  
  /*
   * 32 bit systems traditionally used different