whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit a9dce6679d736cb3d612af39bab9f31f8db66f9b
parent f67e3fb4891287b8248ebb3320f794b9f5e782d4
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Sat, 16 Mar 2019 13:47:14 -0700

Merge tag 'pidfd-v5.1-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux

Pull pidfd system call from Christian Brauner:
 "This introduces the ability to use file descriptors from /proc/<pid>/
  as stable handles on struct pid. Even if a pid is recycled the handle
  will not change. For a start these fds can be used to send signals to
  the processes they refer to.

  With the ability to use /proc/<pid> fds as stable handles on struct
  pid we can fix a long-standing issue where after a process has exited
  its pid can be reused by another process. If a caller sends a signal
  to a reused pid it will end up signaling the wrong process.

  With this patchset we enable a variety of use cases. One obvious
  example is that we can now safely delegate an important part of
  process management - sending signals - to processes other than the
  parent of a given process by sending file descriptors around via scm
  rights and not fearing that the given process will have been recycled
  in the meantime. It also allows for easy testing whether a given
  process is still alive or not by sending signal 0 to a pidfd which is
  quite handy.

  There has been some interest in this feature e.g. from systems
  management (systemd, glibc) and container managers. I have requested
  and gotten comments from glibc to make sure that this syscall is
  suitable for their needs as well. In the future I expect it to take on
  most other pid-based signal syscalls. But such features are left for
  the future once they are needed.

  This has been sitting in linux-next for quite a while and has not
  caused any issues. It comes with selftests which verify basic
  functionality and also test that a recycled pid cannot be signaled via
  a pidfd.

  Jon has written about a prior version of this patchset. It should
  cover the basic functionality since not a lot has changed since then:

      https://lwn.net/Articles/773459/

  The commit message for the syscall itself is extensively documenting
  the syscall, including it's functionality and extensibility"

* tag 'pidfd-v5.1-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
  selftests: add tests for pidfd_send_signal()
  signal: add pidfd_send_signal() syscall

Diffstat:
March/x86/entry/syscalls/syscall_32.tbl | 1+
March/x86/entry/syscalls/syscall_64.tbl | 1+
Mfs/proc/base.c | 9+++++++++
Minclude/linux/proc_fs.h | 6++++++
Minclude/linux/syscalls.h | 3+++
Minclude/uapi/asm-generic/unistd.h | 2++
Mkernel/signal.c | 133+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mkernel/sys_ni.c | 1+
Mtools/testing/selftests/Makefile | 1+
Atools/testing/selftests/pidfd/Makefile | 6++++++
Atools/testing/selftests/pidfd/pidfd_test.c | 381+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
11 files changed, 538 insertions(+), 6 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl @@ -429,6 +429,7 @@ 421 i386 rt_sigtimedwait_time64 sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait_time64 422 i386 futex_time64 sys_futex __ia32_sys_futex 423 i386 sched_rr_get_interval_time64 sys_sched_rr_get_interval __ia32_sys_sched_rr_get_interval +424 i386 pidfd_send_signal sys_pidfd_send_signal __ia32_sys_pidfd_send_signal 425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup 426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter 427 i386 io_uring_register sys_io_uring_register __ia32_sys_io_uring_register diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl @@ -345,6 +345,7 @@ 334 common rseq __x64_sys_rseq # don't use numbers 387 through 423, add new calls after the last # 'common' entry +424 common pidfd_send_signal __x64_sys_pidfd_send_signal 425 common io_uring_setup __x64_sys_io_uring_setup 426 common io_uring_enter __x64_sys_io_uring_enter 427 common io_uring_register __x64_sys_io_uring_register diff --git a/fs/proc/base.c b/fs/proc/base.c @@ -3074,6 +3074,15 @@ static const struct file_operations proc_tgid_base_operations = { .llseek = generic_file_llseek, }; +struct pid *tgid_pidfd_to_pid(const struct file *file) +{ + if (!d_is_dir(file->f_path.dentry) || + (file->f_op != &proc_tgid_base_operations)) + return ERR_PTR(-EBADF); + + return proc_pid(file_inode(file)); +} + static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h @@ -73,6 +73,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo int (*show)(struct seq_file *, void *), proc_write_t write, void *data); +extern struct pid *tgid_pidfd_to_pid(const struct file *file); #else /* CONFIG_PROC_FS */ @@ -114,6 +115,11 @@ static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *p #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;}) #define proc_create_net_single(name, mode, parent, show, data) ({NULL;}) +static inline struct pid *tgid_pidfd_to_pid(const struct file *file) +{ + return ERR_PTR(-EBADF); +} + #endif /* CONFIG_PROC_FS */ struct net; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h @@ -985,6 +985,9 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, unsigned mask, struct statx __user *buffer); asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, int flags, uint32_t sig); +asmlinkage long sys_pidfd_send_signal(int pidfd, int sig, + siginfo_t __user *info, + unsigned int flags); /* * Architecture-specific system calls diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h @@ -824,6 +824,8 @@ __SYSCALL(__NR_futex_time64, sys_futex) __SYSCALL(__NR_sched_rr_get_interval_time64, sys_sched_rr_get_interval) #endif +#define __NR_pidfd_send_signal 424 +__SYSCALL(__NR_pidfd_send_signal, sys_pidfd_send_signal) #define __NR_io_uring_setup 425 __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) #define __NR_io_uring_enter 426 diff --git a/kernel/signal.c b/kernel/signal.c @@ -19,7 +19,9 @@ #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> +#include <linux/file.h> #include <linux/fs.h> +#include <linux/proc_fs.h> #include <linux/tty.h> #include <linux/binfmts.h> #include <linux/coredump.h> @@ -3487,6 +3489,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese, #endif #endif +static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info) +{ + clear_siginfo(info); + info->si_signo = sig; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = task_tgid_vnr(current); + info->si_uid = from_kuid_munged(current_user_ns(), current_uid()); +} + /** * sys_kill - send a signal to a process * @pid: the PID of the process @@ -3496,16 +3508,125 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct kernel_siginfo info; - clear_siginfo(&info); - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_USER; - info.si_pid = task_tgid_vnr(current); - info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); + prepare_kill_siginfo(sig, &info); return kill_something_info(sig, &info, pid); } +#ifdef CONFIG_PROC_FS +/* + * Verify that the signaler and signalee either are in the same pid namespace + * or that the signaler's pid namespace is an ancestor of the signalee's pid + * namespace. + */ +static bool access_pidfd_pidns(struct pid *pid) +{ + struct pid_namespace *active = task_active_pid_ns(current); + struct pid_namespace *p = ns_of_pid(pid); + + for (;;) { + if (!p) + return false; + if (p == active) + break; + p = p->parent; + } + + return true; +} + +static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info) +{ +#ifdef CONFIG_COMPAT + /* + * Avoid hooking up compat syscalls and instead handle necessary + * conversions here. Note, this is a stop-gap measure and should not be + * considered a generic solution. + */ + if (in_compat_syscall()) + return copy_siginfo_from_user32( + kinfo, (struct compat_siginfo __user *)info); +#endif + return copy_siginfo_from_user(kinfo, info); +} + +/** + * sys_pidfd_send_signal - send a signal to a process through a task file + * descriptor + * @pidfd: the file descriptor of the process + * @sig: signal to be sent + * @info: the signal info + * @flags: future flags to be passed + * + * The syscall currently only signals via PIDTYPE_PID which covers + * kill(<positive-pid>, <signal>. It does not signal threads or process + * groups. + * In order to extend the syscall to threads and process groups the @flags + * argument should be used. In essence, the @flags argument will determine + * what is signaled and not the file descriptor itself. Put in other words, + * grouping is a property of the flags argument not a property of the file + * descriptor. + * + * Return: 0 on success, negative errno on failure + */ +SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, + siginfo_t __user *, info, unsigned int, flags) +{ + int ret; + struct fd f; + struct pid *pid; + kernel_siginfo_t kinfo; + + /* Enforce flags be set to 0 until we add an extension. */ + if (flags) + return -EINVAL; + + f = fdget_raw(pidfd); + if (!f.file) + return -EBADF; + + /* Is this a pidfd? */ + pid = tgid_pidfd_to_pid(f.file); + if (IS_ERR(pid)) { + ret = PTR_ERR(pid); + goto err; + } + + ret = -EINVAL; + if (!access_pidfd_pidns(pid)) + goto err; + + if (info) { + ret = copy_siginfo_from_user_any(&kinfo, info); + if (unlikely(ret)) + goto err; + + ret = -EINVAL; + if (unlikely(sig != kinfo.si_signo)) + goto err; + + if ((task_pid(current) != pid) && + (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) { + /* Only allow sending arbitrary signals to yourself. */ + ret = -EPERM; + if (kinfo.si_code != SI_USER) + goto err; + + /* Turn this into a regular kill signal. */ + prepare_kill_siginfo(sig, &kinfo); + } + } else { + prepare_kill_siginfo(sig, &kinfo); + } + + ret = kill_pid_info(sig, &kinfo, pid); + +err: + fdput(f); + return ret; +} +#endif /* CONFIG_PROC_FS */ + static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) { diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c @@ -168,6 +168,7 @@ COND_SYSCALL(syslog); /* kernel/sched/core.c */ /* kernel/signal.c */ +COND_SYSCALL(pidfd_send_signal); /* kernel/sys.c */ COND_SYSCALL(setregid); diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile @@ -32,6 +32,7 @@ TARGETS += net TARGETS += netfilter TARGETS += networking/timestamping TARGETS += nsfs +TARGETS += pidfd TARGETS += powerpc TARGETS += proc TARGETS += pstore diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile @@ -0,0 +1,6 @@ +CFLAGS += -g -I../../../../usr/include/ + +TEST_GEN_PROGS := pidfd_test + +include ../lib.mk + diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c @@ -0,0 +1,381 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <linux/types.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syscall.h> +#include <sys/mount.h> +#include <sys/wait.h> +#include <unistd.h> + +#include "../kselftest.h" + +static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, + unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int signal_received; + +static void set_signal_received_on_sigusr1(int sig) +{ + if (sig == SIGUSR1) + signal_received = 1; +} + +/* + * Straightforward test to see whether pidfd_send_signal() works is to send + * a signal to ourself. + */ +static int test_pidfd_send_signal_simple_success(void) +{ + int pidfd, ret; + const char *test_name = "pidfd_send_signal send SIGUSR1"; + + pidfd = open("/proc/self", O_DIRECTORY | O_CLOEXEC); + if (pidfd < 0) + ksft_exit_fail_msg( + "%s test: Failed to open process file descriptor\n", + test_name); + + signal(SIGUSR1, set_signal_received_on_sigusr1); + + ret = sys_pidfd_send_signal(pidfd, SIGUSR1, NULL, 0); + close(pidfd); + if (ret < 0) + ksft_exit_fail_msg("%s test: Failed to send signal\n", + test_name); + + if (signal_received != 1) + ksft_exit_fail_msg("%s test: Failed to receive signal\n", + test_name); + + signal_received = 0; + ksft_test_result_pass("%s test: Sent signal\n", test_name); + return 0; +} + +static int wait_for_pid(pid_t pid) +{ + int status, ret; + +again: + ret = waitpid(pid, &status, 0); + if (ret == -1) { + if (errno == EINTR) + goto again; + + return -1; + } + + if (ret != pid) + goto again; + + if (!WIFEXITED(status)) + return -1; + + return WEXITSTATUS(status); +} + +static int test_pidfd_send_signal_exited_fail(void) +{ + int pidfd, ret, saved_errno; + char buf[256]; + pid_t pid; + const char *test_name = "pidfd_send_signal signal exited process"; + + pid = fork(); + if (pid < 0) + ksft_exit_fail_msg("%s test: Failed to create new process\n", + test_name); + + if (pid == 0) + _exit(EXIT_SUCCESS); + + snprintf(buf, sizeof(buf), "/proc/%d", pid); + + pidfd = open(buf, O_DIRECTORY | O_CLOEXEC); + + (void)wait_for_pid(pid); + + if (pidfd < 0) + ksft_exit_fail_msg( + "%s test: Failed to open process file descriptor\n", + test_name); + + ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0); + saved_errno = errno; + close(pidfd); + if (ret == 0) + ksft_exit_fail_msg( + "%s test: Managed to send signal to process even though it should have failed\n", + test_name); + + if (saved_errno != ESRCH) + ksft_exit_fail_msg( + "%s test: Expected to receive ESRCH as errno value but received %d instead\n", + test_name, saved_errno); + + ksft_test_result_pass("%s test: Failed to send signal as expected\n", + test_name); + return 0; +} + +/* + * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c + * That means, when it wraps around any pid < 300 will be skipped. + * So we need to use a pid > 300 in order to test recycling. + */ +#define PID_RECYCLE 1000 + +/* + * Maximum number of cycles we allow. This is equivalent to PID_MAX_DEFAULT. + * If users set a higher limit or we have cycled PIDFD_MAX_DEFAULT number of + * times then we skip the test to not go into an infinite loop or block for a + * long time. + */ +#define PIDFD_MAX_DEFAULT 0x8000 + +/* + * Define a few custom error codes for the child process to clearly indicate + * what is happening. This way we can tell the difference between a system + * error, a test error, etc. + */ +#define PIDFD_PASS 0 +#define PIDFD_FAIL 1 +#define PIDFD_ERROR 2 +#define PIDFD_SKIP 3 +#define PIDFD_XFAIL 4 + +static int test_pidfd_send_signal_recycled_pid_fail(void) +{ + int i, ret; + pid_t pid1; + const char *test_name = "pidfd_send_signal signal recycled pid"; + + ret = unshare(CLONE_NEWPID); + if (ret < 0) + ksft_exit_fail_msg("%s test: Failed to unshare pid namespace\n", + test_name); + + ret = unshare(CLONE_NEWNS); + if (ret < 0) + ksft_exit_fail_msg( + "%s test: Failed to unshare mount namespace\n", + test_name); + + ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0); + if (ret < 0) + ksft_exit_fail_msg("%s test: Failed to remount / private\n", + test_name); + + /* pid 1 in new pid namespace */ + pid1 = fork(); + if (pid1 < 0) + ksft_exit_fail_msg("%s test: Failed to create new process\n", + test_name); + + if (pid1 == 0) { + char buf[256]; + pid_t pid2; + int pidfd = -1; + + (void)umount2("/proc", MNT_DETACH); + ret = mount("proc", "/proc", "proc", 0, NULL); + if (ret < 0) + _exit(PIDFD_ERROR); + + /* grab pid PID_RECYCLE */ + for (i = 0; i <= PIDFD_MAX_DEFAULT; i++) { + pid2 = fork(); + if (pid2 < 0) + _exit(PIDFD_ERROR); + + if (pid2 == 0) + _exit(PIDFD_PASS); + + if (pid2 == PID_RECYCLE) { + snprintf(buf, sizeof(buf), "/proc/%d", pid2); + ksft_print_msg("pid to recycle is %d\n", pid2); + pidfd = open(buf, O_DIRECTORY | O_CLOEXEC); + } + + if (wait_for_pid(pid2)) + _exit(PIDFD_ERROR); + + if (pid2 >= PID_RECYCLE) + break; + } + + /* + * We want to be as predictable as we can so if we haven't been + * able to grab pid PID_RECYCLE skip the test. + */ + if (pid2 != PID_RECYCLE) { + /* skip test */ + close(pidfd); + _exit(PIDFD_SKIP); + } + + if (pidfd < 0) + _exit(PIDFD_ERROR); + + for (i = 0; i <= PIDFD_MAX_DEFAULT; i++) { + char c; + int pipe_fds[2]; + pid_t recycled_pid; + int child_ret = PIDFD_PASS; + + ret = pipe2(pipe_fds, O_CLOEXEC); + if (ret < 0) + _exit(PIDFD_ERROR); + + recycled_pid = fork(); + if (recycled_pid < 0) + _exit(PIDFD_ERROR); + + if (recycled_pid == 0) { + close(pipe_fds[1]); + (void)read(pipe_fds[0], &c, 1); + close(pipe_fds[0]); + + _exit(PIDFD_PASS); + } + + /* + * Stop the child so we can inspect whether we have + * recycled pid PID_RECYCLE. + */ + close(pipe_fds[0]); + ret = kill(recycled_pid, SIGSTOP); + close(pipe_fds[1]); + if (ret) { + (void)wait_for_pid(recycled_pid); + _exit(PIDFD_ERROR); + } + + /* + * We have recycled the pid. Try to signal it. This + * needs to fail since this is a different process than + * the one the pidfd refers to. + */ + if (recycled_pid == PID_RECYCLE) { + ret = sys_pidfd_send_signal(pidfd, SIGCONT, + NULL, 0); + if (ret && errno == ESRCH) + child_ret = PIDFD_XFAIL; + else + child_ret = PIDFD_FAIL; + } + + /* let the process move on */ + ret = kill(recycled_pid, SIGCONT); + if (ret) + (void)kill(recycled_pid, SIGKILL); + + if (wait_for_pid(recycled_pid)) + _exit(PIDFD_ERROR); + + switch (child_ret) { + case PIDFD_FAIL: + /* fallthrough */ + case PIDFD_XFAIL: + _exit(child_ret); + case PIDFD_PASS: + break; + default: + /* not reached */ + _exit(PIDFD_ERROR); + } + + /* + * If the user set a custom pid_max limit we could be + * in the millions. + * Skip the test in this case. + */ + if (recycled_pid > PIDFD_MAX_DEFAULT) + _exit(PIDFD_SKIP); + } + + /* failed to recycle pid */ + _exit(PIDFD_SKIP); + } + + ret = wait_for_pid(pid1); + switch (ret) { + case PIDFD_FAIL: + ksft_exit_fail_msg( + "%s test: Managed to signal recycled pid %d\n", + test_name, PID_RECYCLE); + case PIDFD_PASS: + ksft_exit_fail_msg("%s test: Failed to recycle pid %d\n", + test_name, PID_RECYCLE); + case PIDFD_SKIP: + ksft_print_msg("%s test: Skipping test\n", test_name); + ret = 0; + break; + case PIDFD_XFAIL: + ksft_test_result_pass( + "%s test: Failed to signal recycled pid as expected\n", + test_name); + ret = 0; + break; + default /* PIDFD_ERROR */: + ksft_exit_fail_msg("%s test: Error while running tests\n", + test_name); + } + + return ret; +} + +static int test_pidfd_send_signal_syscall_support(void) +{ + int pidfd, ret; + const char *test_name = "pidfd_send_signal check for support"; + + pidfd = open("/proc/self", O_DIRECTORY | O_CLOEXEC); + if (pidfd < 0) + ksft_exit_fail_msg( + "%s test: Failed to open process file descriptor\n", + test_name); + + ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0); + if (ret < 0) { + /* + * pidfd_send_signal() will currently return ENOSYS when + * CONFIG_PROC_FS is not set. + */ + if (errno == ENOSYS) + ksft_exit_skip( + "%s test: pidfd_send_signal() syscall not supported (Ensure that CONFIG_PROC_FS=y is set)\n", + test_name); + + ksft_exit_fail_msg("%s test: Failed to send signal\n", + test_name); + } + + close(pidfd); + ksft_test_result_pass( + "%s test: pidfd_send_signal() syscall is supported. Tests can be executed\n", + test_name); + return 0; +} + +int main(int argc, char **argv) +{ + ksft_print_header(); + + test_pidfd_send_signal_syscall_support(); + test_pidfd_send_signal_simple_success(); + test_pidfd_send_signal_exited_fail(); + test_pidfd_send_signal_recycled_pid_fail(); + + return ksft_exit_pass(); +}