whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit d9a7fa67b4bfe6ce93ee9aab23ae2e7ca0763e84
parent f218a29c25ad8abdb961435d6b8139f462061364
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Wed,  2 Jan 2019 09:48:13 -0800

Merge branch 'next-seccomp' of git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security

Pull seccomp updates from James Morris:

 - Add SECCOMP_RET_USER_NOTIF

 - seccomp fixes for sparse warnings and s390 build (Tycho)

* 'next-seccomp' of git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security:
  seccomp, s390: fix build for syscall type change
  seccomp: fix poor type promotion
  samples: add an example of seccomp user trap
  seccomp: add a return code to trap to userspace
  seccomp: switch system call argument type to void *
  seccomp: hoist struct seccomp_data recalculation higher

Diffstat:
MDocumentation/ioctl/ioctl-number.txt | 1+
MDocumentation/userspace-api/seccomp_filter.rst | 84+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
March/s390/kernel/compat_wrapper.c | 2+-
Minclude/linux/seccomp.h | 9+++++----
Minclude/linux/syscalls.h | 2+-
Minclude/uapi/linux/seccomp.h | 40+++++++++++++++++++++++++++++++++++++---
Mkernel/seccomp.c | 467++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Msamples/seccomp/.gitignore | 1+
Msamples/seccomp/Makefile | 7++++++-
Asamples/seccomp/user-trap.c | 375+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtools/testing/selftests/seccomp/seccomp_bpf.c | 447++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
11 files changed, 1411 insertions(+), 24 deletions(-)

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt @@ -79,6 +79,7 @@ Code Seq#(hex) Include File Comments 0x1b all InfiniBand Subsystem <http://infiniband.sourceforge.net/> 0x20 all drivers/cdrom/cm206.h 0x22 all scsi/sg.h +'!' 00-1F uapi/linux/seccomp.h '#' 00-3F IEEE 1394 Subsystem Block for the entire subsystem '$' 00-0F linux/perf_counter.h, linux/perf_event.h '%' 00-0F include/uapi/linux/stm.h diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst @@ -122,6 +122,11 @@ In precedence order, they are: Results in the lower 16-bits of the return value being passed to userland as the errno without executing the system call. +``SECCOMP_RET_USER_NOTIF``: + Results in a ``struct seccomp_notif`` message sent on the userspace + notification fd, if it is attached, or ``-ENOSYS`` if it is not. See below + on discussion of how to handle user notifications. + ``SECCOMP_RET_TRACE``: When returned, this value will cause the kernel to attempt to notify a ``ptrace()``-based tracer prior to executing the system @@ -183,6 +188,85 @@ The ``samples/seccomp/`` directory contains both an x86-specific example and a more generic example of a higher level macro interface for BPF program generation. +Userspace Notification +====================== + +The ``SECCOMP_RET_USER_NOTIF`` return code lets seccomp filters pass a +particular syscall to userspace to be handled. This may be useful for +applications like container managers, which wish to intercept particular +syscalls (``mount()``, ``finit_module()``, etc.) and change their behavior. + +To acquire a notification FD, use the ``SECCOMP_FILTER_FLAG_NEW_LISTENER`` +argument to the ``seccomp()`` syscall: + +.. code-block:: c + + fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog); + +which (on success) will return a listener fd for the filter, which can then be +passed around via ``SCM_RIGHTS`` or similar. Note that filter fds correspond to +a particular filter, and not a particular task. So if this task then forks, +notifications from both tasks will appear on the same filter fd. Reads and +writes to/from a filter fd are also synchronized, so a filter fd can safely +have many readers. + +The interface for a seccomp notification fd consists of two structures: + +.. code-block:: c + + struct seccomp_notif_sizes { + __u16 seccomp_notif; + __u16 seccomp_notif_resp; + __u16 seccomp_data; + }; + + struct seccomp_notif { + __u64 id; + __u32 pid; + __u32 flags; + struct seccomp_data data; + }; + + struct seccomp_notif_resp { + __u64 id; + __s64 val; + __s32 error; + __u32 flags; + }; + +The ``struct seccomp_notif_sizes`` structure can be used to determine the size +of the various structures used in seccomp notifications. The size of ``struct +seccomp_data`` may change in the future, so code should use: + +.. code-block:: c + + struct seccomp_notif_sizes sizes; + seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes); + +to determine the size of the various structures to allocate. See +samples/seccomp/user-trap.c for an example. + +Users can read via ``ioctl(SECCOMP_IOCTL_NOTIF_RECV)`` (or ``poll()``) on a +seccomp notification fd to receive a ``struct seccomp_notif``, which contains +five members: the input length of the structure, a unique-per-filter ``id``, +the ``pid`` of the task which triggered this request (which may be 0 if the +task is in a pid ns not visible from the listener's pid namespace), a ``flags`` +member which for now only has ``SECCOMP_NOTIF_FLAG_SIGNALED``, representing +whether or not the notification is a result of a non-fatal signal, and the +``data`` passed to seccomp. Userspace can then make a decision based on this +information about what to do, and ``ioctl(SECCOMP_IOCTL_NOTIF_SEND)`` a +response, indicating what should be returned to userspace. The ``id`` member of +``struct seccomp_notif_resp`` should be the same ``id`` as in ``struct +seccomp_notif``. + +It is worth noting that ``struct seccomp_data`` contains the values of register +arguments to the syscall, but does not contain pointers to memory. The task's +memory is accessible to suitably privileged traces via ``ptrace()`` or +``/proc/pid/mem``. However, care should be taken to avoid the TOCTOU mentioned +above in this document: all arguments being read from the tracee's memory +should be read into the tracer's memory before any policy decisions are made. +This allows for an atomic decision on syscall arguments. + Sysctls ======= diff --git a/arch/s390/kernel/compat_wrapper.c b/arch/s390/kernel/compat_wrapper.c @@ -164,7 +164,7 @@ COMPAT_SYSCALL_WRAP3(finit_module, int, fd, const char __user *, uargs, int, fla COMPAT_SYSCALL_WRAP3(sched_setattr, pid_t, pid, struct sched_attr __user *, attr, unsigned int, flags); COMPAT_SYSCALL_WRAP4(sched_getattr, pid_t, pid, struct sched_attr __user *, attr, unsigned int, size, unsigned int, flags); COMPAT_SYSCALL_WRAP5(renameat2, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, unsigned int, flags); -COMPAT_SYSCALL_WRAP3(seccomp, unsigned int, op, unsigned int, flags, const char __user *, uargs) +COMPAT_SYSCALL_WRAP3(seccomp, unsigned int, op, unsigned int, flags, void __user *, uargs) COMPAT_SYSCALL_WRAP3(getrandom, char __user *, buf, size_t, count, unsigned int, flags) COMPAT_SYSCALL_WRAP2(memfd_create, const char __user *, uname, unsigned int, flags) COMPAT_SYSCALL_WRAP3(bpf, int, cmd, union bpf_attr *, attr, unsigned int, size); diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h @@ -4,9 +4,10 @@ #include <uapi/linux/seccomp.h> -#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \ - SECCOMP_FILTER_FLAG_LOG | \ - SECCOMP_FILTER_FLAG_SPEC_ALLOW) +#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \ + SECCOMP_FILTER_FLAG_LOG | \ + SECCOMP_FILTER_FLAG_SPEC_ALLOW | \ + SECCOMP_FILTER_FLAG_NEW_LISTENER) #ifdef CONFIG_SECCOMP @@ -43,7 +44,7 @@ extern void secure_computing_strict(int this_syscall); #endif extern long prctl_get_seccomp(void); -extern long prctl_set_seccomp(unsigned long, char __user *); +extern long prctl_set_seccomp(unsigned long, void __user *); static inline int seccomp_mode(struct seccomp *s) { diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h @@ -898,7 +898,7 @@ asmlinkage long sys_renameat2(int olddfd, const char __user *oldname, int newdfd, const char __user *newname, unsigned int flags); asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, - const char __user *uargs); + void __user *uargs); asmlinkage long sys_getrandom(char __user *buf, size_t count, unsigned int flags); asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags); diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h @@ -15,11 +15,13 @@ #define SECCOMP_SET_MODE_STRICT 0 #define SECCOMP_SET_MODE_FILTER 1 #define SECCOMP_GET_ACTION_AVAIL 2 +#define SECCOMP_GET_NOTIF_SIZES 3 /* Valid flags for SECCOMP_SET_MODE_FILTER */ -#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0) -#define SECCOMP_FILTER_FLAG_LOG (1UL << 1) -#define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2) +#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0) +#define SECCOMP_FILTER_FLAG_LOG (1UL << 1) +#define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2) +#define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3) /* * All BPF programs must return a 32-bit value. @@ -35,6 +37,7 @@ #define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD #define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ #define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ +#define SECCOMP_RET_USER_NOTIF 0x7fc00000U /* notifies userspace */ #define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ #define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ #define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ @@ -60,4 +63,35 @@ struct seccomp_data { __u64 args[6]; }; +struct seccomp_notif_sizes { + __u16 seccomp_notif; + __u16 seccomp_notif_resp; + __u16 seccomp_data; +}; + +struct seccomp_notif { + __u64 id; + __u32 pid; + __u32 flags; + struct seccomp_data data; +}; + +struct seccomp_notif_resp { + __u64 id; + __s64 val; + __s32 error; + __u32 flags; +}; + +#define SECCOMP_IOC_MAGIC '!' +#define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr) +#define SECCOMP_IOR(nr, type) _IOR(SECCOMP_IOC_MAGIC, nr, type) +#define SECCOMP_IOW(nr, type) _IOW(SECCOMP_IOC_MAGIC, nr, type) +#define SECCOMP_IOWR(nr, type) _IOWR(SECCOMP_IOC_MAGIC, nr, type) + +/* Flags for seccomp notification fd ioctl. */ +#define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif) +#define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ + struct seccomp_notif_resp) +#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64) #endif /* _UAPI_LINUX_SECCOMP_H */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c @@ -33,12 +33,74 @@ #endif #ifdef CONFIG_SECCOMP_FILTER +#include <linux/file.h> #include <linux/filter.h> #include <linux/pid.h> #include <linux/ptrace.h> #include <linux/security.h> #include <linux/tracehook.h> #include <linux/uaccess.h> +#include <linux/anon_inodes.h> + +enum notify_state { + SECCOMP_NOTIFY_INIT, + SECCOMP_NOTIFY_SENT, + SECCOMP_NOTIFY_REPLIED, +}; + +struct seccomp_knotif { + /* The struct pid of the task whose filter triggered the notification */ + struct task_struct *task; + + /* The "cookie" for this request; this is unique for this filter. */ + u64 id; + + /* + * The seccomp data. This pointer is valid the entire time this + * notification is active, since it comes from __seccomp_filter which + * eclipses the entire lifecycle here. + */ + const struct seccomp_data *data; + + /* + * Notification states. When SECCOMP_RET_USER_NOTIF is returned, a + * struct seccomp_knotif is created and starts out in INIT. Once the + * handler reads the notification off of an FD, it transitions to SENT. + * If a signal is received the state transitions back to INIT and + * another message is sent. When the userspace handler replies, state + * transitions to REPLIED. + */ + enum notify_state state; + + /* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */ + int error; + long val; + + /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */ + struct completion ready; + + struct list_head list; +}; + +/** + * struct notification - container for seccomp userspace notifications. Since + * most seccomp filters will not have notification listeners attached and this + * structure is fairly large, we store the notification-specific stuff in a + * separate structure. + * + * @request: A semaphore that users of this notification can wait on for + * changes. Actual reads and writes are still controlled with + * filter->notify_lock. + * @next_id: The id of the next request. + * @notifications: A list of struct seccomp_knotif elements. + * @wqh: A wait queue for poll. + */ +struct notification { + struct semaphore request; + u64 next_id; + struct list_head notifications; + wait_queue_head_t wqh; +}; /** * struct seccomp_filter - container for seccomp BPF programs @@ -50,6 +112,8 @@ * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate + * @notif: the struct that holds all notification related information + * @notify_lock: A lock for all notification-related accesses. * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting @@ -66,6 +130,8 @@ struct seccomp_filter { bool log; struct seccomp_filter *prev; struct bpf_prog *prog; + struct notification *notif; + struct mutex notify_lock; }; /* Limit any path through the tree to 256KB worth of instructions. */ @@ -188,7 +254,6 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) static u32 seccomp_run_filters(const struct seccomp_data *sd, struct seccomp_filter **match) { - struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; /* Make sure cross-thread synced filter points somewhere sane. */ struct seccomp_filter *f = @@ -198,11 +263,6 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, if (WARN_ON(f == NULL)) return SECCOMP_RET_KILL_PROCESS; - if (!sd) { - populate_seccomp_data(&sd_local); - sd = &sd_local; - } - /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). @@ -392,6 +452,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) if (!sfilter) return ERR_PTR(-ENOMEM); + mutex_init(&sfilter->notify_lock); ret = bpf_prog_create_from_user(&sfilter->prog, fprog, seccomp_check_filter, save_orig); if (ret < 0) { @@ -485,7 +546,6 @@ static long seccomp_attach_filter(unsigned int flags, static void __get_seccomp_filter(struct seccomp_filter *filter) { - /* Reference count is bounded by the number of total processes. */ refcount_inc(&filter->usage); } @@ -556,11 +616,13 @@ static void seccomp_send_sigsys(int syscall, int reason) #define SECCOMP_LOG_TRACE (1 << 4) #define SECCOMP_LOG_LOG (1 << 5) #define SECCOMP_LOG_ALLOW (1 << 6) +#define SECCOMP_LOG_USER_NOTIF (1 << 7) static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS | SECCOMP_LOG_KILL_THREAD | SECCOMP_LOG_TRAP | SECCOMP_LOG_ERRNO | + SECCOMP_LOG_USER_NOTIF | SECCOMP_LOG_TRACE | SECCOMP_LOG_LOG; @@ -581,6 +643,9 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, case SECCOMP_RET_TRACE: log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE; break; + case SECCOMP_RET_USER_NOTIF: + log = requested && seccomp_actions_logged & SECCOMP_LOG_USER_NOTIF; + break; case SECCOMP_RET_LOG: log = seccomp_actions_logged & SECCOMP_LOG_LOG; break; @@ -652,12 +717,75 @@ void secure_computing_strict(int this_syscall) #else #ifdef CONFIG_SECCOMP_FILTER +static u64 seccomp_next_notify_id(struct seccomp_filter *filter) +{ + /* + * Note: overflow is ok here, the id just needs to be unique per + * filter. + */ + lockdep_assert_held(&filter->notify_lock); + return filter->notif->next_id++; +} + +static void seccomp_do_user_notification(int this_syscall, + struct seccomp_filter *match, + const struct seccomp_data *sd) +{ + int err; + long ret = 0; + struct seccomp_knotif n = {}; + + mutex_lock(&match->notify_lock); + err = -ENOSYS; + if (!match->notif) + goto out; + + n.task = current; + n.state = SECCOMP_NOTIFY_INIT; + n.data = sd; + n.id = seccomp_next_notify_id(match); + init_completion(&n.ready); + list_add(&n.list, &match->notif->notifications); + + up(&match->notif->request); + wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM); + mutex_unlock(&match->notify_lock); + + /* + * This is where we wait for a reply from userspace. + */ + err = wait_for_completion_interruptible(&n.ready); + mutex_lock(&match->notify_lock); + if (err == 0) { + ret = n.val; + err = n.error; + } + + /* + * Note that it's possible the listener died in between the time when + * we were notified of a respons (or a signal) and when we were able to + * re-acquire the lock, so only delete from the list if the + * notification actually exists. + * + * Also note that this test is only valid because there's no way to + * *reattach* to a notifier right now. If one is added, we'll need to + * keep track of the notif itself and make sure they match here. + */ + if (match->notif) + list_del(&n.list); +out: + mutex_unlock(&match->notify_lock); + syscall_set_return_value(current, task_pt_regs(current), + err, ret); +} + static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, const bool recheck_after_trace) { u32 filter_ret, action; struct seccomp_filter *match = NULL; int data; + struct seccomp_data sd_local; /* * Make sure that any changes to mode from another thread have @@ -665,6 +793,11 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, */ rmb(); + if (!sd) { + populate_seccomp_data(&sd_local); + sd = &sd_local; + } + filter_ret = seccomp_run_filters(sd, &match); data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION_FULL; @@ -728,6 +861,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; + case SECCOMP_RET_USER_NOTIF: + seccomp_do_user_notification(this_syscall, match, sd); + goto skip; + case SECCOMP_RET_LOG: seccomp_log(this_syscall, 0, action, true); return 0; @@ -834,6 +971,262 @@ out: } #ifdef CONFIG_SECCOMP_FILTER +static int seccomp_notify_release(struct inode *inode, struct file *file) +{ + struct seccomp_filter *filter = file->private_data; + struct seccomp_knotif *knotif; + + mutex_lock(&filter->notify_lock); + + /* + * If this file is being closed because e.g. the task who owned it + * died, let's wake everyone up who was waiting on us. + */ + list_for_each_entry(knotif, &filter->notif->notifications, list) { + if (knotif->state == SECCOMP_NOTIFY_REPLIED) + continue; + + knotif->state = SECCOMP_NOTIFY_REPLIED; + knotif->error = -ENOSYS; + knotif->val = 0; + + complete(&knotif->ready); + } + + kfree(filter->notif); + filter->notif = NULL; + mutex_unlock(&filter->notify_lock); + __put_seccomp_filter(filter); + return 0; +} + +static long seccomp_notify_recv(struct seccomp_filter *filter, + void __user *buf) +{ + struct seccomp_knotif *knotif = NULL, *cur; + struct seccomp_notif unotif; + ssize_t ret; + + memset(&unotif, 0, sizeof(unotif)); + + ret = down_interruptible(&filter->notif->request); + if (ret < 0) + return ret; + + mutex_lock(&filter->notify_lock); + list_for_each_entry(cur, &filter->notif->notifications, list) { + if (cur->state == SECCOMP_NOTIFY_INIT) { + knotif = cur; + break; + } + } + + /* + * If we didn't find a notification, it could be that the task was + * interrupted by a fatal signal between the time we were woken and + * when we were able to acquire the rw lock. + */ + if (!knotif) { + ret = -ENOENT; + goto out; + } + + unotif.id = knotif->id; + unotif.pid = task_pid_vnr(knotif->task); + unotif.data = *(knotif->data); + + knotif->state = SECCOMP_NOTIFY_SENT; + wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM); + ret = 0; +out: + mutex_unlock(&filter->notify_lock); + + if (ret == 0 && copy_to_user(buf, &unotif, sizeof(unotif))) { + ret = -EFAULT; + + /* + * Userspace screwed up. To make sure that we keep this + * notification alive, let's reset it back to INIT. It + * may have died when we released the lock, so we need to make + * sure it's still around. + */ + knotif = NULL; + mutex_lock(&filter->notify_lock); + list_for_each_entry(cur, &filter->notif->notifications, list) { + if (cur->id == unotif.id) { + knotif = cur; + break; + } + } + + if (knotif) { + knotif->state = SECCOMP_NOTIFY_INIT; + up(&filter->notif->request); + } + mutex_unlock(&filter->notify_lock); + } + + return ret; +} + +static long seccomp_notify_send(struct seccomp_filter *filter, + void __user *buf) +{ + struct seccomp_notif_resp resp = {}; + struct seccomp_knotif *knotif = NULL, *cur; + long ret; + + if (copy_from_user(&resp, buf, sizeof(resp))) + return -EFAULT; + + if (resp.flags) + return -EINVAL; + + ret = mutex_lock_interruptible(&filter->notify_lock); + if (ret < 0) + return ret; + + list_for_each_entry(cur, &filter->notif->notifications, list) { + if (cur->id == resp.id) { + knotif = cur; + break; + } + } + + if (!knotif) { + ret = -ENOENT; + goto out; + } + + /* Allow exactly one reply. */ + if (knotif->state != SECCOMP_NOTIFY_SENT) { + ret = -EINPROGRESS; + goto out; + } + + ret = 0; + knotif->state = SECCOMP_NOTIFY_REPLIED; + knotif->error = resp.error; + knotif->val = resp.val; + complete(&knotif->ready); +out: + mutex_unlock(&filter->notify_lock); + return ret; +} + +static long seccomp_notify_id_valid(struct seccomp_filter *filter, + void __user *buf) +{ + struct seccomp_knotif *knotif = NULL; + u64 id; + long ret; + + if (copy_from_user(&id, buf, sizeof(id))) + return -EFAULT; + + ret = mutex_lock_interruptible(&filter->notify_lock); + if (ret < 0) + return ret; + + ret = -ENOENT; + list_for_each_entry(knotif, &filter->notif->notifications, list) { + if (knotif->id == id) { + if (knotif->state == SECCOMP_NOTIFY_SENT) + ret = 0; + goto out; + } + } + +out: + mutex_unlock(&filter->notify_lock); + return ret; +} + +static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct seccomp_filter *filter = file->private_data; + void __user *buf = (void __user *)arg; + + switch (cmd) { + case SECCOMP_IOCTL_NOTIF_RECV: + return seccomp_notify_recv(filter, buf); + case SECCOMP_IOCTL_NOTIF_SEND: + return seccomp_notify_send(filter, buf); + case SECCOMP_IOCTL_NOTIF_ID_VALID: + return seccomp_notify_id_valid(filter, buf); + default: + return -EINVAL; + } +} + +static __poll_t seccomp_notify_poll(struct file *file, + struct poll_table_struct *poll_tab) +{ + struct seccomp_filter *filter = file->private_data; + __poll_t ret = 0; + struct seccomp_knotif *cur; + + poll_wait(file, &filter->notif->wqh, poll_tab); + + if (mutex_lock_interruptible(&filter->notify_lock) < 0) + return EPOLLERR; + + list_for_each_entry(cur, &filter->notif->notifications, list) { + if (cur->state == SECCOMP_NOTIFY_INIT) + ret |= EPOLLIN | EPOLLRDNORM; + if (cur->state == SECCOMP_NOTIFY_SENT) + ret |= EPOLLOUT | EPOLLWRNORM; + if ((ret & EPOLLIN) && (ret & EPOLLOUT)) + break; + } + + mutex_unlock(&filter->notify_lock); + + return ret; +} + +static const struct file_operations seccomp_notify_ops = { + .poll = seccomp_notify_poll, + .release = seccomp_notify_release, + .unlocked_ioctl = seccomp_notify_ioctl, +}; + +static struct file *init_listener(struct seccomp_filter *filter) +{ + struct file *ret = ERR_PTR(-EBUSY); + struct seccomp_filter *cur; + + for (cur = current->seccomp.filter; cur; cur = cur->prev) { + if (cur->notif) + goto out; + } + + ret = ERR_PTR(-ENOMEM); + filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL); + if (!filter->notif) + goto out; + + sema_init(&filter->notif->request, 0); + filter->notif->next_id = get_random_u64(); + INIT_LIST_HEAD(&filter->notif->notifications); + init_waitqueue_head(&filter->notif->wqh); + + ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops, + filter, O_RDWR); + if (IS_ERR(ret)) + goto out_notif; + + /* The file has a reference to it now */ + __get_seccomp_filter(filter); + +out_notif: + if (IS_ERR(ret)) + kfree(filter->notif); +out: + return ret; +} + /** * seccomp_set_mode_filter: internal function for setting seccomp filter * @flags: flags to change filter behavior @@ -853,6 +1246,8 @@ static long seccomp_set_mode_filter(unsigned int flags, const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; struct seccomp_filter *prepared = NULL; long ret = -EINVAL; + int listener = -1; + struct file *listener_f = NULL; /* Validate flags. */ if (flags & ~SECCOMP_FILTER_FLAG_MASK) @@ -863,13 +1258,28 @@ static long seccomp_set_mode_filter(unsigned int flags, if (IS_ERR(prepared)) return PTR_ERR(prepared); + if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { + listener = get_unused_fd_flags(O_CLOEXEC); + if (listener < 0) { + ret = listener; + goto out_free; + } + + listener_f = init_listener(prepared); + if (IS_ERR(listener_f)) { + put_unused_fd(listener); + ret = PTR_ERR(listener_f); + goto out_free; + } + } + /* * Make sure we cannot change seccomp or nnp state via TSYNC * while another thread is in the middle of calling exec. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC && mutex_lock_killable(&current->signal->cred_guard_mutex)) - goto out_free; + goto out_put_fd; spin_lock_irq(&current->sighand->siglock); @@ -887,6 +1297,16 @@ out: spin_unlock_irq(&current->sighand->siglock); if (flags & SECCOMP_FILTER_FLAG_TSYNC) mutex_unlock(&current->signal->cred_guard_mutex); +out_put_fd: + if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { + if (ret < 0) { + fput(listener_f); + put_unused_fd(listener); + } else { + fd_install(listener, listener_f); + ret = listener; + } + } out_free: seccomp_filter_free(prepared); return ret; @@ -911,6 +1331,7 @@ static long seccomp_get_action_avail(const char __user *uaction) case SECCOMP_RET_KILL_THREAD: case SECCOMP_RET_TRAP: case SECCOMP_RET_ERRNO: + case SECCOMP_RET_USER_NOTIF: case SECCOMP_RET_TRACE: case SECCOMP_RET_LOG: case SECCOMP_RET_ALLOW: @@ -922,9 +1343,23 @@ static long seccomp_get_action_avail(const char __user *uaction) return 0; } +static long seccomp_get_notif_sizes(void __user *usizes) +{ + struct seccomp_notif_sizes sizes = { + .seccomp_notif = sizeof(struct seccomp_notif), + .seccomp_notif_resp = sizeof(struct seccomp_notif_resp), + .seccomp_data = sizeof(struct seccomp_data), + }; + + if (copy_to_user(usizes, &sizes, sizeof(sizes))) + return -EFAULT; + + return 0; +} + /* Common entry point for both prctl and syscall. */ static long do_seccomp(unsigned int op, unsigned int flags, - const char __user *uargs) + void __user *uargs) { switch (op) { case SECCOMP_SET_MODE_STRICT: @@ -938,13 +1373,18 @@ static long do_seccomp(unsigned int op, unsigned int flags, return -EINVAL; return seccomp_get_action_avail(uargs); + case SECCOMP_GET_NOTIF_SIZES: + if (flags != 0) + return -EINVAL; + + return seccomp_get_notif_sizes(uargs); default: return -EINVAL; } } SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags, - const char __user *, uargs) + void __user *, uargs) { return do_seccomp(op, flags, uargs); } @@ -956,10 +1396,10 @@ SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags, * * Returns 0 on success or -EINVAL on failure. */ -long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) +long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter) { unsigned int op; - char __user *uargs; + void __user *uargs; switch (seccomp_mode) { case SECCOMP_MODE_STRICT: @@ -1111,6 +1551,7 @@ long seccomp_get_metadata(struct task_struct *task, #define SECCOMP_RET_KILL_THREAD_NAME "kill_thread" #define SECCOMP_RET_TRAP_NAME "trap" #define SECCOMP_RET_ERRNO_NAME "errno" +#define SECCOMP_RET_USER_NOTIF_NAME "user_notif" #define SECCOMP_RET_TRACE_NAME "trace" #define SECCOMP_RET_LOG_NAME "log" #define SECCOMP_RET_ALLOW_NAME "allow" @@ -1120,6 +1561,7 @@ static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_THREAD_NAME " " SECCOMP_RET_TRAP_NAME " " SECCOMP_RET_ERRNO_NAME " " + SECCOMP_RET_USER_NOTIF_NAME " " SECCOMP_RET_TRACE_NAME " " SECCOMP_RET_LOG_NAME " " SECCOMP_RET_ALLOW_NAME; @@ -1134,6 +1576,7 @@ static const struct seccomp_log_name seccomp_log_names[] = { { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME }, { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, + { SECCOMP_LOG_USER_NOTIF, SECCOMP_RET_USER_NOTIF_NAME }, { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME }, { SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME }, { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME }, diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore @@ -1,3 +1,4 @@ bpf-direct bpf-fancy dropper +user-trap diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 ifndef CROSS_COMPILE -hostprogs-$(CONFIG_SAMPLE_SECCOMP) := bpf-fancy dropper bpf-direct +hostprogs-$(CONFIG_SAMPLE_SECCOMP) := bpf-fancy dropper bpf-direct user-trap HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include HOSTCFLAGS_bpf-fancy.o += -idirafter $(objtree)/include @@ -16,6 +16,10 @@ HOSTCFLAGS_bpf-direct.o += -I$(objtree)/usr/include HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include bpf-direct-objs := bpf-direct.o +HOSTCFLAGS_user-trap.o += -I$(objtree)/usr/include +HOSTCFLAGS_user-trap.o += -idirafter $(objtree)/include +user-trap-objs := user-trap.o + # Try to match the kernel target. ifndef CONFIG_64BIT @@ -33,6 +37,7 @@ HOSTCFLAGS_bpf-fancy.o += $(MFLAG) HOSTLDLIBS_bpf-direct += $(MFLAG) HOSTLDLIBS_bpf-fancy += $(MFLAG) HOSTLDLIBS_dropper += $(MFLAG) +HOSTLDLIBS_user-trap += $(MFLAG) endif always := $(hostprogs-m) endif diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c @@ -0,0 +1,375 @@ +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <string.h> +#include <stddef.h> +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <sys/user.h> +#include <sys/ioctl.h> +#include <sys/ptrace.h> +#include <sys/mount.h> +#include <linux/limits.h> +#include <linux/filter.h> +#include <linux/seccomp.h> + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) + +static int seccomp(unsigned int op, unsigned int flags, void *args) +{ + errno = 0; + return syscall(__NR_seccomp, op, flags, args); +} + +static int send_fd(int sock, int fd) +{ + struct msghdr msg = {}; + struct cmsghdr *cmsg; + char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c'; + struct iovec io = { + .iov_base = &c, + .iov_len = 1, + }; + + msg.msg_iov = &io; + msg.msg_iovlen = 1; + msg.msg_control = buf; + msg.msg_controllen = sizeof(buf); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + *((int *)CMSG_DATA(cmsg)) = fd; + msg.msg_controllen = cmsg->cmsg_len; + + if (sendmsg(sock, &msg, 0) < 0) { + perror("sendmsg"); + return -1; + } + + return 0; +} + +static int recv_fd(int sock) +{ + struct msghdr msg = {}; + struct cmsghdr *cmsg; + char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c'; + struct iovec io = { + .iov_base = &c, + .iov_len = 1, + }; + + msg.msg_iov = &io; + msg.msg_iovlen = 1; + msg.msg_control = buf; + msg.msg_controllen = sizeof(buf); + + if (recvmsg(sock, &msg, 0) < 0) { + perror("recvmsg"); + return -1; + } + + cmsg = CMSG_FIRSTHDR(&msg); + + return *((int *)CMSG_DATA(cmsg)); +} + +static int user_trap_syscall(int nr, unsigned int flags) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + }; + + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog); +} + +static int handle_req(struct seccomp_notif *req, + struct seccomp_notif_resp *resp, int listener) +{ + char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX]; + int ret = -1, mem; + + resp->id = req->id; + resp->error = -EPERM; + resp->val = 0; + + if (req->data.nr != __NR_mount) { + fprintf(stderr, "huh? trapped something besides mount? %d\n", req->data.nr); + return -1; + } + + /* Only allow bind mounts. */ + if (!(req->data.args[3] & MS_BIND)) + return 0; + + /* + * Ok, let's read the task's memory to see where they wanted their + * mount to go. + */ + snprintf(path, sizeof(path), "/proc/%d/mem", req->pid); + mem = open(path, O_RDONLY); + if (mem < 0) { + perror("open mem"); + return -1; + } + + /* + * Now we avoid a TOCTOU: we referred to a pid by its pid, but since + * the pid that made the syscall may have died, we need to confirm that + * the pid is still valid after we open its /proc/pid/mem file. We can + * ask the listener fd this as follows. + * + * Note that this check should occur *after* any task-specific + * resources are opened, to make sure that the task has not died and + * we're not wrongly reading someone else's state in order to make + * decisions. + */ + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < 0) { + fprintf(stderr, "task died before we could map its memory\n"); + goto out; + } + + /* + * Phew, we've got the right /proc/pid/mem. Now we can read it. Note + * that to avoid another TOCTOU, we should read all of the pointer args + * before we decide to allow the syscall. + */ + if (lseek(mem, req->data.args[0], SEEK_SET) < 0) { + perror("seek"); + goto out; + } + + ret = read(mem, source, sizeof(source)); + if (ret < 0) { + perror("read"); + goto out; + } + + if (lseek(mem, req->data.args[1], SEEK_SET) < 0) { + perror("seek"); + goto out; + } + + ret = read(mem, target, sizeof(target)); + if (ret < 0) { + perror("read"); + goto out; + } + + /* + * Our policy is to only allow bind mounts inside /tmp. This isn't very + * interesting, because we could do unprivlieged bind mounts with user + * namespaces already, but you get the idea. + */ + if (!strncmp(source, "/tmp/", 5) && !strncmp(target, "/tmp/", 5)) { + if (mount(source, target, NULL, req->data.args[3], NULL) < 0) { + ret = -1; + perror("actual mount"); + goto out; + } + resp->error = 0; + } + + /* Even if we didn't allow it because of policy, generating the + * response was be a success, because we want to tell the worker EPERM. + */ + ret = 0; + +out: + close(mem); + return ret; +} + +int main(void) +{ + int sk_pair[2], ret = 1, status, listener; + pid_t worker = 0 , tracer = 0; + + if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair) < 0) { + perror("socketpair"); + return 1; + } + + worker = fork(); + if (worker < 0) { + perror("fork"); + goto close_pair; + } + + if (worker == 0) { + listener = user_trap_syscall(__NR_mount, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + if (listener < 0) { + perror("seccomp"); + exit(1); + } + + /* + * Drop privileges. We definitely can't mount as uid 1000. + */ + if (setuid(1000) < 0) { + perror("setuid"); + exit(1); + } + + /* + * Send the listener to the parent; also serves as + * synchronization. + */ + if (send_fd(sk_pair[1], listener) < 0) + exit(1); + close(listener); + + if (mkdir("/tmp/foo", 0755) < 0) { + perror("mkdir"); + exit(1); + } + + /* + * Try a bad mount just for grins. + */ + if (mount("/dev/sda", "/tmp/foo", NULL, 0, NULL) != -1) { + fprintf(stderr, "huh? mounted /dev/sda?\n"); + exit(1); + } + + if (errno != EPERM) { + perror("bad error from mount"); + exit(1); + } + + /* + * Ok, we expect this one to succeed. + */ + if (mount("/tmp/foo", "/tmp/foo", NULL, MS_BIND, NULL) < 0) { + perror("mount"); + exit(1); + } + + exit(0); + } + + /* + * Get the listener from the child. + */ + listener = recv_fd(sk_pair[0]); + if (listener < 0) + goto out_kill; + + /* + * Fork a task to handle the requests. This isn't strictly necessary, + * but it makes the particular writing of this sample easier, since we + * can just wait ofr the tracee to exit and kill the tracer. + */ + tracer = fork(); + if (tracer < 0) { + perror("fork"); + goto out_kill; + } + + if (tracer == 0) { + struct seccomp_notif *req; + struct seccomp_notif_resp *resp; + struct seccomp_notif_sizes sizes; + + if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) < 0) { + perror("seccomp(GET_NOTIF_SIZES)"); + goto out_close; + } + + req = malloc(sizes.seccomp_notif); + if (!req) + goto out_close; + memset(req, 0, sizeof(*req)); + + resp = malloc(sizes.seccomp_notif_resp); + if (!resp) + goto out_req; + memset(resp, 0, sizeof(*resp)); + + while (1) { + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) { + perror("ioctl recv"); + goto out_resp; + } + + if (handle_req(req, resp, listener) < 0) + goto out_resp; + + /* + * ENOENT here means that the task may have gotten a + * signal and restarted the syscall. It's up to the + * handler to decide what to do in this case, but for + * the sample code, we just ignore it. Probably + * something better should happen, like undoing the + * mount, or keeping track of the args to make sure we + * don't do it again. + */ + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 && + errno != ENOENT) { + perror("ioctl send"); + goto out_resp; + } + } +out_resp: + free(resp); +out_req: + free(req); +out_close: + close(listener); + exit(1); + } + + close(listener); + + if (waitpid(worker, &status, 0) != worker) { + perror("waitpid"); + goto out_kill; + } + + if (umount2("/tmp/foo", MNT_DETACH) < 0 && errno != EINVAL) { + perror("umount2"); + goto out_kill; + } + + if (remove("/tmp/foo") < 0 && errno != ENOENT) { + perror("remove"); + exit(1); + } + + if (!WIFEXITED(status) || WEXITSTATUS(status)) { + fprintf(stderr, "worker exited nonzero\n"); + goto out_kill; + } + + ret = 0; + +out_kill: + if (tracer > 0) + kill(tracer, SIGKILL); + if (worker > 0) + kill(worker, SIGKILL); + +close_pair: + close(sk_pair[0]); + close(sk_pair[1]); + return ret; +} diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -5,6 +5,7 @@ * Test code for seccomp bpf. */ +#define _GNU_SOURCE #include <sys/types.h> /* @@ -40,10 +41,12 @@ #include <sys/fcntl.h> #include <sys/mman.h> #include <sys/times.h> +#include <sys/socket.h> +#include <sys/ioctl.h> -#define _GNU_SOURCE #include <unistd.h> #include <sys/syscall.h> +#include <poll.h> #include "../kselftest_harness.h" @@ -133,6 +136,10 @@ struct seccomp_data { #define SECCOMP_GET_ACTION_AVAIL 2 #endif +#ifndef SECCOMP_GET_NOTIF_SIZES +#define SECCOMP_GET_NOTIF_SIZES 3 +#endif + #ifndef SECCOMP_FILTER_FLAG_TSYNC #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0) #endif @@ -154,6 +161,44 @@ struct seccomp_metadata { }; #endif +#ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER +#define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3) + +#define SECCOMP_RET_USER_NOTIF 0x7fc00000U + +#define SECCOMP_IOC_MAGIC '!' +#define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr) +#define SECCOMP_IOR(nr, type) _IOR(SECCOMP_IOC_MAGIC, nr, type) +#define SECCOMP_IOW(nr, type) _IOW(SECCOMP_IOC_MAGIC, nr, type) +#define SECCOMP_IOWR(nr, type) _IOWR(SECCOMP_IOC_MAGIC, nr, type) + +/* Flags for seccomp notification fd ioctl. */ +#define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif) +#define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ + struct seccomp_notif_resp) +#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64) + +struct seccomp_notif { + __u64 id; + __u32 pid; + __u32 flags; + struct seccomp_data data; +}; + +struct seccomp_notif_resp { + __u64 id; + __s64 val; + __s32 error; + __u32 flags; +}; + +struct seccomp_notif_sizes { + __u16 seccomp_notif; + __u16 seccomp_notif_resp; + __u16 seccomp_data; +}; +#endif + #ifndef seccomp int seccomp(unsigned int op, unsigned int flags, void *args) { @@ -2077,7 +2122,8 @@ TEST(detect_seccomp_filter_flags) { unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_LOG, - SECCOMP_FILTER_FLAG_SPEC_ALLOW }; + SECCOMP_FILTER_FLAG_SPEC_ALLOW, + SECCOMP_FILTER_FLAG_NEW_LISTENER }; unsigned int flag, all_flags; int i; long ret; @@ -2938,6 +2984,403 @@ skip: ASSERT_EQ(0, kill(pid, SIGKILL)); } +static int user_trap_syscall(int nr, unsigned int flags) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + }; + + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog); +} + +#define USER_NOTIF_MAGIC 116983961184613L +TEST(user_notification_basic) +{ + pid_t pid; + long ret; + int status, listener; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + struct pollfd pollfd; + + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + pid = fork(); + ASSERT_GE(pid, 0); + + /* Check that we get -ENOSYS with no listener attached */ + if (pid == 0) { + if (user_trap_syscall(__NR_getpid, 0) < 0) + exit(1); + ret = syscall(__NR_getpid); + exit(ret >= 0 || errno != ENOSYS); + } + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + + /* Add some no-op filters so for grins. */ + EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); + EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); + EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); + EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); + + /* Check that the basic notification machinery works */ + listener = user_trap_syscall(__NR_getpid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + EXPECT_GE(listener, 0); + + /* Installing a second listener in the chain should EBUSY */ + EXPECT_EQ(user_trap_syscall(__NR_getpid, + SECCOMP_FILTER_FLAG_NEW_LISTENER), + -1); + EXPECT_EQ(errno, EBUSY); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ret = syscall(__NR_getpid); + exit(ret != USER_NOTIF_MAGIC); + } + + pollfd.fd = listener; + pollfd.events = POLLIN | POLLOUT; + + EXPECT_GT(poll(&pollfd, 1, -1), 0); + EXPECT_EQ(pollfd.revents, POLLIN); + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + + pollfd.fd = listener; + pollfd.events = POLLIN | POLLOUT; + + EXPECT_GT(poll(&pollfd, 1, -1), 0); + EXPECT_EQ(pollfd.revents, POLLOUT); + + EXPECT_EQ(req.data.nr, __NR_getpid); + + resp.id = req.id; + resp.error = 0; + resp.val = USER_NOTIF_MAGIC; + + /* check that we make sure flags == 0 */ + resp.flags = 1; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1); + EXPECT_EQ(errno, EINVAL); + + resp.flags = 0; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST(user_notification_kill_in_middle) +{ + pid_t pid; + long ret; + int listener; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + + listener = user_trap_syscall(__NR_getpid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + EXPECT_GE(listener, 0); + + /* + * Check that nothing bad happens when we kill the task in the middle + * of a syscall. + */ + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ret = syscall(__NR_getpid); + exit(ret != USER_NOTIF_MAGIC); + } + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), 0); + + EXPECT_EQ(kill(pid, SIGKILL), 0); + EXPECT_EQ(waitpid(pid, NULL, 0), pid); + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), -1); + + resp.id = req.id; + ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp); + EXPECT_EQ(ret, -1); + EXPECT_EQ(errno, ENOENT); +} + +static int handled = -1; + +static void signal_handler(int signal) +{ + if (write(handled, "c", 1) != 1) + perror("write from signal"); +} + +TEST(user_notification_signal) +{ + pid_t pid; + long ret; + int status, listener, sk_pair[2]; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + char c; + + ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0); + + listener = user_trap_syscall(__NR_gettid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + EXPECT_GE(listener, 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(sk_pair[0]); + handled = sk_pair[1]; + if (signal(SIGUSR1, signal_handler) == SIG_ERR) { + perror("signal"); + exit(1); + } + /* + * ERESTARTSYS behavior is a bit hard to test, because we need + * to rely on a signal that has not yet been handled. Let's at + * least check that the error code gets propagated through, and + * hope that it doesn't break when there is actually a signal :) + */ + ret = syscall(__NR_gettid); + exit(!(ret == -1 && errno == 512)); + } + + close(sk_pair[1]); + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + + EXPECT_EQ(kill(pid, SIGUSR1), 0); + + /* + * Make sure the signal really is delivered, which means we're not + * stuck in the user notification code any more and the notification + * should be dead. + */ + EXPECT_EQ(read(sk_pair[0], &c, 1), 1); + + resp.id = req.id; + resp.error = -EPERM; + resp.val = 0; + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1); + EXPECT_EQ(errno, ENOENT); + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + + resp.id = req.id; + resp.error = -512; /* -ERESTARTSYS */ + resp.val = 0; + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST(user_notification_closed_listener) +{ + pid_t pid; + long ret; + int status, listener; + + listener = user_trap_syscall(__NR_getpid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + EXPECT_GE(listener, 0); + + /* + * Check that we get an ENOSYS when the listener is closed. + */ + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) { + close(listener); + ret = syscall(__NR_getpid); + exit(ret != -1 && errno != ENOSYS); + } + + close(listener); + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +/* + * Check that a pid in a child namespace still shows up as valid in ours. + */ +TEST(user_notification_child_pid_ns) +{ + pid_t pid; + int status, listener; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + + ASSERT_EQ(unshare(CLONE_NEWPID), 0); + + listener = user_trap_syscall(__NR_getpid, SECCOMP_FILTER_FLAG_NEW_LISTENER); + ASSERT_GE(listener, 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) + exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC); + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + EXPECT_EQ(req.pid, pid); + + resp.id = req.id; + resp.error = 0; + resp.val = USER_NOTIF_MAGIC; + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + close(listener); +} + +/* + * Check that a pid in a sibling (i.e. unrelated) namespace shows up as 0, i.e. + * invalid. + */ +TEST(user_notification_sibling_pid_ns) +{ + pid_t pid, pid2; + int status, listener; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + + listener = user_trap_syscall(__NR_getpid, SECCOMP_FILTER_FLAG_NEW_LISTENER); + ASSERT_GE(listener, 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ASSERT_EQ(unshare(CLONE_NEWPID), 0); + + pid2 = fork(); + ASSERT_GE(pid2, 0); + + if (pid2 == 0) + exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC); + + EXPECT_EQ(waitpid(pid2, &status, 0), pid2); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + exit(WEXITSTATUS(status)); + } + + /* Create the sibling ns, and sibling in it. */ + EXPECT_EQ(unshare(CLONE_NEWPID), 0); + EXPECT_EQ(errno, 0); + + pid2 = fork(); + EXPECT_GE(pid2, 0); + + if (pid2 == 0) { + ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + /* + * The pid should be 0, i.e. the task is in some namespace that + * we can't "see". + */ + ASSERT_EQ(req.pid, 0); + + resp.id = req.id; + resp.error = 0; + resp.val = USER_NOTIF_MAGIC; + + ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + exit(0); + } + + close(listener); + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + + EXPECT_EQ(waitpid(pid2, &status, 0), pid2); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST(user_notification_fault_recv) +{ + pid_t pid; + int status, listener; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + + listener = user_trap_syscall(__NR_getpid, SECCOMP_FILTER_FLAG_NEW_LISTENER); + ASSERT_GE(listener, 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) + exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC); + + /* Do a bad recv() */ + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, NULL), -1); + EXPECT_EQ(errno, EFAULT); + + /* We should still be able to receive this notification, though. */ + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + EXPECT_EQ(req.pid, pid); + + resp.id = req.id; + resp.error = 0; + resp.val = USER_NOTIF_MAGIC; + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST(seccomp_get_notif_sizes) +{ + struct seccomp_notif_sizes sizes; + + EXPECT_EQ(seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes), 0); + EXPECT_EQ(sizes.seccomp_notif, sizeof(struct seccomp_notif)); + EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp)); +} + /* * TODO: * - add microbenchmarks