whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit b50be48c144088f2ab152ea861ba5979bbd65cc9
parent 9e5de623a0cb9374bdcc73c0c098818f0d7ab7e9
Author: Louis Solofrizzo <louis@ne02ptzero.me>
Date:   Mon, 22 Apr 2019 15:21:06 +0200

ukl: Add basic UKL configuration & musl mock to the kernel

This patches introduces numerous things. First of all, I added the ukl/
subdirectory in the tree, which contains a musl (libc) mock for future
UKL applications. For the moment, I added 4 syscalls, open / read /
write and close. Apart from the O_CLOEXEC option on an open, they should
behave normally.
I also added a new boot option, ukl. When this option is present, and
the kernel is compiled in UKL mode, no userspace init is launched, and
the main of the UKL program is runned.

One should be able to run a little test program with this patch:

    #include <unistd.h>
    #include <fcntl.h>

    #ifndef USERSPACE
    # define main ukl_main
    #endif

    int main(void) {
        char        buf[256] = { 0 };
        int         fd = open("/etc/passwd", O_RDONLY);
        ssize_t     ret;

        if (fd == -1)
            return 1;

        while ((ret = read(fd, buf, sizeof(buf))))
        {
            buf[ret] = 0;
            write(1, buf, ret);
        }

        close(fd);

        return 0;
    }

The compilation is handled the same way kernel modules does:

    export ukl-obj-m := main.o

    all:
    	make -C ../whiterose/ UKL=$(PWD) bzImage

Where 'whiterose' is the kernel tree.
One can then run this kernel under qemu with:

    qemu-system-x86_64 -enable-kvm -m 1G -s -kernel
    ../whiterose/arch/x86/boot/bzImage -append
    "console=ttyS0 root=/dev/sda ukl quiet" -nographic  -hda qemu-image.img

If the disk image do have an /etc/passwd file, it should be printed on
the console.

Signed-off-by: Louis Solofrizzo <louis@ne02ptzero.me>

Diffstat:
MKconfig | 2++
MMakefile | 3++-
Ainclude/ukl/fcntl.h | 9+++++++++
Ainclude/ukl/unistd.h | 10++++++++++
Minit/main.c | 39++++++++++++++++++++++++++++++++++++---
Aukl/Kconfig | 3+++
Aukl/Makefile | 2++
Aukl/TO_PORT | 333+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aukl/fcntl/open.c | 34++++++++++++++++++++++++++++++++++
Aukl/unistd/close.c | 7+++++++
Aukl/unistd/read.c | 21+++++++++++++++++++++
Aukl/unistd/write.c | 111+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
12 files changed, 570 insertions(+), 4 deletions(-)

diff --git a/Kconfig b/Kconfig @@ -30,3 +30,5 @@ source "crypto/Kconfig" source "lib/Kconfig" source "lib/Kconfig.debug" + +source "ukl/Kconfig" diff --git a/Makefile b/Makefile @@ -437,6 +437,7 @@ LINUXINCLUDE := \ -I$(objtree)/arch/$(SRCARCH)/include/generated \ $(if $(KBUILD_SRC), -I$(srctree)/include) \ -I$(objtree)/include \ + -I$(srctree)/include/ukl \ $(USERINCLUDE) KBUILD_AFLAGS := -D__ASSEMBLY__ -fno-PIE @@ -969,7 +970,7 @@ endif PHONY += prepare0 ifeq ($(KBUILD_EXTMOD),) -core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/ +core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/ ukl/ vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ diff --git a/include/ukl/fcntl.h b/include/ukl/fcntl.h @@ -0,0 +1,9 @@ +#ifndef FCNTL_H +#define FCNTL_H + +#include <stdarg.h> +#include <linux/syscalls.h> + +int open(const char *filename, int flags, ...); + +#endif /* FCNTL_H */ diff --git a/include/ukl/unistd.h b/include/ukl/unistd.h @@ -0,0 +1,10 @@ +#ifndef UNISTD_H +#define UNISTD_H + +#include <linux/kernel.h> + +ssize_t write(int, const void *, size_t); +ssize_t read(int, void *, size_t); +int close(int); + +#endif /* UNISTD_H */ diff --git a/init/main.c b/init/main.c @@ -347,6 +347,16 @@ static int __init init_setup(char *str) } __setup("init=", init_setup); +#ifdef CONFIG_UKL_LINUX +bool ukl_mode = false; +static int __init ukl_kernel(char *str) +{ + ukl_mode = true; + return 1; +} +early_param("ukl", ukl_kernel); +#endif /* CONFIG_UKL_LINUX */ + static int __init rdinit_setup(char *str) { unsigned int i; @@ -1004,13 +1014,27 @@ static void __init do_pre_smp_initcalls(void) do_one_initcall(initcall_from_entry(fn)); } +#ifdef CONFIG_UKL_LINUX +extern int ukl_main(void); + +static int run_ukl_main(void) +{ + printk("Launching Unikernel...\n"); + kthread_run((void*)ukl_main, NULL, "UKL"); + while (1) + cond_resched(); + return 0; +} +#endif /* CONFIG_UKL_LINUX */ + static int run_init_process(const char *init_filename) { argv_init[0] = init_filename; pr_info("Run %s as init process\n", init_filename); - return do_execve(getname_kernel(init_filename), - (const char __user *const __user *)argv_init, - (const char __user *const __user *)envp_init); + + return do_execve(getname_kernel(init_filename), + (const char __user *const __user *)argv_init, + (const char __user *const __user *)envp_init); } static int try_to_run_init_process(const char *init_filename) @@ -1083,6 +1107,15 @@ static int __ref kernel_init(void *unused) rcu_end_inkernel_boot(); +#ifdef CONFIG_UKL_LINUX + if (ukl_mode) + { + run_ukl_main(); + return 0; + } + +#endif /* CONFIG_UKL_LINUX */ + if (ramdisk_execute_command) { ret = run_init_process(ramdisk_execute_command); if (!ret) diff --git a/ukl/Kconfig b/ukl/Kconfig @@ -0,0 +1,3 @@ +config UKL_LINUX + bool "Compile with UKL support" + def_bool y diff --git a/ukl/Makefile b/ukl/Makefile @@ -0,0 +1,2 @@ +ukl-real-obj-m := $(addprefix ../../../../../../../..$(UKL)/,$(ukl-obj-m)) +obj-y := unistd/write.o unistd/read.o unistd/close.o fcntl/open.o $(ukl-real-obj-m) diff --git a/ukl/TO_PORT b/ukl/TO_PORT @@ -0,0 +1,333 @@ +Syscalls to port: +================ +- stat +- fstat +- lstat +- poll +- lseek +- mmap +- mprotect +- munmap +- brk +- rt_sigaction +- rt_sigprocmask +- rt_sigreturn +- ioctl +- pread64 +- pwrite64 +- readv +- writev +- access +- pipe +- select +- sched_yield +- mremap +- msync +- mincore +- madvise +- shmget +- shmat +- shmctl +- dup +- dup2 +- pause +- nanosleep +- getitimer +- alarm +- setitimer +- getpid +- sendfile +- socket +- connect +- accept +- sendto +- recvfrom +- sendmsg +- recvmsg +- shutdown +- bind +- listen +- getsockname +- getpeername +- socketpair +- setsockopt +- getsockopt +- clone +- fork +- vfork +- execve +- exit +- wait4 +- kill +- uname +- semget +- semop +- semctl +- shmdt +- msgget +- msgsnd +- msgrcv +- msgctl +- fcntl +- flock +- fsync +- fdatasync +- truncate +- ftruncate +- getdents +- getcwd +- chdir +- fchdir +- rename +- mkdir +- rmdir +- creat +- link +- unlink +- symlink +- readlink +- chmod +- fchmod +- chown +- fchown +- lchown +- umask +- gettimeofday +- getrlimit +- getrusage +- sysinfo +- times +- ptrace +- getuid +- syslog +- getgid +- setuid +- setgid +- geteuid +- getegid +- setpgid +- getppid +- getpgrp +- setsid +- setreuid +- setregid +- getgroups +- setgroups +- setresuid +- getresuid +- setresgid +- getresgid +- getpgid +- setfsuid +- setfsgid +- getsid +- capget +- capset +- rt_sigpending +- rt_sigtimedwait +- rt_sigqueueinfo +- rt_sigsuspend +- sigaltstack +- utime +- mknod +- uselib +- personality +- ustat +- statfs +- fstatfs +- sysfs +- getpriority +- setpriority +- sched_setparam +- sched_getparam +- sched_setscheduler +- sched_getscheduler +- sched_get_priority_max +- sched_get_priority_min +- sched_rr_get_interval +- mlock +- munlock +- mlockall +- munlockall +- vhangup +- modify_ldt +- pivot_root +- _sysctl +- prctl +- arch_prctl +- adjtimex +- setrlimit +- chroot +- sync +- acct +- settimeofday +- mount +- umount2 +- swapon +- swapoff +- reboot +- sethostname +- setdomainname +- iopl +- ioperm +- create_module +- init_module +- delete_module +- get_kernel_syms +- query_module +- quotactl +- nfsservctl +- getpmsg +- putpmsg +- afs_syscall +- tuxcall +- security +- gettid +- readahead +- setxattr +- lsetxattr +- fsetxattr +- getxattr +- lgetxattr +- fgetxattr +- listxattr +- llistxattr +- flistxattr +- removexattr +- lremovexattr +- fremovexattr +- tkill +- time +- futex +- sched_setaffinity +- sched_getaffinity +- set_thread_area +- io_setup +- io_destroy +- io_getevents +- io_submit +- io_cancel +- get_thread_area +- lookup_dcookie +- epoll_create +- epoll_ctl_old +- epoll_wait_old +- remap_file_pages +- getdents64 +- set_tid_address +- restart_syscall +- semtimedop +- fadvise64 +- timer_create +- timer_settime +- timer_gettime +- timer_getoverrun +- timer_delete +- clock_settime +- clock_gettime +- clock_getres +- clock_nanosleep +- exit_group +- epoll_wait +- epoll_ctl +- tgkill +- utimes +- vserver +- mbind +- set_mempolicy +- get_mempolicy +- mq_open +- mq_unlink +- mq_timedsend +- mq_timedreceive +- mq_notify +- mq_getsetattr +- kexec_load +- waitid +- add_key +- request_key +- keyctl +- ioprio_set +- ioprio_get +- inotify_init +- inotify_add_watch +- inotify_rm_watch +- migrate_pages +- openat +- mkdirat +- mknodat +- fchownat +- futimesat +- newfstatat +- unlinkat +- renameat +- linkat +- symlinkat +- readlinkat +- fchmodat +- faccessat +- pselect6 +- ppoll +- unshare +- set_robust_list +- get_robust_list +- splice +- tee +- sync_file_range +- vmsplice +- move_pages +- utimensat +- epoll_pwait +- signalfd +- timerfd_create +- eventfd +- fallocate +- timerfd_settime +- timerfd_gettime +- accept4 +- signalfd4 +- eventfd2 +- epoll_create1 +- dup3 +- pipe2 +- inotify_init1 +- preadv +- pwritev +- rt_tgsigqueueinfo +- perf_event_open +- recvmmsg +- fanotify_init +- fanotify_mark +- prlimit64 +- name_to_handle_at +- open_by_handle_at +- clock_adjtime +- syncfs +- sendmmsg +- setns +- getcpu +- process_vm_readv +- process_vm_writev +- kcmp +- finit_module +- sched_setattr +- sched_getattr +- renameat2 +- seccomp +- getrandom +- memfd_create +- kexec_file_load +- bpf +- execveat +- userfaultfd +- membarrier +- mlock2 +- copy_file_range +- preadv2 +- pwritev2 +- pkey_mprotect +- pkey_alloc +- pkey_free +- statx + + diff --git a/ukl/fcntl/open.c b/ukl/fcntl/open.c @@ -0,0 +1,34 @@ +#include <fcntl.h> +#include <linux/syscalls.h> + +#include <stdarg.h> + +int open(const char *filename, int flags, ...) +{ + umode_t mode = 0; + mm_segment_t old_fs; + int ret = -1; + + /* Parse the flags */ + if ((flags & O_CREAT) || (flags & O_TMPFILE) == O_TMPFILE) + { + va_list ap; + + va_start(ap, flags); + mode = va_arg(ap, umode_t); + va_end(ap); + } + + /* Make the call */ + ret = do_sys_open(AT_FDCWD, filename, flags, mode); + + /*if (fd >= 0 && (flags & O_CLOEXEC))*/ + /*fcntl(fd, F_SETFD, FD_CLOEXEC);*/ + + /** + * Little trick used to mock stdin, stdout and stderr for UKL + * Don't make any sense in kernel space, since a fd could very well + * be 0, so we start at 3. + */ + return ret + 3; +} diff --git a/ukl/unistd/close.c b/ukl/unistd/close.c @@ -0,0 +1,7 @@ +#include <unistd.h> +#include <linux/syscalls.h> + +int close(int fd) +{ + return ksys_close(fd); +} diff --git a/ukl/unistd/read.c b/ukl/unistd/read.c @@ -0,0 +1,21 @@ +#include <unistd.h> +#include <linux/syscalls.h> + +ssize_t read(int fd, void *buf, size_t count) +{ + /* We don't support pipe, stdin, stdout or stderr reading */ + if (fd >= 0 && fd <= 2) + { + printk(KERN_ERR "Unsupported read to %d\n", fd); + return 0; + } + + /** + * Little trick used to mock stdin, stdout and stderr for UKL + * Don't make any sense in kernel space, since a fd could very well + * be 0. + */ + fd -= 3; + + return ksys_read(fd, buf, count); +} diff --git a/ukl/unistd/write.c b/ukl/unistd/write.c @@ -0,0 +1,111 @@ +#include <unistd.h> +#include <linux/syscalls.h> +#include <linux/console.h> + +static DEFINE_RAW_SPINLOCK(console_owner_lock); +static struct task_struct *console_owner; +static bool console_waiter; +#ifdef CONFIG_LOCKDEP +static struct lockdep_map console_owner_dep_map = { + .name = "console_owner" +}; +#endif /* CONFIG_LOCKDEP */ + +static void console_lock_spinning_enable(void) +{ + /* Lock, save the current thread as the console owner, and unlock */ + raw_spin_lock(&console_owner_lock); + console_owner = current; + raw_spin_unlock(&console_owner_lock); + + /* Take the console lock */ + spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); +} + +static int console_lock_spinning_disable_and_check(void) +{ + int waiter; + + /* Lock, atomically set the waiter, reset the console owner, and unlock */ + raw_spin_lock(&console_owner_lock); + waiter = READ_ONCE(console_waiter); + console_owner = NULL; + raw_spin_unlock(&console_owner_lock); + + /* No waiters, release the spin lock */ + if (!waiter) + { + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + return 0; + } + + /* The waiter is now free to continue */ + WRITE_ONCE(console_waiter, false); + + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + + /* + * Hand off console_lock to waiter. The waiter will perform + * the up(). After this, the waiter is the console_lock owner. + */ + mutex_release(&console_lock_dep_map, 1, _THIS_IP_); + + return 1; +} + +static void write_to_console(const char *buf, size_t count) +{ + struct console *con; + + /* No console drives to speak of */ + if (!console_drivers) + return; + + /* For each known console */ + for_each_console(con) + { + /* Console is not enabled */ + if (!(con->flags & CON_ENABLED)) + continue; + + /* Cannot write to this console */ + if (!con->write) + continue; + + /* CPU is not ready */ + if (!cpu_online(smp_processor_id()) && + !(con->flags & CON_ANYTIME)) + continue; + else + con->write(con, buf, count); + } +} + +ssize_t write(int fd, const void *buf, size_t count) +{ + /* We don't support pipe writing in KSpace */ + if (fd == 0) + { + printk(KERN_WARNING "Attempt to write to fd 0!\n"); + return 0; + } + + /** + * Writing to stdout or stderr, that don't exist in KSpace, + * so write them directly to the console. + */ + if (fd == 1 || fd == 2) + { + /* Take the console lock, write and release the lock */ + console_lock_spinning_enable(); + write_to_console(buf, count); + console_lock_spinning_disable_and_check(); + } + + /** + * Little trick used to mock stdin, stdout and stderr for UKL + * Don't make any sense in kernel space, since a fd could very well + * be 0. + */ + return ksys_write(fd - 3, buf, count); +}