whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 7b47a9e7c8f672b6fb0b77fca11a63a8a77f5a91
parent dbc2fba3fc46084f502aec53183995a632998dcd
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Tue, 12 Mar 2019 14:08:19 -0700

Merge branch 'work.mount' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

Pull vfs mount infrastructure updates from Al Viro:
 "The rest of core infrastructure; no new syscalls in that pile, but the
  old parts are switched to new infrastructure. At that point
  conversions of individual filesystems can happen independently; some
  are done here (afs, cgroup, procfs, etc.), there's also a large series
  outside of that pile dealing with NFS (quite a bit of option-parsing
  stuff is getting used there - it's one of the most convoluted
  filesystems in terms of mount-related logics), but NFS bits are the
  next cycle fodder.

  It got seriously simplified since the last cycle; documentation is
  probably the weakest bit at the moment - I considered dropping the
  commit introducing Documentation/filesystems/mount_api.txt (cutting
  the size increase by quarter ;-), but decided that it would be better
  to fix it up after -rc1 instead.

  That pile allows to do followup work in independent branches, which
  should make life much easier for the next cycle. fs/super.c size
  increase is unpleasant; there's a followup series that allows to
  shrink it considerably, but I decided to leave that until the next
  cycle"

* 'work.mount' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (41 commits)
  afs: Use fs_context to pass parameters over automount
  afs: Add fs_context support
  vfs: Add some logging to the core users of the fs_context log
  vfs: Implement logging through fs_context
  vfs: Provide documentation for new mount API
  vfs: Remove kern_mount_data()
  hugetlbfs: Convert to fs_context
  cpuset: Use fs_context
  kernfs, sysfs, cgroup, intel_rdt: Support fs_context
  cgroup: store a reference to cgroup_ns into cgroup_fs_context
  cgroup1_get_tree(): separate "get cgroup_root to use" into a separate helper
  cgroup_do_mount(): massage calling conventions
  cgroup: stash cgroup_root reference into cgroup_fs_context
  cgroup2: switch to option-by-option parsing
  cgroup1: switch to option-by-option parsing
  cgroup: take options parsing into ->parse_monolithic()
  cgroup: fold cgroup1_mount() into cgroup1_get_tree()
  cgroup: start switching to fs_context
  ipc: Convert mqueue fs to fs_context
  proc: Add fs_context support to procfs
  ...

Diffstat:
ADocumentation/filesystems/mount_api.txt | 709+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
March/x86/kernel/cpu/resctrl/internal.h | 16++++++++++++++++
March/x86/kernel/cpu/resctrl/rdtgroup.c | 185+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
Mfs/Kconfig | 7+++++++
Mfs/Makefile | 2+-
Mfs/afs/internal.h | 9++++-----
Mfs/afs/mntpt.c | 149+++++++++++++++++++++++++++++++++++++++++--------------------------------------
Mfs/afs/super.c | 430++++++++++++++++++++++++++++++++++++++++---------------------------------------
Mfs/afs/volume.c | 4++--
Mfs/filesystems.c | 4++++
Afs/fs_context.c | 642+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Afs/fs_parser.c | 447+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mfs/hugetlbfs/inode.c | 358++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
Mfs/internal.h | 13++++++++++---
Mfs/kernfs/kernfs-internal.h | 1+
Mfs/kernfs/mount.c | 119+++++++++++++++++++++++++------------------------------------------------------
Mfs/mount.h | 5+++++
Mfs/namei.c | 4++--
Mfs/namespace.c | 395+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
Mfs/pnode.c | 5-----
Mfs/pnode.h | 3+--
Mfs/proc/inode.c | 52+---------------------------------------------------
Mfs/proc/internal.h | 5+----
Mfs/proc/root.c | 236++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
Mfs/super.c | 344+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
Mfs/sysfs/mount.c | 73++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
Minclude/linux/errno.h | 1+
Minclude/linux/fs.h | 14+++++++++++---
Ainclude/linux/fs_context.h | 188+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/linux/fs_parser.h | 151++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/linux/kernfs.h | 39++++++++++++++++++++-------------------
Minclude/linux/lsm_hooks.h | 21+++++++++++++++++++++
Minclude/linux/mount.h | 3+++
Minclude/linux/security.h | 18+++++++++++++++++-
Mipc/mqueue.c | 94++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
Mipc/namespace.c | 2+-
Mkernel/cgroup/cgroup-internal.h | 49++++++++++++++++++++++++++++++++-----------------
Mkernel/cgroup/cgroup-v1.c | 394++++++++++++++++++++++++++++++++++++++++++-------------------------------------
Mkernel/cgroup/cgroup.c | 223+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mkernel/cgroup/cpuset.c | 56++++++++++++++++++++++++++++++++++++++++++--------------
Msecurity/security.c | 10++++++++++
Msecurity/selinux/hooks.c | 88++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Msecurity/selinux/include/security.h | 10+++++-----
Msecurity/smack/smack.h | 19+++++--------------
Msecurity/smack/smack_lsm.c | 92++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
45 files changed, 4357 insertions(+), 1332 deletions(-)

diff --git a/Documentation/filesystems/mount_api.txt b/Documentation/filesystems/mount_api.txt @@ -0,0 +1,709 @@ + ==================== + FILESYSTEM MOUNT API + ==================== + +CONTENTS + + (1) Overview. + + (2) The filesystem context. + + (3) The filesystem context operations. + + (4) Filesystem context security. + + (5) VFS filesystem context operations. + + (6) Parameter description. + + (7) Parameter helper functions. + + +======== +OVERVIEW +======== + +The creation of new mounts is now to be done in a multistep process: + + (1) Create a filesystem context. + + (2) Parse the parameters and attach them to the context. Parameters are + expected to be passed individually from userspace, though legacy binary + parameters can also be handled. + + (3) Validate and pre-process the context. + + (4) Get or create a superblock and mountable root. + + (5) Perform the mount. + + (6) Return an error message attached to the context. + + (7) Destroy the context. + +To support this, the file_system_type struct gains a new field: + + int (*init_fs_context)(struct fs_context *fc); + +which is invoked to set up the filesystem-specific parts of a filesystem +context, including the additional space. + +Note that security initialisation is done *after* the filesystem is called so +that the namespaces may be adjusted first. + + +====================== +THE FILESYSTEM CONTEXT +====================== + +The creation and reconfiguration of a superblock is governed by a filesystem +context. This is represented by the fs_context structure: + + struct fs_context { + const struct fs_context_operations *ops; + struct file_system_type *fs_type; + void *fs_private; + struct dentry *root; + struct user_namespace *user_ns; + struct net *net_ns; + const struct cred *cred; + char *source; + char *subtype; + void *security; + void *s_fs_info; + unsigned int sb_flags; + unsigned int sb_flags_mask; + enum fs_context_purpose purpose:8; + bool sloppy:1; + bool silent:1; + ... + }; + +The fs_context fields are as follows: + + (*) const struct fs_context_operations *ops + + These are operations that can be done on a filesystem context (see + below). This must be set by the ->init_fs_context() file_system_type + operation. + + (*) struct file_system_type *fs_type + + A pointer to the file_system_type of the filesystem that is being + constructed or reconfigured. This retains a reference on the type owner. + + (*) void *fs_private + + A pointer to the file system's private data. This is where the filesystem + will need to store any options it parses. + + (*) struct dentry *root + + A pointer to the root of the mountable tree (and indirectly, the + superblock thereof). This is filled in by the ->get_tree() op. If this + is set, an active reference on root->d_sb must also be held. + + (*) struct user_namespace *user_ns + (*) struct net *net_ns + + There are a subset of the namespaces in use by the invoking process. They + retain references on each namespace. The subscribed namespaces may be + replaced by the filesystem to reflect other sources, such as the parent + mount superblock on an automount. + + (*) const struct cred *cred + + The mounter's credentials. This retains a reference on the credentials. + + (*) char *source + + This specifies the source. It may be a block device (e.g. /dev/sda1) or + something more exotic, such as the "host:/path" that NFS desires. + + (*) char *subtype + + This is a string to be added to the type displayed in /proc/mounts to + qualify it (used by FUSE). This is available for the filesystem to set if + desired. + + (*) void *security + + A place for the LSMs to hang their security data for the superblock. The + relevant security operations are described below. + + (*) void *s_fs_info + + The proposed s_fs_info for a new superblock, set in the superblock by + sget_fc(). This can be used to distinguish superblocks. + + (*) unsigned int sb_flags + (*) unsigned int sb_flags_mask + + Which bits SB_* flags are to be set/cleared in super_block::s_flags. + + (*) enum fs_context_purpose + + This indicates the purpose for which the context is intended. The + available values are: + + FS_CONTEXT_FOR_MOUNT, -- New superblock for explicit mount + FS_CONTEXT_FOR_SUBMOUNT -- New automatic submount of extant mount + FS_CONTEXT_FOR_RECONFIGURE -- Change an existing mount + + (*) bool sloppy + (*) bool silent + + These are set if the sloppy or silent mount options are given. + + [NOTE] sloppy is probably unnecessary when userspace passes over one + option at a time since the error can just be ignored if userspace deems it + to be unimportant. + + [NOTE] silent is probably redundant with sb_flags & SB_SILENT. + +The mount context is created by calling vfs_new_fs_context() or +vfs_dup_fs_context() and is destroyed with put_fs_context(). Note that the +structure is not refcounted. + +VFS, security and filesystem mount options are set individually with +vfs_parse_mount_option(). Options provided by the old mount(2) system call as +a page of data can be parsed with generic_parse_monolithic(). + +When mounting, the filesystem is allowed to take data from any of the pointers +and attach it to the superblock (or whatever), provided it clears the pointer +in the mount context. + +The filesystem is also allowed to allocate resources and pin them with the +mount context. For instance, NFS might pin the appropriate protocol version +module. + + +================================= +THE FILESYSTEM CONTEXT OPERATIONS +================================= + +The filesystem context points to a table of operations: + + struct fs_context_operations { + void (*free)(struct fs_context *fc); + int (*dup)(struct fs_context *fc, struct fs_context *src_fc); + int (*parse_param)(struct fs_context *fc, + struct struct fs_parameter *param); + int (*parse_monolithic)(struct fs_context *fc, void *data); + int (*get_tree)(struct fs_context *fc); + int (*reconfigure)(struct fs_context *fc); + }; + +These operations are invoked by the various stages of the mount procedure to +manage the filesystem context. They are as follows: + + (*) void (*free)(struct fs_context *fc); + + Called to clean up the filesystem-specific part of the filesystem context + when the context is destroyed. It should be aware that parts of the + context may have been removed and NULL'd out by ->get_tree(). + + (*) int (*dup)(struct fs_context *fc, struct fs_context *src_fc); + + Called when a filesystem context has been duplicated to duplicate the + filesystem-private data. An error may be returned to indicate failure to + do this. + + [!] Note that even if this fails, put_fs_context() will be called + immediately thereafter, so ->dup() *must* make the + filesystem-private data safe for ->free(). + + (*) int (*parse_param)(struct fs_context *fc, + struct struct fs_parameter *param); + + Called when a parameter is being added to the filesystem context. param + points to the key name and maybe a value object. VFS-specific options + will have been weeded out and fc->sb_flags updated in the context. + Security options will also have been weeded out and fc->security updated. + + The parameter can be parsed with fs_parse() and fs_lookup_param(). Note + that the source(s) are presented as parameters named "source". + + If successful, 0 should be returned or a negative error code otherwise. + + (*) int (*parse_monolithic)(struct fs_context *fc, void *data); + + Called when the mount(2) system call is invoked to pass the entire data + page in one go. If this is expected to be just a list of "key[=val]" + items separated by commas, then this may be set to NULL. + + The return value is as for ->parse_param(). + + If the filesystem (e.g. NFS) needs to examine the data first and then + finds it's the standard key-val list then it may pass it off to + generic_parse_monolithic(). + + (*) int (*get_tree)(struct fs_context *fc); + + Called to get or create the mountable root and superblock, using the + information stored in the filesystem context (reconfiguration goes via a + different vector). It may detach any resources it desires from the + filesystem context and transfer them to the superblock it creates. + + On success it should set fc->root to the mountable root and return 0. In + the case of an error, it should return a negative error code. + + The phase on a userspace-driven context will be set to only allow this to + be called once on any particular context. + + (*) int (*reconfigure)(struct fs_context *fc); + + Called to effect reconfiguration of a superblock using information stored + in the filesystem context. It may detach any resources it desires from + the filesystem context and transfer them to the superblock. The + superblock can be found from fc->root->d_sb. + + On success it should return 0. In the case of an error, it should return + a negative error code. + + [NOTE] reconfigure is intended as a replacement for remount_fs. + + +=========================== +FILESYSTEM CONTEXT SECURITY +=========================== + +The filesystem context contains a security pointer that the LSMs can use for +building up a security context for the superblock to be mounted. There are a +number of operations used by the new mount code for this purpose: + + (*) int security_fs_context_alloc(struct fs_context *fc, + struct dentry *reference); + + Called to initialise fc->security (which is preset to NULL) and allocate + any resources needed. It should return 0 on success or a negative error + code on failure. + + reference will be non-NULL if the context is being created for superblock + reconfiguration (FS_CONTEXT_FOR_RECONFIGURE) in which case it indicates + the root dentry of the superblock to be reconfigured. It will also be + non-NULL in the case of a submount (FS_CONTEXT_FOR_SUBMOUNT) in which case + it indicates the automount point. + + (*) int security_fs_context_dup(struct fs_context *fc, + struct fs_context *src_fc); + + Called to initialise fc->security (which is preset to NULL) and allocate + any resources needed. The original filesystem context is pointed to by + src_fc and may be used for reference. It should return 0 on success or a + negative error code on failure. + + (*) void security_fs_context_free(struct fs_context *fc); + + Called to clean up anything attached to fc->security. Note that the + contents may have been transferred to a superblock and the pointer cleared + during get_tree. + + (*) int security_fs_context_parse_param(struct fs_context *fc, + struct fs_parameter *param); + + Called for each mount parameter, including the source. The arguments are + as for the ->parse_param() method. It should return 0 to indicate that + the parameter should be passed on to the filesystem, 1 to indicate that + the parameter should be discarded or an error to indicate that the + parameter should be rejected. + + The value pointed to by param may be modified (if a string) or stolen + (provided the value pointer is NULL'd out). If it is stolen, 1 must be + returned to prevent it being passed to the filesystem. + + (*) int security_fs_context_validate(struct fs_context *fc); + + Called after all the options have been parsed to validate the collection + as a whole and to do any necessary allocation so that + security_sb_get_tree() and security_sb_reconfigure() are less likely to + fail. It should return 0 or a negative error code. + + In the case of reconfiguration, the target superblock will be accessible + via fc->root. + + (*) int security_sb_get_tree(struct fs_context *fc); + + Called during the mount procedure to verify that the specified superblock + is allowed to be mounted and to transfer the security data there. It + should return 0 or a negative error code. + + (*) void security_sb_reconfigure(struct fs_context *fc); + + Called to apply any reconfiguration to an LSM's context. It must not + fail. Error checking and resource allocation must be done in advance by + the parameter parsing and validation hooks. + + (*) int security_sb_mountpoint(struct fs_context *fc, struct path *mountpoint, + unsigned int mnt_flags); + + Called during the mount procedure to verify that the root dentry attached + to the context is permitted to be attached to the specified mountpoint. + It should return 0 on success or a negative error code on failure. + + +================================= +VFS FILESYSTEM CONTEXT OPERATIONS +================================= + +There are four operations for creating a filesystem context and +one for destroying a context: + + (*) struct fs_context *vfs_new_fs_context(struct file_system_type *fs_type, + struct dentry *reference, + unsigned int sb_flags, + unsigned int sb_flags_mask, + enum fs_context_purpose purpose); + + Create a filesystem context for a given filesystem type and purpose. This + allocates the filesystem context, sets the superblock flags, initialises + the security and calls fs_type->init_fs_context() to initialise the + filesystem private data. + + reference can be NULL or it may indicate the root dentry of a superblock + that is going to be reconfigured (FS_CONTEXT_FOR_RECONFIGURE) or + the automount point that triggered a submount (FS_CONTEXT_FOR_SUBMOUNT). + This is provided as a source of namespace information. + + (*) struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc); + + Duplicate a filesystem context, copying any options noted and duplicating + or additionally referencing any resources held therein. This is available + for use where a filesystem has to get a mount within a mount, such as NFS4 + does by internally mounting the root of the target server and then doing a + private pathwalk to the target directory. + + The purpose in the new context is inherited from the old one. + + (*) void put_fs_context(struct fs_context *fc); + + Destroy a filesystem context, releasing any resources it holds. This + calls the ->free() operation. This is intended to be called by anyone who + created a filesystem context. + + [!] filesystem contexts are not refcounted, so this causes unconditional + destruction. + +In all the above operations, apart from the put op, the return is a mount +context pointer or a negative error code. + +For the remaining operations, if an error occurs, a negative error code will be +returned. + + (*) int vfs_get_tree(struct fs_context *fc); + + Get or create the mountable root and superblock, using the parameters in + the filesystem context to select/configure the superblock. This invokes + the ->validate() op and then the ->get_tree() op. + + [NOTE] ->validate() could perhaps be rolled into ->get_tree() and + ->reconfigure(). + + (*) struct vfsmount *vfs_create_mount(struct fs_context *fc); + + Create a mount given the parameters in the specified filesystem context. + Note that this does not attach the mount to anything. + + (*) int vfs_parse_fs_param(struct fs_context *fc, + struct fs_parameter *param); + + Supply a single mount parameter to the filesystem context. This include + the specification of the source/device which is specified as the "source" + parameter (which may be specified multiple times if the filesystem + supports that). + + param specifies the parameter key name and the value. The parameter is + first checked to see if it corresponds to a standard mount flag (in which + case it is used to set an SB_xxx flag and consumed) or a security option + (in which case the LSM consumes it) before it is passed on to the + filesystem. + + The parameter value is typed and can be one of: + + fs_value_is_flag, Parameter not given a value. + fs_value_is_string, Value is a string + fs_value_is_blob, Value is a binary blob + fs_value_is_filename, Value is a filename* + dirfd + fs_value_is_filename_empty, Value is a filename* + dirfd + AT_EMPTY_PATH + fs_value_is_file, Value is an open file (file*) + + If there is a value, that value is stored in a union in the struct in one + of param->{string,blob,name,file}. Note that the function may steal and + clear the pointer, but then becomes responsible for disposing of the + object. + + (*) int vfs_parse_fs_string(struct fs_context *fc, char *key, + const char *value, size_t v_size); + + A wrapper around vfs_parse_fs_param() that just passes a constant string. + + (*) int generic_parse_monolithic(struct fs_context *fc, void *data); + + Parse a sys_mount() data page, assuming the form to be a text list + consisting of key[=val] options separated by commas. Each item in the + list is passed to vfs_mount_option(). This is the default when the + ->parse_monolithic() operation is NULL. + + +===================== +PARAMETER DESCRIPTION +===================== + +Parameters are described using structures defined in linux/fs_parser.h. +There's a core description struct that links everything together: + + struct fs_parameter_description { + const char name[16]; + u8 nr_params; + u8 nr_alt_keys; + u8 nr_enums; + bool ignore_unknown; + bool no_source; + const char *const *keys; + const struct constant_table *alt_keys; + const struct fs_parameter_spec *specs; + const struct fs_parameter_enum *enums; + }; + +For example: + + enum afs_param { + Opt_autocell, + Opt_bar, + Opt_dyn, + Opt_foo, + Opt_source, + nr__afs_params + }; + + static const struct fs_parameter_description afs_fs_parameters = { + .name = "kAFS", + .nr_params = nr__afs_params, + .nr_alt_keys = ARRAY_SIZE(afs_param_alt_keys), + .nr_enums = ARRAY_SIZE(afs_param_enums), + .keys = afs_param_keys, + .alt_keys = afs_param_alt_keys, + .specs = afs_param_specs, + .enums = afs_param_enums, + }; + +The members are as follows: + + (1) const char name[16]; + + The name to be used in error messages generated by the parse helper + functions. + + (2) u8 nr_params; + + The number of discrete parameter identifiers. This indicates the number + of elements in the ->types[] array and also limits the values that may be + used in the values that the ->keys[] array maps to. + + It is expected that, for example, two parameters that are related, say + "acl" and "noacl" with have the same ID, but will be flagged to indicate + that one is the inverse of the other. The value can then be picked out + from the parse result. + + (3) const struct fs_parameter_specification *specs; + + Table of parameter specifications, where the entries are of type: + + struct fs_parameter_type { + enum fs_parameter_spec type:8; + u8 flags; + }; + + and the parameter identifier is the index to the array. 'type' indicates + the desired value type and must be one of: + + TYPE NAME EXPECTED VALUE RESULT IN + ======================= ======================= ===================== + fs_param_is_flag No value n/a + fs_param_is_bool Boolean value result->boolean + fs_param_is_u32 32-bit unsigned int result->uint_32 + fs_param_is_u32_octal 32-bit octal int result->uint_32 + fs_param_is_u32_hex 32-bit hex int result->uint_32 + fs_param_is_s32 32-bit signed int result->int_32 + fs_param_is_enum Enum value name result->uint_32 + fs_param_is_string Arbitrary string param->string + fs_param_is_blob Binary blob param->blob + fs_param_is_blockdev Blockdev path * Needs lookup + fs_param_is_path Path * Needs lookup + fs_param_is_fd File descriptor param->file + + And each parameter can be qualified with 'flags': + + fs_param_v_optional The value is optional + fs_param_neg_with_no If key name is prefixed with "no", it is false + fs_param_neg_with_empty If value is "", it is false + fs_param_deprecated The parameter is deprecated. + + For example: + + static const struct fs_parameter_spec afs_param_specs[nr__afs_params] = { + [Opt_autocell] = { fs_param_is flag }, + [Opt_bar] = { fs_param_is_enum }, + [Opt_dyn] = { fs_param_is flag }, + [Opt_foo] = { fs_param_is_bool, fs_param_neg_with_no }, + [Opt_source] = { fs_param_is_string }, + }; + + Note that if the value is of fs_param_is_bool type, fs_parse() will try + to match any string value against "0", "1", "no", "yes", "false", "true". + + [!] NOTE that the table must be sorted according to primary key name so + that ->keys[] is also sorted. + + (4) const char *const *keys; + + Table of primary key names for the parameters. There must be one entry + per defined parameter. The table is optional if ->nr_params is 0. The + table is just an array of names e.g.: + + static const char *const afs_param_keys[nr__afs_params] = { + [Opt_autocell] = "autocell", + [Opt_bar] = "bar", + [Opt_dyn] = "dyn", + [Opt_foo] = "foo", + [Opt_source] = "source", + }; + + [!] NOTE that the table must be sorted such that the table can be searched + with bsearch() using strcmp(). This means that the Opt_* values must + correspond to the entries in this table. + + (5) const struct constant_table *alt_keys; + u8 nr_alt_keys; + + Table of additional key names and their mappings to parameter ID plus the + number of elements in the table. This is optional. The table is just an + array of { name, integer } pairs, e.g.: + + static const struct constant_table afs_param_keys[] = { + { "baz", Opt_bar }, + { "dynamic", Opt_dyn }, + }; + + [!] NOTE that the table must be sorted such that strcmp() can be used with + bsearch() to search the entries. + + The parameter ID can also be fs_param_key_removed to indicate that a + deprecated parameter has been removed and that an error will be given. + This differs from fs_param_deprecated where the parameter may still have + an effect. + + Further, the behaviour of the parameter may differ when an alternate name + is used (for instance with NFS, "v3", "v4.2", etc. are alternate names). + + (6) const struct fs_parameter_enum *enums; + u8 nr_enums; + + Table of enum value names to integer mappings and the number of elements + stored therein. This is of type: + + struct fs_parameter_enum { + u8 param_id; + char name[14]; + u8 value; + }; + + Where the array is an unsorted list of { parameter ID, name }-keyed + elements that indicate the value to map to, e.g.: + + static const struct fs_parameter_enum afs_param_enums[] = { + { Opt_bar, "x", 1}, + { Opt_bar, "y", 23}, + { Opt_bar, "z", 42}, + }; + + If a parameter of type fs_param_is_enum is encountered, fs_parse() will + try to look the value up in the enum table and the result will be stored + in the parse result. + + (7) bool no_source; + + If this is set, fs_parse() will ignore any "source" parameter and not + pass it to the filesystem. + +The parser should be pointed to by the parser pointer in the file_system_type +struct as this will provide validation on registration (if +CONFIG_VALIDATE_FS_PARSER=y) and will allow the description to be queried from +userspace using the fsinfo() syscall. + + +========================== +PARAMETER HELPER FUNCTIONS +========================== + +A number of helper functions are provided to help a filesystem or an LSM +process the parameters it is given. + + (*) int lookup_constant(const struct constant_table tbl[], + const char *name, int not_found); + + Look up a constant by name in a table of name -> integer mappings. The + table is an array of elements of the following type: + + struct constant_table { + const char *name; + int value; + }; + + and it must be sorted such that it can be searched using bsearch() using + strcmp(). If a match is found, the corresponding value is returned. If a + match isn't found, the not_found value is returned instead. + + (*) bool validate_constant_table(const struct constant_table *tbl, + size_t tbl_size, + int low, int high, int special); + + Validate a constant table. Checks that all the elements are appropriately + ordered, that there are no duplicates and that the values are between low + and high inclusive, though provision is made for one allowable special + value outside of that range. If no special value is required, special + should just be set to lie inside the low-to-high range. + + If all is good, true is returned. If the table is invalid, errors are + logged to dmesg, the stack is dumped and false is returned. + + (*) int fs_parse(struct fs_context *fc, + const struct fs_param_parser *parser, + struct fs_parameter *param, + struct fs_param_parse_result *result); + + This is the main interpreter of parameters. It uses the parameter + description (parser) to look up the name of the parameter to use and to + convert that to a parameter ID (stored in result->key). + + If successful, and if the parameter type indicates the result is a + boolean, integer or enum type, the value is converted by this function and + the result stored in result->{boolean,int_32,uint_32}. + + If a match isn't initially made, the key is prefixed with "no" and no + value is present then an attempt will be made to look up the key with the + prefix removed. If this matches a parameter for which the type has flag + fs_param_neg_with_no set, then a match will be made and the value will be + set to false/0/NULL. + + If the parameter is successfully matched and, optionally, parsed + correctly, 1 is returned. If the parameter isn't matched and + parser->ignore_unknown is set, then 0 is returned. Otherwise -EINVAL is + returned. + + (*) bool fs_validate_description(const struct fs_parameter_description *desc); + + This is validates the parameter description. It returns true if the + description is good and false if it is not. + + (*) int fs_lookup_param(struct fs_context *fc, + struct fs_parameter *value, + bool want_bdev, + struct path *_path); + + This takes a parameter that carries a string or filename type and attempts + to do a path lookup on it. If the parameter expects a blockdev, a check + is made that the inode actually represents one. + + Returns 0 if successful and *_path will be set; returns a negative error + code if not. diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h @@ -4,6 +4,7 @@ #include <linux/sched.h> #include <linux/kernfs.h> +#include <linux/fs_context.h> #include <linux/jump_label.h> #define MSR_IA32_L3_QOS_CFG 0xc81 @@ -40,6 +41,21 @@ #define RMID_VAL_ERROR BIT_ULL(63) #define RMID_VAL_UNAVAIL BIT_ULL(62) + +struct rdt_fs_context { + struct kernfs_fs_context kfc; + bool enable_cdpl2; + bool enable_cdpl3; + bool enable_mba_mbps; +}; + +static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc = fc->fs_private; + + return container_of(kfc, struct rdt_fs_context, kfc); +} + DECLARE_STATIC_KEY_FALSE(rdt_enable_key); /** diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -24,6 +24,7 @@ #include <linux/cpu.h> #include <linux/debugfs.h> #include <linux/fs.h> +#include <linux/fs_parser.h> #include <linux/sysfs.h> #include <linux/kernfs.h> #include <linux/seq_buf.h> @@ -32,6 +33,7 @@ #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/task_work.h> +#include <linux/user_namespace.h> #include <uapi/linux/magic.h> @@ -1858,46 +1860,6 @@ static void cdp_disable_all(void) cdpl2_disable(); } -static int parse_rdtgroupfs_options(char *data) -{ - char *token, *o = data; - int ret = 0; - - while ((token = strsep(&o, ",")) != NULL) { - if (!*token) { - ret = -EINVAL; - goto out; - } - - if (!strcmp(token, "cdp")) { - ret = cdpl3_enable(); - if (ret) - goto out; - } else if (!strcmp(token, "cdpl2")) { - ret = cdpl2_enable(); - if (ret) - goto out; - } else if (!strcmp(token, "mba_MBps")) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - ret = set_mba_sc(true); - else - ret = -EINVAL; - if (ret) - goto out; - } else { - ret = -EINVAL; - goto out; - } - } - - return 0; - -out: - pr_err("Invalid mount option \"%s\"\n", token); - - return ret; -} - /* * We don't allow rdtgroup directories to be created anywhere * except the root directory. Thus when looking for the rdtgroup @@ -1969,13 +1931,27 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, struct kernfs_node **mon_data_kn); -static struct dentry *rdt_mount(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data) +static int rdt_enable_ctx(struct rdt_fs_context *ctx) +{ + int ret = 0; + + if (ctx->enable_cdpl2) + ret = cdpl2_enable(); + + if (!ret && ctx->enable_cdpl3) + ret = cdpl3_enable(); + + if (!ret && ctx->enable_mba_mbps) + ret = set_mba_sc(true); + + return ret; +} + +static int rdt_get_tree(struct fs_context *fc) { + struct rdt_fs_context *ctx = rdt_fc2context(fc); struct rdt_domain *dom; struct rdt_resource *r; - struct dentry *dentry; int ret; cpus_read_lock(); @@ -1984,53 +1960,42 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, * resctrl file system can only be mounted once. */ if (static_branch_unlikely(&rdt_enable_key)) { - dentry = ERR_PTR(-EBUSY); + ret = -EBUSY; goto out; } - ret = parse_rdtgroupfs_options(data); - if (ret) { - dentry = ERR_PTR(ret); + ret = rdt_enable_ctx(ctx); + if (ret < 0) goto out_cdp; - } closid_init(); ret = rdtgroup_create_info_dir(rdtgroup_default.kn); - if (ret) { - dentry = ERR_PTR(ret); - goto out_cdp; - } + if (ret < 0) + goto out_mba; if (rdt_mon_capable) { ret = mongroup_create_dir(rdtgroup_default.kn, NULL, "mon_groups", &kn_mongrp); - if (ret) { - dentry = ERR_PTR(ret); + if (ret < 0) goto out_info; - } kernfs_get(kn_mongrp); ret = mkdir_mondata_all(rdtgroup_default.kn, &rdtgroup_default, &kn_mondata); - if (ret) { - dentry = ERR_PTR(ret); + if (ret < 0) goto out_mongrp; - } kernfs_get(kn_mondata); rdtgroup_default.mon.mon_data_kn = kn_mondata; } ret = rdt_pseudo_lock_init(); - if (ret) { - dentry = ERR_PTR(ret); + if (ret) goto out_mondata; - } - dentry = kernfs_mount(fs_type, flags, rdt_root, - RDTGROUP_SUPER_MAGIC, NULL); - if (IS_ERR(dentry)) + ret = kernfs_get_tree(fc); + if (ret < 0) goto out_psl; if (rdt_alloc_capable) @@ -2059,14 +2024,95 @@ out_mongrp: kernfs_remove(kn_mongrp); out_info: kernfs_remove(kn_info); +out_mba: + if (ctx->enable_mba_mbps) + set_mba_sc(false); out_cdp: cdp_disable_all(); out: rdt_last_cmd_clear(); mutex_unlock(&rdtgroup_mutex); cpus_read_unlock(); + return ret; +} + +enum rdt_param { + Opt_cdp, + Opt_cdpl2, + Opt_mba_mpbs, + nr__rdt_params +}; + +static const struct fs_parameter_spec rdt_param_specs[] = { + fsparam_flag("cdp", Opt_cdp), + fsparam_flag("cdpl2", Opt_cdpl2), + fsparam_flag("mba_mpbs", Opt_mba_mpbs), + {} +}; + +static const struct fs_parameter_description rdt_fs_parameters = { + .name = "rdt", + .specs = rdt_param_specs, +}; + +static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct rdt_fs_context *ctx = rdt_fc2context(fc); + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, &rdt_fs_parameters, param, &result); + if (opt < 0) + return opt; - return dentry; + switch (opt) { + case Opt_cdp: + ctx->enable_cdpl3 = true; + return 0; + case Opt_cdpl2: + ctx->enable_cdpl2 = true; + return 0; + case Opt_mba_mpbs: + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return -EINVAL; + ctx->enable_mba_mbps = true; + return 0; + } + + return -EINVAL; +} + +static void rdt_fs_context_free(struct fs_context *fc) +{ + struct rdt_fs_context *ctx = rdt_fc2context(fc); + + kernfs_free_fs_context(fc); + kfree(ctx); +} + +static const struct fs_context_operations rdt_fs_context_ops = { + .free = rdt_fs_context_free, + .parse_param = rdt_parse_param, + .get_tree = rdt_get_tree, +}; + +static int rdt_init_fs_context(struct fs_context *fc) +{ + struct rdt_fs_context *ctx; + + ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->kfc.root = rdt_root; + ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; + fc->fs_private = &ctx->kfc; + fc->ops = &rdt_fs_context_ops; + if (fc->user_ns) + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(&init_user_ns); + fc->global = true; + return 0; } static int reset_all_ctrls(struct rdt_resource *r) @@ -2239,9 +2285,10 @@ static void rdt_kill_sb(struct super_block *sb) } static struct file_system_type rdt_fs_type = { - .name = "resctrl", - .mount = rdt_mount, - .kill_sb = rdt_kill_sb, + .name = "resctrl", + .init_fs_context = rdt_init_fs_context, + .parameters = &rdt_fs_parameters, + .kill_sb = rdt_kill_sb, }; static int mon_addfile(struct kernfs_node *parent_kn, const char *name, diff --git a/fs/Kconfig b/fs/Kconfig @@ -8,6 +8,13 @@ menu "File systems" config DCACHE_WORD_ACCESS bool +config VALIDATE_FS_PARSER + bool "Validate filesystem parameter description" + default y + help + Enable this to perform validation of the parameter description for a + filesystem when it is registered. + if BLOCK config FS_IOMAP diff --git a/fs/Makefile b/fs/Makefile @@ -13,7 +13,7 @@ obj-y := open.o read_write.o file_table.o super.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ - fs_types.o + fs_types.o fs_context.o fs_parser.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o diff --git a/fs/afs/internal.h b/fs/afs/internal.h @@ -36,15 +36,14 @@ struct pagevec; struct afs_call; -struct afs_mount_params { - bool rwpath; /* T if the parent should be considered R/W */ +struct afs_fs_context { bool force; /* T to force cell type */ bool autocell; /* T if set auto mount operation */ bool dyn_root; /* T if dynamic root */ + bool no_cell; /* T if the source is "none" (for dynroot) */ afs_voltype_t type; /* type of volume requested */ - int volnamesz; /* size of volume name */ + unsigned int volnamesz; /* size of volume name */ const char *volname; /* name of volume to mount */ - struct net *net_ns; /* Network namespace in effect */ struct afs_net *net; /* the AFS net namespace stuff */ struct afs_cell *cell; /* cell in which to find volume */ struct afs_volume *volume; /* volume record */ @@ -1274,7 +1273,7 @@ static inline struct afs_volume *__afs_get_volume(struct afs_volume *volume) return volume; } -extern struct afs_volume *afs_create_volume(struct afs_mount_params *); +extern struct afs_volume *afs_create_volume(struct afs_fs_context *); extern void afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); extern void afs_put_volume(struct afs_cell *, struct afs_volume *); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c @@ -17,6 +17,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/gfp.h> +#include <linux/fs_context.h> #include "internal.h" @@ -47,6 +48,8 @@ static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out); static unsigned long afs_mntpt_expiry_timeout = 10 * 60; +static const char afs_root_volume[] = "root.cell"; + /* * no valid lookup procedure on this sort of dir */ @@ -68,108 +71,112 @@ static int afs_mntpt_open(struct inode *inode, struct file *file) } /* - * create a vfsmount to be automounted + * Set the parameters for the proposed superblock. */ -static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) +static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) { - struct afs_super_info *as; - struct vfsmount *mnt; - struct afs_vnode *vnode; - struct page *page; - char *devname, *options; - bool rwpath = false; + struct afs_fs_context *ctx = fc->fs_private; + struct afs_super_info *src_as = AFS_FS_S(mntpt->d_sb); + struct afs_vnode *vnode = AFS_FS_I(d_inode(mntpt)); + struct afs_cell *cell; + const char *p; int ret; - _enter("{%pd}", mntpt); - - BUG_ON(!d_inode(mntpt)); - - ret = -ENOMEM; - devname = (char *) get_zeroed_page(GFP_KERNEL); - if (!devname) - goto error_no_devname; - - options = (char *) get_zeroed_page(GFP_KERNEL); - if (!options) - goto error_no_options; + if (fc->net_ns != src_as->net_ns) { + put_net(fc->net_ns); + fc->net_ns = get_net(src_as->net_ns); + } - vnode = AFS_FS_I(d_inode(mntpt)); + if (src_as->volume && src_as->volume->type == AFSVL_RWVOL) { + ctx->type = AFSVL_RWVOL; + ctx->force = true; + } + if (ctx->cell) { + afs_put_cell(ctx->net, ctx->cell); + ctx->cell = NULL; + } if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { /* if the directory is a pseudo directory, use the d_name */ - static const char afs_root_cell[] = ":root.cell."; unsigned size = mntpt->d_name.len; - ret = -ENOENT; - if (size < 2 || size > AFS_MAXCELLNAME) - goto error_no_page; + if (size < 2) + return -ENOENT; + p = mntpt->d_name.name; if (mntpt->d_name.name[0] == '.') { - devname[0] = '%'; - memcpy(devname + 1, mntpt->d_name.name + 1, size - 1); - memcpy(devname + size, afs_root_cell, - sizeof(afs_root_cell)); - rwpath = true; - } else { - devname[0] = '#'; - memcpy(devname + 1, mntpt->d_name.name, size); - memcpy(devname + size + 1, afs_root_cell, - sizeof(afs_root_cell)); + size--; + p++; + ctx->type = AFSVL_RWVOL; + ctx->force = true; } + if (size > AFS_MAXCELLNAME) + return -ENAMETOOLONG; + + cell = afs_lookup_cell(ctx->net, p, size, NULL, false); + if (IS_ERR(cell)) { + pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt); + return PTR_ERR(cell); + } + ctx->cell = cell; + + ctx->volname = afs_root_volume; + ctx->volnamesz = sizeof(afs_root_volume) - 1; } else { /* read the contents of the AFS special symlink */ + struct page *page; loff_t size = i_size_read(d_inode(mntpt)); char *buf; - ret = -EINVAL; + if (src_as->cell) + ctx->cell = afs_get_cell(src_as->cell); + if (size > PAGE_SIZE - 1) - goto error_no_page; + return -EINVAL; page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL); - if (IS_ERR(page)) { - ret = PTR_ERR(page); - goto error_no_page; - } + if (IS_ERR(page)) + return PTR_ERR(page); if (PageError(page)) { ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt); - goto error; + put_page(page); + return ret; } - buf = kmap_atomic(page); - memcpy(devname, buf, size); - kunmap_atomic(buf); + buf = kmap(page); + ret = vfs_parse_fs_string(fc, "source", buf, size); + kunmap(page); put_page(page); - page = NULL; + if (ret < 0) + return ret; } - /* work out what options we want */ - as = AFS_FS_S(mntpt->d_sb); - if (as->cell) { - memcpy(options, "cell=", 5); - strcpy(options + 5, as->cell->name); - if ((as->volume && as->volume->type == AFSVL_RWVOL) || rwpath) - strcat(options, ",rwpath"); - } + return 0; +} - /* try and do the mount */ - _debug("--- attempting mount %s -o %s ---", devname, options); - mnt = vfs_submount(mntpt, &afs_fs_type, devname, options); - _debug("--- mount result %p ---", mnt); +/* + * create a vfsmount to be automounted + */ +static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) +{ + struct fs_context *fc; + struct vfsmount *mnt; + int ret; - free_page((unsigned long) devname); - free_page((unsigned long) options); - _leave(" = %p", mnt); - return mnt; + BUG_ON(!d_inode(mntpt)); -error: - put_page(page); -error_no_page: - free_page((unsigned long) options); -error_no_options: - free_page((unsigned long) devname); -error_no_devname: - _leave(" = %d", ret); - return ERR_PTR(ret); + fc = fs_context_for_submount(&afs_fs_type, mntpt); + if (IS_ERR(fc)) + return ERR_CAST(fc); + + ret = afs_mntpt_set_params(fc, mntpt); + if (!ret) + mnt = fc_mount(fc); + else + mnt = ERR_PTR(ret); + + put_fs_context(fc); + return mnt; } /* diff --git a/fs/afs/super.c b/fs/afs/super.c @@ -1,6 +1,6 @@ /* AFS superblock handling * - * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved. + * Copyright (c) 2002, 2007, 2018 Red Hat, Inc. All rights reserved. * * This software may be freely redistributed under the terms of the * GNU General Public License. @@ -21,7 +21,7 @@ #include <linux/slab.h> #include <linux/fs.h> #include <linux/pagemap.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> #include <linux/statfs.h> #include <linux/sched.h> #include <linux/nsproxy.h> @@ -30,21 +30,22 @@ #include "internal.h" static void afs_i_init_once(void *foo); -static struct dentry *afs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data); static void afs_kill_super(struct super_block *sb); static struct inode *afs_alloc_inode(struct super_block *sb); static void afs_destroy_inode(struct inode *inode); static int afs_statfs(struct dentry *dentry, struct kstatfs *buf); static int afs_show_devname(struct seq_file *m, struct dentry *root); static int afs_show_options(struct seq_file *m, struct dentry *root); +static int afs_init_fs_context(struct fs_context *fc); +static const struct fs_parameter_description afs_fs_parameters; struct file_system_type afs_fs_type = { - .owner = THIS_MODULE, - .name = "afs", - .mount = afs_mount, - .kill_sb = afs_kill_super, - .fs_flags = 0, + .owner = THIS_MODULE, + .name = "afs", + .init_fs_context = afs_init_fs_context, + .parameters = &afs_fs_parameters, + .kill_sb = afs_kill_super, + .fs_flags = 0, }; MODULE_ALIAS_FS("afs"); @@ -63,22 +64,22 @@ static const struct super_operations afs_super_ops = { static struct kmem_cache *afs_inode_cachep; static atomic_t afs_count_active_inodes; -enum { - afs_no_opt, - afs_opt_cell, - afs_opt_dyn, - afs_opt_rwpath, - afs_opt_vol, - afs_opt_autocell, +enum afs_param { + Opt_autocell, + Opt_dyn, + Opt_source, }; -static const match_table_t afs_options_list = { - { afs_opt_cell, "cell=%s" }, - { afs_opt_dyn, "dyn" }, - { afs_opt_rwpath, "rwpath" }, - { afs_opt_vol, "vol=%s" }, - { afs_opt_autocell, "autocell" }, - { afs_no_opt, NULL }, +static const struct fs_parameter_spec afs_param_specs[] = { + fsparam_flag ("autocell", Opt_autocell), + fsparam_flag ("dyn", Opt_dyn), + fsparam_string("source", Opt_source), + {} +}; + +static const struct fs_parameter_description afs_fs_parameters = { + .name = "kAFS", + .specs = afs_param_specs, }; /* @@ -190,84 +191,23 @@ static int afs_show_options(struct seq_file *m, struct dentry *root) } /* - * parse the mount options - * - this function has been shamelessly adapted from the ext3 fs which - * shamelessly adapted it from the msdos fs - */ -static int afs_parse_options(struct afs_mount_params *params, - char *options, const char **devname) -{ - struct afs_cell *cell; - substring_t args[MAX_OPT_ARGS]; - char *p; - int token; - - _enter("%s", options); - - options[PAGE_SIZE - 1] = 0; - - while ((p = strsep(&options, ","))) { - if (!*p) - continue; - - token = match_token(p, afs_options_list, args); - switch (token) { - case afs_opt_cell: - rcu_read_lock(); - cell = afs_lookup_cell_rcu(params->net, - args[0].from, - args[0].to - args[0].from); - rcu_read_unlock(); - if (IS_ERR(cell)) - return PTR_ERR(cell); - afs_put_cell(params->net, params->cell); - params->cell = cell; - break; - - case afs_opt_rwpath: - params->rwpath = true; - break; - - case afs_opt_vol: - *devname = args[0].from; - break; - - case afs_opt_autocell: - params->autocell = true; - break; - - case afs_opt_dyn: - params->dyn_root = true; - break; - - default: - printk(KERN_ERR "kAFS:" - " Unknown or invalid mount option: '%s'\n", p); - return -EINVAL; - } - } - - _leave(" = 0"); - return 0; -} - -/* - * parse a device name to get cell name, volume name, volume type and R/W - * selector - * - this can be one of the following: + * Parse the source name to get cell name, volume name, volume type and R/W + * selector. + * + * This can be one of the following: * "%[cell:]volume[.]" R/W volume - * "#[cell:]volume[.]" R/O or R/W volume (rwpath=0), - * or R/W (rwpath=1) volume + * "#[cell:]volume[.]" R/O or R/W volume (R/O parent), + * or R/W (R/W parent) volume * "%[cell:]volume.readonly" R/O volume * "#[cell:]volume.readonly" R/O volume * "%[cell:]volume.backup" Backup volume * "#[cell:]volume.backup" Backup volume */ -static int afs_parse_device_name(struct afs_mount_params *params, - const char *name) +static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) { + struct afs_fs_context *ctx = fc->fs_private; struct afs_cell *cell; - const char *cellname, *suffix; + const char *cellname, *suffix, *name = param->string; int cellnamesz; _enter(",%s", name); @@ -278,69 +218,149 @@ static int afs_parse_device_name(struct afs_mount_params *params, } if ((name[0] != '%' && name[0] != '#') || !name[1]) { + /* To use dynroot, we don't want to have to provide a source */ + if (strcmp(name, "none") == 0) { + ctx->no_cell = true; + return 0; + } printk(KERN_ERR "kAFS: unparsable volume name\n"); return -EINVAL; } /* determine the type of volume we're looking for */ - params->type = AFSVL_ROVOL; - params->force = false; - if (params->rwpath || name[0] == '%') { - params->type = AFSVL_RWVOL; - params->force = true; + if (name[0] == '%') { + ctx->type = AFSVL_RWVOL; + ctx->force = true; } name++; /* split the cell name out if there is one */ - params->volname = strchr(name, ':'); - if (params->volname) { + ctx->volname = strchr(name, ':'); + if (ctx->volname) { cellname = name; - cellnamesz = params->volname - name; - params->volname++; + cellnamesz = ctx->volname - name; + ctx->volname++; } else { - params->volname = name; + ctx->volname = name; cellname = NULL; cellnamesz = 0; } /* the volume type is further affected by a possible suffix */ - suffix = strrchr(params->volname, '.'); + suffix = strrchr(ctx->volname, '.'); if (suffix) { if (strcmp(suffix, ".readonly") == 0) { - params->type = AFSVL_ROVOL; - params->force = true; + ctx->type = AFSVL_ROVOL; + ctx->force = true; } else if (strcmp(suffix, ".backup") == 0) { - params->type = AFSVL_BACKVOL; - params->force = true; + ctx->type = AFSVL_BACKVOL; + ctx->force = true; } else if (suffix[1] == 0) { } else { suffix = NULL; } } - params->volnamesz = suffix ? - suffix - params->volname : strlen(params->volname); + ctx->volnamesz = suffix ? + suffix - ctx->volname : strlen(ctx->volname); _debug("cell %*.*s [%p]", - cellnamesz, cellnamesz, cellname ?: "", params->cell); + cellnamesz, cellnamesz, cellname ?: "", ctx->cell); /* lookup the cell record */ - if (cellname || !params->cell) { - cell = afs_lookup_cell(params->net, cellname, cellnamesz, + if (cellname) { + cell = afs_lookup_cell(ctx->net, cellname, cellnamesz, NULL, false); if (IS_ERR(cell)) { - printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n", + pr_err("kAFS: unable to lookup cell '%*.*s'\n", cellnamesz, cellnamesz, cellname ?: ""); return PTR_ERR(cell); } - afs_put_cell(params->net, params->cell); - params->cell = cell; + afs_put_cell(ctx->net, ctx->cell); + ctx->cell = cell; } _debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s", - params->cell->name, params->cell, - params->volnamesz, params->volnamesz, params->volname, - suffix ?: "-", params->type, params->force ? " FORCE" : ""); + ctx->cell->name, ctx->cell, + ctx->volnamesz, ctx->volnamesz, ctx->volname, + suffix ?: "-", ctx->type, ctx->force ? " FORCE" : ""); + + fc->source = param->string; + param->string = NULL; + return 0; +} + +/* + * Parse a single mount parameter. + */ +static int afs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct fs_parse_result result; + struct afs_fs_context *ctx = fc->fs_private; + int opt; + + opt = fs_parse(fc, &afs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_source: + return afs_parse_source(fc, param); + + case Opt_autocell: + ctx->autocell = true; + break; + + case Opt_dyn: + ctx->dyn_root = true; + break; + + default: + return -EINVAL; + } + + _leave(" = 0"); + return 0; +} + +/* + * Validate the options, get the cell key and look up the volume. + */ +static int afs_validate_fc(struct fs_context *fc) +{ + struct afs_fs_context *ctx = fc->fs_private; + struct afs_volume *volume; + struct key *key; + + if (!ctx->dyn_root) { + if (ctx->no_cell) { + pr_warn("kAFS: Can only specify source 'none' with -o dyn\n"); + return -EINVAL; + } + + if (!ctx->cell) { + pr_warn("kAFS: No cell specified\n"); + return -EDESTADDRREQ; + } + + /* We try to do the mount securely. */ + key = afs_request_key(ctx->cell); + if (IS_ERR(key)) + return PTR_ERR(key); + + ctx->key = key; + + if (ctx->volume) { + afs_put_volume(ctx->cell, ctx->volume); + ctx->volume = NULL; + } + + volume = afs_create_volume(ctx); + if (IS_ERR(volume)) + return PTR_ERR(volume); + + ctx->volume = volume; + } return 0; } @@ -348,39 +368,34 @@ static int afs_parse_device_name(struct afs_mount_params *params, /* * check a superblock to see if it's the one we're looking for */ -static int afs_test_super(struct super_block *sb, void *data) +static int afs_test_super(struct super_block *sb, struct fs_context *fc) { - struct afs_super_info *as1 = data; + struct afs_fs_context *ctx = fc->fs_private; struct afs_super_info *as = AFS_FS_S(sb); - return (as->net_ns == as1->net_ns && + return (as->net_ns == fc->net_ns && as->volume && - as->volume->vid == as1->volume->vid && + as->volume->vid == ctx->volume->vid && !as->dyn_root); } -static int afs_dynroot_test_super(struct super_block *sb, void *data) +static int afs_dynroot_test_super(struct super_block *sb, struct fs_context *fc) { - struct afs_super_info *as1 = data; struct afs_super_info *as = AFS_FS_S(sb); - return (as->net_ns == as1->net_ns && + return (as->net_ns == fc->net_ns && as->dyn_root); } -static int afs_set_super(struct super_block *sb, void *data) +static int afs_set_super(struct super_block *sb, struct fs_context *fc) { - struct afs_super_info *as = data; - - sb->s_fs_info = as; return set_anon_super(sb, NULL); } /* * fill in the superblock */ -static int afs_fill_super(struct super_block *sb, - struct afs_mount_params *params) +static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) { struct afs_super_info *as = AFS_FS_S(sb); struct afs_fid fid; @@ -412,13 +427,13 @@ static int afs_fill_super(struct super_block *sb, fid.vnode = 1; fid.vnode_hi = 0; fid.unique = 1; - inode = afs_iget(sb, params->key, &fid, NULL, NULL, NULL); + inode = afs_iget(sb, ctx->key, &fid, NULL, NULL, NULL); } if (IS_ERR(inode)) return PTR_ERR(inode); - if (params->autocell || params->dyn_root) + if (ctx->autocell || as->dyn_root) set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); ret = -ENOMEM; @@ -443,17 +458,20 @@ error: return ret; } -static struct afs_super_info *afs_alloc_sbi(struct afs_mount_params *params) +static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc) { + struct afs_fs_context *ctx = fc->fs_private; struct afs_super_info *as; as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); if (as) { - as->net_ns = get_net(params->net_ns); - if (params->dyn_root) + as->net_ns = get_net(fc->net_ns); + if (ctx->dyn_root) { as->dyn_root = true; - else - as->cell = afs_get_cell(params->cell); + } else { + as->cell = afs_get_cell(ctx->cell); + as->volume = __afs_get_volume(ctx->volume); + } } return as; } @@ -475,7 +493,7 @@ static void afs_kill_super(struct super_block *sb) if (as->dyn_root) afs_dynroot_depopulate(sb); - + /* Clear the callback interests (which will do ilookup5) before * deactivating the superblock. */ @@ -488,111 +506,103 @@ static void afs_kill_super(struct super_block *sb) } /* - * get an AFS superblock + * Get an AFS superblock and root directory. */ -static struct dentry *afs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *options) +static int afs_get_tree(struct fs_context *fc) { - struct afs_mount_params params; + struct afs_fs_context *ctx = fc->fs_private; struct super_block *sb; - struct afs_volume *candidate; - struct key *key; struct afs_super_info *as; int ret; - _enter(",,%s,%p", dev_name, options); - - memset(&params, 0, sizeof(params)); - - ret = -EINVAL; - if (current->nsproxy->net_ns != &init_net) + ret = afs_validate_fc(fc); + if (ret) goto error; - params.net_ns = current->nsproxy->net_ns; - params.net = afs_net(params.net_ns); - - /* parse the options and device name */ - if (options) { - ret = afs_parse_options(&params, options, &dev_name); - if (ret < 0) - goto error; - } - - if (!params.dyn_root) { - ret = afs_parse_device_name(&params, dev_name); - if (ret < 0) - goto error; - /* try and do the mount securely */ - key = afs_request_key(params.cell); - if (IS_ERR(key)) { - _leave(" = %ld [key]", PTR_ERR(key)); - ret = PTR_ERR(key); - goto error; - } - params.key = key; - } + _enter(""); /* allocate a superblock info record */ ret = -ENOMEM; - as = afs_alloc_sbi(&params); + as = afs_alloc_sbi(fc); if (!as) - goto error_key; - - if (!params.dyn_root) { - /* Assume we're going to need a volume record; at the very - * least we can use it to update the volume record if we have - * one already. This checks that the volume exists within the - * cell. - */ - candidate = afs_create_volume(&params); - if (IS_ERR(candidate)) { - ret = PTR_ERR(candidate); - goto error_as; - } - - as->volume = candidate; - } + goto error; + fc->s_fs_info = as; /* allocate a deviceless superblock */ - sb = sget(fs_type, - as->dyn_root ? afs_dynroot_test_super : afs_test_super, - afs_set_super, flags, as); + sb = sget_fc(fc, + as->dyn_root ? afs_dynroot_test_super : afs_test_super, + afs_set_super); if (IS_ERR(sb)) { ret = PTR_ERR(sb); - goto error_as; + goto error; } if (!sb->s_root) { /* initial superblock/root creation */ _debug("create"); - ret = afs_fill_super(sb, &params); + ret = afs_fill_super(sb, ctx); if (ret < 0) goto error_sb; - as = NULL; sb->s_flags |= SB_ACTIVE; } else { _debug("reuse"); ASSERTCMP(sb->s_flags, &, SB_ACTIVE); - afs_destroy_sbi(as); - as = NULL; } - afs_put_cell(params.net, params.cell); - key_put(params.key); + fc->root = dget(sb->s_root); _leave(" = 0 [%p]", sb); - return dget(sb->s_root); + return 0; error_sb: deactivate_locked_super(sb); - goto error_key; -error_as: - afs_destroy_sbi(as); -error_key: - key_put(params.key); error: - afs_put_cell(params.net, params.cell); _leave(" = %d", ret); - return ERR_PTR(ret); + return ret; +} + +static void afs_free_fc(struct fs_context *fc) +{ + struct afs_fs_context *ctx = fc->fs_private; + + afs_destroy_sbi(fc->s_fs_info); + afs_put_volume(ctx->cell, ctx->volume); + afs_put_cell(ctx->net, ctx->cell); + key_put(ctx->key); + kfree(ctx); +} + +static const struct fs_context_operations afs_context_ops = { + .free = afs_free_fc, + .parse_param = afs_parse_param, + .get_tree = afs_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +static int afs_init_fs_context(struct fs_context *fc) +{ + struct afs_fs_context *ctx; + struct afs_cell *cell; + + ctx = kzalloc(sizeof(struct afs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->type = AFSVL_ROVOL; + ctx->net = afs_net(fc->net_ns); + + /* Default to the workstation cell. */ + rcu_read_lock(); + cell = afs_lookup_cell_rcu(ctx->net, NULL, 0); + rcu_read_unlock(); + if (IS_ERR(cell)) + cell = NULL; + ctx->cell = cell; + + fc->fs_private = ctx; + fc->ops = &afs_context_ops; + return 0; } /* diff --git a/fs/afs/volume.c b/fs/afs/volume.c @@ -21,7 +21,7 @@ static const char *const afs_voltypes[] = { "R/W", "R/O", "BAK" }; /* * Allocate a volume record and load it up from a vldb record. */ -static struct afs_volume *afs_alloc_volume(struct afs_mount_params *params, +static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, struct afs_vldb_entry *vldb, unsigned long type_mask) { @@ -113,7 +113,7 @@ static struct afs_vldb_entry *afs_vl_lookup_vldb(struct afs_cell *cell, * - Rule 3: If parent volume is R/W, then only mount R/W volume unless * explicitly told otherwise */ -struct afs_volume *afs_create_volume(struct afs_mount_params *params) +struct afs_volume *afs_create_volume(struct afs_fs_context *params) { struct afs_vldb_entry *vldb; struct afs_volume *volume; diff --git a/fs/filesystems.c b/fs/filesystems.c @@ -16,6 +16,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/fs_parser.h> /* * Handling of filesystem drivers list. @@ -73,6 +74,9 @@ int register_filesystem(struct file_system_type * fs) int res = 0; struct file_system_type ** p; + if (fs->parameters && !fs_validate_description(fs->parameters)) + return -EINVAL; + BUG_ON(strchr(fs->name, '.')); if (fs->next) return -EBUSY; diff --git a/fs/fs_context.c b/fs/fs_context.c @@ -0,0 +1,642 @@ +/* Provide a way to create a superblock configuration context within the kernel + * that allows a superblock to be set up prior to mounting. + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/fs_context.h> +#include <linux/fs_parser.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/nsproxy.h> +#include <linux/slab.h> +#include <linux/magic.h> +#include <linux/security.h> +#include <linux/mnt_namespace.h> +#include <linux/pid_namespace.h> +#include <linux/user_namespace.h> +#include <net/net_namespace.h> +#include "mount.h" +#include "internal.h" + +enum legacy_fs_param { + LEGACY_FS_UNSET_PARAMS, + LEGACY_FS_MONOLITHIC_PARAMS, + LEGACY_FS_INDIVIDUAL_PARAMS, +}; + +struct legacy_fs_context { + char *legacy_data; /* Data page for legacy filesystems */ + size_t data_size; + enum legacy_fs_param param_type; +}; + +static int legacy_init_fs_context(struct fs_context *fc); + +static const struct constant_table common_set_sb_flag[] = { + { "dirsync", SB_DIRSYNC }, + { "lazytime", SB_LAZYTIME }, + { "mand", SB_MANDLOCK }, + { "posixacl", SB_POSIXACL }, + { "ro", SB_RDONLY }, + { "sync", SB_SYNCHRONOUS }, +}; + +static const struct constant_table common_clear_sb_flag[] = { + { "async", SB_SYNCHRONOUS }, + { "nolazytime", SB_LAZYTIME }, + { "nomand", SB_MANDLOCK }, + { "rw", SB_RDONLY }, + { "silent", SB_SILENT }, +}; + +static const char *const forbidden_sb_flag[] = { + "bind", + "dev", + "exec", + "move", + "noatime", + "nodev", + "nodiratime", + "noexec", + "norelatime", + "nostrictatime", + "nosuid", + "private", + "rec", + "relatime", + "remount", + "shared", + "slave", + "strictatime", + "suid", + "unbindable", +}; + +/* + * Check for a common mount option that manipulates s_flags. + */ +static int vfs_parse_sb_flag(struct fs_context *fc, const char *key) +{ + unsigned int token; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(forbidden_sb_flag); i++) + if (strcmp(key, forbidden_sb_flag[i]) == 0) + return -EINVAL; + + token = lookup_constant(common_set_sb_flag, key, 0); + if (token) { + fc->sb_flags |= token; + fc->sb_flags_mask |= token; + return 0; + } + + token = lookup_constant(common_clear_sb_flag, key, 0); + if (token) { + fc->sb_flags &= ~token; + fc->sb_flags_mask |= token; + return 0; + } + + return -ENOPARAM; +} + +/** + * vfs_parse_fs_param - Add a single parameter to a superblock config + * @fc: The filesystem context to modify + * @param: The parameter + * + * A single mount option in string form is applied to the filesystem context + * being set up. Certain standard options (for example "ro") are translated + * into flag bits without going to the filesystem. The active security module + * is allowed to observe and poach options. Any other options are passed over + * to the filesystem to parse. + * + * This may be called multiple times for a context. + * + * Returns 0 on success and a negative error code on failure. In the event of + * failure, supplementary error information may have been set. + */ +int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param) +{ + int ret; + + if (!param->key) + return invalf(fc, "Unnamed parameter\n"); + + ret = vfs_parse_sb_flag(fc, param->key); + if (ret != -ENOPARAM) + return ret; + + ret = security_fs_context_parse_param(fc, param); + if (ret != -ENOPARAM) + /* Param belongs to the LSM or is disallowed by the LSM; so + * don't pass to the FS. + */ + return ret; + + if (fc->ops->parse_param) { + ret = fc->ops->parse_param(fc, param); + if (ret != -ENOPARAM) + return ret; + } + + /* If the filesystem doesn't take any arguments, give it the + * default handling of source. + */ + if (strcmp(param->key, "source") == 0) { + if (param->type != fs_value_is_string) + return invalf(fc, "VFS: Non-string source"); + if (fc->source) + return invalf(fc, "VFS: Multiple sources"); + fc->source = param->string; + param->string = NULL; + return 0; + } + + return invalf(fc, "%s: Unknown parameter '%s'", + fc->fs_type->name, param->key); +} +EXPORT_SYMBOL(vfs_parse_fs_param); + +/** + * vfs_parse_fs_string - Convenience function to just parse a string. + */ +int vfs_parse_fs_string(struct fs_context *fc, const char *key, + const char *value, size_t v_size) +{ + int ret; + + struct fs_parameter param = { + .key = key, + .type = fs_value_is_string, + .size = v_size, + }; + + if (v_size > 0) { + param.string = kmemdup_nul(value, v_size, GFP_KERNEL); + if (!param.string) + return -ENOMEM; + } + + ret = vfs_parse_fs_param(fc, &param); + kfree(param.string); + return ret; +} +EXPORT_SYMBOL(vfs_parse_fs_string); + +/** + * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data + * @ctx: The superblock configuration to fill in. + * @data: The data to parse + * + * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be + * called from the ->monolithic_mount_data() fs_context operation. + * + * Returns 0 on success or the error returned by the ->parse_option() fs_context + * operation on failure. + */ +int generic_parse_monolithic(struct fs_context *fc, void *data) +{ + char *options = data, *key; + int ret = 0; + + if (!options) + return 0; + + ret = security_sb_eat_lsm_opts(options, &fc->security); + if (ret) + return ret; + + while ((key = strsep(&options, ",")) != NULL) { + if (*key) { + size_t v_len = 0; + char *value = strchr(key, '='); + + if (value) { + if (value == key) + continue; + *value++ = 0; + v_len = strlen(value); + } + ret = vfs_parse_fs_string(fc, key, value, v_len); + if (ret < 0) + break; + } + } + + return ret; +} +EXPORT_SYMBOL(generic_parse_monolithic); + +/** + * alloc_fs_context - Create a filesystem context. + * @fs_type: The filesystem type. + * @reference: The dentry from which this one derives (or NULL) + * @sb_flags: Filesystem/superblock flags (SB_*) + * @sb_flags_mask: Applicable members of @sb_flags + * @purpose: The purpose that this configuration shall be used for. + * + * Open a filesystem and create a mount context. The mount context is + * initialised with the supplied flags and, if a submount/automount from + * another superblock (referred to by @reference) is supplied, may have + * parameters such as namespaces copied across from that superblock. + */ +static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, + struct dentry *reference, + unsigned int sb_flags, + unsigned int sb_flags_mask, + enum fs_context_purpose purpose) +{ + int (*init_fs_context)(struct fs_context *); + struct fs_context *fc; + int ret = -ENOMEM; + + fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); + if (!fc) + return ERR_PTR(-ENOMEM); + + fc->purpose = purpose; + fc->sb_flags = sb_flags; + fc->sb_flags_mask = sb_flags_mask; + fc->fs_type = get_filesystem(fs_type); + fc->cred = get_current_cred(); + fc->net_ns = get_net(current->nsproxy->net_ns); + + switch (purpose) { + case FS_CONTEXT_FOR_MOUNT: + fc->user_ns = get_user_ns(fc->cred->user_ns); + break; + case FS_CONTEXT_FOR_SUBMOUNT: + fc->user_ns = get_user_ns(reference->d_sb->s_user_ns); + break; + case FS_CONTEXT_FOR_RECONFIGURE: + /* We don't pin any namespaces as the superblock's + * subscriptions cannot be changed at this point. + */ + atomic_inc(&reference->d_sb->s_active); + fc->root = dget(reference); + break; + } + + /* TODO: Make all filesystems support this unconditionally */ + init_fs_context = fc->fs_type->init_fs_context; + if (!init_fs_context) + init_fs_context = legacy_init_fs_context; + + ret = init_fs_context(fc); + if (ret < 0) + goto err_fc; + fc->need_free = true; + return fc; + +err_fc: + put_fs_context(fc); + return ERR_PTR(ret); +} + +struct fs_context *fs_context_for_mount(struct file_system_type *fs_type, + unsigned int sb_flags) +{ + return alloc_fs_context(fs_type, NULL, sb_flags, 0, + FS_CONTEXT_FOR_MOUNT); +} +EXPORT_SYMBOL(fs_context_for_mount); + +struct fs_context *fs_context_for_reconfigure(struct dentry *dentry, + unsigned int sb_flags, + unsigned int sb_flags_mask) +{ + return alloc_fs_context(dentry->d_sb->s_type, dentry, sb_flags, + sb_flags_mask, FS_CONTEXT_FOR_RECONFIGURE); +} +EXPORT_SYMBOL(fs_context_for_reconfigure); + +struct fs_context *fs_context_for_submount(struct file_system_type *type, + struct dentry *reference) +{ + return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT); +} +EXPORT_SYMBOL(fs_context_for_submount); + +void fc_drop_locked(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + dput(fc->root); + fc->root = NULL; + deactivate_locked_super(sb); +} + +static void legacy_fs_context_free(struct fs_context *fc); + +/** + * vfs_dup_fc_config: Duplicate a filesystem context. + * @src_fc: The context to copy. + */ +struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc) +{ + struct fs_context *fc; + int ret; + + if (!src_fc->ops->dup) + return ERR_PTR(-EOPNOTSUPP); + + fc = kmemdup(src_fc, sizeof(struct fs_context), GFP_KERNEL); + if (!fc) + return ERR_PTR(-ENOMEM); + + fc->fs_private = NULL; + fc->s_fs_info = NULL; + fc->source = NULL; + fc->security = NULL; + get_filesystem(fc->fs_type); + get_net(fc->net_ns); + get_user_ns(fc->user_ns); + get_cred(fc->cred); + + /* Can't call put until we've called ->dup */ + ret = fc->ops->dup(fc, src_fc); + if (ret < 0) + goto err_fc; + + ret = security_fs_context_dup(fc, src_fc); + if (ret < 0) + goto err_fc; + return fc; + +err_fc: + put_fs_context(fc); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(vfs_dup_fs_context); + +#ifdef CONFIG_PRINTK +/** + * logfc - Log a message to a filesystem context + * @fc: The filesystem context to log to. + * @fmt: The format of the buffer. + */ +void logfc(struct fs_context *fc, const char *fmt, ...) +{ + va_list va; + + va_start(va, fmt); + + switch (fmt[0]) { + case 'w': + vprintk_emit(0, LOGLEVEL_WARNING, NULL, 0, fmt, va); + break; + case 'e': + vprintk_emit(0, LOGLEVEL_ERR, NULL, 0, fmt, va); + break; + default: + vprintk_emit(0, LOGLEVEL_NOTICE, NULL, 0, fmt, va); + break; + } + + pr_cont("\n"); + va_end(va); +} +EXPORT_SYMBOL(logfc); +#endif + +/** + * put_fs_context - Dispose of a superblock configuration context. + * @fc: The context to dispose of. + */ +void put_fs_context(struct fs_context *fc) +{ + struct super_block *sb; + + if (fc->root) { + sb = fc->root->d_sb; + dput(fc->root); + fc->root = NULL; + deactivate_super(sb); + } + + if (fc->need_free && fc->ops && fc->ops->free) + fc->ops->free(fc); + + security_free_mnt_opts(&fc->security); + put_net(fc->net_ns); + put_user_ns(fc->user_ns); + put_cred(fc->cred); + kfree(fc->subtype); + put_filesystem(fc->fs_type); + kfree(fc->source); + kfree(fc); +} +EXPORT_SYMBOL(put_fs_context); + +/* + * Free the config for a filesystem that doesn't support fs_context. + */ +static void legacy_fs_context_free(struct fs_context *fc) +{ + struct legacy_fs_context *ctx = fc->fs_private; + + if (ctx) { + if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) + kfree(ctx->legacy_data); + kfree(ctx); + } +} + +/* + * Duplicate a legacy config. + */ +static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) +{ + struct legacy_fs_context *ctx; + struct legacy_fs_context *src_ctx = src_fc->fs_private; + + ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) { + ctx->legacy_data = kmemdup(src_ctx->legacy_data, + src_ctx->data_size, GFP_KERNEL); + if (!ctx->legacy_data) { + kfree(ctx); + return -ENOMEM; + } + } + + fc->fs_private = ctx; + return 0; +} + +/* + * Add a parameter to a legacy config. We build up a comma-separated list of + * options. + */ +static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct legacy_fs_context *ctx = fc->fs_private; + unsigned int size = ctx->data_size; + size_t len = 0; + + if (strcmp(param->key, "source") == 0) { + if (param->type != fs_value_is_string) + return invalf(fc, "VFS: Legacy: Non-string source"); + if (fc->source) + return invalf(fc, "VFS: Legacy: Multiple sources"); + fc->source = param->string; + param->string = NULL; + return 0; + } + + if ((fc->fs_type->fs_flags & FS_HAS_SUBTYPE) && + strcmp(param->key, "subtype") == 0) { + if (param->type != fs_value_is_string) + return invalf(fc, "VFS: Legacy: Non-string subtype"); + if (fc->subtype) + return invalf(fc, "VFS: Legacy: Multiple subtype"); + fc->subtype = param->string; + param->string = NULL; + return 0; + } + + if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS) + return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options"); + + switch (param->type) { + case fs_value_is_string: + len = 1 + param->size; + /* Fall through */ + case fs_value_is_flag: + len += strlen(param->key); + break; + default: + return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported", + param->key); + } + + if (len > PAGE_SIZE - 2 - size) + return invalf(fc, "VFS: Legacy: Cumulative options too large"); + if (strchr(param->key, ',') || + (param->type == fs_value_is_string && + memchr(param->string, ',', param->size))) + return invalf(fc, "VFS: Legacy: Option '%s' contained comma", + param->key); + if (!ctx->legacy_data) { + ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!ctx->legacy_data) + return -ENOMEM; + } + + ctx->legacy_data[size++] = ','; + len = strlen(param->key); + memcpy(ctx->legacy_data + size, param->key, len); + size += len; + if (param->type == fs_value_is_string) { + ctx->legacy_data[size++] = '='; + memcpy(ctx->legacy_data + size, param->string, param->size); + size += param->size; + } + ctx->legacy_data[size] = '\0'; + ctx->data_size = size; + ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS; + return 0; +} + +/* + * Add monolithic mount data. + */ +static int legacy_parse_monolithic(struct fs_context *fc, void *data) +{ + struct legacy_fs_context *ctx = fc->fs_private; + + if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) { + pr_warn("VFS: Can't mix monolithic and individual options\n"); + return -EINVAL; + } + + ctx->legacy_data = data; + ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS; + if (!ctx->legacy_data) + return 0; + + if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA) + return 0; + return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security); +} + +/* + * Get a mountable root with the legacy mount command. + */ +static int legacy_get_tree(struct fs_context *fc) +{ + struct legacy_fs_context *ctx = fc->fs_private; + struct super_block *sb; + struct dentry *root; + + root = fc->fs_type->mount(fc->fs_type, fc->sb_flags, + fc->source, ctx->legacy_data); + if (IS_ERR(root)) + return PTR_ERR(root); + + sb = root->d_sb; + BUG_ON(!sb); + + fc->root = root; + return 0; +} + +/* + * Handle remount. + */ +static int legacy_reconfigure(struct fs_context *fc) +{ + struct legacy_fs_context *ctx = fc->fs_private; + struct super_block *sb = fc->root->d_sb; + + if (!sb->s_op->remount_fs) + return 0; + + return sb->s_op->remount_fs(sb, &fc->sb_flags, + ctx ? ctx->legacy_data : NULL); +} + +const struct fs_context_operations legacy_fs_context_ops = { + .free = legacy_fs_context_free, + .dup = legacy_fs_context_dup, + .parse_param = legacy_parse_param, + .parse_monolithic = legacy_parse_monolithic, + .get_tree = legacy_get_tree, + .reconfigure = legacy_reconfigure, +}; + +/* + * Initialise a legacy context for a filesystem that doesn't support + * fs_context. + */ +static int legacy_init_fs_context(struct fs_context *fc) +{ + fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL); + if (!fc->fs_private) + return -ENOMEM; + fc->ops = &legacy_fs_context_ops; + return 0; +} + +int parse_monolithic_mount_data(struct fs_context *fc, void *data) +{ + int (*monolithic_mount_data)(struct fs_context *, void *); + + monolithic_mount_data = fc->ops->parse_monolithic; + if (!monolithic_mount_data) + monolithic_mount_data = generic_parse_monolithic; + + return monolithic_mount_data(fc, data); +} diff --git a/fs/fs_parser.c b/fs/fs_parser.c @@ -0,0 +1,447 @@ +/* Filesystem parameter parser. + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/export.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> +#include <linux/slab.h> +#include <linux/security.h> +#include <linux/namei.h> +#include "internal.h" + +static const struct constant_table bool_names[] = { + { "0", false }, + { "1", true }, + { "false", false }, + { "no", false }, + { "true", true }, + { "yes", true }, +}; + +/** + * lookup_constant - Look up a constant by name in an ordered table + * @tbl: The table of constants to search. + * @tbl_size: The size of the table. + * @name: The name to look up. + * @not_found: The value to return if the name is not found. + */ +int __lookup_constant(const struct constant_table *tbl, size_t tbl_size, + const char *name, int not_found) +{ + unsigned int i; + + for (i = 0; i < tbl_size; i++) + if (strcmp(name, tbl[i].name) == 0) + return tbl[i].value; + + return not_found; +} +EXPORT_SYMBOL(__lookup_constant); + +static const struct fs_parameter_spec *fs_lookup_key( + const struct fs_parameter_description *desc, + const char *name) +{ + const struct fs_parameter_spec *p; + + if (!desc->specs) + return NULL; + + for (p = desc->specs; p->name; p++) + if (strcmp(p->name, name) == 0) + return p; + + return NULL; +} + +/* + * fs_parse - Parse a filesystem configuration parameter + * @fc: The filesystem context to log errors through. + * @desc: The parameter description to use. + * @param: The parameter. + * @result: Where to place the result of the parse + * + * Parse a filesystem configuration parameter and attempt a conversion for a + * simple parameter for which this is requested. If successful, the determined + * parameter ID is placed into @result->key, the desired type is indicated in + * @result->t and any converted value is placed into an appropriate member of + * the union in @result. + * + * The function returns the parameter number if the parameter was matched, + * -ENOPARAM if it wasn't matched and @desc->ignore_unknown indicated that + * unknown parameters are okay and -EINVAL if there was a conversion issue or + * the parameter wasn't recognised and unknowns aren't okay. + */ +int fs_parse(struct fs_context *fc, + const struct fs_parameter_description *desc, + struct fs_parameter *param, + struct fs_parse_result *result) +{ + const struct fs_parameter_spec *p; + const struct fs_parameter_enum *e; + int ret = -ENOPARAM, b; + + result->has_value = !!param->string; + result->negated = false; + result->uint_64 = 0; + + p = fs_lookup_key(desc, param->key); + if (!p) { + /* If we didn't find something that looks like "noxxx", see if + * "xxx" takes the "no"-form negative - but only if there + * wasn't an value. + */ + if (result->has_value) + goto unknown_parameter; + if (param->key[0] != 'n' || param->key[1] != 'o' || !param->key[2]) + goto unknown_parameter; + + p = fs_lookup_key(desc, param->key + 2); + if (!p) + goto unknown_parameter; + if (!(p->flags & fs_param_neg_with_no)) + goto unknown_parameter; + result->boolean = false; + result->negated = true; + } + + if (p->flags & fs_param_deprecated) + warnf(fc, "%s: Deprecated parameter '%s'", + desc->name, param->key); + + if (result->negated) + goto okay; + + /* Certain parameter types only take a string and convert it. */ + switch (p->type) { + case __fs_param_wasnt_defined: + return -EINVAL; + case fs_param_is_u32: + case fs_param_is_u32_octal: + case fs_param_is_u32_hex: + case fs_param_is_s32: + case fs_param_is_u64: + case fs_param_is_enum: + case fs_param_is_string: + if (param->type != fs_value_is_string) + goto bad_value; + if (!result->has_value) { + if (p->flags & fs_param_v_optional) + goto okay; + goto bad_value; + } + /* Fall through */ + default: + break; + } + + /* Try to turn the type we were given into the type desired by the + * parameter and give an error if we can't. + */ + switch (p->type) { + case fs_param_is_flag: + if (param->type != fs_value_is_flag && + (param->type != fs_value_is_string || result->has_value)) + return invalf(fc, "%s: Unexpected value for '%s'", + desc->name, param->key); + result->boolean = true; + goto okay; + + case fs_param_is_bool: + switch (param->type) { + case fs_value_is_flag: + result->boolean = true; + goto okay; + case fs_value_is_string: + if (param->size == 0) { + result->boolean = true; + goto okay; + } + b = lookup_constant(bool_names, param->string, -1); + if (b == -1) + goto bad_value; + result->boolean = b; + goto okay; + default: + goto bad_value; + } + + case fs_param_is_u32: + ret = kstrtouint(param->string, 0, &result->uint_32); + goto maybe_okay; + case fs_param_is_u32_octal: + ret = kstrtouint(param->string, 8, &result->uint_32); + goto maybe_okay; + case fs_param_is_u32_hex: + ret = kstrtouint(param->string, 16, &result->uint_32); + goto maybe_okay; + case fs_param_is_s32: + ret = kstrtoint(param->string, 0, &result->int_32); + goto maybe_okay; + case fs_param_is_u64: + ret = kstrtoull(param->string, 0, &result->uint_64); + goto maybe_okay; + + case fs_param_is_enum: + for (e = desc->enums; e->name[0]; e++) { + if (e->opt == p->opt && + strcmp(e->name, param->string) == 0) { + result->uint_32 = e->value; + goto okay; + } + } + goto bad_value; + + case fs_param_is_string: + goto okay; + case fs_param_is_blob: + if (param->type != fs_value_is_blob) + goto bad_value; + goto okay; + + case fs_param_is_fd: { + if (param->type != fs_value_is_file) + goto bad_value; + goto okay; + } + + case fs_param_is_blockdev: + case fs_param_is_path: + goto okay; + default: + BUG(); + } + +maybe_okay: + if (ret < 0) + goto bad_value; +okay: + return p->opt; + +bad_value: + return invalf(fc, "%s: Bad value for '%s'", desc->name, param->key); +unknown_parameter: + return -ENOPARAM; +} +EXPORT_SYMBOL(fs_parse); + +/** + * fs_lookup_param - Look up a path referred to by a parameter + * @fc: The filesystem context to log errors through. + * @param: The parameter. + * @want_bdev: T if want a blockdev + * @_path: The result of the lookup + */ +int fs_lookup_param(struct fs_context *fc, + struct fs_parameter *param, + bool want_bdev, + struct path *_path) +{ + struct filename *f; + unsigned int flags = 0; + bool put_f; + int ret; + + switch (param->type) { + case fs_value_is_string: + f = getname_kernel(param->string); + if (IS_ERR(f)) + return PTR_ERR(f); + put_f = true; + break; + case fs_value_is_filename_empty: + flags = LOOKUP_EMPTY; + /* Fall through */ + case fs_value_is_filename: + f = param->name; + put_f = false; + break; + default: + return invalf(fc, "%s: not usable as path", param->key); + } + + ret = filename_lookup(param->dirfd, f, flags, _path, NULL); + if (ret < 0) { + errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name); + goto out; + } + + if (want_bdev && + !S_ISBLK(d_backing_inode(_path->dentry)->i_mode)) { + path_put(_path); + _path->dentry = NULL; + _path->mnt = NULL; + errorf(fc, "%s: Non-blockdev passed as '%s'", + param->key, f->name); + ret = -ENOTBLK; + } + +out: + if (put_f) + putname(f); + return ret; +} +EXPORT_SYMBOL(fs_lookup_param); + +#ifdef CONFIG_VALIDATE_FS_PARSER +/** + * validate_constant_table - Validate a constant table + * @name: Name to use in reporting + * @tbl: The constant table to validate. + * @tbl_size: The size of the table. + * @low: The lowest permissible value. + * @high: The highest permissible value. + * @special: One special permissible value outside of the range. + */ +bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, + int low, int high, int special) +{ + size_t i; + bool good = true; + + if (tbl_size == 0) { + pr_warn("VALIDATE C-TBL: Empty\n"); + return true; + } + + for (i = 0; i < tbl_size; i++) { + if (!tbl[i].name) { + pr_err("VALIDATE C-TBL[%zu]: Null\n", i); + good = false; + } else if (i > 0 && tbl[i - 1].name) { + int c = strcmp(tbl[i-1].name, tbl[i].name); + + if (c == 0) { + pr_err("VALIDATE C-TBL[%zu]: Duplicate %s\n", + i, tbl[i].name); + good = false; + } + if (c > 0) { + pr_err("VALIDATE C-TBL[%zu]: Missorted %s>=%s\n", + i, tbl[i-1].name, tbl[i].name); + good = false; + } + } + + if (tbl[i].value != special && + (tbl[i].value < low || tbl[i].value > high)) { + pr_err("VALIDATE C-TBL[%zu]: %s->%d const out of range (%d-%d)\n", + i, tbl[i].name, tbl[i].value, low, high); + good = false; + } + } + + return good; +} + +/** + * fs_validate_description - Validate a parameter description + * @desc: The parameter description to validate. + */ +bool fs_validate_description(const struct fs_parameter_description *desc) +{ + const struct fs_parameter_spec *param, *p2; + const struct fs_parameter_enum *e; + const char *name = desc->name; + unsigned int nr_params = 0; + bool good = true, enums = false; + + pr_notice("*** VALIDATE %s ***\n", name); + + if (!name[0]) { + pr_err("VALIDATE Parser: No name\n"); + name = "Unknown"; + good = false; + } + + if (desc->specs) { + for (param = desc->specs; param->name; param++) { + enum fs_parameter_type t = param->type; + + /* Check that the type is in range */ + if (t == __fs_param_wasnt_defined || + t >= nr__fs_parameter_type) { + pr_err("VALIDATE %s: PARAM[%s] Bad type %u\n", + name, param->name, t); + good = false; + } else if (t == fs_param_is_enum) { + enums = true; + } + + /* Check for duplicate parameter names */ + for (p2 = desc->specs; p2 < param; p2++) { + if (strcmp(param->name, p2->name) == 0) { + pr_err("VALIDATE %s: PARAM[%s]: Duplicate\n", + name, param->name); + good = false; + } + } + } + + nr_params = param - desc->specs; + } + + if (desc->enums) { + if (!nr_params) { + pr_err("VALIDATE %s: Enum table but no parameters\n", + name); + good = false; + goto no_enums; + } + if (!enums) { + pr_err("VALIDATE %s: Enum table but no enum-type values\n", + name); + good = false; + goto no_enums; + } + + for (e = desc->enums; e->name[0]; e++) { + /* Check that all entries in the enum table have at + * least one parameter that uses them. + */ + for (param = desc->specs; param->name; param++) { + if (param->opt == e->opt && + param->type != fs_param_is_enum) { + pr_err("VALIDATE %s: e[%lu] enum val for %s\n", + name, e - desc->enums, param->name); + good = false; + } + } + } + + /* Check that all enum-type parameters have at least one enum + * value in the enum table. + */ + for (param = desc->specs; param->name; param++) { + if (param->type != fs_param_is_enum) + continue; + for (e = desc->enums; e->name[0]; e++) + if (e->opt == param->opt) + break; + if (!e->name[0]) { + pr_err("VALIDATE %s: PARAM[%s] enum with no values\n", + name, param->name); + good = false; + } + } + } else { + if (enums) { + pr_err("VALIDATE %s: enum-type values, but no enum table\n", + name); + good = false; + goto no_enums; + } + } + +no_enums: + return good; +} +#endif /* CONFIG_VALIDATE_FS_PARSER */ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c @@ -27,7 +27,7 @@ #include <linux/backing-dev.h> #include <linux/hugetlb.h> #include <linux/pagevec.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/dnotify.h> @@ -45,11 +45,17 @@ const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; static const struct inode_operations hugetlbfs_inode_operations; -struct hugetlbfs_config { +enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT }; + +struct hugetlbfs_fs_context { struct hstate *hstate; + unsigned long long max_size_opt; + unsigned long long min_size_opt; long max_hpages; long nr_inodes; long min_hpages; + enum hugetlbfs_size_type max_val_type; + enum hugetlbfs_size_type min_val_type; kuid_t uid; kgid_t gid; umode_t mode; @@ -57,22 +63,30 @@ struct hugetlbfs_config { int sysctl_hugetlb_shm_group; -enum { - Opt_size, Opt_nr_inodes, - Opt_mode, Opt_uid, Opt_gid, - Opt_pagesize, Opt_min_size, - Opt_err, +enum hugetlb_param { + Opt_gid, + Opt_min_size, + Opt_mode, + Opt_nr_inodes, + Opt_pagesize, + Opt_size, + Opt_uid, }; -static const match_table_t tokens = { - {Opt_size, "size=%s"}, - {Opt_nr_inodes, "nr_inodes=%s"}, - {Opt_mode, "mode=%o"}, - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_pagesize, "pagesize=%s"}, - {Opt_min_size, "min_size=%s"}, - {Opt_err, NULL}, +static const struct fs_parameter_spec hugetlb_param_specs[] = { + fsparam_u32 ("gid", Opt_gid), + fsparam_string("min_size", Opt_min_size), + fsparam_u32 ("mode", Opt_mode), + fsparam_string("nr_inodes", Opt_nr_inodes), + fsparam_string("pagesize", Opt_pagesize), + fsparam_string("size", Opt_size), + fsparam_u32 ("uid", Opt_uid), + {} +}; + +static const struct fs_parameter_description hugetlb_fs_parameters = { + .name = "hugetlbfs", + .specs = hugetlb_param_specs, }; #ifdef CONFIG_NUMA @@ -708,16 +722,16 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) } static struct inode *hugetlbfs_get_root(struct super_block *sb, - struct hugetlbfs_config *config) + struct hugetlbfs_fs_context *ctx) { struct inode *inode; inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_mode = S_IFDIR | config->mode; - inode->i_uid = config->uid; - inode->i_gid = config->gid; + inode->i_mode = S_IFDIR | ctx->mode; + inode->i_uid = ctx->uid; + inode->i_gid = ctx->gid; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; @@ -1093,8 +1107,6 @@ static const struct super_operations hugetlbfs_ops = { .show_options = hugetlbfs_show_options, }; -enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT }; - /* * Convert size option passed from command line to number of huge pages * in the pool specified by hstate. Size option could be in bytes @@ -1117,170 +1129,151 @@ hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, return size_opt; } -static int -hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) +/* + * Parse one mount parameter. + */ +static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p, *rest; - substring_t args[MAX_OPT_ARGS]; - int option; - unsigned long long max_size_opt = 0, min_size_opt = 0; - enum hugetlbfs_size_type max_val_type = NO_SIZE, min_val_type = NO_SIZE; - - if (!options) + struct hugetlbfs_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + char *rest; + unsigned long ps; + int opt; + + opt = fs_parse(fc, &hugetlb_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + ctx->uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(ctx->uid)) + goto bad_val; return 0; - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; + case Opt_gid: + ctx->gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(ctx->gid)) + goto bad_val; + return 0; - token = match_token(p, tokens, args); - switch (token) { - case Opt_uid: - if (match_int(&args[0], &option)) - goto bad_val; - pconfig->uid = make_kuid(current_user_ns(), option); - if (!uid_valid(pconfig->uid)) - goto bad_val; - break; + case Opt_mode: + ctx->mode = result.uint_32 & 01777U; + return 0; - case Opt_gid: - if (match_int(&args[0], &option)) - goto bad_val; - pconfig->gid = make_kgid(current_user_ns(), option); - if (!gid_valid(pconfig->gid)) - goto bad_val; - break; + case Opt_size: + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(param->string[0])) + goto bad_val; + ctx->max_size_opt = memparse(param->string, &rest); + ctx->max_val_type = SIZE_STD; + if (*rest == '%') + ctx->max_val_type = SIZE_PERCENT; + return 0; - case Opt_mode: - if (match_octal(&args[0], &option)) - goto bad_val; - pconfig->mode = option & 01777U; - break; + case Opt_nr_inodes: + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(param->string[0])) + goto bad_val; + ctx->nr_inodes = memparse(param->string, &rest); + return 0; - case Opt_size: { - /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(*args[0].from)) - goto bad_val; - max_size_opt = memparse(args[0].from, &rest); - max_val_type = SIZE_STD; - if (*rest == '%') - max_val_type = SIZE_PERCENT; - break; + case Opt_pagesize: + ps = memparse(param->string, &rest); + ctx->hstate = size_to_hstate(ps); + if (!ctx->hstate) { + pr_err("Unsupported page size %lu MB\n", ps >> 20); + return -EINVAL; } + return 0; - case Opt_nr_inodes: - /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(*args[0].from)) - goto bad_val; - pconfig->nr_inodes = memparse(args[0].from, &rest); - break; + case Opt_min_size: + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(param->string[0])) + goto bad_val; + ctx->min_size_opt = memparse(param->string, &rest); + ctx->min_val_type = SIZE_STD; + if (*rest == '%') + ctx->min_val_type = SIZE_PERCENT; + return 0; - case Opt_pagesize: { - unsigned long ps; - ps = memparse(args[0].from, &rest); - pconfig->hstate = size_to_hstate(ps); - if (!pconfig->hstate) { - pr_err("Unsupported page size %lu MB\n", - ps >> 20); - return -EINVAL; - } - break; - } + default: + return -EINVAL; + } - case Opt_min_size: { - /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(*args[0].from)) - goto bad_val; - min_size_opt = memparse(args[0].from, &rest); - min_val_type = SIZE_STD; - if (*rest == '%') - min_val_type = SIZE_PERCENT; - break; - } +bad_val: + return invalf(fc, "hugetlbfs: Bad value '%s' for mount option '%s'\n", + param->string, param->key); +} - default: - pr_err("Bad mount option: \"%s\"\n", p); - return -EINVAL; - break; - } - } +/* + * Validate the parsed options. + */ +static int hugetlbfs_validate(struct fs_context *fc) +{ + struct hugetlbfs_fs_context *ctx = fc->fs_private; /* * Use huge page pool size (in hstate) to convert the size * options to number of huge pages. If NO_SIZE, -1 is returned. */ - pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, - max_size_opt, max_val_type); - pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, - min_size_opt, min_val_type); + ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate, + ctx->max_size_opt, + ctx->max_val_type); + ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate, + ctx->min_size_opt, + ctx->min_val_type); /* * If max_size was specified, then min_size must be smaller */ - if (max_val_type > NO_SIZE && - pconfig->min_hpages > pconfig->max_hpages) { - pr_err("minimum size can not be greater than maximum size\n"); + if (ctx->max_val_type > NO_SIZE && + ctx->min_hpages > ctx->max_hpages) { + pr_err("Minimum size can not be greater than maximum size\n"); return -EINVAL; } return 0; - -bad_val: - pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p); - return -EINVAL; } static int -hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) +hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc) { - int ret; - struct hugetlbfs_config config; + struct hugetlbfs_fs_context *ctx = fc->fs_private; struct hugetlbfs_sb_info *sbinfo; - config.max_hpages = -1; /* No limit on size by default */ - config.nr_inodes = -1; /* No limit on number of inodes by default */ - config.uid = current_fsuid(); - config.gid = current_fsgid(); - config.mode = 0755; - config.hstate = &default_hstate; - config.min_hpages = -1; /* No default minimum size */ - ret = hugetlbfs_parse_options(data, &config); - if (ret) - return ret; - sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); if (!sbinfo) return -ENOMEM; sb->s_fs_info = sbinfo; - sbinfo->hstate = config.hstate; spin_lock_init(&sbinfo->stat_lock); - sbinfo->max_inodes = config.nr_inodes; - sbinfo->free_inodes = config.nr_inodes; - sbinfo->spool = NULL; - sbinfo->uid = config.uid; - sbinfo->gid = config.gid; - sbinfo->mode = config.mode; + sbinfo->hstate = ctx->hstate; + sbinfo->max_inodes = ctx->nr_inodes; + sbinfo->free_inodes = ctx->nr_inodes; + sbinfo->spool = NULL; + sbinfo->uid = ctx->uid; + sbinfo->gid = ctx->gid; + sbinfo->mode = ctx->mode; /* * Allocate and initialize subpool if maximum or minimum size is * specified. Any needed reservations (for minimim size) are taken * taken when the subpool is created. */ - if (config.max_hpages != -1 || config.min_hpages != -1) { - sbinfo->spool = hugepage_new_subpool(config.hstate, - config.max_hpages, - config.min_hpages); + if (ctx->max_hpages != -1 || ctx->min_hpages != -1) { + sbinfo->spool = hugepage_new_subpool(ctx->hstate, + ctx->max_hpages, + ctx->min_hpages); if (!sbinfo->spool) goto out_free; } sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = huge_page_size(config.hstate); - sb->s_blocksize_bits = huge_page_shift(config.hstate); + sb->s_blocksize = huge_page_size(ctx->hstate); + sb->s_blocksize_bits = huge_page_shift(ctx->hstate); sb->s_magic = HUGETLBFS_MAGIC; sb->s_op = &hugetlbfs_ops; sb->s_time_gran = 1; - sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config)); + sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx)); if (!sb->s_root) goto out_free; return 0; @@ -1290,16 +1283,52 @@ out_free: return -ENOMEM; } -static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int hugetlbfs_get_tree(struct fs_context *fc) +{ + int err = hugetlbfs_validate(fc); + if (err) + return err; + return vfs_get_super(fc, vfs_get_independent_super, hugetlbfs_fill_super); +} + +static void hugetlbfs_fs_context_free(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static const struct fs_context_operations hugetlbfs_fs_context_ops = { + .free = hugetlbfs_fs_context_free, + .parse_param = hugetlbfs_parse_param, + .get_tree = hugetlbfs_get_tree, +}; + +static int hugetlbfs_init_fs_context(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super); + struct hugetlbfs_fs_context *ctx; + + ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->max_hpages = -1; /* No limit on size by default */ + ctx->nr_inodes = -1; /* No limit on number of inodes by default */ + ctx->uid = current_fsuid(); + ctx->gid = current_fsgid(); + ctx->mode = 0755; + ctx->hstate = &default_hstate; + ctx->min_hpages = -1; /* No default minimum size */ + ctx->max_val_type = NO_SIZE; + ctx->min_val_type = NO_SIZE; + fc->fs_private = ctx; + fc->ops = &hugetlbfs_fs_context_ops; + return 0; } static struct file_system_type hugetlbfs_fs_type = { - .name = "hugetlbfs", - .mount = hugetlbfs_mount, - .kill_sb = kill_litter_super, + .name = "hugetlbfs", + .init_fs_context = hugetlbfs_init_fs_context, + .parameters = &hugetlb_fs_parameters, + .kill_sb = kill_litter_super, }; static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; @@ -1384,8 +1413,29 @@ out: return file; } +static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) +{ + struct fs_context *fc; + struct vfsmount *mnt; + + fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT); + if (IS_ERR(fc)) { + mnt = ERR_CAST(fc); + } else { + struct hugetlbfs_fs_context *ctx = fc->fs_private; + ctx->hstate = h; + mnt = fc_mount(fc); + put_fs_context(fc); + } + if (IS_ERR(mnt)) + pr_err("Cannot mount internal hugetlbfs for page size %uK", + 1U << (h->order + PAGE_SHIFT - 10)); + return mnt; +} + static int __init init_hugetlbfs_fs(void) { + struct vfsmount *mnt; struct hstate *h; int error; int i; @@ -1408,24 +1458,16 @@ static int __init init_hugetlbfs_fs(void) i = 0; for_each_hstate(h) { - char buf[50]; - unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10); - - snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb); - hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type, - buf); - - if (IS_ERR(hugetlbfs_vfsmount[i])) { - pr_err("Cannot mount internal hugetlbfs for " - "page size %uK", ps_kb); - error = PTR_ERR(hugetlbfs_vfsmount[i]); - hugetlbfs_vfsmount[i] = NULL; + mnt = mount_one_hugetlbfs(h); + if (IS_ERR(mnt) && i == 0) { + error = PTR_ERR(mnt); + goto out; } + hugetlbfs_vfsmount[i] = mnt; i++; } - /* Non default hstates are optional */ - if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx])) - return 0; + + return 0; out: kmem_cache_destroy(hugetlbfs_inode_cachep); diff --git a/fs/internal.h b/fs/internal.h @@ -17,6 +17,7 @@ struct linux_binprm; struct path; struct mount; struct shrink_control; +struct fs_context; /* * block_dev.c @@ -52,8 +53,16 @@ int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied, extern void __init chrdev_init(void); /* + * fs_context.c + */ +extern int parse_monolithic_mount_data(struct fs_context *, void *); +extern void fc_drop_locked(struct fs_context *); + +/* * namei.c */ +extern int filename_lookup(int dfd, struct filename *name, unsigned flags, + struct path *path, struct path *root); extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); @@ -99,10 +108,8 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *); /* * super.c */ -extern int do_remount_sb(struct super_block *, int, void *, int); +extern int reconfigure_super(struct fs_context *); extern bool trylock_super(struct super_block *sb); -extern struct dentry *mount_fs(struct file_system_type *, - int, const char *, void *); extern struct super_block *user_get_super(dev_t); /* diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h @@ -17,6 +17,7 @@ #include <linux/xattr.h> #include <linux/kernfs.h> +#include <linux/fs_context.h> struct kernfs_iattrs { struct iattr ia_iattr; diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c @@ -22,16 +22,6 @@ struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; -static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data) -{ - struct kernfs_root *root = kernfs_info(sb)->root; - struct kernfs_syscall_ops *scops = root->syscall_ops; - - if (scops && scops->remount_fs) - return scops->remount_fs(root, flags, data); - return 0; -} - static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) { struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry)); @@ -60,7 +50,6 @@ const struct super_operations kernfs_sops = { .drop_inode = generic_delete_inode, .evict_inode = kernfs_evict_inode, - .remount_fs = kernfs_sop_remount_fs, .show_options = kernfs_sop_show_options, .show_path = kernfs_sop_show_path, }; @@ -222,7 +211,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, } while (true); } -static int kernfs_fill_super(struct super_block *sb, unsigned long magic) +static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc) { struct kernfs_super_info *info = kernfs_info(sb); struct inode *inode; @@ -233,7 +222,7 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; - sb->s_magic = magic; + sb->s_magic = kfc->magic; sb->s_op = &kernfs_sops; sb->s_xattr = kernfs_xattr_handlers; if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP) @@ -263,21 +252,20 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) return 0; } -static int kernfs_test_super(struct super_block *sb, void *data) +static int kernfs_test_super(struct super_block *sb, struct fs_context *fc) { struct kernfs_super_info *sb_info = kernfs_info(sb); - struct kernfs_super_info *info = data; + struct kernfs_super_info *info = fc->s_fs_info; return sb_info->root == info->root && sb_info->ns == info->ns; } -static int kernfs_set_super(struct super_block *sb, void *data) +static int kernfs_set_super(struct super_block *sb, struct fs_context *fc) { - int error; - error = set_anon_super(sb, data); - if (!error) - sb->s_fs_info = data; - return error; + struct kernfs_fs_context *kfc = fc->fs_private; + + kfc->ns_tag = NULL; + return set_anon_super_fc(sb, fc); } /** @@ -294,63 +282,60 @@ const void *kernfs_super_ns(struct super_block *sb) } /** - * kernfs_mount_ns - kernfs mount helper - * @fs_type: file_system_type of the fs being mounted - * @flags: mount flags specified for the mount - * @root: kernfs_root of the hierarchy being mounted - * @magic: file system specific magic number - * @new_sb_created: tell the caller if we allocated a new superblock - * @ns: optional namespace tag of the mount + * kernfs_get_tree - kernfs filesystem access/retrieval helper + * @fc: The filesystem context. * - * This is to be called from each kernfs user's file_system_type->mount() - * implementation, which should pass through the specified @fs_type and - * @flags, and specify the hierarchy and namespace tag to mount via @root - * and @ns, respectively. - * - * The return value can be passed to the vfs layer verbatim. + * This is to be called from each kernfs user's fs_context->ops->get_tree() + * implementation, which should set the specified ->@fs_type and ->@flags, and + * specify the hierarchy and namespace tag to mount via ->@root and ->@ns, + * respectively. */ -struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, unsigned long magic, - bool *new_sb_created, const void *ns) +int kernfs_get_tree(struct fs_context *fc) { + struct kernfs_fs_context *kfc = fc->fs_private; struct super_block *sb; struct kernfs_super_info *info; int error; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) - return ERR_PTR(-ENOMEM); + return -ENOMEM; - info->root = root; - info->ns = ns; + info->root = kfc->root; + info->ns = kfc->ns_tag; INIT_LIST_HEAD(&info->node); - sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags, - &init_user_ns, info); - if (IS_ERR(sb) || sb->s_fs_info != info) - kfree(info); + fc->s_fs_info = info; + sb = sget_fc(fc, kernfs_test_super, kernfs_set_super); if (IS_ERR(sb)) - return ERR_CAST(sb); - - if (new_sb_created) - *new_sb_created = !sb->s_root; + return PTR_ERR(sb); if (!sb->s_root) { struct kernfs_super_info *info = kernfs_info(sb); - error = kernfs_fill_super(sb, magic); + kfc->new_sb_created = true; + + error = kernfs_fill_super(sb, kfc); if (error) { deactivate_locked_super(sb); - return ERR_PTR(error); + return error; } sb->s_flags |= SB_ACTIVE; mutex_lock(&kernfs_mutex); - list_add(&info->node, &root->supers); + list_add(&info->node, &info->root->supers); mutex_unlock(&kernfs_mutex); } - return dget(sb->s_root); + fc->root = dget(sb->s_root); + return 0; +} + +void kernfs_free_fs_context(struct fs_context *fc) +{ + /* Note that we don't deal with kfc->ns_tag here. */ + kfree(fc->s_fs_info); + fc->s_fs_info = NULL; } /** @@ -377,36 +362,6 @@ void kernfs_kill_sb(struct super_block *sb) kfree(info); } -/** - * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root - * @kernfs_root: the kernfs_root in question - * @ns: the namespace tag - * - * Pin the superblock so the superblock won't be destroyed in subsequent - * operations. This can be used to block ->kill_sb() which may be useful - * for kernfs users which dynamically manage superblocks. - * - * Returns NULL if there's no superblock associated to this kernfs_root, or - * -EINVAL if the superblock is being freed. - */ -struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) -{ - struct kernfs_super_info *info; - struct super_block *sb = NULL; - - mutex_lock(&kernfs_mutex); - list_for_each_entry(info, &root->supers, node) { - if (info->ns == ns) { - sb = info->sb; - if (!atomic_inc_not_zero(&info->sb->s_active)) - sb = ERR_PTR(-EINVAL); - break; - } - } - mutex_unlock(&kernfs_mutex); - return sb; -} - void __init kernfs_init(void) { diff --git a/fs/mount.h b/fs/mount.h @@ -146,3 +146,8 @@ static inline bool is_local_mountpoint(struct dentry *dentry) return __is_local_mountpoint(dentry); } + +static inline bool is_anon_ns(struct mnt_namespace *ns) +{ + return ns->seq == 0; +} diff --git a/fs/namei.c b/fs/namei.c @@ -2331,8 +2331,8 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path return err; } -static int filename_lookup(int dfd, struct filename *name, unsigned flags, - struct path *path, struct path *root) +int filename_lookup(int dfd, struct filename *name, unsigned flags, + struct path *path, struct path *root) { int retval; struct nameidata nd; diff --git a/fs/namespace.c b/fs/namespace.c @@ -27,6 +27,7 @@ #include <linux/task_work.h> #include <linux/sched/task.h> #include <uapi/linux/mount.h> +#include <linux/fs_context.h> #include "pnode.h" #include "internal.h" @@ -940,38 +941,81 @@ static struct mount *skip_mnt_tree(struct mount *p) return p; } -struct vfsmount * -vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) +/** + * vfs_create_mount - Create a mount for a configured superblock + * @fc: The configuration context with the superblock attached + * + * Create a mount to an already configured superblock. If necessary, the + * caller should invoke vfs_get_tree() before calling this. + * + * Note that this does not attach the mount to anything. + */ +struct vfsmount *vfs_create_mount(struct fs_context *fc) { struct mount *mnt; - struct dentry *root; - if (!type) - return ERR_PTR(-ENODEV); + if (!fc->root) + return ERR_PTR(-EINVAL); - mnt = alloc_vfsmnt(name); + mnt = alloc_vfsmnt(fc->source ?: "none"); if (!mnt) return ERR_PTR(-ENOMEM); - if (flags & SB_KERNMOUNT) + if (fc->sb_flags & SB_KERNMOUNT) mnt->mnt.mnt_flags = MNT_INTERNAL; - root = mount_fs(type, flags, name, data); - if (IS_ERR(root)) { - mnt_free_id(mnt); - free_vfsmnt(mnt); - return ERR_CAST(root); - } + atomic_inc(&fc->root->d_sb->s_active); + mnt->mnt.mnt_sb = fc->root->d_sb; + mnt->mnt.mnt_root = dget(fc->root); + mnt->mnt_mountpoint = mnt->mnt.mnt_root; + mnt->mnt_parent = mnt; - mnt->mnt.mnt_root = root; - mnt->mnt.mnt_sb = root->d_sb; - mnt->mnt_mountpoint = mnt->mnt.mnt_root; - mnt->mnt_parent = mnt; lock_mount_hash(); - list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); + list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); return &mnt->mnt; } +EXPORT_SYMBOL(vfs_create_mount); + +struct vfsmount *fc_mount(struct fs_context *fc) +{ + int err = vfs_get_tree(fc); + if (!err) { + up_write(&fc->root->d_sb->s_umount); + return vfs_create_mount(fc); + } + return ERR_PTR(err); +} +EXPORT_SYMBOL(fc_mount); + +struct vfsmount *vfs_kern_mount(struct file_system_type *type, + int flags, const char *name, + void *data) +{ + struct fs_context *fc; + struct vfsmount *mnt; + int ret = 0; + + if (!type) + return ERR_PTR(-EINVAL); + + fc = fs_context_for_mount(type, flags); + if (IS_ERR(fc)) + return ERR_CAST(fc); + + if (name) + ret = vfs_parse_fs_string(fc, "source", + name, strlen(name)); + if (!ret) + ret = parse_monolithic_mount_data(fc, data); + if (!ret) + mnt = fc_mount(fc); + else + mnt = ERR_PTR(ret); + + put_fs_context(fc); + return mnt; +} EXPORT_SYMBOL_GPL(vfs_kern_mount); struct vfsmount * @@ -1013,27 +1057,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_flags = old->mnt.mnt_flags; mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); - /* Don't allow unprivileged users to change mount flags */ - if (flag & CL_UNPRIVILEGED) { - mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; - - if (mnt->mnt.mnt_flags & MNT_READONLY) - mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; - - if (mnt->mnt.mnt_flags & MNT_NODEV) - mnt->mnt.mnt_flags |= MNT_LOCK_NODEV; - - if (mnt->mnt.mnt_flags & MNT_NOSUID) - mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID; - - if (mnt->mnt.mnt_flags & MNT_NOEXEC) - mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC; - } - - /* Don't allow unprivileged users to reveal what is under a mount */ - if ((flag & CL_UNPRIVILEGED) && - (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire))) - mnt->mnt.mnt_flags |= MNT_LOCKED; atomic_inc(&sb->s_active); mnt->mnt.mnt_sb = sb; @@ -1464,6 +1487,29 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) static void shrink_submounts(struct mount *mnt); +static int do_umount_root(struct super_block *sb) +{ + int ret = 0; + + down_write(&sb->s_umount); + if (!sb_rdonly(sb)) { + struct fs_context *fc; + + fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY, + SB_RDONLY); + if (IS_ERR(fc)) { + ret = PTR_ERR(fc); + } else { + ret = parse_monolithic_mount_data(fc, NULL); + if (!ret) + ret = reconfigure_super(fc); + put_fs_context(fc); + } + } + up_write(&sb->s_umount); + return ret; +} + static int do_umount(struct mount *mnt, int flags) { struct super_block *sb = mnt->mnt.mnt_sb; @@ -1529,11 +1575,7 @@ static int do_umount(struct mount *mnt, int flags) */ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; - down_write(&sb->s_umount); - if (!sb_rdonly(sb)) - retval = do_remount_sb(sb, SB_RDONLY, NULL, 0); - up_write(&sb->s_umount); - return retval; + return do_umount_root(sb); } namespace_lock(); @@ -1839,6 +1881,33 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, return 0; } +static void lock_mnt_tree(struct mount *mnt) +{ + struct mount *p; + + for (p = mnt; p; p = next_mnt(p, mnt)) { + int flags = p->mnt.mnt_flags; + /* Don't allow unprivileged users to change mount flags */ + flags |= MNT_LOCK_ATIME; + + if (flags & MNT_READONLY) + flags |= MNT_LOCK_READONLY; + + if (flags & MNT_NODEV) + flags |= MNT_LOCK_NODEV; + + if (flags & MNT_NOSUID) + flags |= MNT_LOCK_NOSUID; + + if (flags & MNT_NOEXEC) + flags |= MNT_LOCK_NOEXEC; + /* Don't allow unprivileged users to reveal what is under a mount */ + if (list_empty(&p->mnt_expire)) + flags |= MNT_LOCKED; + p->mnt.mnt_flags = flags; + } +} + static void cleanup_group_ids(struct mount *mnt, struct mount *end) { struct mount *p; @@ -1956,6 +2025,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, struct mountpoint *dest_mp, struct path *parent_path) { + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; HLIST_HEAD(tree_list); struct mnt_namespace *ns = dest_mnt->mnt_ns; struct mountpoint *smp; @@ -2006,6 +2076,9 @@ static int attach_recursive_mnt(struct mount *source_mnt, child->mnt_mountpoint); if (q) mnt_change_mountpoint(child, smp, q); + /* Notice when we are propagating across user namespaces */ + if (child->mnt_parent->mnt_ns->user_ns != user_ns) + lock_mnt_tree(child); commit_tree(child); } put_mountpoint(smp); @@ -2313,7 +2386,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, int err; struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); - void *sec_opts = NULL; + struct fs_context *fc; if (!check_mnt(mnt)) return -EINVAL; @@ -2324,24 +2397,22 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; - if (data && !(sb->s_type->fs_flags & FS_BINARY_MOUNTDATA)) { - err = security_sb_eat_lsm_opts(data, &sec_opts); - if (err) - return err; - } - err = security_sb_remount(sb, sec_opts); - security_free_mnt_opts(&sec_opts); - if (err) - return err; + fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK); + if (IS_ERR(fc)) + return PTR_ERR(fc); - down_write(&sb->s_umount); - err = -EPERM; - if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { - err = do_remount_sb(sb, sb_flags, data, 0); - if (!err) - set_mount_attributes(mnt, mnt_flags); + err = parse_monolithic_mount_data(fc, data); + if (!err) { + down_write(&sb->s_umount); + err = -EPERM; + if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { + err = reconfigure_super(fc); + if (!err) + set_mount_attributes(mnt, mnt_flags); + } + up_write(&sb->s_umount); } - up_write(&sb->s_umount); + put_fs_context(fc); return err; } @@ -2425,29 +2496,6 @@ out: return err; } -static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) -{ - int err; - const char *subtype = strchr(fstype, '.'); - if (subtype) { - subtype++; - err = -EINVAL; - if (!subtype[0]) - goto err; - } else - subtype = ""; - - mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); - err = -ENOMEM; - if (!mnt->mnt_sb->s_subtype) - goto err; - return mnt; - - err: - mntput(mnt); - return ERR_PTR(err); -} - /* * add a mount into a namespace's mount tree */ @@ -2492,7 +2540,39 @@ unlock: return err; } -static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags); +static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); + +/* + * Create a new mount using a superblock configuration and request it + * be added to the namespace tree. + */ +static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, + unsigned int mnt_flags) +{ + struct vfsmount *mnt; + struct super_block *sb = fc->root->d_sb; + int error; + + error = security_sb_kern_mount(sb); + if (!error && mount_too_revealing(sb, &mnt_flags)) + error = -EPERM; + + if (unlikely(error)) { + fc_drop_locked(fc); + return error; + } + + up_write(&sb->s_umount); + + mnt = vfs_create_mount(fc); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags); + if (error < 0) + mntput(mnt); + return error; +} /* * create a new mount for userspace and request it to be added into the @@ -2502,8 +2582,9 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int mnt_flags, const char *name, void *data) { struct file_system_type *type; - struct vfsmount *mnt; - int err; + struct fs_context *fc; + const char *subtype = NULL; + int err = 0; if (!fstype) return -EINVAL; @@ -2512,23 +2593,37 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, if (!type) return -ENODEV; - mnt = vfs_kern_mount(type, sb_flags, name, data); - if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && - !mnt->mnt_sb->s_subtype) - mnt = fs_set_subtype(mnt, fstype); + if (type->fs_flags & FS_HAS_SUBTYPE) { + subtype = strchr(fstype, '.'); + if (subtype) { + subtype++; + if (!*subtype) { + put_filesystem(type); + return -EINVAL; + } + } else { + subtype = ""; + } + } + fc = fs_context_for_mount(type, sb_flags); put_filesystem(type); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); - - if (mount_too_revealing(mnt, &mnt_flags)) { - mntput(mnt); - return -EPERM; - } + if (IS_ERR(fc)) + return PTR_ERR(fc); + + if (subtype) + err = vfs_parse_fs_string(fc, "subtype", + subtype, strlen(subtype)); + if (!err && name) + err = vfs_parse_fs_string(fc, "source", name, strlen(name)); + if (!err) + err = parse_monolithic_mount_data(fc, data); + if (!err) + err = vfs_get_tree(fc); + if (!err) + err = do_new_mount_fc(fc, path, mnt_flags); - err = do_add_mount(real_mount(mnt), path, mnt_flags); - if (err) - mntput(mnt); + put_fs_context(fc); return err; } @@ -2863,7 +2958,8 @@ static void dec_mnt_namespaces(struct ucounts *ucounts) static void free_mnt_ns(struct mnt_namespace *ns) { - ns_free_inum(&ns->ns); + if (!is_anon_ns(ns)) + ns_free_inum(&ns->ns); dec_mnt_namespaces(ns->ucounts); put_user_ns(ns->user_ns); kfree(ns); @@ -2878,7 +2974,7 @@ static void free_mnt_ns(struct mnt_namespace *ns) */ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); -static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) { struct mnt_namespace *new_ns; struct ucounts *ucounts; @@ -2888,28 +2984,27 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) if (!ucounts) return ERR_PTR(-ENOSPC); - new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); + new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) { dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); } - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); - dec_mnt_namespaces(ucounts); - return ERR_PTR(ret); + if (!anon) { + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + dec_mnt_namespaces(ucounts); + return ERR_PTR(ret); + } } new_ns->ns.ops = &mntns_operations; - new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); + if (!anon) + new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); atomic_set(&new_ns->count, 1); - new_ns->root = NULL; INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); - new_ns->event = 0; new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; - new_ns->mounts = 0; - new_ns->pending_mounts = 0; return new_ns; } @@ -2933,7 +3028,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, old = ns->root; - new_ns = alloc_mnt_ns(user_ns); + new_ns = alloc_mnt_ns(user_ns, false); if (IS_ERR(new_ns)) return new_ns; @@ -2941,13 +3036,18 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) - copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; + copy_flags |= CL_SHARED_TO_SLAVE; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); free_mnt_ns(new_ns); return ERR_CAST(new); } + if (user_ns != ns->user_ns) { + lock_mount_hash(); + lock_mnt_tree(new); + unlock_mount_hash(); + } new_ns->root = new; list_add_tail(&new_ns->list, &new->mnt_list); @@ -2988,37 +3088,25 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, return new_ns; } -/** - * create_mnt_ns - creates a private namespace and adds a root filesystem - * @mnt: pointer to the new root filesystem mountpoint - */ -static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) -{ - struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); - if (!IS_ERR(new_ns)) { - struct mount *mnt = real_mount(m); - mnt->mnt_ns = new_ns; - new_ns->root = mnt; - new_ns->mounts++; - list_add(&mnt->mnt_list, &new_ns->list); - } else { - mntput(m); - } - return new_ns; -} - -struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) +struct dentry *mount_subtree(struct vfsmount *m, const char *name) { + struct mount *mnt = real_mount(m); struct mnt_namespace *ns; struct super_block *s; struct path path; int err; - ns = create_mnt_ns(mnt); - if (IS_ERR(ns)) + ns = alloc_mnt_ns(&init_user_ns, true); + if (IS_ERR(ns)) { + mntput(m); return ERR_CAST(ns); + } + mnt->mnt_ns = ns; + ns->root = mnt; + ns->mounts++; + list_add(&mnt->mnt_list, &ns->list); - err = vfs_path_lookup(mnt->mnt_root, mnt, + err = vfs_path_lookup(m->mnt_root, m, name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); put_mnt_ns(ns); @@ -3228,6 +3316,7 @@ out0: static void __init init_mount_tree(void) { struct vfsmount *mnt; + struct mount *m; struct mnt_namespace *ns; struct path root; struct file_system_type *type; @@ -3240,10 +3329,14 @@ static void __init init_mount_tree(void) if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = create_mnt_ns(mnt); + ns = alloc_mnt_ns(&init_user_ns, false); if (IS_ERR(ns)) panic("Can't allocate initial namespace"); - + m = real_mount(mnt); + m->mnt_ns = ns; + ns->root = m; + ns->mounts = 1; + list_add(&m->mnt_list, &ns->list); init_task.nsproxy->mnt_ns = ns; get_mnt_ns(ns); @@ -3297,10 +3390,10 @@ void put_mnt_ns(struct mnt_namespace *ns) free_mnt_ns(ns); } -struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) +struct vfsmount *kern_mount(struct file_system_type *type) { struct vfsmount *mnt; - mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, data); + mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); if (!IS_ERR(mnt)) { /* * it is a longterm mount, don't release mnt until @@ -3310,7 +3403,7 @@ struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) } return mnt; } -EXPORT_SYMBOL_GPL(kern_mount_data); +EXPORT_SYMBOL_GPL(kern_mount); void kern_unmount(struct vfsmount *mnt) { @@ -3352,7 +3445,8 @@ bool current_chrooted(void) return chrooted; } -static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, +static bool mnt_already_visible(struct mnt_namespace *ns, + const struct super_block *sb, int *new_mnt_flags) { int new_flags = *new_mnt_flags; @@ -3364,7 +3458,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, struct mount *child; int mnt_flags; - if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type) + if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; /* This mount is not fully visible if it's root directory @@ -3415,7 +3509,7 @@ found: return visible; } -static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) +static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags) { const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV; struct mnt_namespace *ns = current->nsproxy->mnt_ns; @@ -3425,7 +3519,7 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) return false; /* Can this filesystem be too revealing? */ - s_iflags = mnt->mnt_sb->s_iflags; + s_iflags = sb->s_iflags; if (!(s_iflags & SB_I_USERNS_VISIBLE)) return false; @@ -3435,7 +3529,7 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) return true; } - return !mnt_already_visible(ns, mnt, new_mnt_flags); + return !mnt_already_visible(ns, sb, new_mnt_flags); } bool mnt_may_suid(struct vfsmount *mnt) @@ -3484,6 +3578,9 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; + if (is_anon_ns(mnt_ns)) + return -EINVAL; + if (fs->users != 1) return -EINVAL; diff --git a/fs/pnode.c b/fs/pnode.c @@ -214,7 +214,6 @@ static struct mount *next_group(struct mount *m, struct mount *origin) } /* all accesses are serialized by namespace_sem */ -static struct user_namespace *user_ns; static struct mount *last_dest, *first_source, *last_source, *dest_master; static struct mountpoint *mp; static struct hlist_head *list; @@ -260,9 +259,6 @@ static int propagate_one(struct mount *m) type |= CL_MAKE_SHARED; } - /* Notice when we are propagating across user namespaces */ - if (m->mnt_ns->user_ns != user_ns) - type |= CL_UNPRIVILEGED; child = copy_tree(last_source, last_source->mnt.mnt_root, type); if (IS_ERR(child)) return PTR_ERR(child); @@ -303,7 +299,6 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, * propagate_one(); everything is serialized by namespace_sem, * so globals will do just fine. */ - user_ns = current->nsproxy->mnt_ns->user_ns; last_dest = dest_mnt; first_source = source_mnt; last_source = source_mnt; diff --git a/fs/pnode.h b/fs/pnode.h @@ -27,8 +27,7 @@ #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 #define CL_SHARED_TO_SLAVE 0x20 -#define CL_UNPRIVILEGED 0x40 -#define CL_COPY_MNT_NS_FILE 0x80 +#define CL_COPY_MNT_NS_FILE 0x40 #define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE) diff --git a/fs/proc/inode.c b/fs/proc/inode.c @@ -24,7 +24,6 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/mount.h> -#include <linux/magic.h> #include <linux/uaccess.h> @@ -122,13 +121,12 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) return 0; } -static const struct super_operations proc_sops = { +const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .destroy_inode = proc_destroy_inode, .drop_inode = generic_delete_inode, .evict_inode = proc_evict_inode, .statfs = simple_statfs, - .remount_fs = proc_remount, .show_options = proc_show_options, }; @@ -488,51 +486,3 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) pde_put(de); return inode; } - -int proc_fill_super(struct super_block *s, void *data, int silent) -{ - struct pid_namespace *ns = get_pid_ns(s->s_fs_info); - struct inode *root_inode; - int ret; - - if (!proc_parse_options(data, ns)) - return -EINVAL; - - /* User space would break if executables or devices appear on proc */ - s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; - s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; - s->s_blocksize = 1024; - s->s_blocksize_bits = 10; - s->s_magic = PROC_SUPER_MAGIC; - s->s_op = &proc_sops; - s->s_time_gran = 1; - - /* - * procfs isn't actually a stacking filesystem; however, there is - * too much magic going on inside it to permit stacking things on - * top of it - */ - s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; - - /* procfs dentries and inodes don't require IO to create */ - s->s_shrink.seeks = 0; - - pde_get(&proc_root); - root_inode = proc_get_inode(s, &proc_root); - if (!root_inode) { - pr_err("proc_fill_super: get root inode failed\n"); - return -ENOMEM; - } - - s->s_root = d_make_root(root_inode); - if (!s->s_root) { - pr_err("proc_fill_super: allocate dentry failed\n"); - return -ENOMEM; - } - - ret = proc_setup_self(s); - if (ret) { - return ret; - } - return proc_setup_thread_self(s); -} diff --git a/fs/proc/internal.h b/fs/proc/internal.h @@ -207,13 +207,12 @@ struct pde_opener { struct completion *c; } __randomize_layout; extern const struct inode_operations proc_link_inode_operations; - extern const struct inode_operations proc_pid_link_inode_operations; +extern const struct super_operations proc_sops; void proc_init_kmemcache(void); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); -extern int proc_fill_super(struct super_block *, void *data, int flags); extern void proc_entry_rundown(struct proc_dir_entry *); /* @@ -271,10 +270,8 @@ static inline void proc_tty_init(void) {} * root.c */ extern struct proc_dir_entry proc_root; -extern int proc_parse_options(char *options, struct pid_namespace *pid); extern void proc_self_init(void); -extern int proc_remount(struct super_block *, int *, char *); /* * task_[no]mmu.c diff --git a/fs/proc/root.c b/fs/proc/root.c @@ -19,86 +19,178 @@ #include <linux/module.h> #include <linux/bitops.h> #include <linux/user_namespace.h> +#include <linux/fs_context.h> #include <linux/mount.h> #include <linux/pid_namespace.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> #include <linux/cred.h> +#include <linux/magic.h> +#include <linux/slab.h> #include "internal.h" -enum { - Opt_gid, Opt_hidepid, Opt_err, +struct proc_fs_context { + struct pid_namespace *pid_ns; + unsigned int mask; + int hidepid; + int gid; }; -static const match_table_t tokens = { - {Opt_hidepid, "hidepid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_err, NULL}, +enum proc_param { + Opt_gid, + Opt_hidepid, }; -int proc_parse_options(char *options, struct pid_namespace *pid) +static const struct fs_parameter_spec proc_param_specs[] = { + fsparam_u32("gid", Opt_gid), + fsparam_u32("hidepid", Opt_hidepid), + {} +}; + +static const struct fs_parameter_description proc_fs_parameters = { + .name = "proc", + .specs = proc_param_specs, +}; + +static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - - args[0].to = args[0].from = NULL; - token = match_token(p, tokens, args); - switch (token) { - case Opt_gid: - if (match_int(&args[0], &option)) - return 0; - pid->pid_gid = make_kgid(current_user_ns(), option); - break; - case Opt_hidepid: - if (match_int(&args[0], &option)) - return 0; - if (option < HIDEPID_OFF || - option > HIDEPID_INVISIBLE) { - pr_err("proc: hidepid value must be between 0 and 2.\n"); - return 0; - } - pid->hide_pid = option; - break; - default: - pr_err("proc: unrecognized mount option \"%s\" " - "or missing value\n", p); - return 0; - } + struct proc_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, &proc_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_gid: + ctx->gid = result.uint_32; + break; + + case Opt_hidepid: + ctx->hidepid = result.uint_32; + if (ctx->hidepid < HIDEPID_OFF || + ctx->hidepid > HIDEPID_INVISIBLE) + return invalf(fc, "proc: hidepid value must be between 0 and 2.\n"); + break; + + default: + return -EINVAL; } - return 1; + ctx->mask |= 1 << opt; + return 0; } -int proc_remount(struct super_block *sb, int *flags, char *data) +static void proc_apply_options(struct super_block *s, + struct fs_context *fc, + struct pid_namespace *pid_ns, + struct user_namespace *user_ns) { + struct proc_fs_context *ctx = fc->fs_private; + + if (ctx->mask & (1 << Opt_gid)) + pid_ns->pid_gid = make_kgid(user_ns, ctx->gid); + if (ctx->mask & (1 << Opt_hidepid)) + pid_ns->hide_pid = ctx->hidepid; +} + +static int proc_fill_super(struct super_block *s, struct fs_context *fc) +{ + struct pid_namespace *pid_ns = get_pid_ns(s->s_fs_info); + struct inode *root_inode; + int ret; + + proc_apply_options(s, fc, pid_ns, current_user_ns()); + + /* User space would break if executables or devices appear on proc */ + s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; + s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = PROC_SUPER_MAGIC; + s->s_op = &proc_sops; + s->s_time_gran = 1; + + /* + * procfs isn't actually a stacking filesystem; however, there is + * too much magic going on inside it to permit stacking things on + * top of it + */ + s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + + /* procfs dentries and inodes don't require IO to create */ + s->s_shrink.seeks = 0; + + pde_get(&proc_root); + root_inode = proc_get_inode(s, &proc_root); + if (!root_inode) { + pr_err("proc_fill_super: get root inode failed\n"); + return -ENOMEM; + } + + s->s_root = d_make_root(root_inode); + if (!s->s_root) { + pr_err("proc_fill_super: allocate dentry failed\n"); + return -ENOMEM; + } + + ret = proc_setup_self(s); + if (ret) { + return ret; + } + return proc_setup_thread_self(s); +} + +static int proc_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; struct pid_namespace *pid = sb->s_fs_info; sync_filesystem(sb); - return !proc_parse_options(data, pid); + + proc_apply_options(sb, fc, pid, current_user_ns()); + return 0; } -static struct dentry *proc_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int proc_get_tree(struct fs_context *fc) { - struct pid_namespace *ns; + struct proc_fs_context *ctx = fc->fs_private; - if (flags & SB_KERNMOUNT) { - ns = data; - data = NULL; - } else { - ns = task_active_pid_ns(current); - } + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); + fc->s_fs_info = ctx->pid_ns; + return vfs_get_super(fc, vfs_get_keyed_super, proc_fill_super); +} - return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super); +static void proc_fs_context_free(struct fs_context *fc) +{ + struct proc_fs_context *ctx = fc->fs_private; + + if (ctx->pid_ns) + put_pid_ns(ctx->pid_ns); + kfree(ctx); +} + +static const struct fs_context_operations proc_fs_context_ops = { + .free = proc_fs_context_free, + .parse_param = proc_parse_param, + .get_tree = proc_get_tree, + .reconfigure = proc_reconfigure, +}; + +static int proc_init_fs_context(struct fs_context *fc) +{ + struct proc_fs_context *ctx; + + ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->pid_ns = get_pid_ns(task_active_pid_ns(current)); + fc->fs_private = ctx; + fc->ops = &proc_fs_context_ops; + return 0; } static void proc_kill_sb(struct super_block *sb) @@ -115,10 +207,11 @@ static void proc_kill_sb(struct super_block *sb) } static struct file_system_type proc_fs_type = { - .name = "proc", - .mount = proc_mount, - .kill_sb = proc_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .name = "proc", + .init_fs_context = proc_init_fs_context, + .parameters = &proc_fs_parameters, + .kill_sb = proc_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; void __init proc_root_init(void) @@ -156,7 +249,7 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr { if (!proc_pid_lookup(dentry, flags)) return NULL; - + return proc_lookup(dir, dentry, flags); } @@ -209,9 +302,28 @@ struct proc_dir_entry proc_root = { int pid_ns_prepare_proc(struct pid_namespace *ns) { + struct proc_fs_context *ctx; + struct fs_context *fc; struct vfsmount *mnt; - mnt = kern_mount_data(&proc_fs_type, ns); + fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT); + if (IS_ERR(fc)) + return PTR_ERR(fc); + + if (fc->user_ns != ns->user_ns) { + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ns->user_ns); + } + + ctx = fc->fs_private; + if (ctx->pid_ns != ns) { + put_pid_ns(ctx->pid_ns); + get_pid_ns(ns); + ctx->pid_ns = ns; + } + + mnt = fc_mount(fc); + put_fs_context(fc); if (IS_ERR(mnt)) return PTR_ERR(mnt); diff --git a/fs/super.c b/fs/super.c @@ -35,6 +35,7 @@ #include <linux/fsnotify.h> #include <linux/lockdep.h> #include <linux/user_namespace.h> +#include <linux/fs_context.h> #include <uapi/linux/mount.h> #include "internal.h" @@ -476,6 +477,94 @@ void generic_shutdown_super(struct super_block *sb) EXPORT_SYMBOL(generic_shutdown_super); /** + * sget_fc - Find or create a superblock + * @fc: Filesystem context. + * @test: Comparison callback + * @set: Setup callback + * + * Find or create a superblock using the parameters stored in the filesystem + * context and the two callback functions. + * + * If an extant superblock is matched, then that will be returned with an + * elevated reference count that the caller must transfer or discard. + * + * If no match is made, a new superblock will be allocated and basic + * initialisation will be performed (s_type, s_fs_info and s_id will be set and + * the set() callback will be invoked), the superblock will be published and it + * will be returned in a partially constructed state with SB_BORN and SB_ACTIVE + * as yet unset. + */ +struct super_block *sget_fc(struct fs_context *fc, + int (*test)(struct super_block *, struct fs_context *), + int (*set)(struct super_block *, struct fs_context *)) +{ + struct super_block *s = NULL; + struct super_block *old; + struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns; + int err; + + if (!(fc->sb_flags & SB_KERNMOUNT) && + fc->purpose != FS_CONTEXT_FOR_SUBMOUNT) { + /* Don't allow mounting unless the caller has CAP_SYS_ADMIN + * over the namespace. + */ + if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) { + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + } else { + if (!ns_capable(fc->user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + } + } + +retry: + spin_lock(&sb_lock); + if (test) { + hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) { + if (test(old, fc)) + goto share_extant_sb; + } + } + if (!s) { + spin_unlock(&sb_lock); + s = alloc_super(fc->fs_type, fc->sb_flags, user_ns); + if (!s) + return ERR_PTR(-ENOMEM); + goto retry; + } + + s->s_fs_info = fc->s_fs_info; + err = set(s, fc); + if (err) { + s->s_fs_info = NULL; + spin_unlock(&sb_lock); + destroy_unused_super(s); + return ERR_PTR(err); + } + fc->s_fs_info = NULL; + s->s_type = fc->fs_type; + strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id)); + list_add_tail(&s->s_list, &super_blocks); + hlist_add_head(&s->s_instances, &s->s_type->fs_supers); + spin_unlock(&sb_lock); + get_filesystem(s->s_type); + register_shrinker_prepared(&s->s_shrink); + return s; + +share_extant_sb: + if (user_ns != old->s_user_ns) { + spin_unlock(&sb_lock); + destroy_unused_super(s); + return ERR_PTR(-EBUSY); + } + if (!grab_super(old)) + goto retry; + destroy_unused_super(s); + return old; +} +EXPORT_SYMBOL(sget_fc); + +/** * sget_userns - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback @@ -835,28 +924,35 @@ rescan: } /** - * do_remount_sb - asks filesystem to change mount options. - * @sb: superblock in question - * @sb_flags: revised superblock flags - * @data: the rest of options - * @force: whether or not to force the change + * reconfigure_super - asks filesystem to change superblock parameters + * @fc: The superblock and configuration * - * Alters the mount options of a mounted file system. + * Alters the configuration parameters of a live superblock. */ -int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force) +int reconfigure_super(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; int retval; - int remount_ro; + bool remount_ro = false; + bool force = fc->sb_flags & SB_FORCE; + if (fc->sb_flags_mask & ~MS_RMT_MASK) + return -EINVAL; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; + retval = security_sb_remount(sb, fc->security); + if (retval) + return retval; + + if (fc->sb_flags_mask & SB_RDONLY) { #ifdef CONFIG_BLOCK - if (!(sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev)) - return -EACCES; + if (!(fc->sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev)) + return -EACCES; #endif - remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb); + remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb); + } if (remount_ro) { if (!hlist_empty(&sb->s_pins)) { @@ -867,13 +963,14 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; - remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb); + remount_ro = !sb_rdonly(sb); } } shrink_dcache_sb(sb); - /* If we are remounting RDONLY and current sb is read/write, - make sure there are no rw files opened */ + /* If we are reconfiguring to RDONLY and current sb is read/write, + * make sure there are no files open for writing. + */ if (remount_ro) { if (force) { sb->s_readonly_remount = 1; @@ -885,8 +982,8 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force) } } - if (sb->s_op->remount_fs) { - retval = sb->s_op->remount_fs(sb, &sb_flags, data); + if (fc->ops->reconfigure) { + retval = fc->ops->reconfigure(fc); if (retval) { if (!force) goto cancel_readonly; @@ -895,7 +992,9 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force) sb->s_type->name, retval); } } - sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (sb_flags & MS_RMT_MASK); + + WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) | + (fc->sb_flags & fc->sb_flags_mask))); /* Needs to be ordered wrt mnt_is_readonly() */ smp_wmb(); sb->s_readonly_remount = 0; @@ -922,10 +1021,15 @@ static void do_emergency_remount_callback(struct super_block *sb) down_write(&sb->s_umount); if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) && !sb_rdonly(sb)) { - /* - * What lock protects sb->s_flags?? - */ - do_remount_sb(sb, SB_RDONLY, NULL, 1); + struct fs_context *fc; + + fc = fs_context_for_reconfigure(sb->s_root, + SB_RDONLY | SB_FORCE, SB_RDONLY); + if (!IS_ERR(fc)) { + if (parse_monolithic_mount_data(fc, NULL) == 0) + (void)reconfigure_super(fc); + put_fs_context(fc); + } } up_write(&sb->s_umount); } @@ -1087,6 +1191,89 @@ struct dentry *mount_ns(struct file_system_type *fs_type, EXPORT_SYMBOL(mount_ns); +int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) +{ + return set_anon_super(sb, NULL); +} +EXPORT_SYMBOL(set_anon_super_fc); + +static int test_keyed_super(struct super_block *sb, struct fs_context *fc) +{ + return sb->s_fs_info == fc->s_fs_info; +} + +static int test_single_super(struct super_block *s, struct fs_context *fc) +{ + return 1; +} + +/** + * vfs_get_super - Get a superblock with a search key set in s_fs_info. + * @fc: The filesystem context holding the parameters + * @keying: How to distinguish superblocks + * @fill_super: Helper to initialise a new superblock + * + * Search for a superblock and create a new one if not found. The search + * criterion is controlled by @keying. If the search fails, a new superblock + * is created and @fill_super() is called to initialise it. + * + * @keying can take one of a number of values: + * + * (1) vfs_get_single_super - Only one superblock of this type may exist on the + * system. This is typically used for special system filesystems. + * + * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have + * distinct keys (where the key is in s_fs_info). Searching for the same + * key again will turn up the superblock for that key. + * + * (3) vfs_get_independent_super - Multiple superblocks may exist and are + * unkeyed. Each call will get a new superblock. + * + * A permissions check is made by sget_fc() unless we're getting a superblock + * for a kernel-internal mount or a submount. + */ +int vfs_get_super(struct fs_context *fc, + enum vfs_get_super_keying keying, + int (*fill_super)(struct super_block *sb, + struct fs_context *fc)) +{ + int (*test)(struct super_block *, struct fs_context *); + struct super_block *sb; + + switch (keying) { + case vfs_get_single_super: + test = test_single_super; + break; + case vfs_get_keyed_super: + test = test_keyed_super; + break; + case vfs_get_independent_super: + test = NULL; + break; + default: + BUG(); + } + + sb = sget_fc(fc, test, set_anon_super_fc); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + if (!sb->s_root) { + int err = fill_super(sb, fc); + if (err) { + deactivate_locked_super(sb); + return err; + } + + sb->s_flags |= SB_ACTIVE; + } + + BUG_ON(fc->root); + fc->root = dget(sb->s_root); + return 0; +} +EXPORT_SYMBOL(vfs_get_super); + #ifdef CONFIG_BLOCK static int set_bdev_super(struct super_block *s, void *data) { @@ -1212,6 +1399,31 @@ struct dentry *mount_nodev(struct file_system_type *fs_type, } EXPORT_SYMBOL(mount_nodev); +static int reconfigure_single(struct super_block *s, + int flags, void *data) +{ + struct fs_context *fc; + int ret; + + /* The caller really need to be passing fc down into mount_single(), + * then a chunk of this can be removed. [Bollocks -- AV] + * Better yet, reconfiguration shouldn't happen, but rather the second + * mount should be rejected if the parameters are not compatible. + */ + fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK); + if (IS_ERR(fc)) + return PTR_ERR(fc); + + ret = parse_monolithic_mount_data(fc, data); + if (ret < 0) + goto out; + + ret = reconfigure_super(fc); +out: + put_fs_context(fc); + return ret; +} + static int compare_single(struct super_block *s, void *p) { return 1; @@ -1229,41 +1441,64 @@ struct dentry *mount_single(struct file_system_type *fs_type, return ERR_CAST(s); if (!s->s_root) { error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - s->s_flags |= SB_ACTIVE; + if (!error) + s->s_flags |= SB_ACTIVE; } else { - do_remount_sb(s, flags, data, 0); + error = reconfigure_single(s, flags, data); + } + if (unlikely(error)) { + deactivate_locked_super(s); + return ERR_PTR(error); } return dget(s->s_root); } EXPORT_SYMBOL(mount_single); -struct dentry * -mount_fs(struct file_system_type *type, int flags, const char *name, void *data) +/** + * vfs_get_tree - Get the mountable root + * @fc: The superblock configuration context. + * + * The filesystem is invoked to get or create a superblock which can then later + * be used for mounting. The filesystem places a pointer to the root to be + * used for mounting in @fc->root. + */ +int vfs_get_tree(struct fs_context *fc) { - struct dentry *root; struct super_block *sb; - int error = -ENOMEM; - void *sec_opts = NULL; + int error; - if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) { - error = security_sb_eat_lsm_opts(data, &sec_opts); - if (error) - return ERR_PTR(error); + if (fc->fs_type->fs_flags & FS_REQUIRES_DEV && !fc->source) { + errorf(fc, "Filesystem requires source device"); + return -ENOENT; } - root = type->mount(type, flags, name, data); - if (IS_ERR(root)) { - error = PTR_ERR(root); - goto out_free_secdata; + if (fc->root) + return -EBUSY; + + /* Get the mountable root in fc->root, with a ref on the root and a ref + * on the superblock. + */ + error = fc->ops->get_tree(fc); + if (error < 0) + return error; + + if (!fc->root) { + pr_err("Filesystem %s get_tree() didn't set fc->root\n", + fc->fs_type->name); + /* We don't know what the locking state of the superblock is - + * if there is a superblock. + */ + BUG(); } - sb = root->d_sb; - BUG_ON(!sb); + + sb = fc->root->d_sb; WARN_ON(!sb->s_bdi); + if (fc->subtype && !sb->s_subtype) { + sb->s_subtype = fc->subtype; + fc->subtype = NULL; + } + /* * Write barrier is for super_cache_count(). We place it before setting * SB_BORN as the data dependency between the two functions is the @@ -1273,14 +1508,10 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) smp_wmb(); sb->s_flags |= SB_BORN; - error = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL); - if (error) - goto out_sb; - - if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT))) { - error = security_sb_kern_mount(sb); - if (error) - goto out_sb; + error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); + if (unlikely(error)) { + fc_drop_locked(fc); + return error; } /* @@ -1290,18 +1521,11 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) * violate this rule. */ WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " - "negative value (%lld)\n", type->name, sb->s_maxbytes); + "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes); - up_write(&sb->s_umount); - security_free_mnt_opts(&sec_opts); - return root; -out_sb: - dput(root); - deactivate_locked_super(sb); -out_free_secdata: - security_free_mnt_opts(&sec_opts); - return ERR_PTR(error); + return 0; } +EXPORT_SYMBOL(vfs_get_tree); /* * Setup private BDI for given superblock. It gets automatically cleaned up diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c @@ -13,34 +13,69 @@ #include <linux/magic.h> #include <linux/mount.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/user_namespace.h> +#include <linux/fs_context.h> +#include <net/net_namespace.h> #include "sysfs.h" static struct kernfs_root *sysfs_root; struct kernfs_node *sysfs_root_kn; -static struct dentry *sysfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int sysfs_get_tree(struct fs_context *fc) { - struct dentry *root; - void *ns; - bool new_sb = false; + struct kernfs_fs_context *kfc = fc->fs_private; + int ret; - if (!(flags & SB_KERNMOUNT)) { + ret = kernfs_get_tree(fc); + if (ret) + return ret; + + if (kfc->new_sb_created) + fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; + return 0; +} + +static void sysfs_fs_context_free(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc = fc->fs_private; + + if (kfc->ns_tag) + kobj_ns_drop(KOBJ_NS_TYPE_NET, kfc->ns_tag); + kernfs_free_fs_context(fc); + kfree(kfc); +} + +static const struct fs_context_operations sysfs_fs_context_ops = { + .free = sysfs_fs_context_free, + .get_tree = sysfs_get_tree, +}; + +static int sysfs_init_fs_context(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc; + struct net *netns; + + if (!(fc->sb_flags & SB_KERNMOUNT)) { if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) - return ERR_PTR(-EPERM); + return -EPERM; } - ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); - root = kernfs_mount_ns(fs_type, flags, sysfs_root, - SYSFS_MAGIC, &new_sb, ns); - if (!new_sb) - kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); - else if (!IS_ERR(root)) - root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; + kfc = kzalloc(sizeof(struct kernfs_fs_context), GFP_KERNEL); + if (!kfc) + return -ENOMEM; - return root; + kfc->ns_tag = netns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); + kfc->root = sysfs_root; + kfc->magic = SYSFS_MAGIC; + fc->fs_private = kfc; + fc->ops = &sysfs_fs_context_ops; + if (fc->user_ns) + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(netns->user_ns); + fc->global = true; + return 0; } static void sysfs_kill_sb(struct super_block *sb) @@ -52,10 +87,10 @@ static void sysfs_kill_sb(struct super_block *sb) } static struct file_system_type sysfs_fs_type = { - .name = "sysfs", - .mount = sysfs_mount, - .kill_sb = sysfs_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .name = "sysfs", + .init_fs_context = sysfs_init_fs_context, + .kill_sb = sysfs_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; int __init sysfs_init(void) diff --git a/include/linux/errno.h b/include/linux/errno.h @@ -18,6 +18,7 @@ #define ERESTART_RESTARTBLOCK 516 /* restart by calling sys_restart_syscall */ #define EPROBE_DEFER 517 /* Driver requests probe retry */ #define EOPENSTALE 518 /* open found a stale dentry */ +#define ENOPARAM 519 /* Parameter not supported */ /* Defined for the NFSv3 protocol */ #define EBADHANDLE 521 /* Illegal NFS file handle */ diff --git a/include/linux/fs.h b/include/linux/fs.h @@ -64,6 +64,8 @@ struct workqueue_struct; struct iov_iter; struct fscrypt_info; struct fscrypt_operations; +struct fs_context; +struct fs_parameter_description; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -1349,6 +1351,7 @@ extern int send_sigurg(struct fown_struct *fown); /* These sb flags are internal to the kernel */ #define SB_SUBMOUNT (1<<26) +#define SB_FORCE (1<<27) #define SB_NOSEC (1<<28) #define SB_BORN (1<<29) #define SB_ACTIVE (1<<30) @@ -1459,7 +1462,7 @@ struct super_block { * Filesystem subtype. If non-empty the filesystem type field * in /proc/mounts will be "type.subtype" */ - char *s_subtype; + const char *s_subtype; const struct dentry_operations *s_d_op; /* default d_op for dentries */ @@ -2170,6 +2173,8 @@ struct file_system_type { #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ + int (*init_fs_context)(struct fs_context *); + const struct fs_parameter_description *parameters; struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); void (*kill_sb) (struct super_block *); @@ -2225,8 +2230,12 @@ void kill_litter_super(struct super_block *sb); void deactivate_super(struct super_block *sb); void deactivate_locked_super(struct super_block *sb); int set_anon_super(struct super_block *s, void *data); +int set_anon_super_fc(struct super_block *s, struct fs_context *fc); int get_anon_bdev(dev_t *); void free_anon_bdev(dev_t); +struct super_block *sget_fc(struct fs_context *fc, + int (*test)(struct super_block *, struct fs_context *), + int (*set)(struct super_block *, struct fs_context *)); struct super_block *sget_userns(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), @@ -2269,8 +2278,7 @@ mount_pseudo(struct file_system_type *fs_type, char *name, extern int register_filesystem(struct file_system_type *); extern int unregister_filesystem(struct file_system_type *); -extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); -#define kern_mount(type) kern_mount_data(type, NULL) +extern struct vfsmount *kern_mount(struct file_system_type *); extern void kern_unmount(struct vfsmount *mnt); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h @@ -0,0 +1,188 @@ +/* Filesystem superblock creation and reconfiguration context. + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _LINUX_FS_CONTEXT_H +#define _LINUX_FS_CONTEXT_H + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/security.h> + +struct cred; +struct dentry; +struct file_operations; +struct file_system_type; +struct mnt_namespace; +struct net; +struct pid_namespace; +struct super_block; +struct user_namespace; +struct vfsmount; +struct path; + +enum fs_context_purpose { + FS_CONTEXT_FOR_MOUNT, /* New superblock for explicit mount */ + FS_CONTEXT_FOR_SUBMOUNT, /* New superblock for automatic submount */ + FS_CONTEXT_FOR_RECONFIGURE, /* Superblock reconfiguration (remount) */ +}; + +/* + * Type of parameter value. + */ +enum fs_value_type { + fs_value_is_undefined, + fs_value_is_flag, /* Value not given a value */ + fs_value_is_string, /* Value is a string */ + fs_value_is_blob, /* Value is a binary blob */ + fs_value_is_filename, /* Value is a filename* + dirfd */ + fs_value_is_filename_empty, /* Value is a filename* + dirfd + AT_EMPTY_PATH */ + fs_value_is_file, /* Value is a file* */ +}; + +/* + * Configuration parameter. + */ +struct fs_parameter { + const char *key; /* Parameter name */ + enum fs_value_type type:8; /* The type of value here */ + union { + char *string; + void *blob; + struct filename *name; + struct file *file; + }; + size_t size; + int dirfd; +}; + +/* + * Filesystem context for holding the parameters used in the creation or + * reconfiguration of a superblock. + * + * Superblock creation fills in ->root whereas reconfiguration begins with this + * already set. + * + * See Documentation/filesystems/mounting.txt + */ +struct fs_context { + const struct fs_context_operations *ops; + struct file_system_type *fs_type; + void *fs_private; /* The filesystem's context */ + struct dentry *root; /* The root and superblock */ + struct user_namespace *user_ns; /* The user namespace for this mount */ + struct net *net_ns; /* The network namespace for this mount */ + const struct cred *cred; /* The mounter's credentials */ + const char *source; /* The source name (eg. dev path) */ + const char *subtype; /* The subtype to set on the superblock */ + void *security; /* Linux S&M options */ + void *s_fs_info; /* Proposed s_fs_info */ + unsigned int sb_flags; /* Proposed superblock flags (SB_*) */ + unsigned int sb_flags_mask; /* Superblock flags that were changed */ + unsigned int lsm_flags; /* Information flags from the fs to the LSM */ + enum fs_context_purpose purpose:8; + bool need_free:1; /* Need to call ops->free() */ + bool global:1; /* Goes into &init_user_ns */ +}; + +struct fs_context_operations { + void (*free)(struct fs_context *fc); + int (*dup)(struct fs_context *fc, struct fs_context *src_fc); + int (*parse_param)(struct fs_context *fc, struct fs_parameter *param); + int (*parse_monolithic)(struct fs_context *fc, void *data); + int (*get_tree)(struct fs_context *fc); + int (*reconfigure)(struct fs_context *fc); +}; + +/* + * fs_context manipulation functions. + */ +extern struct fs_context *fs_context_for_mount(struct file_system_type *fs_type, + unsigned int sb_flags); +extern struct fs_context *fs_context_for_reconfigure(struct dentry *dentry, + unsigned int sb_flags, + unsigned int sb_flags_mask); +extern struct fs_context *fs_context_for_submount(struct file_system_type *fs_type, + struct dentry *reference); + +extern struct fs_context *vfs_dup_fs_context(struct fs_context *fc); +extern int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param); +extern int vfs_parse_fs_string(struct fs_context *fc, const char *key, + const char *value, size_t v_size); +extern int generic_parse_monolithic(struct fs_context *fc, void *data); +extern int vfs_get_tree(struct fs_context *fc); +extern void put_fs_context(struct fs_context *fc); + +/* + * sget() wrapper to be called from the ->get_tree() op. + */ +enum vfs_get_super_keying { + vfs_get_single_super, /* Only one such superblock may exist */ + vfs_get_keyed_super, /* Superblocks with different s_fs_info keys may exist */ + vfs_get_independent_super, /* Multiple independent superblocks may exist */ +}; +extern int vfs_get_super(struct fs_context *fc, + enum vfs_get_super_keying keying, + int (*fill_super)(struct super_block *sb, + struct fs_context *fc)); + +extern const struct file_operations fscontext_fops; + +#ifdef CONFIG_PRINTK +extern __attribute__((format(printf, 2, 3))) +void logfc(struct fs_context *fc, const char *fmt, ...); +#else +static inline __attribute__((format(printf, 2, 3))) +void logfc(struct fs_context *fc, const char *fmt, ...) +{ +} +#endif + +/** + * infof - Store supplementary informational message + * @fc: The context in which to log the informational message + * @fmt: The format string + * + * Store the supplementary informational message for the process if the process + * has enabled the facility. + */ +#define infof(fc, fmt, ...) ({ logfc(fc, "i "fmt, ## __VA_ARGS__); }) + +/** + * warnf - Store supplementary warning message + * @fc: The context in which to log the error message + * @fmt: The format string + * + * Store the supplementary warning message for the process if the process has + * enabled the facility. + */ +#define warnf(fc, fmt, ...) ({ logfc(fc, "w "fmt, ## __VA_ARGS__); }) + +/** + * errorf - Store supplementary error message + * @fc: The context in which to log the error message + * @fmt: The format string + * + * Store the supplementary error message for the process if the process has + * enabled the facility. + */ +#define errorf(fc, fmt, ...) ({ logfc(fc, "e "fmt, ## __VA_ARGS__); }) + +/** + * invalf - Store supplementary invalid argument error message + * @fc: The context in which to log the error message + * @fmt: The format string + * + * Store the supplementary error message for the process if the process has + * enabled the facility and return -EINVAL. + */ +#define invalf(fc, fmt, ...) ({ errorf(fc, fmt, ## __VA_ARGS__); -EINVAL; }) + +#endif /* _LINUX_FS_CONTEXT_H */ diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h @@ -0,0 +1,151 @@ +/* Filesystem parameter description and parser + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _LINUX_FS_PARSER_H +#define _LINUX_FS_PARSER_H + +#include <linux/fs_context.h> + +struct path; + +struct constant_table { + const char *name; + int value; +}; + +/* + * The type of parameter expected. + */ +enum fs_parameter_type { + __fs_param_wasnt_defined, + fs_param_is_flag, + fs_param_is_bool, + fs_param_is_u32, + fs_param_is_u32_octal, + fs_param_is_u32_hex, + fs_param_is_s32, + fs_param_is_u64, + fs_param_is_enum, + fs_param_is_string, + fs_param_is_blob, + fs_param_is_blockdev, + fs_param_is_path, + fs_param_is_fd, + nr__fs_parameter_type, +}; + +/* + * Specification of the type of value a parameter wants. + * + * Note that the fsparam_flag(), fsparam_string(), fsparam_u32(), ... macros + * should be used to generate elements of this type. + */ +struct fs_parameter_spec { + const char *name; + u8 opt; /* Option number (returned by fs_parse()) */ + enum fs_parameter_type type:8; /* The desired parameter type */ + unsigned short flags; +#define fs_param_v_optional 0x0001 /* The value is optional */ +#define fs_param_neg_with_no 0x0002 /* "noxxx" is negative param */ +#define fs_param_neg_with_empty 0x0004 /* "xxx=" is negative param */ +#define fs_param_deprecated 0x0008 /* The param is deprecated */ +}; + +struct fs_parameter_enum { + u8 opt; /* Option number (as fs_parameter_spec::opt) */ + char name[14]; + u8 value; +}; + +struct fs_parameter_description { + const char name[16]; /* Name for logging purposes */ + const struct fs_parameter_spec *specs; /* List of param specifications */ + const struct fs_parameter_enum *enums; /* Enum values */ +}; + +/* + * Result of parse. + */ +struct fs_parse_result { + bool negated; /* T if param was "noxxx" */ + bool has_value; /* T if value supplied to param */ + union { + bool boolean; /* For spec_bool */ + int int_32; /* For spec_s32/spec_enum */ + unsigned int uint_32; /* For spec_u32{,_octal,_hex}/spec_enum */ + u64 uint_64; /* For spec_u64 */ + }; +}; + +extern int fs_parse(struct fs_context *fc, + const struct fs_parameter_description *desc, + struct fs_parameter *value, + struct fs_parse_result *result); +extern int fs_lookup_param(struct fs_context *fc, + struct fs_parameter *param, + bool want_bdev, + struct path *_path); + +extern int __lookup_constant(const struct constant_table tbl[], size_t tbl_size, + const char *name, int not_found); +#define lookup_constant(t, n, nf) __lookup_constant(t, ARRAY_SIZE(t), (n), (nf)) + +#ifdef CONFIG_VALIDATE_FS_PARSER +extern bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, + int low, int high, int special); +extern bool fs_validate_description(const struct fs_parameter_description *desc); +#else +static inline bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, + int low, int high, int special) +{ return true; } +static inline bool fs_validate_description(const struct fs_parameter_description *desc) +{ return true; } +#endif + +/* + * Parameter type, name, index and flags element constructors. Use as: + * + * fsparam_xxxx("foo", Opt_foo) + * + * If existing helpers are not enough, direct use of __fsparam() would + * work, but any such case is probably a sign that new helper is needed. + * Helpers will remain stable; low-level implementation may change. + */ +#define __fsparam(TYPE, NAME, OPT, FLAGS) \ + { \ + .name = NAME, \ + .opt = OPT, \ + .type = TYPE, \ + .flags = FLAGS \ + } + +#define fsparam_flag(NAME, OPT) __fsparam(fs_param_is_flag, NAME, OPT, 0) +#define fsparam_flag_no(NAME, OPT) \ + __fsparam(fs_param_is_flag, NAME, OPT, \ + fs_param_neg_with_no) +#define fsparam_bool(NAME, OPT) __fsparam(fs_param_is_bool, NAME, OPT, 0) +#define fsparam_u32(NAME, OPT) __fsparam(fs_param_is_u32, NAME, OPT, 0) +#define fsparam_u32oct(NAME, OPT) \ + __fsparam(fs_param_is_u32_octal, NAME, OPT, 0) +#define fsparam_u32hex(NAME, OPT) \ + __fsparam(fs_param_is_u32_hex, NAME, OPT, 0) +#define fsparam_s32(NAME, OPT) __fsparam(fs_param_is_s32, NAME, OPT, 0) +#define fsparam_u64(NAME, OPT) __fsparam(fs_param_is_u64, NAME, OPT, 0) +#define fsparam_enum(NAME, OPT) __fsparam(fs_param_is_enum, NAME, OPT, 0) +#define fsparam_string(NAME, OPT) \ + __fsparam(fs_param_is_string, NAME, OPT, 0) +#define fsparam_blob(NAME, OPT) __fsparam(fs_param_is_blob, NAME, OPT, 0) +#define fsparam_bdev(NAME, OPT) __fsparam(fs_param_is_blockdev, NAME, OPT, 0) +#define fsparam_path(NAME, OPT) __fsparam(fs_param_is_path, NAME, OPT, 0) +#define fsparam_fd(NAME, OPT) __fsparam(fs_param_is_fd, NAME, OPT, 0) + + +#endif /* _LINUX_FS_PARSER_H */ diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h @@ -26,7 +26,9 @@ struct vm_area_struct; struct super_block; struct file_system_type; struct poll_table_struct; +struct fs_context; +struct kernfs_fs_context; struct kernfs_open_node; struct kernfs_iattrs; @@ -168,7 +170,6 @@ struct kernfs_node { * kernfs_node parameter. */ struct kernfs_syscall_ops { - int (*remount_fs)(struct kernfs_root *root, int *flags, char *data); int (*show_options)(struct seq_file *sf, struct kernfs_root *root); int (*mkdir)(struct kernfs_node *parent, const char *name, @@ -272,6 +273,18 @@ struct kernfs_ops { #endif }; +/* + * The kernfs superblock creation/mount parameter context. + */ +struct kernfs_fs_context { + struct kernfs_root *root; /* Root of the hierarchy being mounted */ + void *ns_tag; /* Namespace tag of the mount (or NULL) */ + unsigned long magic; /* File system specific magic number */ + + /* The following are set/used by kernfs_mount() */ + bool new_sb_created; /* Set to T if we allocated a new sb */ +}; + #ifdef CONFIG_KERNFS static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) @@ -359,11 +372,9 @@ __poll_t kernfs_generic_poll(struct kernfs_open_file *of, void kernfs_notify(struct kernfs_node *kn); const void *kernfs_super_ns(struct super_block *sb); -struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, unsigned long magic, - bool *new_sb_created, const void *ns); +int kernfs_get_tree(struct fs_context *fc); +void kernfs_free_fs_context(struct fs_context *fc); void kernfs_kill_sb(struct super_block *sb); -struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns); void kernfs_init(void); @@ -465,11 +476,10 @@ static inline void kernfs_notify(struct kernfs_node *kn) { } static inline const void *kernfs_super_ns(struct super_block *sb) { return NULL; } -static inline struct dentry * -kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, unsigned long magic, - bool *new_sb_created, const void *ns) -{ return ERR_PTR(-ENOSYS); } +static inline int kernfs_get_tree(struct fs_context *fc) +{ return -ENOSYS; } + +static inline void kernfs_free_fs_context(struct fs_context *fc) { } static inline void kernfs_kill_sb(struct super_block *sb) { } @@ -552,13 +562,4 @@ static inline int kernfs_rename(struct kernfs_node *kn, return kernfs_rename_ns(kn, new_parent, new_name, NULL); } -static inline struct dentry * -kernfs_mount(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, unsigned long magic, - bool *new_sb_created) -{ - return kernfs_mount_ns(fs_type, flags, root, - magic, new_sb_created, NULL); -} - #endif /* __LINUX_KERNFS_H */ diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h @@ -76,6 +76,22 @@ * changes on the process such as clearing out non-inheritable signal * state. This is called immediately after commit_creds(). * + * Security hooks for mount using fs_context. + * [See also Documentation/filesystems/mounting.txt] + * + * @fs_context_dup: + * Allocate and attach a security structure to sc->security. This pointer + * is initialised to NULL by the caller. + * @fc indicates the new filesystem context. + * @src_fc indicates the original filesystem context. + * @fs_context_parse_param: + * Userspace provided a parameter to configure a superblock. The LSM may + * reject it with an error and may use it for itself, in which case it + * should return 0; otherwise it should return -ENOPARAM to pass it on to + * the filesystem. + * @fc indicates the filesystem context. + * @param The parameter + * * Security hooks for filesystem operations. * * @sb_alloc_security: @@ -1460,6 +1476,9 @@ union security_list_options { void (*bprm_committing_creds)(struct linux_binprm *bprm); void (*bprm_committed_creds)(struct linux_binprm *bprm); + int (*fs_context_dup)(struct fs_context *fc, struct fs_context *src_sc); + int (*fs_context_parse_param)(struct fs_context *fc, struct fs_parameter *param); + int (*sb_alloc_security)(struct super_block *sb); void (*sb_free_security)(struct super_block *sb); void (*sb_free_mnt_opts)(void *mnt_opts); @@ -1800,6 +1819,8 @@ struct security_hook_heads { struct hlist_head bprm_check_security; struct hlist_head bprm_committing_creds; struct hlist_head bprm_committed_creds; + struct hlist_head fs_context_dup; + struct hlist_head fs_context_parse_param; struct hlist_head sb_alloc_security; struct hlist_head sb_free_security; struct hlist_head sb_free_mnt_opts; diff --git a/include/linux/mount.h b/include/linux/mount.h @@ -21,6 +21,7 @@ struct super_block; struct vfsmount; struct dentry; struct mnt_namespace; +struct fs_context; #define MNT_NOSUID 0x01 #define MNT_NODEV 0x02 @@ -88,6 +89,8 @@ struct path; extern struct vfsmount *clone_private_mount(const struct path *path); struct file_system_type; +extern struct vfsmount *fc_mount(struct fs_context *fc); +extern struct vfsmount *vfs_create_mount(struct fs_context *fc); extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data); diff --git a/include/linux/security.h b/include/linux/security.h @@ -53,6 +53,9 @@ struct msg_msg; struct xattr; struct xfrm_sec_ctx; struct mm_struct; +struct fs_context; +struct fs_parameter; +enum fs_value_type; /* Default (no) options for the capable function */ #define CAP_OPT_NONE 0x0 @@ -61,7 +64,7 @@ struct mm_struct; /* If capable is being called by a setid function */ #define CAP_OPT_INSETID BIT(2) -/* LSM Agnostic defines for sb_set_mnt_opts */ +/* LSM Agnostic defines for fs_context::lsm_flags */ #define SECURITY_LSM_NATIVE_LABELS 1 struct ctl_table; @@ -223,6 +226,8 @@ int security_bprm_set_creds(struct linux_binprm *bprm); int security_bprm_check(struct linux_binprm *bprm); void security_bprm_committing_creds(struct linux_binprm *bprm); void security_bprm_committed_creds(struct linux_binprm *bprm); +int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc); +int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param); int security_sb_alloc(struct super_block *sb); void security_sb_free(struct super_block *sb); void security_free_mnt_opts(void **mnt_opts); @@ -519,6 +524,17 @@ static inline void security_bprm_committed_creds(struct linux_binprm *bprm) { } +static inline int security_fs_context_dup(struct fs_context *fc, + struct fs_context *src_fc) +{ + return 0; +} +static inline int security_fs_context_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + return -ENOPARAM; +} + static inline int security_sb_alloc(struct super_block *sb) { return 0; diff --git a/ipc/mqueue.c b/ipc/mqueue.c @@ -18,6 +18,7 @@ #include <linux/pagemap.h> #include <linux/file.h> #include <linux/mount.h> +#include <linux/fs_context.h> #include <linux/namei.h> #include <linux/sysctl.h> #include <linux/poll.h> @@ -42,6 +43,10 @@ #include <net/sock.h> #include "util.h" +struct mqueue_fs_context { + struct ipc_namespace *ipc_ns; +}; + #define MQUEUE_MAGIC 0x19800202 #define DIRENT_SIZE 20 #define FILENT_SIZE 80 @@ -87,9 +92,11 @@ struct mqueue_inode_info { unsigned long qsize; /* size of queue in memory (sum of all msgs) */ }; +static struct file_system_type mqueue_fs_type; static const struct inode_operations mqueue_dir_inode_operations; static const struct file_operations mqueue_file_operations; static const struct super_operations mqueue_super_ops; +static const struct fs_context_operations mqueue_fs_context_ops; static void remove_notification(struct mqueue_inode_info *info); static struct kmem_cache *mqueue_inode_cachep; @@ -322,7 +329,7 @@ err: return ERR_PTR(ret); } -static int mqueue_fill_super(struct super_block *sb, void *data, int silent) +static int mqueue_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; struct ipc_namespace *ns = sb->s_fs_info; @@ -343,18 +350,56 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static struct dentry *mqueue_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int mqueue_get_tree(struct fs_context *fc) { - struct ipc_namespace *ns; - if (flags & SB_KERNMOUNT) { - ns = data; - data = NULL; - } else { - ns = current->nsproxy->ipc_ns; - } - return mount_ns(fs_type, flags, data, ns, ns->user_ns, mqueue_fill_super); + struct mqueue_fs_context *ctx = fc->fs_private; + + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns); + fc->s_fs_info = ctx->ipc_ns; + return vfs_get_super(fc, vfs_get_keyed_super, mqueue_fill_super); +} + +static void mqueue_fs_context_free(struct fs_context *fc) +{ + struct mqueue_fs_context *ctx = fc->fs_private; + + if (ctx->ipc_ns) + put_ipc_ns(ctx->ipc_ns); + kfree(ctx); +} + +static int mqueue_init_fs_context(struct fs_context *fc) +{ + struct mqueue_fs_context *ctx; + + ctx = kzalloc(sizeof(struct mqueue_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns); + fc->fs_private = ctx; + fc->ops = &mqueue_fs_context_ops; + return 0; +} + +static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) +{ + struct mqueue_fs_context *ctx; + struct fs_context *fc; + struct vfsmount *mnt; + + fc = fs_context_for_mount(&mqueue_fs_type, SB_KERNMOUNT); + if (IS_ERR(fc)) + return ERR_CAST(fc); + + ctx = fc->fs_private; + put_ipc_ns(ctx->ipc_ns); + ctx->ipc_ns = get_ipc_ns(ns); + + mnt = fc_mount(fc); + put_fs_context(fc); + return mnt; } static void init_once(void *foo) @@ -1522,15 +1567,22 @@ static const struct super_operations mqueue_super_ops = { .statfs = simple_statfs, }; +static const struct fs_context_operations mqueue_fs_context_ops = { + .free = mqueue_fs_context_free, + .get_tree = mqueue_get_tree, +}; + static struct file_system_type mqueue_fs_type = { - .name = "mqueue", - .mount = mqueue_mount, - .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT, + .name = "mqueue", + .init_fs_context = mqueue_init_fs_context, + .kill_sb = kill_litter_super, + .fs_flags = FS_USERNS_MOUNT, }; int mq_init_ns(struct ipc_namespace *ns) { + struct vfsmount *m; + ns->mq_queues_count = 0; ns->mq_queues_max = DFLT_QUEUESMAX; ns->mq_msg_max = DFLT_MSGMAX; @@ -1538,12 +1590,10 @@ int mq_init_ns(struct ipc_namespace *ns) ns->mq_msg_default = DFLT_MSG; ns->mq_msgsize_default = DFLT_MSGSIZE; - ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns); - if (IS_ERR(ns->mq_mnt)) { - int err = PTR_ERR(ns->mq_mnt); - ns->mq_mnt = NULL; - return err; - } + m = mq_create_mount(ns); + if (IS_ERR(m)) + return PTR_ERR(m); + ns->mq_mnt = m; return 0; } diff --git a/ipc/namespace.c b/ipc/namespace.c @@ -42,7 +42,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL); + ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL); if (ns == NULL) goto fail_dec; diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h @@ -7,6 +7,7 @@ #include <linux/workqueue.h> #include <linux/list.h> #include <linux/refcount.h> +#include <linux/fs_context.h> #define TRACE_CGROUP_PATH_LEN 1024 extern spinlock_t trace_cgroup_path_lock; @@ -37,6 +38,31 @@ extern void __init enable_debug_cgroup(void); } while (0) /* + * The cgroup filesystem superblock creation/mount context. + */ +struct cgroup_fs_context { + struct kernfs_fs_context kfc; + struct cgroup_root *root; + struct cgroup_namespace *ns; + unsigned int flags; /* CGRP_ROOT_* flags */ + + /* cgroup1 bits */ + bool cpuset_clone_children; + bool none; /* User explicitly requested empty subsystem */ + bool all_ss; /* Seen 'all' option */ + u16 subsys_mask; /* Selected subsystems */ + char *name; /* Hierarchy name */ + char *release_agent; /* Path for release notifications */ +}; + +static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc = fc->fs_private; + + return container_of(kfc, struct cgroup_fs_context, kfc); +} + +/* * A cgroup can be associated with multiple css_sets as different tasks may * belong to different cgroups on different hierarchies. In the other * direction, a css_set is naturally associated with multiple cgroups. @@ -117,16 +143,6 @@ struct cgroup_mgctx { #define DEFINE_CGROUP_MGCTX(name) \ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) -struct cgroup_sb_opts { - u16 subsys_mask; - unsigned int flags; - char *release_agent; - bool cpuset_clone_children; - char *name; - /* User explicitly requested empty subsystem */ - bool none; -}; - extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; @@ -197,12 +213,10 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); void cgroup_free_root(struct cgroup_root *root); -void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); +void init_cgroup_root(struct cgroup_fs_context *ctx); int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, - struct cgroup_root *root, unsigned long magic, - struct cgroup_namespace *ns); +int cgroup_do_get_tree(struct fs_context *fc); int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp); void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); @@ -246,14 +260,15 @@ extern const struct proc_ns_operations cgroupns_operations; */ extern struct cftype cgroup1_base_files[]; extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; +extern const struct fs_parameter_description cgroup1_fs_parameters; int proc_cgroupstats_show(struct seq_file *m, void *v); bool cgroup1_ssid_disabled(int ssid); void cgroup1_pidlist_destroy_all(struct cgroup *cgrp); void cgroup1_release_agent(struct work_struct *work); void cgroup1_check_for_release(struct cgroup *cgrp); -struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, - void *data, unsigned long magic, - struct cgroup_namespace *ns); +int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param); +int cgroup1_get_tree(struct fs_context *fc); +int cgroup1_reconfigure(struct fs_context *ctx); #endif /* __CGROUP_INTERNAL_H */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c @@ -13,9 +13,12 @@ #include <linux/delayacct.h> #include <linux/pid_namespace.h> #include <linux/cgroupstats.h> +#include <linux/fs_parser.h> #include <trace/events/cgroup.h> +#define cg_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__) + /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls @@ -906,172 +909,195 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo return 0; } -static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) -{ - char *token, *o = data; - bool all_ss = false, one_ss = false; - u16 mask = U16_MAX; - struct cgroup_subsys *ss; - int nr_opts = 0; - int i; - -#ifdef CONFIG_CPUSETS - mask = ~((u16)1 << cpuset_cgrp_id); -#endif - - memset(opts, 0, sizeof(*opts)); +enum cgroup1_param { + Opt_all, + Opt_clone_children, + Opt_cpuset_v2_mode, + Opt_name, + Opt_none, + Opt_noprefix, + Opt_release_agent, + Opt_xattr, +}; - while ((token = strsep(&o, ",")) != NULL) { - nr_opts++; +static const struct fs_parameter_spec cgroup1_param_specs[] = { + fsparam_flag ("all", Opt_all), + fsparam_flag ("clone_children", Opt_clone_children), + fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), + fsparam_string("name", Opt_name), + fsparam_flag ("none", Opt_none), + fsparam_flag ("noprefix", Opt_noprefix), + fsparam_string("release_agent", Opt_release_agent), + fsparam_flag ("xattr", Opt_xattr), + {} +}; - if (!*token) - return -EINVAL; - if (!strcmp(token, "none")) { - /* Explicitly have no subsystems */ - opts->none = true; - continue; - } - if (!strcmp(token, "all")) { - /* Mutually exclusive option 'all' + subsystem name */ - if (one_ss) - return -EINVAL; - all_ss = true; - continue; - } - if (!strcmp(token, "noprefix")) { - opts->flags |= CGRP_ROOT_NOPREFIX; - continue; - } - if (!strcmp(token, "clone_children")) { - opts->cpuset_clone_children = true; - continue; - } - if (!strcmp(token, "cpuset_v2_mode")) { - opts->flags |= CGRP_ROOT_CPUSET_V2_MODE; - continue; - } - if (!strcmp(token, "xattr")) { - opts->flags |= CGRP_ROOT_XATTR; - continue; - } - if (!strncmp(token, "release_agent=", 14)) { - /* Specifying two release agents is forbidden */ - if (opts->release_agent) - return -EINVAL; - opts->release_agent = - kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); - if (!opts->release_agent) - return -ENOMEM; - continue; - } - if (!strncmp(token, "name=", 5)) { - const char *name = token + 5; - - /* blocked by boot param? */ - if (cgroup_no_v1_named) - return -ENOENT; - /* Can't specify an empty name */ - if (!strlen(name)) - return -EINVAL; - /* Must match [\w.-]+ */ - for (i = 0; i < strlen(name); i++) { - char c = name[i]; - if (isalnum(c)) - continue; - if ((c == '.') || (c == '-') || (c == '_')) - continue; - return -EINVAL; - } - /* Specifying two names is forbidden */ - if (opts->name) - return -EINVAL; - opts->name = kstrndup(name, - MAX_CGROUP_ROOT_NAMELEN - 1, - GFP_KERNEL); - if (!opts->name) - return -ENOMEM; +const struct fs_parameter_description cgroup1_fs_parameters = { + .name = "cgroup1", + .specs = cgroup1_param_specs, +}; - continue; +int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct cgroup_subsys *ss; + struct fs_parse_result result; + int opt, i; + + opt = fs_parse(fc, &cgroup1_fs_parameters, param, &result); + if (opt == -ENOPARAM) { + if (strcmp(param->key, "source") == 0) { + fc->source = param->string; + param->string = NULL; + return 0; } - for_each_subsys(ss, i) { - if (strcmp(token, ss->legacy_name)) + if (strcmp(param->key, ss->legacy_name)) continue; - if (!cgroup_ssid_enabled(i)) + ctx->subsys_mask |= (1 << i); + return 0; + } + return cg_invalf(fc, "cgroup1: Unknown subsys name '%s'", param->key); + } + if (opt < 0) + return opt; + + switch (opt) { + case Opt_none: + /* Explicitly have no subsystems */ + ctx->none = true; + break; + case Opt_all: + ctx->all_ss = true; + break; + case Opt_noprefix: + ctx->flags |= CGRP_ROOT_NOPREFIX; + break; + case Opt_clone_children: + ctx->cpuset_clone_children = true; + break; + case Opt_cpuset_v2_mode: + ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; + break; + case Opt_xattr: + ctx->flags |= CGRP_ROOT_XATTR; + break; + case Opt_release_agent: + /* Specifying two release agents is forbidden */ + if (ctx->release_agent) + return cg_invalf(fc, "cgroup1: release_agent respecified"); + ctx->release_agent = param->string; + param->string = NULL; + break; + case Opt_name: + /* blocked by boot param? */ + if (cgroup_no_v1_named) + return -ENOENT; + /* Can't specify an empty name */ + if (!param->size) + return cg_invalf(fc, "cgroup1: Empty name"); + if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1) + return cg_invalf(fc, "cgroup1: Name too long"); + /* Must match [\w.-]+ */ + for (i = 0; i < param->size; i++) { + char c = param->string[i]; + if (isalnum(c)) continue; - if (cgroup1_ssid_disabled(i)) + if ((c == '.') || (c == '-') || (c == '_')) continue; - - /* Mutually exclusive option 'all' + subsystem name */ - if (all_ss) - return -EINVAL; - opts->subsys_mask |= (1 << i); - one_ss = true; - - break; + return cg_invalf(fc, "cgroup1: Invalid name"); } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; + /* Specifying two names is forbidden */ + if (ctx->name) + return cg_invalf(fc, "cgroup1: name respecified"); + ctx->name = param->string; + param->string = NULL; + break; } + return 0; +} + +static int check_cgroupfs_options(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + u16 mask = U16_MAX; + u16 enabled = 0; + struct cgroup_subsys *ss; + int i; + +#ifdef CONFIG_CPUSETS + mask = ~((u16)1 << cpuset_cgrp_id); +#endif + for_each_subsys(ss, i) + if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) + enabled |= 1 << i; + + ctx->subsys_mask &= enabled; /* - * If the 'all' option was specified select all the subsystems, - * otherwise if 'none', 'name=' and a subsystem name options were - * not specified, let's default to 'all' + * In absense of 'none', 'name=' or subsystem name options, + * let's default to 'all'. */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) - opts->subsys_mask |= (1 << i); + if (!ctx->subsys_mask && !ctx->none && !ctx->name) + ctx->all_ss = true; + + if (ctx->all_ss) { + /* Mutually exclusive option 'all' + subsystem name */ + if (ctx->subsys_mask) + return cg_invalf(fc, "cgroup1: subsys name conflicts with all"); + /* 'all' => select all the subsystems */ + ctx->subsys_mask = enabled; + } /* * We either have to specify by name or by subsystems. (So all * empty hierarchies must have a name). */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; + if (!ctx->subsys_mask && !ctx->name) + return cg_invalf(fc, "cgroup1: Need name or subsystem set"); /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. */ - if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) - return -EINVAL; + if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask)) + return cg_invalf(fc, "cgroup1: noprefix used incorrectly"); /* Can't specify "none" and some subsystems */ - if (opts->subsys_mask && opts->none) - return -EINVAL; + if (ctx->subsys_mask && ctx->none) + return cg_invalf(fc, "cgroup1: none used incorrectly"); return 0; } -static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) +int cgroup1_reconfigure(struct fs_context *fc) { - int ret = 0; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); - struct cgroup_sb_opts opts; + int ret = 0; u16 added_mask, removed_mask; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ - ret = parse_cgroupfs_options(data, &opts); + ret = check_cgroupfs_options(fc); if (ret) goto out_unlock; - if (opts.subsys_mask != root->subsys_mask || opts.release_agent) + if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent) pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); - added_mask = opts.subsys_mask & ~root->subsys_mask; - removed_mask = root->subsys_mask & ~opts.subsys_mask; + added_mask = ctx->subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~ctx->subsys_mask; /* Don't allow flags or name to change at remount */ - if ((opts.flags ^ root->flags) || - (opts.name && strcmp(opts.name, root->name))) { - pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", - opts.flags, opts.name ?: "", root->flags, root->name); + if ((ctx->flags ^ root->flags) || + (ctx->name && strcmp(ctx->name, root->name))) { + cg_invalf(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"", + ctx->flags, ctx->name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; } @@ -1088,17 +1114,15 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); - if (opts.release_agent) { + if (ctx->release_agent) { spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, opts.release_agent); + strcpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } trace_cgroup_remount(root); out_unlock: - kfree(opts.release_agent); - kfree(opts.name); mutex_unlock(&cgroup_mutex); return ret; } @@ -1106,28 +1130,30 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { .rename = cgroup1_rename, .show_options = cgroup1_show_options, - .remount_fs = cgroup1_remount, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, }; -struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, - void *data, unsigned long magic, - struct cgroup_namespace *ns) +/* + * The guts of cgroup1 mount - find or create cgroup_root to use. + * Called with cgroup_mutex held; returns 0 on success, -E... on + * error and positive - in case when the candidate is busy dying. + * On success it stashes a reference to cgroup_root into given + * cgroup_fs_context; that reference is *NOT* counting towards the + * cgroup_root refcount. + */ +static int cgroup1_root_to_use(struct fs_context *fc) { - struct cgroup_sb_opts opts; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_root *root; struct cgroup_subsys *ss; - struct dentry *dentry; int i, ret; - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); - /* First find the desired set of subsystems */ - ret = parse_cgroupfs_options(data, &opts); + ret = check_cgroupfs_options(fc); if (ret) - goto out_unlock; + return ret; /* * Destruction of cgroup root is asynchronous, so subsystems may @@ -1137,16 +1163,12 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * starting. Testing ref liveliness is good enough. */ for_each_subsys(ss, i) { - if (!(opts.subsys_mask & (1 << i)) || + if (!(ctx->subsys_mask & (1 << i)) || ss->root == &cgrp_dfl_root) continue; - if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - msleep(10); - ret = restart_syscall(); - goto out_free; - } + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) + return 1; /* restart */ cgroup_put(&ss->root->cgrp); } @@ -1161,8 +1183,8 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * name matches but sybsys_mask doesn't, we should fail. * Remember whether name matched. */ - if (opts.name) { - if (strcmp(opts.name, root->name)) + if (ctx->name) { + if (strcmp(ctx->name, root->name)) continue; name_match = true; } @@ -1171,19 +1193,18 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * If we asked for subsystems (or explicitly for no * subsystems) then they must match. */ - if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->subsys_mask)) { + if ((ctx->subsys_mask || ctx->none) && + (ctx->subsys_mask != root->subsys_mask)) { if (!name_match) continue; - ret = -EBUSY; - goto out_unlock; + return -EBUSY; } - if (root->flags ^ opts.flags) + if (root->flags ^ ctx->flags) pr_warn("new mount options do not match the existing superblock, will be ignored\n"); - ret = 0; - goto out_unlock; + ctx->root = root; + return 0; } /* @@ -1191,55 +1212,58 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * specification is allowed for already existing hierarchies but we * can't create new one without subsys specification. */ - if (!opts.subsys_mask && !opts.none) { - ret = -EINVAL; - goto out_unlock; - } + if (!ctx->subsys_mask && !ctx->none) + return cg_invalf(fc, "cgroup1: No subsys list or none specified"); /* Hierarchies may only be created in the initial cgroup namespace. */ - if (ns != &init_cgroup_ns) { - ret = -EPERM; - goto out_unlock; - } + if (ctx->ns != &init_cgroup_ns) + return -EPERM; root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - ret = -ENOMEM; - goto out_unlock; - } + if (!root) + return -ENOMEM; - init_cgroup_root(root, &opts); + ctx->root = root; + init_cgroup_root(ctx); - ret = cgroup_setup_root(root, opts.subsys_mask); + ret = cgroup_setup_root(root, ctx->subsys_mask); if (ret) cgroup_free_root(root); + return ret; +} -out_unlock: - if (!ret && !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - mutex_unlock(&cgroup_mutex); -out_free: - kfree(opts.release_agent); - kfree(opts.name); +int cgroup1_get_tree(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + int ret; - if (ret) - return ERR_PTR(ret); + /* Check if the caller has permission to mount. */ + if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); + + ret = cgroup1_root_to_use(fc); + if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt)) + ret = 1; /* restart */ - dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, - CGROUP_SUPER_MAGIC, ns); + mutex_unlock(&cgroup_mutex); - if (!IS_ERR(dentry) && percpu_ref_is_dying(&root->cgrp.self.refcnt)) { - struct super_block *sb = dentry->d_sb; - dput(dentry); + if (!ret) + ret = cgroup_do_get_tree(fc); + + if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) { + struct super_block *sb = fc->root->d_sb; + dput(fc->root); deactivate_locked_super(sb); + ret = 1; + } + + if (unlikely(ret > 0)) { msleep(10); - dentry = ERR_PTR(restart_syscall()); + return restart_syscall(); } - return dentry; + return ret; } static int __init cgroup1_wq_init(void) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c @@ -54,6 +54,7 @@ #include <linux/proc_ns.h> #include <linux/nsproxy.h> #include <linux/file.h> +#include <linux/fs_parser.h> #include <linux/sched/cputime.h> #include <linux/psi.h> #include <net/sock.h> @@ -1772,26 +1773,37 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } -static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) -{ - char *token; +enum cgroup2_param { + Opt_nsdelegate, + nr__cgroup2_params +}; - *root_flags = 0; +static const struct fs_parameter_spec cgroup2_param_specs[] = { + fsparam_flag ("nsdelegate", Opt_nsdelegate), + {} +}; - if (!data || *data == '\0') - return 0; +static const struct fs_parameter_description cgroup2_fs_parameters = { + .name = "cgroup2", + .specs = cgroup2_param_specs, +}; - while ((token = strsep(&data, ",")) != NULL) { - if (!strcmp(token, "nsdelegate")) { - *root_flags |= CGRP_ROOT_NS_DELEGATE; - continue; - } +static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct fs_parse_result result; + int opt; - pr_err("cgroup2: unknown option \"%s\"\n", token); - return -EINVAL; - } + opt = fs_parse(fc, &cgroup2_fs_parameters, param, &result); + if (opt < 0) + return opt; - return 0; + switch (opt) { + case Opt_nsdelegate: + ctx->flags |= CGRP_ROOT_NS_DELEGATE; + return 0; + } + return -EINVAL; } static void apply_cgroup_root_flags(unsigned int root_flags) @@ -1811,16 +1823,11 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root return 0; } -static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) +static int cgroup_reconfigure(struct fs_context *fc) { - unsigned int root_flags; - int ret; - - ret = parse_cgroup_root_flags(data, &root_flags); - if (ret) - return ret; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); - apply_cgroup_root_flags(root_flags); + apply_cgroup_root_flags(ctx->flags); return 0; } @@ -1908,8 +1915,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } -void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) +void init_cgroup_root(struct cgroup_fs_context *ctx) { + struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; INIT_LIST_HEAD(&root->root_list); @@ -1918,12 +1926,12 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) init_cgroup_housekeeping(cgrp); idr_init(&root->cgroup_idr); - root->flags = opts->flags; - if (opts->release_agent) - strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); - if (opts->name) - strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); - if (opts->cpuset_clone_children) + root->flags = ctx->flags; + if (ctx->release_agent) + strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); + if (ctx->name) + strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN); + if (ctx->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } @@ -2028,60 +2036,104 @@ out: return ret; } -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, - struct cgroup_root *root, unsigned long magic, - struct cgroup_namespace *ns) +int cgroup_do_get_tree(struct fs_context *fc) { - struct dentry *dentry; - bool new_sb = false; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + int ret; - dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb); + ctx->kfc.root = ctx->root->kf_root; + if (fc->fs_type == &cgroup2_fs_type) + ctx->kfc.magic = CGROUP2_SUPER_MAGIC; + else + ctx->kfc.magic = CGROUP_SUPER_MAGIC; + ret = kernfs_get_tree(fc); /* * In non-init cgroup namespace, instead of root cgroup's dentry, * we return the dentry corresponding to the cgroupns->root_cgrp. */ - if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { + if (!ret && ctx->ns != &init_cgroup_ns) { struct dentry *nsdentry; - struct super_block *sb = dentry->d_sb; + struct super_block *sb = fc->root->d_sb; struct cgroup *cgrp; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); - cgrp = cset_cgroup_from_root(ns->root_cset, root); + cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root); spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); nsdentry = kernfs_node_dentry(cgrp->kn, sb); - dput(dentry); - if (IS_ERR(nsdentry)) + dput(fc->root); + fc->root = nsdentry; + if (IS_ERR(nsdentry)) { + ret = PTR_ERR(nsdentry); deactivate_locked_super(sb); - dentry = nsdentry; + } } - if (!new_sb) - cgroup_put(&root->cgrp); + if (!ctx->kfc.new_sb_created) + cgroup_put(&ctx->root->cgrp); - return dentry; + return ret; } -static struct dentry *cgroup_mount(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data) +/* + * Destroy a cgroup filesystem context. + */ +static void cgroup_fs_context_free(struct fs_context *fc) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; - struct dentry *dentry; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + + kfree(ctx->name); + kfree(ctx->release_agent); + put_cgroup_ns(ctx->ns); + kernfs_free_fs_context(fc); + kfree(ctx); +} + +static int cgroup_get_tree(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; - get_cgroup_ns(ns); + cgrp_dfl_visible = true; + cgroup_get_live(&cgrp_dfl_root.cgrp); + ctx->root = &cgrp_dfl_root; - /* Check if the caller has permission to mount. */ - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { - put_cgroup_ns(ns); - return ERR_PTR(-EPERM); - } + ret = cgroup_do_get_tree(fc); + if (!ret) + apply_cgroup_root_flags(ctx->flags); + return ret; +} + +static const struct fs_context_operations cgroup_fs_context_ops = { + .free = cgroup_fs_context_free, + .parse_param = cgroup2_parse_param, + .get_tree = cgroup_get_tree, + .reconfigure = cgroup_reconfigure, +}; + +static const struct fs_context_operations cgroup1_fs_context_ops = { + .free = cgroup_fs_context_free, + .parse_param = cgroup1_parse_param, + .get_tree = cgroup1_get_tree, + .reconfigure = cgroup1_reconfigure, +}; + +/* + * Initialise the cgroup filesystem creation/reconfiguration context. Notably, + * we select the namespace we're going to use. + */ +static int cgroup_init_fs_context(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx; + + ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; /* * The first time anyone tries to mount a cgroup, enable the list @@ -2090,29 +2142,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); - if (fs_type == &cgroup2_fs_type) { - unsigned int root_flags; - - ret = parse_cgroup_root_flags(data, &root_flags); - if (ret) { - put_cgroup_ns(ns); - return ERR_PTR(ret); - } - - cgrp_dfl_visible = true; - cgroup_get_live(&cgrp_dfl_root.cgrp); - - dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, - CGROUP2_SUPER_MAGIC, ns); - if (!IS_ERR(dentry)) - apply_cgroup_root_flags(root_flags); - } else { - dentry = cgroup1_mount(&cgroup_fs_type, flags, data, - CGROUP_SUPER_MAGIC, ns); - } - - put_cgroup_ns(ns); - return dentry; + ctx->ns = current->nsproxy->cgroup_ns; + get_cgroup_ns(ctx->ns); + fc->fs_private = &ctx->kfc; + if (fc->fs_type == &cgroup2_fs_type) + fc->ops = &cgroup_fs_context_ops; + else + fc->ops = &cgroup1_fs_context_ops; + if (fc->user_ns) + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->ns->user_ns); + fc->global = true; + return 0; } static void cgroup_kill_sb(struct super_block *sb) @@ -2135,17 +2176,19 @@ static void cgroup_kill_sb(struct super_block *sb) } struct file_system_type cgroup_fs_type = { - .name = "cgroup", - .mount = cgroup_mount, - .kill_sb = cgroup_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .name = "cgroup", + .init_fs_context = cgroup_init_fs_context, + .parameters = &cgroup1_fs_parameters, + .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; static struct file_system_type cgroup2_fs_type = { - .name = "cgroup2", - .mount = cgroup_mount, - .kill_sb = cgroup_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .name = "cgroup2", + .init_fs_context = cgroup_init_fs_context, + .parameters = &cgroup2_fs_parameters, + .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, @@ -5280,7 +5323,6 @@ int cgroup_rmdir(struct kernfs_node *kn) static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .show_options = cgroup_show_options, - .remount_fs = cgroup_remount, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, @@ -5347,11 +5389,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) */ int __init cgroup_init_early(void) { - static struct cgroup_sb_opts __initdata opts; + static struct cgroup_fs_context __initdata ctx; struct cgroup_subsys *ss; int i; - init_cgroup_root(&cgrp_dfl_root, &opts); + ctx.root = &cgrp_dfl_root; + init_cgroup_root(&ctx); cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; RCU_INIT_POINTER(init_task.cgroups, &init_css_set); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c @@ -39,6 +39,7 @@ #include <linux/memory.h> #include <linux/export.h> #include <linux/mount.h> +#include <linux/fs_context.h> #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> @@ -359,25 +360,52 @@ static inline bool is_in_v2_mode(void) * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead */ -static struct dentry *cpuset_mount(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, void *data) -{ - struct file_system_type *cgroup_fs = get_fs_type("cgroup"); - struct dentry *ret = ERR_PTR(-ENODEV); - if (cgroup_fs) { - char mountopts[] = - "cpuset,noprefix," - "release_agent=/sbin/cpuset_release_agent"; - ret = cgroup_fs->mount(cgroup_fs, flags, - unused_dev_name, mountopts); - put_filesystem(cgroup_fs); +static int cpuset_get_tree(struct fs_context *fc) +{ + struct file_system_type *cgroup_fs; + struct fs_context *new_fc; + int ret; + + cgroup_fs = get_fs_type("cgroup"); + if (!cgroup_fs) + return -ENODEV; + + new_fc = fs_context_for_mount(cgroup_fs, fc->sb_flags); + if (IS_ERR(new_fc)) { + ret = PTR_ERR(new_fc); + } else { + static const char agent_path[] = "/sbin/cpuset_release_agent"; + ret = vfs_parse_fs_string(new_fc, "cpuset", NULL, 0); + if (!ret) + ret = vfs_parse_fs_string(new_fc, "noprefix", NULL, 0); + if (!ret) + ret = vfs_parse_fs_string(new_fc, "release_agent", + agent_path, sizeof(agent_path) - 1); + if (!ret) + ret = vfs_get_tree(new_fc); + if (!ret) { /* steal the result */ + fc->root = new_fc->root; + new_fc->root = NULL; + } + put_fs_context(new_fc); } + put_filesystem(cgroup_fs); return ret; } +static const struct fs_context_operations cpuset_fs_context_ops = { + .get_tree = cpuset_get_tree, +}; + +static int cpuset_init_fs_context(struct fs_context *fc) +{ + fc->ops = &cpuset_fs_context_ops; + return 0; +} + static struct file_system_type cpuset_fs_type = { - .name = "cpuset", - .mount = cpuset_mount, + .name = "cpuset", + .init_fs_context = cpuset_init_fs_context, }; /* diff --git a/security/security.c b/security/security.c @@ -764,6 +764,16 @@ void security_bprm_committed_creds(struct linux_binprm *bprm) call_void_hook(bprm_committed_creds, bprm); } +int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) +{ + return call_int_hook(fs_context_dup, 0, fc, src_fc); +} + +int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + return call_int_hook(fs_context_parse_param, -ENOPARAM, fc, param); +} + int security_sb_alloc(struct super_block *sb) { return call_int_hook(sb_alloc_security, 0, sb); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c @@ -48,6 +48,8 @@ #include <linux/fdtable.h> #include <linux/namei.h> #include <linux/mount.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/tty.h> @@ -410,11 +412,11 @@ static inline int inode_doinit(struct inode *inode) enum { Opt_error = -1, - Opt_context = 1, + Opt_context = 0, + Opt_defcontext = 1, Opt_fscontext = 2, - Opt_defcontext = 3, - Opt_rootcontext = 4, - Opt_seclabel = 5, + Opt_rootcontext = 3, + Opt_seclabel = 4, }; #define A(s, has_arg) {#s, sizeof(#s) - 1, Opt_##s, has_arg} @@ -1067,6 +1069,7 @@ static int show_sid(struct seq_file *m, u32 sid) if (!rc) { bool has_comma = context && strchr(context, ','); + seq_putc(m, '='); if (has_comma) seq_putc(m, '\"'); seq_escape(m, context, "\"\n\\"); @@ -1120,7 +1123,7 @@ static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb) } if (sbsec->flags & SBLABEL_MNT) { seq_putc(m, ','); - seq_puts(m, LABELSUPP_STR); + seq_puts(m, SECLABEL_STR); } return 0; } @@ -2739,6 +2742,76 @@ static int selinux_umount(struct vfsmount *mnt, int flags) FILESYSTEM__UNMOUNT, NULL); } +static int selinux_fs_context_dup(struct fs_context *fc, + struct fs_context *src_fc) +{ + const struct selinux_mnt_opts *src = src_fc->security; + struct selinux_mnt_opts *opts; + + if (!src) + return 0; + + fc->security = kzalloc(sizeof(struct selinux_mnt_opts), GFP_KERNEL); + if (!fc->security) + return -ENOMEM; + + opts = fc->security; + + if (src->fscontext) { + opts->fscontext = kstrdup(src->fscontext, GFP_KERNEL); + if (!opts->fscontext) + return -ENOMEM; + } + if (src->context) { + opts->context = kstrdup(src->context, GFP_KERNEL); + if (!opts->context) + return -ENOMEM; + } + if (src->rootcontext) { + opts->rootcontext = kstrdup(src->rootcontext, GFP_KERNEL); + if (!opts->rootcontext) + return -ENOMEM; + } + if (src->defcontext) { + opts->defcontext = kstrdup(src->defcontext, GFP_KERNEL); + if (!opts->defcontext) + return -ENOMEM; + } + return 0; +} + +static const struct fs_parameter_spec selinux_param_specs[] = { + fsparam_string(CONTEXT_STR, Opt_context), + fsparam_string(DEFCONTEXT_STR, Opt_defcontext), + fsparam_string(FSCONTEXT_STR, Opt_fscontext), + fsparam_string(ROOTCONTEXT_STR, Opt_rootcontext), + fsparam_flag (SECLABEL_STR, Opt_seclabel), + {} +}; + +static const struct fs_parameter_description selinux_fs_parameters = { + .name = "SELinux", + .specs = selinux_param_specs, +}; + +static int selinux_fs_context_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct fs_parse_result result; + int opt, rc; + + opt = fs_parse(fc, &selinux_fs_parameters, param, &result); + if (opt < 0) + return opt; + + rc = selinux_add_opt(opt, param->string, &fc->security); + if (!rc) { + param->string = NULL; + rc = 1; + } + return rc; +} + /* inode security operations */ static int selinux_inode_alloc_security(struct inode *inode) @@ -6592,6 +6665,9 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(bprm_committing_creds, selinux_bprm_committing_creds), LSM_HOOK_INIT(bprm_committed_creds, selinux_bprm_committed_creds), + LSM_HOOK_INIT(fs_context_dup, selinux_fs_context_dup), + LSM_HOOK_INIT(fs_context_parse_param, selinux_fs_context_parse_param), + LSM_HOOK_INIT(sb_alloc_security, selinux_sb_alloc_security), LSM_HOOK_INIT(sb_free_security, selinux_sb_free_security), LSM_HOOK_INIT(sb_eat_lsm_opts, selinux_sb_eat_lsm_opts), @@ -6837,6 +6913,8 @@ static __init int selinux_init(void) else pr_debug("SELinux: Starting in permissive mode\n"); + fs_validate_description(&selinux_fs_parameters); + return 0; } diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h @@ -59,11 +59,11 @@ #define SE_SBPROC 0x0200 #define SE_SBGENFS 0x0400 -#define CONTEXT_STR "context=" -#define FSCONTEXT_STR "fscontext=" -#define ROOTCONTEXT_STR "rootcontext=" -#define DEFCONTEXT_STR "defcontext=" -#define LABELSUPP_STR "seclabel" +#define CONTEXT_STR "context" +#define FSCONTEXT_STR "fscontext" +#define ROOTCONTEXT_STR "rootcontext" +#define DEFCONTEXT_STR "defcontext" +#define SECLABEL_STR "seclabel" struct netlbl_lsm_secattr; diff --git a/security/smack/smack.h b/security/smack/smack.h @@ -196,22 +196,13 @@ struct smack_known_list_elem { enum { Opt_error = -1, - Opt_fsdefault = 1, - Opt_fsfloor = 2, - Opt_fshat = 3, - Opt_fsroot = 4, - Opt_fstransmute = 5, + Opt_fsdefault = 0, + Opt_fsfloor = 1, + Opt_fshat = 2, + Opt_fsroot = 3, + Opt_fstransmute = 4, }; -/* - * Mount options - */ -#define SMK_FSDEFAULT "smackfsdef=" -#define SMK_FSFLOOR "smackfsfloor=" -#define SMK_FSHAT "smackfshat=" -#define SMK_FSROOT "smackfsroot=" -#define SMK_FSTRANS "smackfstransmute=" - #define SMACK_DELETE_OPTION "-DELETE" #define SMACK_CIPSO_OPTION "-CIPSO" diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c @@ -43,6 +43,8 @@ #include <linux/shm.h> #include <linux/binfmts.h> #include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include "smack.h" #define TRANS_TRUE "TRUE" @@ -526,7 +528,6 @@ static int smack_syslog(int typefrom_file) return rc; } - /* * Superblock Hooks. */ @@ -631,6 +632,92 @@ out_opt_err: return -EINVAL; } +/** + * smack_fs_context_dup - Duplicate the security data on fs_context duplication + * @fc: The new filesystem context. + * @src_fc: The source filesystem context being duplicated. + * + * Returns 0 on success or -ENOMEM on error. + */ +static int smack_fs_context_dup(struct fs_context *fc, + struct fs_context *src_fc) +{ + struct smack_mnt_opts *dst, *src = src_fc->security; + + if (!src) + return 0; + + fc->security = kzalloc(sizeof(struct smack_mnt_opts), GFP_KERNEL); + if (!fc->security) + return -ENOMEM; + dst = fc->security; + + if (src->fsdefault) { + dst->fsdefault = kstrdup(src->fsdefault, GFP_KERNEL); + if (!dst->fsdefault) + return -ENOMEM; + } + if (src->fsfloor) { + dst->fsfloor = kstrdup(src->fsfloor, GFP_KERNEL); + if (!dst->fsfloor) + return -ENOMEM; + } + if (src->fshat) { + dst->fshat = kstrdup(src->fshat, GFP_KERNEL); + if (!dst->fshat) + return -ENOMEM; + } + if (src->fsroot) { + dst->fsroot = kstrdup(src->fsroot, GFP_KERNEL); + if (!dst->fsroot) + return -ENOMEM; + } + if (src->fstransmute) { + dst->fstransmute = kstrdup(src->fstransmute, GFP_KERNEL); + if (!dst->fstransmute) + return -ENOMEM; + } + return 0; +} + +static const struct fs_parameter_spec smack_param_specs[] = { + fsparam_string("fsdefault", Opt_fsdefault), + fsparam_string("fsfloor", Opt_fsfloor), + fsparam_string("fshat", Opt_fshat), + fsparam_string("fsroot", Opt_fsroot), + fsparam_string("fstransmute", Opt_fstransmute), + {} +}; + +static const struct fs_parameter_description smack_fs_parameters = { + .name = "smack", + .specs = smack_param_specs, +}; + +/** + * smack_fs_context_parse_param - Parse a single mount parameter + * @fc: The new filesystem context being constructed. + * @param: The parameter. + * + * Returns 0 on success, -ENOPARAM to pass the parameter on or anything else on + * error. + */ +static int smack_fs_context_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct fs_parse_result result; + int opt, rc; + + opt = fs_parse(fc, &smack_fs_parameters, param, &result); + if (opt < 0) + return opt; + + rc = smack_add_opt(opt, param->string, &fc->security); + if (!rc) + param->string = NULL; + return rc; +} + static int smack_sb_eat_lsm_opts(char *options, void **mnt_opts) { char *from = options, *to = options; @@ -4495,6 +4582,9 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme), LSM_HOOK_INIT(syslog, smack_syslog), + LSM_HOOK_INIT(fs_context_dup, smack_fs_context_dup), + LSM_HOOK_INIT(fs_context_parse_param, smack_fs_context_parse_param), + LSM_HOOK_INIT(sb_alloc_security, smack_sb_alloc_security), LSM_HOOK_INIT(sb_free_security, smack_sb_free_security), LSM_HOOK_INIT(sb_free_mnt_opts, smack_free_mnt_opts),