return rc;
}
- rc = posix_acl_valid(acl);
+ rc = posix_acl_valid(&init_user_ns, acl);
if (rc) {
CERROR("validate acl: %d\n", rc);
posix_acl_release(acl);
if (IS_ERR(acl))
return PTR_ERR(acl);
else if (acl) {
- retval = posix_acl_valid(acl);
+ retval = posix_acl_valid(inode->i_sb->s_user_ns, acl);
if (retval)
goto err_out;
}
if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID)))
return 0;
+ /*
+ * Verify that uid/gid changes are valid in the target
+ * namespace of the superblock.
+ */
+ if (ia_valid & ATTR_UID &&
+ !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid))
+ return -EOVERFLOW;
+ if (ia_valid & ATTR_GID &&
+ !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid))
+ return -EOVERFLOW;
+
+ /* Don't allow modifications of files with invalid uids or
+ * gids unless those uids & gids are being made valid.
+ */
+ if (!(ia_valid & ATTR_UID) && !uid_valid(inode->i_uid))
+ return -EOVERFLOW;
+ if (!(ia_valid & ATTR_GID) && !gid_valid(inode->i_gid))
+ return -EOVERFLOW;
+
error = security_inode_setattr(dentry, attr);
if (error)
return error;
if (!S_ISBLK(inode->i_mode))
goto fail;
error = -EACCES;
- if (path.mnt->mnt_flags & MNT_NODEV)
+ if (!may_open_dev(&path))
goto fail;
error = -ENOMEM;
bdev = bd_acquire(inode);
{
struct inode *inode;
+ s->s_iflags &= ~SB_I_NODEV;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
s->s_magic = DEVPTS_SUPER_MAGIC;
.name = "devpts",
.mount = devpts_mount,
.kill_sb = devpts_kill_sb,
- .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
+ .fs_flags = FS_USERNS_MOUNT,
};
/*
bprm->cred->euid = current_euid();
bprm->cred->egid = current_egid();
- if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
+ if (!mnt_may_suid(bprm->file->f_path.mnt))
return;
if (task_no_new_privs(current))
if (inode->i_flags & S_NOATIME)
return false;
+
+ /* Atime updates will likely cause i_uid and i_gid to be written
+ * back improprely if their true value is unknown to the vfs.
+ */
+ if (HAS_UNMAPPED_ID(inode))
+ return false;
+
if (IS_NOATIME(inode))
return false;
if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
struct dentry *root;
info->sb = sb;
+ /* Userspace would break if executables or devices appear on sysfs */
+ sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = magic;
info->root = root;
info->ns = ns;
- sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info);
+ sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags,
+ &init_user_ns, info);
if (IS_ERR(sb) || sb->s_fs_info != info)
kfree(info);
if (IS_ERR(sb))
#include <linux/posix_acl.h>
#include <linux/hash.h>
#include <linux/bitops.h>
+#include <linux/init_task.h>
#include <asm/uaccess.h>
#include "internal.h"
*/
if (IS_IMMUTABLE(inode))
return -EACCES;
+
+ /*
+ * Updating mtime will likely cause i_uid and i_gid to be
+ * written back improperly if their true value is unknown
+ * to the vfs.
+ */
+ if (HAS_UNMAPPED_ID(inode))
+ return -EACCES;
}
retval = do_inode_permission(inode, mask);
{
const struct inode *inode;
const struct inode *parent;
+ kuid_t puid;
if (!sysctl_protected_symlinks)
return 0;
return 0;
/* Allowed if parent directory and link owner match. */
- if (uid_eq(parent->i_uid, inode->i_uid))
+ puid = parent->i_uid;
+ if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
return 0;
if (nd->flags & LOOKUP_RCU)
bool *need_mntput)
{
struct vfsmount *mnt;
+ const struct cred *old_cred;
int err;
if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
path->dentry->d_inode)
return -EISDIR;
+ if (path->dentry->d_sb->s_user_ns != &init_user_ns)
+ return -EACCES;
+
nd->total_link_count++;
if (nd->total_link_count >= 40)
return -ELOOP;
+ old_cred = override_creds(&init_cred);
mnt = path->dentry->d_op->d_automount(path);
+ revert_creds(old_cred);
if (IS_ERR(mnt)) {
/*
* The filesystem is allowed to return -EISDIR here to indicate
* c. have CAP_FOWNER capability
* 6. If the victim is append-only or immutable we can't do antyhing with
* links pointing to it.
- * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
- * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
- * 9. We can't remove a root or mountpoint.
- * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ * 7. If the victim has an unknown uid or gid we can't change the inode.
+ * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ * 10. We can't remove a root or mountpoint.
+ * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
* nfs_async_unlink().
*/
static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
return -EPERM;
if (check_sticky(dir, inode) || IS_APPEND(inode) ||
- IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
+ IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
return -EPERM;
if (isdir) {
if (!d_is_dir(victim))
* 1. We can't do it if child already exists (open has special treatment for
* this case, but since we are inlined it's OK)
* 2. We can't do it if dir is read-only (done in permission())
- * 3. We should have write and exec permissions on dir
- * 4. We can't do it if dir is immutable (done in permission())
+ * 3. We can't do it if the fs can't represent the fsuid or fsgid.
+ * 4. We should have write and exec permissions on dir
+ * 5. We can't do it if dir is immutable (done in permission())
*/
static inline int may_create(struct inode *dir, struct dentry *child)
{
+ struct user_namespace *s_user_ns;
audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
if (child->d_inode)
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
+ s_user_ns = dir->i_sb->s_user_ns;
+ if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+ !kgid_has_mapping(s_user_ns, current_fsgid()))
+ return -EOVERFLOW;
return inode_permission(dir, MAY_WRITE | MAY_EXEC);
}
}
EXPORT_SYMBOL(vfs_create);
+bool may_open_dev(const struct path *path)
+{
+ return !(path->mnt->mnt_flags & MNT_NODEV) &&
+ !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
+}
+
static int may_open(struct path *path, int acc_mode, int flag)
{
struct dentry *dentry = path->dentry;
break;
case S_IFBLK:
case S_IFCHR:
- if (path->mnt->mnt_flags & MNT_NODEV)
+ if (!may_open_dev(path))
return -EACCES;
/*FALLTHRU*/
case S_IFIFO:
*/
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return -EPERM;
+ /*
+ * Updating the link count will likely cause i_uid and i_gid to
+ * be writen back improperly if their true value is unknown to
+ * the vfs.
+ */
+ if (HAS_UNMAPPED_ID(inode))
+ return -EPERM;
if (!dir->i_op->link)
return -EPERM;
if (S_ISDIR(inode->i_mode))
}
if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
!(mnt_flags & MNT_NODEV)) {
- /* Was the nodev implicitly added in mount? */
- if ((mnt->mnt_ns->user_ns != &init_user_ns) &&
- !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) {
- mnt_flags |= MNT_NODEV;
- } else {
- return -EPERM;
- }
+ return -EPERM;
}
if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
!(mnt_flags & MNT_NOSUID)) {
return err;
}
-static bool fs_fully_visible(struct file_system_type *fs_type, int *new_mnt_flags);
+static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags);
/*
* create a new mount for userspace and request it to be added into the
int mnt_flags, const char *name, void *data)
{
struct file_system_type *type;
- struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
struct vfsmount *mnt;
int err;
if (!type)
return -ENODEV;
- if (user_ns != &init_user_ns) {
- if (!(type->fs_flags & FS_USERNS_MOUNT)) {
- put_filesystem(type);
- return -EPERM;
- }
- /* Only in special cases allow devices from mounts
- * created outside the initial user namespace.
- */
- if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
- flags |= MS_NODEV;
- mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
- }
- if (type->fs_flags & FS_USERNS_VISIBLE) {
- if (!fs_fully_visible(type, &mnt_flags)) {
- put_filesystem(type);
- return -EPERM;
- }
- }
- }
-
mnt = vfs_kern_mount(type, flags, name, data);
if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
!mnt->mnt_sb->s_subtype)
if (IS_ERR(mnt))
return PTR_ERR(mnt);
+ if (mount_too_revealing(mnt, &mnt_flags)) {
+ mntput(mnt);
+ return -EPERM;
+ }
+
err = do_add_mount(real_mount(mnt), path, mnt_flags);
if (err)
mntput(mnt);
return chrooted;
}
-static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
+static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
+ int *new_mnt_flags)
{
- struct mnt_namespace *ns = current->nsproxy->mnt_ns;
int new_flags = *new_mnt_flags;
struct mount *mnt;
bool visible = false;
- if (unlikely(!ns))
- return false;
-
down_read(&namespace_sem);
list_for_each_entry(mnt, &ns->list, mnt_list) {
struct mount *child;
int mnt_flags;
- if (mnt->mnt.mnt_sb->s_type != type)
+ if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type)
continue;
/* This mount is not fully visible if it's root directory
if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
continue;
- /* Read the mount flags and filter out flags that
- * may safely be ignored.
- */
+ /* A local view of the mount flags */
mnt_flags = mnt->mnt.mnt_flags;
- if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC)
- mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC);
/* Don't miss readonly hidden in the superblock flags */
if (mnt->mnt.mnt_sb->s_flags & MS_RDONLY)
if ((mnt_flags & MNT_LOCK_READONLY) &&
!(new_flags & MNT_READONLY))
continue;
- if ((mnt_flags & MNT_LOCK_NODEV) &&
- !(new_flags & MNT_NODEV))
- continue;
- if ((mnt_flags & MNT_LOCK_NOSUID) &&
- !(new_flags & MNT_NOSUID))
- continue;
- if ((mnt_flags & MNT_LOCK_NOEXEC) &&
- !(new_flags & MNT_NOEXEC))
- continue;
if ((mnt_flags & MNT_LOCK_ATIME) &&
((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
continue;
}
/* Preserve the locked attributes */
*new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
- MNT_LOCK_NODEV | \
- MNT_LOCK_NOSUID | \
- MNT_LOCK_NOEXEC | \
MNT_LOCK_ATIME);
visible = true;
goto found;
return visible;
}
+static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
+{
+ const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
+ struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+ unsigned long s_iflags;
+
+ if (ns->user_ns == &init_user_ns)
+ return false;
+
+ /* Can this filesystem be too revealing? */
+ s_iflags = mnt->mnt_sb->s_iflags;
+ if (!(s_iflags & SB_I_USERNS_VISIBLE))
+ return false;
+
+ if ((s_iflags & required_iflags) != required_iflags) {
+ WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
+ required_iflags);
+ return true;
+ }
+
+ return !mnt_already_visible(ns, mnt, new_mnt_flags);
+}
+
+bool mnt_may_suid(struct vfsmount *mnt)
+{
+ /*
+ * Foreign mounts (accessed via fchdir or through /proc
+ * symlinks) are always treated as if they are nosuid. This
+ * prevents namespaces from trusting potentially unsafe
+ * suid/sgid bits, file caps, or security labels that originate
+ * in other namespaces.
+ */
+ return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
+ current_in_userns(mnt->mnt_sb->s_user_ns);
+}
+
static struct ns_common *mntns_get(struct task_struct *task)
{
struct ns_common *ns = NULL;
#endif
/* last one */ {""}
};
- struct net *net = data;
- int ret;
-
- ret = simple_fill_super(sb, 0x6e667364, nfsd_files);
- if (ret)
- return ret;
- sb->s_fs_info = get_net(net);
- return 0;
+ get_net(sb->s_fs_info);
+ return simple_fill_super(sb, 0x6e667364, nfsd_files);
}
static struct dentry *nfsd_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super);
+ struct net *net = current->nsproxy->net_ns;
+ return mount_ns(fs_type, flags, data, net, net->user_ns, nfsd_fill_super);
}
static void nfsd_umount(struct super_block *sb)
* Check if an acl is valid. Returns 0 if it is, or -E... otherwise.
*/
int
-posix_acl_valid(const struct posix_acl *acl)
+posix_acl_valid(struct user_namespace *user_ns, const struct posix_acl *acl)
{
const struct posix_acl_entry *pa, *pe;
int state = ACL_USER_OBJ;
case ACL_USER:
if (state != ACL_USER)
return -EINVAL;
- if (!uid_valid(pa->e_uid))
+ if (!kuid_has_mapping(user_ns, pa->e_uid))
return -EINVAL;
needs_mask = 1;
break;
case ACL_GROUP:
if (state != ACL_GROUP)
return -EINVAL;
- if (!gid_valid(pa->e_gid))
+ if (!kgid_has_mapping(user_ns, pa->e_gid))
return -EINVAL;
needs_mask = 1;
break;
return -EPERM;
if (acl) {
- int ret = posix_acl_valid(acl);
+ int ret = posix_acl_valid(inode->i_sb->s_user_ns, acl);
if (ret)
return ret;
}
return inode;
}
-int proc_fill_super(struct super_block *s)
+int proc_fill_super(struct super_block *s, void *data, int silent)
{
+ struct pid_namespace *ns = get_pid_ns(s->s_fs_info);
struct inode *root_inode;
int ret;
+ if (!proc_parse_options(data, ns))
+ return -EINVAL;
+
+ /* User space would break if executables or devices appear on proc */
+ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
s->s_magic = PROC_SUPER_MAGIC;
s->s_op = &proc_sops;
s->s_time_gran = 1;
+
+ /*
+ * procfs isn't actually a stacking filesystem; however, there is
+ * too much magic going on inside it to permit stacking things on
+ * top of it
+ */
+ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
pde_get(&proc_root);
root_inode = proc_get_inode(s, &proc_root);
extern void proc_init_inodecache(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
-extern int proc_fill_super(struct super_block *);
+extern int proc_fill_super(struct super_block *, void *data, int flags);
extern void proc_entry_rundown(struct proc_dir_entry *);
/*
* root.c
*/
extern struct proc_dir_entry proc_root;
+extern int proc_parse_options(char *options, struct pid_namespace *pid);
extern void proc_self_init(void);
extern int proc_remount(struct super_block *, int *, char *);
#include "internal.h"
-static int proc_test_super(struct super_block *sb, void *data)
-{
- return sb->s_fs_info == data;
-}
-
-static int proc_set_super(struct super_block *sb, void *data)
-{
- int err = set_anon_super(sb, NULL);
- if (!err) {
- struct pid_namespace *ns = (struct pid_namespace *)data;
- sb->s_fs_info = get_pid_ns(ns);
- }
- return err;
-}
-
enum {
Opt_gid, Opt_hidepid, Opt_err,
};
{Opt_err, NULL},
};
-static int proc_parse_options(char *options, struct pid_namespace *pid)
+int proc_parse_options(char *options, struct pid_namespace *pid)
{
char *p;
substring_t args[MAX_OPT_ARGS];
static struct dentry *proc_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- int err;
- struct super_block *sb;
struct pid_namespace *ns;
- char *options;
if (flags & MS_KERNMOUNT) {
- ns = (struct pid_namespace *)data;
- options = NULL;
+ ns = data;
+ data = NULL;
} else {
ns = task_active_pid_ns(current);
- options = data;
-
- /* Does the mounter have privilege over the pid namespace? */
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
- return ERR_PTR(-EPERM);
- }
-
- sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns);
- if (IS_ERR(sb))
- return ERR_CAST(sb);
-
- /*
- * procfs isn't actually a stacking filesystem; however, there is
- * too much magic going on inside it to permit stacking things on
- * top of it
- */
- sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
-
- if (!proc_parse_options(options, ns)) {
- deactivate_locked_super(sb);
- return ERR_PTR(-EINVAL);
- }
-
- if (!sb->s_root) {
- err = proc_fill_super(sb);
- if (err) {
- deactivate_locked_super(sb);
- return ERR_PTR(err);
- }
-
- sb->s_flags |= MS_ACTIVE;
- /* User space would break if executables appear on proc */
- sb->s_iflags |= SB_I_NOEXEC;
}
- return dget(sb->s_root);
+ return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super);
}
static void proc_kill_sb(struct super_block *sb)
.name = "proc",
.mount = proc_mount,
.kill_sb = proc_kill_sb,
- .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
+ .fs_flags = FS_USERNS_MOUNT,
};
void __init proc_root_init(void)
unsigned int hashent = hashfn(sb, qid);
struct dquot *dquot, *empty = NULL;
+ if (!qid_has_mapping(sb->s_user_ns, qid))
+ return ERR_PTR(-EINVAL);
+
if (!sb_has_quota_active(sb, qid.type))
return ERR_PTR(-ESRCH);
we_slept:
error = -EINVAL;
goto out_fmt;
}
+ /* Filesystems outside of init_user_ns not yet supported */
+ if (sb->s_user_ns != &init_user_ns) {
+ error = -EINVAL;
+ goto out_fmt;
+ }
/* Usage always has to be set... */
if (!(flags & DQUOT_USAGE_ENABLED)) {
error = -EINVAL;
if (!sb->s_qcop->get_dqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
ret = sb->s_qcop->get_dqblk(sb, qid, &fdq);
if (ret)
if (!sb->s_qcop->get_nextdqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq);
if (ret)
if (!sb->s_qcop->set_dqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
copy_from_if_dqblk(&fdq, &idq);
return sb->s_qcop->set_dqblk(sb, qid, &fdq);
if (!sb->s_qcop->set_dqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
/* Are we actually setting timer / warning limits for all users? */
- if (from_kqid(&init_user_ns, qid) == 0 &&
+ if (from_kqid(sb->s_user_ns, qid) == 0 &&
fdq.d_fieldmask & (FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK)) {
struct qc_info qinfo;
int ret;
if (!sb->s_qcop->get_dqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
ret = sb->s_qcop->get_dqblk(sb, qid, &qdq);
if (ret)
if (!sb->s_qcop->get_nextdqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq);
if (ret)
#include <linux/cleancache.h>
#include <linux/fsnotify.h>
#include <linux/lockdep.h>
+#include <linux/user_namespace.h>
#include "internal.h"
list_lru_destroy(&s->s_inode_lru);
security_sb_free(s);
WARN_ON(!list_empty(&s->s_mounts));
+ put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
kfree(s->s_options);
call_rcu(&s->rcu, destroy_super_rcu);
* alloc_super - create new superblock
* @type: filesystem type superblock should belong to
* @flags: the mount flags
+ * @user_ns: User namespace for the super_block
*
* Allocates and initializes a new &struct super_block. alloc_super()
* returns a pointer new superblock or %NULL if allocation had failed.
*/
-static struct super_block *alloc_super(struct file_system_type *type, int flags)
+static struct super_block *alloc_super(struct file_system_type *type, int flags,
+ struct user_namespace *user_ns)
{
struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
static const struct super_operations default_op;
return NULL;
INIT_LIST_HEAD(&s->s_mounts);
+ s->s_user_ns = get_user_ns(user_ns);
if (security_sb_alloc(s))
goto fail;
init_waitqueue_head(&s->s_writers.wait_unfrozen);
s->s_bdi = &noop_backing_dev_info;
s->s_flags = flags;
+ if (s->s_user_ns != &init_user_ns)
+ s->s_iflags |= SB_I_NODEV;
INIT_HLIST_NODE(&s->s_instances);
INIT_HLIST_BL_HEAD(&s->s_anon);
mutex_init(&s->s_sync_lock);
EXPORT_SYMBOL(generic_shutdown_super);
/**
- * sget - find or create a superblock
+ * sget_userns - find or create a superblock
* @type: filesystem type superblock should belong to
* @test: comparison callback
* @set: setup callback
* @flags: mount flags
+ * @user_ns: User namespace for the super_block
* @data: argument to each of them
*/
-struct super_block *sget(struct file_system_type *type,
+struct super_block *sget_userns(struct file_system_type *type,
int (*test)(struct super_block *,void *),
int (*set)(struct super_block *,void *),
- int flags,
+ int flags, struct user_namespace *user_ns,
void *data)
{
struct super_block *s = NULL;
struct super_block *old;
int err;
+ if (!(flags & MS_KERNMOUNT) &&
+ !(type->fs_flags & FS_USERNS_MOUNT) &&
+ !capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
retry:
spin_lock(&sb_lock);
if (test) {
hlist_for_each_entry(old, &type->fs_supers, s_instances) {
if (!test(old, data))
continue;
+ if (user_ns != old->s_user_ns) {
+ spin_unlock(&sb_lock);
+ if (s) {
+ up_write(&s->s_umount);
+ destroy_super(s);
+ }
+ return ERR_PTR(-EBUSY);
+ }
if (!grab_super(old))
goto retry;
if (s) {
}
if (!s) {
spin_unlock(&sb_lock);
- s = alloc_super(type, flags);
+ s = alloc_super(type, flags, user_ns);
if (!s)
return ERR_PTR(-ENOMEM);
goto retry;
return s;
}
+EXPORT_SYMBOL(sget_userns);
+
+/**
+ * sget - find or create a superblock
+ * @type: filesystem type superblock should belong to
+ * @test: comparison callback
+ * @set: setup callback
+ * @flags: mount flags
+ * @data: argument to each of them
+ */
+struct super_block *sget(struct file_system_type *type,
+ int (*test)(struct super_block *,void *),
+ int (*set)(struct super_block *,void *),
+ int flags,
+ void *data)
+{
+ struct user_namespace *user_ns = current_user_ns();
+
+ /* Ensure the requestor has permissions over the target filesystem */
+ if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ return sget_userns(type, test, set, flags, user_ns, data);
+}
+
EXPORT_SYMBOL(sget);
void drop_super(struct super_block *sb)
return set_anon_super(sb, NULL);
}
-struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
- void *data, int (*fill_super)(struct super_block *, void *, int))
+struct dentry *mount_ns(struct file_system_type *fs_type,
+ int flags, void *data, void *ns, struct user_namespace *user_ns,
+ int (*fill_super)(struct super_block *, void *, int))
{
struct super_block *sb;
- sb = sget(fs_type, ns_test_super, ns_set_super, flags, data);
+ /* Don't allow mounting unless the caller has CAP_SYS_ADMIN
+ * over the namespace.
+ */
+ if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags,
+ user_ns, ns);
if (IS_ERR(sb))
return ERR_CAST(sb);
if (IS_ERR(root) || !new_sb)
kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
else if (new_sb)
- /* Userspace would break if executables appear on sysfs */
- root->d_sb->s_iflags |= SB_I_NOEXEC;
+ root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
return root;
}
.name = "sysfs",
.mount = sysfs_mount,
.kill_sb = sysfs_kill_sb,
- .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
+ .fs_flags = FS_USERNS_MOUNT,
};
int __init sysfs_init(void)
if (mask & MAY_WRITE) {
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
+ /*
+ * Updating an xattr will likely cause i_uid and i_gid
+ * to be writen back improperly if their true value is
+ * unknown to the vfs.
+ */
+ if (HAS_UNMAPPED_ID(inode))
+ return -EPERM;
}
/*
#endif
}
-/* Helper functions so that in most cases filesystems will
- * not need to deal directly with kuid_t and kgid_t and can
- * instead deal with the raw numeric values that are stored
- * in the filesystem.
- */
-static inline uid_t i_uid_read(const struct inode *inode)
-{
- return from_kuid(&init_user_ns, inode->i_uid);
-}
-
-static inline gid_t i_gid_read(const struct inode *inode)
-{
- return from_kgid(&init_user_ns, inode->i_gid);
-}
-
-static inline void i_uid_write(struct inode *inode, uid_t uid)
-{
- inode->i_uid = make_kuid(&init_user_ns, uid);
-}
-
-static inline void i_gid_write(struct inode *inode, gid_t gid)
-{
- inode->i_gid = make_kgid(&init_user_ns, gid);
-}
-
static inline unsigned iminor(const struct inode *inode)
{
return MINOR(inode->i_rdev);
/* sb->s_iflags */
#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */
#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */
+#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */
+
+/* sb->s_iflags to limit user namespace mounts */
+#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */
/* Possible states of 'frozen' field */
enum {
struct workqueue_struct *s_dio_done_wq;
struct hlist_head s_pins;
+ /*
+ * Owning user namespace and default context in which to
+ * interpret filesystem uids, gids, quotas, device nodes,
+ * xattrs and security labels.
+ */
+ struct user_namespace *s_user_ns;
+
/*
* Keep the lru lists last in the structure so they always sit on their
* own individual cachelines.
struct list_head s_inodes_wb; /* writeback inodes */
};
+/* Helper functions so that in most cases filesystems will
+ * not need to deal directly with kuid_t and kgid_t and can
+ * instead deal with the raw numeric values that are stored
+ * in the filesystem.
+ */
+static inline uid_t i_uid_read(const struct inode *inode)
+{
+ return from_kuid(inode->i_sb->s_user_ns, inode->i_uid);
+}
+
+static inline gid_t i_gid_read(const struct inode *inode)
+{
+ return from_kgid(inode->i_sb->s_user_ns, inode->i_gid);
+}
+
+static inline void i_uid_write(struct inode *inode, uid_t uid)
+{
+ inode->i_uid = make_kuid(inode->i_sb->s_user_ns, uid);
+}
+
+static inline void i_gid_write(struct inode *inode, gid_t gid)
+{
+ inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
+}
+
extern struct timespec current_fs_time(struct super_block *sb);
/*
*/
extern void inode_init_owner(struct inode *inode, const struct inode *dir,
umode_t mode);
+extern bool may_open_dev(const struct path *path);
/*
* VFS FS_IOC_FIEMAP helper definitions.
*/
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
(inode)->i_rdev == WHITEOUT_DEV)
+static inline bool HAS_UNMAPPED_ID(struct inode *inode)
+{
+ return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid);
+}
+
/*
* Inode state bits. Protected by inode->i_lock
*
#define FS_BINARY_MOUNTDATA 2
#define FS_HAS_SUBTYPE 4
#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
-#define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */
-#define FS_USERNS_VISIBLE 32 /* FS must already be visible */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
struct dentry *(*mount) (struct file_system_type *, int,
const char *, void *);
#define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)
-extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
- void *data, int (*fill_super)(struct super_block *, void *, int));
+extern struct dentry *mount_ns(struct file_system_type *fs_type,
+ int flags, void *data, void *ns, struct user_namespace *user_ns,
+ int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_bdev(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data,
int (*fill_super)(struct super_block *, void *, int));
int set_anon_super(struct super_block *s, void *data);
int get_anon_bdev(dev_t *);
void free_anon_bdev(dev_t);
+struct super_block *sget_userns(struct file_system_type *type,
+ int (*test)(struct super_block *,void *),
+ int (*set)(struct super_block *,void *),
+ int flags, struct user_namespace *user_ns,
+ void *data);
struct super_block *sget(struct file_system_type *type,
int (*test)(struct super_block *,void *),
int (*set)(struct super_block *,void *),
extern struct vfsmount *mntget(struct vfsmount *mnt);
extern struct vfsmount *mnt_clone_internal(struct path *path);
extern int __mnt_is_readonly(struct vfsmount *mnt);
+extern bool mnt_may_suid(struct vfsmount *mnt);
struct path;
extern struct vfsmount *clone_private_mount(struct path *path);
extern void posix_acl_init(struct posix_acl *, int);
extern struct posix_acl *posix_acl_alloc(int, gfp_t);
-extern int posix_acl_valid(const struct posix_acl *);
+extern int posix_acl_valid(struct user_namespace *, const struct posix_acl *);
extern int posix_acl_permission(struct inode *, const struct posix_acl *, int);
extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t);
extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *);
return kqid;
}
+/**
+ * qid_has_mapping - Report if a qid maps into a user namespace.
+ * @ns: The user namespace to see if a value maps into.
+ * @qid: The kernel internal quota identifier to test.
+ */
+static inline bool qid_has_mapping(struct user_namespace *ns, struct kqid qid)
+{
+ return from_kqid(ns, qid) != (qid_t) -1;
+}
+
extern spinlock_t dq_data_lock;
static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid)
{
- return true;
+ return uid_valid(uid);
}
static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
{
- return true;
+ return gid_valid(gid);
}
#endif /* CONFIG_USER_NS */
extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
extern int proc_setgroups_show(struct seq_file *m, void *v);
extern bool userns_may_setgroups(const struct user_namespace *ns);
+extern bool current_in_userns(const struct user_namespace *target_ns);
#else
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
return true;
}
+
+static inline bool current_in_userns(const struct user_namespace *target_ns)
+{
+ return true;
+}
#endif
#endif /* _LINUX_USER_H */
static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
{
struct inode *inode;
- struct ipc_namespace *ns = data;
+ struct ipc_namespace *ns = sb->s_fs_info;
+ sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = MQUEUE_MAGIC;
int flags, const char *dev_name,
void *data)
{
- if (!(flags & MS_KERNMOUNT)) {
- struct ipc_namespace *ns = current->nsproxy->ipc_ns;
- /* Don't allow mounting unless the caller has CAP_SYS_ADMIN
- * over the ipc namespace.
- */
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
- return ERR_PTR(-EPERM);
-
- data = ns;
+ struct ipc_namespace *ns;
+ if (flags & MS_KERNMOUNT) {
+ ns = data;
+ data = NULL;
+ } else {
+ ns = current->nsproxy->ipc_ns;
}
- return mount_ns(fs_type, flags, data, mqueue_fill_super);
+ return mount_ns(fs_type, flags, data, ns, ns->user_ns, mqueue_fill_super);
}
static void init_once(void *foo)
ns->ns.ops = &ipcns_operations;
atomic_set(&ns->count, 1);
+ ns->user_ns = get_user_ns(user_ns);
+
err = mq_init_ns(ns);
if (err) {
+ put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
kfree(ns);
return ERR_PTR(err);
msg_init_ns(ns);
shm_init_ns(ns);
- ns->user_ns = get_user_ns(user_ns);
-
return ns;
}
*/
int set_create_files_as(struct cred *new, struct inode *inode)
{
+ if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+ return -EINVAL;
new->fsuid = inode->i_uid;
new->fsgid = inode->i_gid;
return security_kernel_create_files_as(new, inode);
return allowed;
}
+/*
+ * Returns true if @ns is the same namespace as or a descendant of
+ * @target_ns.
+ */
+bool current_in_userns(const struct user_namespace *target_ns)
+{
+ struct user_namespace *ns;
+ for (ns = current_user_ns(); ns; ns = ns->parent) {
+ if (ns == target_ns)
+ return true;
+ }
+ return false;
+}
+
static inline struct user_namespace *to_user_ns(struct ns_common *ns)
{
return container_of(ns, struct user_namespace, ns);
{
struct inode *inode;
struct dentry *root, *gssd_dentry;
- struct net *net = data;
+ struct net *net = get_net(sb->s_fs_info);
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
int err;
sb);
if (err)
goto err_depopulate;
- sb->s_fs_info = get_net(net);
mutex_unlock(&sn->pipefs_sb_lock);
return 0;
rpc_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_ns(fs_type, flags, current->nsproxy->net_ns, rpc_fill_super);
+ struct net *net = current->nsproxy->net_ns;
+ return mount_ns(fs_type, flags, data, net, net->user_ns, rpc_fill_super);
}
static void rpc_kill_sb(struct super_block *sb)
RPC_PIPEFS_UMOUNT,
sb);
mutex_unlock(&sn->pipefs_sb_lock);
- put_net(net);
out:
kill_litter_super(sb);
+ put_net(net);
}
static struct file_system_type rpc_pipe_fs_type = {
if (!file_caps_enabled)
return 0;
- if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
+ if (!mnt_may_suid(bprm->file->f_path.mnt))
+ return 0;
+
+ /*
+ * This check is redundant with mnt_may_suid() but is kept to make
+ * explicit that capability bits are limited to s_user_ns and its
+ * descendants.
+ */
+ if (!current_in_userns(bprm->file->f_path.mnt->mnt_sb->s_user_ns))
return 0;
rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
memset(&hmac_misc, 0, sizeof(hmac_misc));
hmac_misc.ino = inode->i_ino;
hmac_misc.generation = inode->i_generation;
- hmac_misc.uid = from_kuid(&init_user_ns, inode->i_uid);
- hmac_misc.gid = from_kgid(&init_user_ns, inode->i_gid);
+ hmac_misc.uid = from_kuid(inode->i_sb->s_user_ns, inode->i_uid);
+ hmac_misc.gid = from_kgid(inode->i_sb->s_user_ns, inode->i_gid);
hmac_misc.mode = inode->i_mode;
crypto_shash_update(desc, (const u8 *)&hmac_misc, sizeof(hmac_misc));
if (evm_hmac_attrs & EVM_ATTR_FSUUID)
goto out;
}
}
+
+ /*
+ * If this is a user namespace mount, no contexts are allowed
+ * on the command line and security labels must be ignored.
+ */
+ if (sb->s_user_ns != &init_user_ns) {
+ if (context_sid || fscontext_sid || rootcontext_sid ||
+ defcontext_sid) {
+ rc = -EACCES;
+ goto out;
+ }
+ if (sbsec->behavior == SECURITY_FS_USE_XATTR) {
+ sbsec->behavior = SECURITY_FS_USE_MNTPOINT;
+ rc = security_transition_sid(current_sid(), current_sid(),
+ SECCLASS_FILE, NULL,
+ &sbsec->mntpoint_sid);
+ if (rc)
+ goto out;
+ }
+ goto out_set_opts;
+ }
+
/* sets the context of the superblock for the fs being mounted. */
if (fscontext_sid) {
rc = may_context_mount_sb_relabel(fscontext_sid, sbsec, cred);
sbsec->def_sid = defcontext_sid;
}
+out_set_opts:
rc = sb_finish_set_opts(sb);
out:
mutex_unlock(&sbsec->lock);
const struct task_security_struct *new_tsec)
{
int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS);
- int nosuid = (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID);
+ int nosuid = !mnt_may_suid(bprm->file->f_path.mnt);
int rc;
if (!nnp && !nosuid)
struct smack_known *smk_floor;
struct smack_known *smk_hat;
struct smack_known *smk_default;
- int smk_initialized;
+ int smk_flags;
};
+/*
+ * Superblock flags
+ */
+#define SMK_SB_INITIALIZED 0x01
+#define SMK_SB_UNTRUSTED 0x02
+
struct socket_smack {
struct smack_known *smk_out; /* outbound label */
struct smack_known *smk_in; /* inbound label */
sbsp->smk_floor = &smack_known_floor;
sbsp->smk_hat = &smack_known_hat;
/*
- * smk_initialized will be zero from kzalloc.
+ * SMK_SB_INITIALIZED will be zero from kzalloc.
*/
sb->s_security = sbsp;
int num_opts = opts->num_mnt_opts;
int transmute = 0;
- if (sp->smk_initialized)
+ if (sp->smk_flags & SMK_SB_INITIALIZED)
return 0;
- sp->smk_initialized = 1;
+ sp->smk_flags |= SMK_SB_INITIALIZED;
for (i = 0; i < num_opts; i++) {
switch (opts->mnt_opts_flags[i]) {
skp = smk_of_current();
sp->smk_root = skp;
sp->smk_default = skp;
+ /*
+ * For a handful of fs types with no user-controlled
+ * backing store it's okay to trust security labels
+ * in the filesystem. The rest are untrusted.
+ */
+ if (sb->s_user_ns != &init_user_ns &&
+ sb->s_magic != SYSFS_MAGIC && sb->s_magic != TMPFS_MAGIC &&
+ sb->s_magic != RAMFS_MAGIC) {
+ transmute = 1;
+ sp->smk_flags |= SMK_SB_UNTRUSTED;
+ }
}
/*
struct inode *inode = file_inode(bprm->file);
struct task_smack *bsp = bprm->cred->security;
struct inode_smack *isp;
+ struct superblock_smack *sbsp;
int rc;
if (bprm->cred_prepared)
if (isp->smk_task == NULL || isp->smk_task == bsp->smk_task)
return 0;
+ sbsp = inode->i_sb->s_security;
+ if ((sbsp->smk_flags & SMK_SB_UNTRUSTED) &&
+ isp->smk_task != sbsp->smk_root)
+ return 0;
+
if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
struct task_struct *tracer;
rc = 0;
*/
static int smack_inode_permission(struct inode *inode, int mask)
{
+ struct superblock_smack *sbsp = inode->i_sb->s_security;
struct smk_audit_info ad;
int no_block = mask & MAY_NOT_BLOCK;
int rc;
if (mask == 0)
return 0;
+ if (sbsp->smk_flags & SMK_SB_UNTRUSTED) {
+ if (smk_of_inode(inode) != sbsp->smk_root)
+ return -EACCES;
+ }
+
/* May be droppable after audit */
if (no_block)
return -ECHILD;
struct task_smack *tsp;
struct smack_known *okp;
struct inode_smack *isp;
+ struct superblock_smack *sbsp;
int may;
int mmay;
int tmay;
isp = file_inode(file)->i_security;
if (isp->smk_mmap == NULL)
return 0;
+ sbsp = file_inode(file)->i_sb->s_security;
+ if (sbsp->smk_flags & SMK_SB_UNTRUSTED &&
+ isp->smk_mmap != sbsp->smk_root)
+ return -EACCES;
mkp = isp->smk_mmap;
tsp = current_security();