2019-05-28 10:10:12 -07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
|
* linux/fs/namespace.c
|
|
|
|
|
*
|
|
|
|
|
* (C) Copyright Al Viro 2000, 2001
|
|
|
|
|
*
|
|
|
|
|
* Based on code from fs/super.c, copyright Linus Torvalds and others.
|
|
|
|
|
* Heavily rewritten.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include <linux/syscalls.h>
|
2011-12-07 13:06:11 -05:00
|
|
|
#include <linux/export.h>
|
2006-01-11 12:17:46 -08:00
|
|
|
#include <linux/capability.h>
|
2006-12-08 02:37:56 -08:00
|
|
|
#include <linux/mnt_namespace.h>
|
2012-07-26 21:08:32 -07:00
|
|
|
#include <linux/user_namespace.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/namei.h>
|
|
|
|
|
#include <linux/security.h>
|
2017-02-02 17:54:15 +01:00
|
|
|
#include <linux/cred.h>
|
2008-03-26 22:11:34 +01:00
|
|
|
#include <linux/idr.h>
|
2013-09-11 14:26:10 -07:00
|
|
|
#include <linux/init.h> /* init_rootfs */
|
2011-12-07 13:06:11 -05:00
|
|
|
#include <linux/fs_struct.h> /* get_fs_root et.al. */
|
|
|
|
|
#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
|
2018-11-05 17:40:30 +00:00
|
|
|
#include <linux/file.h>
|
2011-12-07 13:06:11 -05:00
|
|
|
#include <linux/uaccess.h>
|
2013-04-12 01:50:06 +01:00
|
|
|
#include <linux/proc_ns.h>
|
2013-05-01 17:51:54 -07:00
|
|
|
#include <linux/magic.h>
|
2018-10-30 15:09:49 -07:00
|
|
|
#include <linux/memblock.h>
|
2021-01-21 14:19:54 +01:00
|
|
|
#include <linux/proc_fs.h>
|
2014-08-08 13:08:20 -04:00
|
|
|
#include <linux/task_work.h>
|
2017-02-04 01:20:53 +01:00
|
|
|
#include <linux/sched/task.h>
|
2018-11-01 23:07:23 +00:00
|
|
|
#include <uapi/linux/mount.h>
|
2018-11-04 03:19:03 -05:00
|
|
|
#include <linux/fs_context.h>
|
2019-06-01 18:09:44 -04:00
|
|
|
#include <linux/shmem_fs.h>
|
2021-12-03 12:17:07 +01:00
|
|
|
#include <linux/mnt_idmapping.h>
|
2024-12-19 18:01:32 +01:00
|
|
|
#include <linux/pidfs.h>
|
2017-02-04 01:20:53 +01:00
|
|
|
|
2005-11-07 17:19:07 -05:00
|
|
|
#include "pnode.h"
|
2007-07-15 23:41:25 -07:00
|
|
|
#include "internal.h"
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2016-09-28 00:27:17 -05:00
|
|
|
/* Maximum number of mounts in a mount namespace */
|
2022-01-21 22:13:27 -08:00
|
|
|
static unsigned int sysctl_mount_max __read_mostly = 100000;
|
2016-09-28 00:27:17 -05:00
|
|
|
|
2023-10-11 19:55:00 +03:00
|
|
|
static unsigned int m_hash_mask __ro_after_init;
|
|
|
|
|
static unsigned int m_hash_shift __ro_after_init;
|
|
|
|
|
static unsigned int mp_hash_mask __ro_after_init;
|
|
|
|
|
static unsigned int mp_hash_shift __ro_after_init;
|
2014-02-28 13:46:44 -05:00
|
|
|
|
|
|
|
|
static __initdata unsigned long mhash_entries;
|
|
|
|
|
static int __init set_mhash_entries(char *str)
|
|
|
|
|
{
|
|
|
|
|
if (!str)
|
|
|
|
|
return 0;
|
|
|
|
|
mhash_entries = simple_strtoul(str, &str, 0);
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
__setup("mhash_entries=", set_mhash_entries);
|
|
|
|
|
|
|
|
|
|
static __initdata unsigned long mphash_entries;
|
|
|
|
|
static int __init set_mphash_entries(char *str)
|
|
|
|
|
{
|
|
|
|
|
if (!str)
|
|
|
|
|
return 0;
|
|
|
|
|
mphash_entries = simple_strtoul(str, &str, 0);
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
__setup("mphash_entries=", set_mphash_entries);
|
2008-02-06 01:37:57 -08:00
|
|
|
|
2014-02-27 14:40:10 -05:00
|
|
|
static u64 event;
|
2024-12-17 13:21:55 +01:00
|
|
|
static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
|
2008-03-27 13:06:23 +01:00
|
|
|
static DEFINE_IDA(mnt_group_ida);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2023-10-25 16:01:59 +02:00
|
|
|
/* Don't allow confusion with old 32bit mount ID */
|
2024-07-19 13:41:48 +02:00
|
|
|
#define MNT_UNIQUE_ID_OFFSET (1ULL << 31)
|
2024-12-17 13:21:55 +01:00
|
|
|
static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET;
|
2023-10-25 16:01:59 +02:00
|
|
|
|
2023-10-11 19:55:00 +03:00
|
|
|
static struct hlist_head *mount_hashtable __ro_after_init;
|
|
|
|
|
static struct hlist_head *mountpoint_hashtable __ro_after_init;
|
|
|
|
|
static struct kmem_cache *mnt_cache __ro_after_init;
|
2013-09-16 21:34:53 -04:00
|
|
|
static DECLARE_RWSEM(namespace_sem);
|
2019-06-30 10:39:08 -04:00
|
|
|
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
|
|
|
|
|
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
|
2024-12-13 00:03:45 +01:00
|
|
|
static DEFINE_SEQLOCK(mnt_ns_tree_lock);
|
2024-12-13 00:03:42 +01:00
|
|
|
|
2025-01-29 17:58:01 +01:00
|
|
|
#ifdef CONFIG_FSNOTIFY
|
|
|
|
|
LIST_HEAD(notify_list); /* protected by namespace_sem */
|
|
|
|
|
#endif
|
2024-06-24 11:49:46 -04:00
|
|
|
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
|
2024-12-13 00:03:44 +01:00
|
|
|
static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2025-01-28 11:33:42 +01:00
|
|
|
enum mount_kattr_flags_t {
|
|
|
|
|
MOUNT_KATTR_RECURSE = (1 << 0),
|
fs: allow changing idmappings
This patchset makes it possible to create a new idmapped mount from an
already idmapped mount and to clear idmappings.
// Create a first idmapped mount
struct mount_attr attr = {
.attr_set = MOUNT_ATTR_IDMAP
.userns_fd = fd_userns
};
fd_tree = open_tree(-EBADF, "/", OPEN_TREE_CLONE, &attr, sizeof(attr));
move_mount(fd_tree, "", -EBADF, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
// Create a second idmapped mount from the first idmapped mount
attr.attr_set = MOUNT_ATTR_IDMAP;
attr.userns_fd = fd_userns2;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
// Create a second non-idmapped mount from the first idmapped mount:
memset(&attr, 0, sizeof(attr));
attr.attr_clr = MOUNT_ATTR_IDMAP;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
Link: https://lore.kernel.org/r/20250128-work-mnt_idmap-update-v2-v1-5-c25feb0d2eb3@kernel.org
Reviewed-by: "Seth Forshee (DigitalOcean)" <sforshee@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-01-28 11:33:43 +01:00
|
|
|
MOUNT_KATTR_IDMAP_REPLACE = (1 << 1),
|
2025-01-28 11:33:42 +01:00
|
|
|
};
|
|
|
|
|
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
struct mount_kattr {
|
|
|
|
|
unsigned int attr_set;
|
|
|
|
|
unsigned int attr_clr;
|
|
|
|
|
unsigned int propagation;
|
|
|
|
|
unsigned int lookup_flags;
|
2025-01-28 11:33:42 +01:00
|
|
|
enum mount_kattr_flags_t kflags;
|
2021-01-21 14:19:54 +01:00
|
|
|
struct user_namespace *mnt_userns;
|
2022-10-26 12:51:27 +02:00
|
|
|
struct mnt_idmap *mnt_idmap;
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
};
|
|
|
|
|
|
2006-01-16 22:14:23 -08:00
|
|
|
/* /sys/fs */
|
2023-10-11 19:55:00 +03:00
|
|
|
struct kobject *fs_kobj __ro_after_init;
|
2007-10-29 14:17:23 -06:00
|
|
|
EXPORT_SYMBOL_GPL(fs_kobj);
|
2006-01-16 22:14:23 -08:00
|
|
|
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
/*
|
|
|
|
|
* vfsmount lock may be taken for read to prevent changes to the
|
|
|
|
|
* vfsmount hash, ie. during mountpoint lookups or walking back
|
|
|
|
|
* up the tree.
|
|
|
|
|
*
|
|
|
|
|
* It should be taken for write in all cases where the vfsmount
|
|
|
|
|
* tree or hash is modified or when a vfsmount structure is modified.
|
|
|
|
|
*/
|
2013-09-29 22:06:07 -04:00
|
|
|
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
|
2024-06-24 11:49:46 -04:00
|
|
|
static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
|
|
|
|
|
{
|
|
|
|
|
if (!node)
|
|
|
|
|
return NULL;
|
|
|
|
|
return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-13 00:03:42 +01:00
|
|
|
static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
|
2024-06-24 11:49:46 -04:00
|
|
|
{
|
|
|
|
|
struct mnt_namespace *ns_a = node_to_mnt_ns(a);
|
|
|
|
|
struct mnt_namespace *ns_b = node_to_mnt_ns(b);
|
|
|
|
|
u64 seq_a = ns_a->seq;
|
2024-12-13 00:03:42 +01:00
|
|
|
u64 seq_b = ns_b->seq;
|
2024-06-24 11:49:46 -04:00
|
|
|
|
2024-12-13 00:03:42 +01:00
|
|
|
if (seq_a < seq_b)
|
|
|
|
|
return -1;
|
|
|
|
|
if (seq_a > seq_b)
|
|
|
|
|
return 1;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline void mnt_ns_tree_write_lock(void)
|
|
|
|
|
{
|
2024-12-13 00:03:45 +01:00
|
|
|
write_seqlock(&mnt_ns_tree_lock);
|
2024-12-13 00:03:42 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline void mnt_ns_tree_write_unlock(void)
|
|
|
|
|
{
|
2024-12-13 00:03:45 +01:00
|
|
|
write_sequnlock(&mnt_ns_tree_lock);
|
2024-06-24 11:49:46 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void mnt_ns_tree_add(struct mnt_namespace *ns)
|
|
|
|
|
{
|
2024-12-13 00:03:44 +01:00
|
|
|
struct rb_node *node, *prev;
|
2024-12-13 00:03:42 +01:00
|
|
|
|
|
|
|
|
mnt_ns_tree_write_lock();
|
|
|
|
|
node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
|
2024-12-13 00:03:44 +01:00
|
|
|
/*
|
|
|
|
|
* If there's no previous entry simply add it after the
|
|
|
|
|
* head and if there is add it after the previous entry.
|
|
|
|
|
*/
|
|
|
|
|
prev = rb_prev(&ns->mnt_ns_tree_node);
|
|
|
|
|
if (!prev)
|
|
|
|
|
list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list);
|
|
|
|
|
else
|
|
|
|
|
list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list);
|
2024-12-13 00:03:42 +01:00
|
|
|
mnt_ns_tree_write_unlock();
|
|
|
|
|
|
|
|
|
|
WARN_ON_ONCE(node);
|
2024-06-24 11:49:46 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void mnt_ns_release(struct mnt_namespace *ns)
|
|
|
|
|
{
|
|
|
|
|
/* keep alive for {list,stat}mount() */
|
|
|
|
|
if (refcount_dec_and_test(&ns->passive)) {
|
2025-01-29 17:58:01 +01:00
|
|
|
fsnotify_mntns_delete(ns);
|
2024-06-24 11:49:46 -04:00
|
|
|
put_user_ns(ns->user_ns);
|
|
|
|
|
kfree(ns);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
|
|
|
|
|
|
2024-12-13 00:03:42 +01:00
|
|
|
static void mnt_ns_release_rcu(struct rcu_head *rcu)
|
|
|
|
|
{
|
|
|
|
|
mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-24 11:49:46 -04:00
|
|
|
static void mnt_ns_tree_remove(struct mnt_namespace *ns)
|
|
|
|
|
{
|
|
|
|
|
/* remove from global mount namespace list */
|
|
|
|
|
if (!is_anon_ns(ns)) {
|
2024-12-13 00:03:42 +01:00
|
|
|
mnt_ns_tree_write_lock();
|
2024-06-24 11:49:46 -04:00
|
|
|
rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
|
2024-12-13 00:03:44 +01:00
|
|
|
list_bidir_del_rcu(&ns->mnt_ns_list);
|
2024-12-13 00:03:42 +01:00
|
|
|
mnt_ns_tree_write_unlock();
|
2024-06-24 11:49:46 -04:00
|
|
|
}
|
|
|
|
|
|
2024-12-13 00:03:42 +01:00
|
|
|
call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
|
2024-06-24 11:49:46 -04:00
|
|
|
}
|
|
|
|
|
|
2024-12-13 00:03:42 +01:00
|
|
|
static int mnt_ns_find(const void *key, const struct rb_node *node)
|
2024-06-24 11:49:46 -04:00
|
|
|
{
|
2024-12-13 00:03:42 +01:00
|
|
|
const u64 mnt_ns_id = *(u64 *)key;
|
|
|
|
|
const struct mnt_namespace *ns = node_to_mnt_ns(node);
|
2024-06-24 11:49:46 -04:00
|
|
|
|
2024-12-13 00:03:42 +01:00
|
|
|
if (mnt_ns_id < ns->seq)
|
|
|
|
|
return -1;
|
|
|
|
|
if (mnt_ns_id > ns->seq)
|
|
|
|
|
return 1;
|
|
|
|
|
return 0;
|
2024-06-24 11:49:46 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Lookup a mount namespace by id and take a passive reference count. Taking a
|
|
|
|
|
* passive reference means the mount namespace can be emptied if e.g., the last
|
|
|
|
|
* task holding an active reference exits. To access the mounts of the
|
|
|
|
|
* namespace the @namespace_sem must first be acquired. If the namespace has
|
|
|
|
|
* already shut down before acquiring @namespace_sem, {list,stat}mount() will
|
|
|
|
|
* see that the mount rbtree of the namespace is empty.
|
2024-12-13 00:03:42 +01:00
|
|
|
*
|
|
|
|
|
* Note the lookup is lockless protected by a sequence counter. We only
|
|
|
|
|
* need to guard against false negatives as false positives aren't
|
|
|
|
|
* possible. So if we didn't find a mount namespace and the sequence
|
|
|
|
|
* counter has changed we need to retry. If the sequence counter is
|
|
|
|
|
* still the same we know the search actually failed.
|
2024-06-24 11:49:46 -04:00
|
|
|
*/
|
|
|
|
|
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
|
|
|
|
|
{
|
2024-12-13 00:03:42 +01:00
|
|
|
struct mnt_namespace *ns;
|
|
|
|
|
struct rb_node *node;
|
|
|
|
|
unsigned int seq;
|
|
|
|
|
|
|
|
|
|
guard(rcu)();
|
|
|
|
|
do {
|
2024-12-13 00:03:45 +01:00
|
|
|
seq = read_seqbegin(&mnt_ns_tree_lock);
|
2024-12-13 00:03:42 +01:00
|
|
|
node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
|
|
|
|
|
if (node)
|
|
|
|
|
break;
|
2024-12-13 00:03:45 +01:00
|
|
|
} while (read_seqretry(&mnt_ns_tree_lock, seq));
|
2024-06-24 11:49:46 -04:00
|
|
|
|
2024-12-13 00:03:42 +01:00
|
|
|
if (!node)
|
|
|
|
|
return NULL;
|
2024-06-24 11:49:46 -04:00
|
|
|
|
2024-12-13 00:03:42 +01:00
|
|
|
/*
|
|
|
|
|
* The last reference count is put with RCU delay so we can
|
|
|
|
|
* unconditonally acquire a reference here.
|
|
|
|
|
*/
|
|
|
|
|
ns = node_to_mnt_ns(node);
|
|
|
|
|
refcount_inc(&ns->passive);
|
|
|
|
|
return ns;
|
2024-06-24 11:49:46 -04:00
|
|
|
}
|
|
|
|
|
|
2021-01-21 14:19:49 +01:00
|
|
|
static inline void lock_mount_hash(void)
|
|
|
|
|
{
|
|
|
|
|
write_seqlock(&mount_lock);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline void unlock_mount_hash(void)
|
|
|
|
|
{
|
|
|
|
|
write_sequnlock(&mount_lock);
|
|
|
|
|
}
|
|
|
|
|
|
2014-03-20 21:10:51 -04:00
|
|
|
static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2005-11-07 17:16:09 -05:00
|
|
|
unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
|
|
|
|
|
tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
|
2014-02-28 13:46:44 -05:00
|
|
|
tmp = tmp + (tmp >> m_hash_shift);
|
|
|
|
|
return &mount_hashtable[tmp & m_hash_mask];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline struct hlist_head *mp_hash(struct dentry *dentry)
|
|
|
|
|
{
|
|
|
|
|
unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
|
|
|
|
|
tmp = tmp + (tmp >> mp_hash_shift);
|
|
|
|
|
return &mountpoint_hashtable[tmp & mp_hash_mask];
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2011-11-24 20:38:33 -05:00
|
|
|
static int mnt_alloc_id(struct mount *mnt)
|
2008-03-26 22:11:34 +01:00
|
|
|
{
|
2024-12-17 13:21:55 +01:00
|
|
|
int res;
|
2018-06-11 12:31:36 -04:00
|
|
|
|
2024-12-17 13:21:55 +01:00
|
|
|
xa_lock(&mnt_id_xa);
|
|
|
|
|
res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL);
|
|
|
|
|
if (!res)
|
|
|
|
|
mnt->mnt_id_unique = ++mnt_id_ctr;
|
|
|
|
|
xa_unlock(&mnt_id_xa);
|
|
|
|
|
return res;
|
2008-03-26 22:11:34 +01:00
|
|
|
}
|
|
|
|
|
|
2011-11-24 20:38:33 -05:00
|
|
|
static void mnt_free_id(struct mount *mnt)
|
2008-03-26 22:11:34 +01:00
|
|
|
{
|
2024-12-17 13:21:55 +01:00
|
|
|
xa_erase(&mnt_id_xa, mnt->mnt_id);
|
2008-03-26 22:11:34 +01:00
|
|
|
}
|
|
|
|
|
|
2008-03-27 13:06:23 +01:00
|
|
|
/*
|
|
|
|
|
* Allocate a new peer group ID
|
|
|
|
|
*/
|
2011-11-24 19:54:23 -05:00
|
|
|
static int mnt_alloc_group_id(struct mount *mnt)
|
2008-03-27 13:06:23 +01:00
|
|
|
{
|
2018-06-11 12:31:36 -04:00
|
|
|
int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);
|
2009-06-24 03:12:00 -04:00
|
|
|
|
2018-06-11 12:31:36 -04:00
|
|
|
if (res < 0)
|
|
|
|
|
return res;
|
|
|
|
|
mnt->mnt_group_id = res;
|
|
|
|
|
return 0;
|
2008-03-27 13:06:23 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Release a peer group ID
|
|
|
|
|
*/
|
2011-11-24 19:54:23 -05:00
|
|
|
void mnt_release_group_id(struct mount *mnt)
|
2008-03-27 13:06:23 +01:00
|
|
|
{
|
2018-06-11 12:31:36 -04:00
|
|
|
ida_free(&mnt_group_ida, mnt->mnt_group_id);
|
2011-11-25 00:50:41 -05:00
|
|
|
mnt->mnt_group_id = 0;
|
2008-03-27 13:06:23 +01:00
|
|
|
}
|
|
|
|
|
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
/*
|
|
|
|
|
* vfsmount lock must be held for read
|
|
|
|
|
*/
|
2011-11-24 22:37:54 -05:00
|
|
|
static inline void mnt_add_count(struct mount *mnt, int n)
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
{
|
|
|
|
|
#ifdef CONFIG_SMP
|
2011-11-24 22:53:09 -05:00
|
|
|
this_cpu_add(mnt->mnt_pcp->mnt_count, n);
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
#else
|
|
|
|
|
preempt_disable();
|
2011-11-24 22:53:09 -05:00
|
|
|
mnt->mnt_count += n;
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
preempt_enable();
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* vfsmount lock must be held for write
|
|
|
|
|
*/
|
2020-10-31 21:40:21 -07:00
|
|
|
int mnt_get_count(struct mount *mnt)
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
{
|
|
|
|
|
#ifdef CONFIG_SMP
|
2020-10-31 21:40:21 -07:00
|
|
|
int count = 0;
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
int cpu;
|
|
|
|
|
|
|
|
|
|
for_each_possible_cpu(cpu) {
|
2011-11-24 22:53:09 -05:00
|
|
|
count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return count;
|
|
|
|
|
#else
|
2011-11-24 22:53:09 -05:00
|
|
|
return mnt->mnt_count;
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
2011-11-24 20:38:33 -05:00
|
|
|
static struct mount *alloc_vfsmnt(const char *name)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2011-11-25 02:35:16 -05:00
|
|
|
struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
|
|
|
|
|
if (mnt) {
|
2008-03-26 22:11:34 +01:00
|
|
|
int err;
|
|
|
|
|
|
2011-11-25 02:35:16 -05:00
|
|
|
err = mnt_alloc_id(mnt);
|
2008-07-21 18:06:36 +08:00
|
|
|
if (err)
|
|
|
|
|
goto out_free_cache;
|
|
|
|
|
|
2025-04-21 04:35:09 +01:00
|
|
|
if (name)
|
memcg: enable accounting for mnt_cache entries
Patch series "memcg accounting from OpenVZ", v7.
OpenVZ uses memory accounting 20+ years since v2.2.x linux kernels.
Initially we used our own accounting subsystem, then partially committed
it to upstream, and a few years ago switched to cgroups v1. Now we're
rebasing again, revising our old patches and trying to push them upstream.
We try to protect the host system from any misuse of kernel memory
allocation triggered by untrusted users inside the containers.
Patch-set is addressed mostly to cgroups maintainers and cgroups@ mailing
list, though I would be very grateful for any comments from maintainersi
of affected subsystems or other people added in cc:
Compared to the upstream, we additionally account the following kernel objects:
- network devices and its Tx/Rx queues
- ipv4/v6 addresses and routing-related objects
- inet_bind_bucket cache objects
- VLAN group arrays
- ipv6/sit: ip_tunnel_prl
- scm_fp_list objects used by SCM_RIGHTS messages of Unix sockets
- nsproxy and namespace objects itself
- IPC objects: semaphores, message queues and share memory segments
- mounts
- pollfd and select bits arrays
- signals and posix timers
- file lock
- fasync_struct used by the file lease code and driver's fasync queues
- tty objects
- per-mm LDT
We have an incorrect/incomplete/obsoleted accounting for few other kernel
objects: sk_filter, af_packets, netlink and xt_counters for iptables.
They require rework and probably will be dropped at all.
Also we're going to add an accounting for nft, however it is not ready
yet.
We have not tested performance on upstream, however, our performance team
compares our current RHEL7-based production kernel and reports that they
are at least not worse as the according original RHEL7 kernel.
This patch (of 10):
The kernel allocates ~400 bytes of 'struct mount' for any new mount.
Creating a new mount namespace clones most of the parent mounts, and this
can be repeated many times. Additionally, each mount allocates up to
PATH_MAX=4096 bytes for mnt->mnt_devname.
It makes sense to account for these allocations to restrict the host's
memory consumption from inside the memcg-limited container.
Link: https://lkml.kernel.org/r/045db11f-4a45-7c9b-2664-5b32c2b44943@virtuozzo.com
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Yutian Yang <nglaive@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Borislav Petkov <bp@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 14:55:10 -07:00
|
|
|
mnt->mnt_devname = kstrdup_const(name,
|
|
|
|
|
GFP_KERNEL_ACCOUNT);
|
2025-04-21 04:35:09 +01:00
|
|
|
else
|
|
|
|
|
mnt->mnt_devname = "none";
|
|
|
|
|
if (!mnt->mnt_devname)
|
|
|
|
|
goto out_free_id;
|
2008-03-26 22:11:34 +01:00
|
|
|
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
#ifdef CONFIG_SMP
|
2011-11-25 02:35:16 -05:00
|
|
|
mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
|
|
|
|
|
if (!mnt->mnt_pcp)
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
goto out_free_devname;
|
|
|
|
|
|
2011-11-25 02:35:16 -05:00
|
|
|
this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
#else
|
2011-11-25 02:35:16 -05:00
|
|
|
mnt->mnt_count = 1;
|
|
|
|
|
mnt->mnt_writers = 0;
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
#endif
|
|
|
|
|
|
2014-03-20 21:10:51 -04:00
|
|
|
INIT_HLIST_NODE(&mnt->mnt_hash);
|
2011-11-25 02:35:16 -05:00
|
|
|
INIT_LIST_HEAD(&mnt->mnt_child);
|
|
|
|
|
INIT_LIST_HEAD(&mnt->mnt_mounts);
|
|
|
|
|
INIT_LIST_HEAD(&mnt->mnt_list);
|
|
|
|
|
INIT_LIST_HEAD(&mnt->mnt_expire);
|
|
|
|
|
INIT_LIST_HEAD(&mnt->mnt_share);
|
|
|
|
|
INIT_LIST_HEAD(&mnt->mnt_slave_list);
|
|
|
|
|
INIT_LIST_HEAD(&mnt->mnt_slave);
|
2013-09-22 19:37:01 -07:00
|
|
|
INIT_HLIST_NODE(&mnt->mnt_mp_list);
|
2019-07-04 16:57:51 -04:00
|
|
|
INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
|
2024-12-15 21:17:05 +01:00
|
|
|
RB_CLEAR_NODE(&mnt->mnt_node);
|
2022-10-26 12:51:27 +02:00
|
|
|
mnt->mnt.mnt_idmap = &nop_mnt_idmap;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2011-11-25 02:35:16 -05:00
|
|
|
return mnt;
|
2008-07-21 18:06:36 +08:00
|
|
|
|
2009-04-26 20:25:54 +10:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
|
out_free_devname:
|
2015-02-13 14:36:41 -08:00
|
|
|
kfree_const(mnt->mnt_devname);
|
2009-04-26 20:25:54 +10:00
|
|
|
#endif
|
2008-07-21 18:06:36 +08:00
|
|
|
out_free_id:
|
2011-11-25 02:35:16 -05:00
|
|
|
mnt_free_id(mnt);
|
2008-07-21 18:06:36 +08:00
|
|
|
out_free_cache:
|
2011-11-25 02:35:16 -05:00
|
|
|
kmem_cache_free(mnt_cache, mnt);
|
2008-07-21 18:06:36 +08:00
|
|
|
return NULL;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
/*
|
|
|
|
|
* Most r/o checks on a fs are for operations that take
|
|
|
|
|
* discrete amounts of time, like a write() or unlink().
|
|
|
|
|
* We must keep track of when those operations start
|
|
|
|
|
* (for permission checks) and when they end, so that
|
|
|
|
|
* we can determine when writes are able to occur to
|
|
|
|
|
* a filesystem.
|
|
|
|
|
*/
|
|
|
|
|
/*
|
|
|
|
|
* __mnt_is_readonly: check whether a mount is read-only
|
|
|
|
|
* @mnt: the mount to check for its write status
|
|
|
|
|
*
|
|
|
|
|
* This shouldn't be used directly ouside of the VFS.
|
|
|
|
|
* It does not guarantee that the filesystem will stay
|
|
|
|
|
* r/w, just that it is right *now*. This can not and
|
|
|
|
|
* should not be used in place of IS_RDONLY(inode).
|
|
|
|
|
* mnt_want/drop_write() will _keep_ the filesystem
|
|
|
|
|
* r/w.
|
|
|
|
|
*/
|
2018-11-01 23:07:25 +00:00
|
|
|
bool __mnt_is_readonly(struct vfsmount *mnt)
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
{
|
2018-11-01 23:07:25 +00:00
|
|
|
return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL_GPL(__mnt_is_readonly);
|
|
|
|
|
|
2011-11-24 22:37:54 -05:00
|
|
|
static inline void mnt_inc_writers(struct mount *mnt)
|
2009-04-26 20:25:54 +10:00
|
|
|
{
|
|
|
|
|
#ifdef CONFIG_SMP
|
2011-11-24 22:53:09 -05:00
|
|
|
this_cpu_inc(mnt->mnt_pcp->mnt_writers);
|
2009-04-26 20:25:54 +10:00
|
|
|
#else
|
2011-11-24 22:53:09 -05:00
|
|
|
mnt->mnt_writers++;
|
2009-04-26 20:25:54 +10:00
|
|
|
#endif
|
|
|
|
|
}
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
|
2011-11-24 22:37:54 -05:00
|
|
|
static inline void mnt_dec_writers(struct mount *mnt)
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
{
|
2009-04-26 20:25:54 +10:00
|
|
|
#ifdef CONFIG_SMP
|
2011-11-24 22:53:09 -05:00
|
|
|
this_cpu_dec(mnt->mnt_pcp->mnt_writers);
|
2009-04-26 20:25:54 +10:00
|
|
|
#else
|
2011-11-24 22:53:09 -05:00
|
|
|
mnt->mnt_writers--;
|
2009-04-26 20:25:54 +10:00
|
|
|
#endif
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
}
|
|
|
|
|
|
2011-11-24 22:37:54 -05:00
|
|
|
static unsigned int mnt_get_writers(struct mount *mnt)
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
{
|
2009-04-26 20:25:54 +10:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
|
unsigned int count = 0;
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
int cpu;
|
|
|
|
|
|
|
|
|
|
for_each_possible_cpu(cpu) {
|
2011-11-24 22:53:09 -05:00
|
|
|
count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
}
|
|
|
|
|
|
2009-04-26 20:25:54 +10:00
|
|
|
return count;
|
|
|
|
|
#else
|
|
|
|
|
return mnt->mnt_writers;
|
|
|
|
|
#endif
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
}
|
|
|
|
|
|
2011-11-21 12:11:31 +01:00
|
|
|
static int mnt_is_readonly(struct vfsmount *mnt)
|
|
|
|
|
{
|
2023-06-20 13:28:32 +02:00
|
|
|
if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
|
2011-11-21 12:11:31 +01:00
|
|
|
return 1;
|
2023-06-20 13:28:32 +02:00
|
|
|
/*
|
|
|
|
|
* The barrier pairs with the barrier in sb_start_ro_state_change()
|
|
|
|
|
* making sure if we don't see s_readonly_remount set yet, we also will
|
|
|
|
|
* not see any superblock / mount flag changes done by remount.
|
|
|
|
|
* It also pairs with the barrier in sb_end_ro_state_change()
|
|
|
|
|
* assuring that if we see s_readonly_remount already cleared, we will
|
|
|
|
|
* see the values of superblock / mount flags updated by remount.
|
|
|
|
|
*/
|
2011-11-21 12:11:31 +01:00
|
|
|
smp_rmb();
|
|
|
|
|
return __mnt_is_readonly(mnt);
|
|
|
|
|
}
|
|
|
|
|
|
2008-02-15 14:37:30 -08:00
|
|
|
/*
|
2012-06-12 16:20:35 +02:00
|
|
|
* Most r/o & frozen checks on a fs are for operations that take discrete
|
|
|
|
|
* amounts of time, like a write() or unlink(). We must keep track of when
|
|
|
|
|
* those operations start (for permission checks) and when they end, so that we
|
|
|
|
|
* can determine when writes are able to occur to a filesystem.
|
2008-02-15 14:37:30 -08:00
|
|
|
*/
|
|
|
|
|
/**
|
2023-09-08 16:28:59 +03:00
|
|
|
* mnt_get_write_access - get write access to a mount without freeze protection
|
2011-11-24 22:37:54 -05:00
|
|
|
* @m: the mount on which to take a write
|
2008-02-15 14:37:30 -08:00
|
|
|
*
|
2012-06-12 16:20:35 +02:00
|
|
|
* This tells the low-level filesystem that a write is about to be performed to
|
|
|
|
|
* it, and makes sure that writes are allowed (mnt it read-write) before
|
|
|
|
|
* returning success. This operation does not protect against filesystem being
|
2023-09-08 16:28:59 +03:00
|
|
|
* frozen. When the write operation is finished, mnt_put_write_access() must be
|
2012-06-12 16:20:35 +02:00
|
|
|
* called. This is effectively a refcount.
|
2008-02-15 14:37:30 -08:00
|
|
|
*/
|
2023-09-08 16:28:59 +03:00
|
|
|
int mnt_get_write_access(struct vfsmount *m)
|
2008-02-15 14:37:30 -08:00
|
|
|
{
|
2011-11-24 22:37:54 -05:00
|
|
|
struct mount *mnt = real_mount(m);
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
int ret = 0;
|
|
|
|
|
|
2009-04-26 20:25:54 +10:00
|
|
|
preempt_disable();
|
2011-01-07 17:50:10 +11:00
|
|
|
mnt_inc_writers(mnt);
|
2009-04-26 20:25:54 +10:00
|
|
|
/*
|
2011-01-07 17:50:10 +11:00
|
|
|
* The store to mnt_inc_writers must be visible before we pass
|
2009-04-26 20:25:54 +10:00
|
|
|
* MNT_WRITE_HOLD loop below, so that the slowpath can see our
|
|
|
|
|
* incremented count after it has set MNT_WRITE_HOLD.
|
|
|
|
|
*/
|
|
|
|
|
smp_mb();
|
2021-11-25 13:07:11 +01:00
|
|
|
might_lock(&mount_lock.lock);
|
|
|
|
|
while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
|
|
|
|
|
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
|
|
|
|
|
cpu_relax();
|
|
|
|
|
} else {
|
|
|
|
|
/*
|
|
|
|
|
* This prevents priority inversion, if the task
|
|
|
|
|
* setting MNT_WRITE_HOLD got preempted on a remote
|
|
|
|
|
* CPU, and it prevents life lock if the task setting
|
|
|
|
|
* MNT_WRITE_HOLD has a lower priority and is bound to
|
|
|
|
|
* the same CPU as the task that is spinning here.
|
|
|
|
|
*/
|
|
|
|
|
preempt_enable();
|
|
|
|
|
lock_mount_hash();
|
|
|
|
|
unlock_mount_hash();
|
|
|
|
|
preempt_disable();
|
|
|
|
|
}
|
|
|
|
|
}
|
2009-04-26 20:25:54 +10:00
|
|
|
/*
|
2023-06-20 13:28:32 +02:00
|
|
|
* The barrier pairs with the barrier sb_start_ro_state_change() making
|
|
|
|
|
* sure that if we see MNT_WRITE_HOLD cleared, we will also see
|
|
|
|
|
* s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
|
|
|
|
|
* mnt_is_readonly() and bail in case we are racing with remount
|
|
|
|
|
* read-only.
|
2009-04-26 20:25:54 +10:00
|
|
|
*/
|
|
|
|
|
smp_rmb();
|
2011-11-21 12:11:31 +01:00
|
|
|
if (mnt_is_readonly(m)) {
|
2011-01-07 17:50:10 +11:00
|
|
|
mnt_dec_writers(mnt);
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
ret = -EROFS;
|
|
|
|
|
}
|
2009-04-26 20:25:54 +10:00
|
|
|
preempt_enable();
|
2012-06-12 16:20:35 +02:00
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
2023-09-08 16:29:00 +03:00
|
|
|
EXPORT_SYMBOL_GPL(mnt_get_write_access);
|
2012-06-12 16:20:35 +02:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* mnt_want_write - get write access to a mount
|
|
|
|
|
* @m: the mount on which to take a write
|
|
|
|
|
*
|
|
|
|
|
* This tells the low-level filesystem that a write is about to be performed to
|
|
|
|
|
* it, and makes sure that writes are allowed (mount is read-write, filesystem
|
|
|
|
|
* is not frozen) before returning success. When the write operation is
|
|
|
|
|
* finished, mnt_drop_write() must be called. This is effectively a refcount.
|
|
|
|
|
*/
|
|
|
|
|
int mnt_want_write(struct vfsmount *m)
|
|
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
sb_start_write(m->mnt_sb);
|
2023-09-08 16:28:59 +03:00
|
|
|
ret = mnt_get_write_access(m);
|
2012-06-12 16:20:35 +02:00
|
|
|
if (ret)
|
|
|
|
|
sb_end_write(m->mnt_sb);
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
return ret;
|
2008-02-15 14:37:30 -08:00
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL_GPL(mnt_want_write);
|
|
|
|
|
|
2009-04-26 20:25:55 +10:00
|
|
|
/**
|
2023-09-08 16:28:59 +03:00
|
|
|
* mnt_get_write_access_file - get write access to a file's mount
|
2009-04-26 20:25:55 +10:00
|
|
|
* @file: the file who's mount on which to take a write
|
|
|
|
|
*
|
2023-09-08 16:28:59 +03:00
|
|
|
* This is like mnt_get_write_access, but if @file is already open for write it
|
2020-09-22 09:44:18 -07:00
|
|
|
* skips incrementing mnt_writers (since the open file already has a reference)
|
|
|
|
|
* and instead only does the check for emergency r/o remounts. This must be
|
2023-09-08 16:28:59 +03:00
|
|
|
* paired with mnt_put_write_access_file.
|
2009-04-26 20:25:55 +10:00
|
|
|
*/
|
2023-09-08 16:28:59 +03:00
|
|
|
int mnt_get_write_access_file(struct file *file)
|
2009-04-26 20:25:55 +10:00
|
|
|
{
|
2020-09-22 09:44:18 -07:00
|
|
|
if (file->f_mode & FMODE_WRITER) {
|
|
|
|
|
/*
|
|
|
|
|
* Superblock may have become readonly while there are still
|
|
|
|
|
* writable fd's, e.g. due to a fs error with errors=remount-ro
|
|
|
|
|
*/
|
|
|
|
|
if (__mnt_is_readonly(file->f_path.mnt))
|
|
|
|
|
return -EROFS;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
2023-09-08 16:28:59 +03:00
|
|
|
return mnt_get_write_access(file->f_path.mnt);
|
2009-04-26 20:25:55 +10:00
|
|
|
}
|
2012-06-12 16:20:35 +02:00
|
|
|
|
2017-09-05 12:53:12 +02:00
|
|
|
/**
|
|
|
|
|
* mnt_want_write_file - get write access to a file's mount
|
|
|
|
|
* @file: the file who's mount on which to take a write
|
|
|
|
|
*
|
2020-09-22 09:44:18 -07:00
|
|
|
* This is like mnt_want_write, but if the file is already open for writing it
|
|
|
|
|
* skips incrementing mnt_writers (since the open file already has a reference)
|
|
|
|
|
* and instead only does the freeze protection and the check for emergency r/o
|
|
|
|
|
* remounts. This must be paired with mnt_drop_write_file.
|
2017-09-05 12:53:12 +02:00
|
|
|
*/
|
|
|
|
|
int mnt_want_write_file(struct file *file)
|
|
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
|
2018-07-18 15:44:43 +02:00
|
|
|
sb_start_write(file_inode(file)->i_sb);
|
2023-09-08 16:28:59 +03:00
|
|
|
ret = mnt_get_write_access_file(file);
|
2012-06-12 16:20:35 +02:00
|
|
|
if (ret)
|
2018-07-18 15:44:43 +02:00
|
|
|
sb_end_write(file_inode(file)->i_sb);
|
2017-09-05 12:53:12 +02:00
|
|
|
return ret;
|
|
|
|
|
}
|
2009-04-26 20:25:55 +10:00
|
|
|
EXPORT_SYMBOL_GPL(mnt_want_write_file);
|
|
|
|
|
|
2008-02-15 14:37:30 -08:00
|
|
|
/**
|
2023-09-08 16:28:59 +03:00
|
|
|
* mnt_put_write_access - give up write access to a mount
|
2008-02-15 14:37:30 -08:00
|
|
|
* @mnt: the mount on which to give up write access
|
|
|
|
|
*
|
|
|
|
|
* Tells the low-level filesystem that we are done
|
|
|
|
|
* performing writes to it. Must be matched with
|
2023-09-08 16:28:59 +03:00
|
|
|
* mnt_get_write_access() call above.
|
2008-02-15 14:37:30 -08:00
|
|
|
*/
|
2023-09-08 16:28:59 +03:00
|
|
|
void mnt_put_write_access(struct vfsmount *mnt)
|
2008-02-15 14:37:30 -08:00
|
|
|
{
|
2009-04-26 20:25:54 +10:00
|
|
|
preempt_disable();
|
2011-11-24 22:37:54 -05:00
|
|
|
mnt_dec_writers(real_mount(mnt));
|
2009-04-26 20:25:54 +10:00
|
|
|
preempt_enable();
|
2008-02-15 14:37:30 -08:00
|
|
|
}
|
2023-09-08 16:29:00 +03:00
|
|
|
EXPORT_SYMBOL_GPL(mnt_put_write_access);
|
2012-06-12 16:20:35 +02:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* mnt_drop_write - give up write access to a mount
|
|
|
|
|
* @mnt: the mount on which to give up write access
|
|
|
|
|
*
|
|
|
|
|
* Tells the low-level filesystem that we are done performing writes to it and
|
|
|
|
|
* also allows filesystem to be frozen again. Must be matched with
|
|
|
|
|
* mnt_want_write() call above.
|
|
|
|
|
*/
|
|
|
|
|
void mnt_drop_write(struct vfsmount *mnt)
|
|
|
|
|
{
|
2023-09-08 16:28:59 +03:00
|
|
|
mnt_put_write_access(mnt);
|
2012-06-12 16:20:35 +02:00
|
|
|
sb_end_write(mnt->mnt_sb);
|
|
|
|
|
}
|
2008-02-15 14:37:30 -08:00
|
|
|
EXPORT_SYMBOL_GPL(mnt_drop_write);
|
|
|
|
|
|
2023-09-08 16:28:59 +03:00
|
|
|
void mnt_put_write_access_file(struct file *file)
|
2012-06-12 16:20:35 +02:00
|
|
|
{
|
2020-09-22 09:44:18 -07:00
|
|
|
if (!(file->f_mode & FMODE_WRITER))
|
2023-09-08 16:28:59 +03:00
|
|
|
mnt_put_write_access(file->f_path.mnt);
|
2012-06-12 16:20:35 +02:00
|
|
|
}
|
|
|
|
|
|
2017-09-05 12:53:12 +02:00
|
|
|
void mnt_drop_write_file(struct file *file)
|
|
|
|
|
{
|
2023-09-08 16:28:59 +03:00
|
|
|
mnt_put_write_access_file(file);
|
2017-09-05 12:53:12 +02:00
|
|
|
sb_end_write(file_inode(file)->i_sb);
|
|
|
|
|
}
|
2011-12-09 08:06:57 -05:00
|
|
|
EXPORT_SYMBOL(mnt_drop_write_file);
|
|
|
|
|
|
2022-02-03 14:14:07 +01:00
|
|
|
/**
|
|
|
|
|
* mnt_hold_writers - prevent write access to the given mount
|
|
|
|
|
* @mnt: mnt to prevent write access to
|
|
|
|
|
*
|
|
|
|
|
* Prevents write access to @mnt if there are no active writers for @mnt.
|
|
|
|
|
* This function needs to be called and return successfully before changing
|
|
|
|
|
* properties of @mnt that need to remain stable for callers with write access
|
|
|
|
|
* to @mnt.
|
|
|
|
|
*
|
|
|
|
|
* After this functions has been called successfully callers must pair it with
|
|
|
|
|
* a call to mnt_unhold_writers() in order to stop preventing write access to
|
|
|
|
|
* @mnt.
|
|
|
|
|
*
|
|
|
|
|
* Context: This function expects lock_mount_hash() to be held serializing
|
|
|
|
|
* setting MNT_WRITE_HOLD.
|
|
|
|
|
* Return: On success 0 is returned.
|
|
|
|
|
* On error, -EBUSY is returned.
|
|
|
|
|
*/
|
2021-01-21 14:19:51 +01:00
|
|
|
static inline int mnt_hold_writers(struct mount *mnt)
|
2008-02-15 14:37:30 -08:00
|
|
|
{
|
2011-11-24 22:37:54 -05:00
|
|
|
mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
/*
|
2009-04-26 20:25:54 +10:00
|
|
|
* After storing MNT_WRITE_HOLD, we'll read the counters. This store
|
|
|
|
|
* should be visible before we do.
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
*/
|
2009-04-26 20:25:54 +10:00
|
|
|
smp_mb();
|
|
|
|
|
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
/*
|
2009-04-26 20:25:54 +10:00
|
|
|
* With writers on hold, if this value is zero, then there are
|
|
|
|
|
* definitely no active writers (although held writers may subsequently
|
|
|
|
|
* increment the count, they'll have to wait, and decrement it after
|
|
|
|
|
* seeing MNT_READONLY).
|
|
|
|
|
*
|
|
|
|
|
* It is OK to have counter incremented on one CPU and decremented on
|
|
|
|
|
* another: the sum will add up correctly. The danger would be when we
|
|
|
|
|
* sum up each counter, if we read a counter before it is incremented,
|
|
|
|
|
* but then read another CPU's count which it has been subsequently
|
|
|
|
|
* decremented from -- we would see more decrements than we should.
|
|
|
|
|
* MNT_WRITE_HOLD protects against this scenario, because
|
|
|
|
|
* mnt_want_write first increments count, then smp_mb, then spins on
|
|
|
|
|
* MNT_WRITE_HOLD, so it can't be decremented by another CPU while
|
|
|
|
|
* we're counting up here.
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
*/
|
2011-01-07 17:50:10 +11:00
|
|
|
if (mnt_get_writers(mnt) > 0)
|
2021-01-21 14:19:51 +01:00
|
|
|
return -EBUSY;
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2022-02-03 14:14:07 +01:00
|
|
|
/**
|
|
|
|
|
* mnt_unhold_writers - stop preventing write access to the given mount
|
|
|
|
|
* @mnt: mnt to stop preventing write access to
|
|
|
|
|
*
|
|
|
|
|
* Stop preventing write access to @mnt allowing callers to gain write access
|
|
|
|
|
* to @mnt again.
|
|
|
|
|
*
|
|
|
|
|
* This function can only be called after a successful call to
|
|
|
|
|
* mnt_hold_writers().
|
|
|
|
|
*
|
|
|
|
|
* Context: This function expects lock_mount_hash() to be held.
|
|
|
|
|
*/
|
2021-01-21 14:19:51 +01:00
|
|
|
static inline void mnt_unhold_writers(struct mount *mnt)
|
|
|
|
|
{
|
2009-04-26 20:25:54 +10:00
|
|
|
/*
|
|
|
|
|
* MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
|
|
|
|
|
* that become unheld will see MNT_READONLY.
|
|
|
|
|
*/
|
|
|
|
|
smp_wmb();
|
2011-11-24 22:37:54 -05:00
|
|
|
mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
|
2021-01-21 14:19:51 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int mnt_make_readonly(struct mount *mnt)
|
|
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
ret = mnt_hold_writers(mnt);
|
|
|
|
|
if (!ret)
|
|
|
|
|
mnt->mnt.mnt_flags |= MNT_READONLY;
|
|
|
|
|
mnt_unhold_writers(mnt);
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 14:37:59 -08:00
|
|
|
return ret;
|
2008-02-15 14:37:30 -08:00
|
|
|
}
|
|
|
|
|
|
2011-11-21 12:11:31 +01:00
|
|
|
int sb_prepare_remount_readonly(struct super_block *sb)
|
|
|
|
|
{
|
|
|
|
|
struct mount *mnt;
|
|
|
|
|
int err = 0;
|
|
|
|
|
|
2011-11-21 12:11:33 +01:00
|
|
|
/* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
|
|
|
|
|
if (atomic_long_read(&sb->s_remove_count))
|
|
|
|
|
return -EBUSY;
|
|
|
|
|
|
2013-09-29 11:24:49 -04:00
|
|
|
lock_mount_hash();
|
2011-11-21 12:11:31 +01:00
|
|
|
list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
|
|
|
|
|
if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
|
2022-02-03 14:14:10 +01:00
|
|
|
err = mnt_hold_writers(mnt);
|
|
|
|
|
if (err)
|
2011-11-21 12:11:31 +01:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
2011-11-21 12:11:33 +01:00
|
|
|
if (!err && atomic_long_read(&sb->s_remove_count))
|
|
|
|
|
err = -EBUSY;
|
|
|
|
|
|
2023-06-20 13:28:32 +02:00
|
|
|
if (!err)
|
|
|
|
|
sb_start_ro_state_change(sb);
|
2011-11-21 12:11:31 +01:00
|
|
|
list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
|
|
|
|
|
if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
|
|
|
|
|
mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
|
|
|
|
|
}
|
2013-09-29 11:24:49 -04:00
|
|
|
unlock_mount_hash();
|
2011-11-21 12:11:31 +01:00
|
|
|
|
|
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
2011-11-24 20:38:33 -05:00
|
|
|
static void free_vfsmnt(struct mount *mnt)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2022-10-26 12:51:27 +02:00
|
|
|
mnt_idmap_put(mnt_idmap(&mnt->mnt));
|
2015-02-13 14:36:41 -08:00
|
|
|
kfree_const(mnt->mnt_devname);
|
2009-04-26 20:25:54 +10:00
|
|
|
#ifdef CONFIG_SMP
|
2011-11-24 22:53:09 -05:00
|
|
|
free_percpu(mnt->mnt_pcp);
|
2009-04-26 20:25:54 +10:00
|
|
|
#endif
|
2011-11-24 20:38:33 -05:00
|
|
|
kmem_cache_free(mnt_cache, mnt);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2014-01-24 12:17:54 +00:00
|
|
|
static void delayed_free_vfsmnt(struct rcu_head *head)
|
|
|
|
|
{
|
|
|
|
|
free_vfsmnt(container_of(head, struct mount, mnt_rcu));
|
|
|
|
|
}
|
|
|
|
|
|
2013-09-29 22:06:07 -04:00
|
|
|
/* call under rcu_read_lock */
|
2015-05-08 11:43:53 -04:00
|
|
|
int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
|
2013-09-29 22:06:07 -04:00
|
|
|
{
|
|
|
|
|
struct mount *mnt;
|
|
|
|
|
if (read_seqretry(&mount_lock, seq))
|
2015-05-08 11:43:53 -04:00
|
|
|
return 1;
|
2013-09-29 22:06:07 -04:00
|
|
|
if (bastard == NULL)
|
2015-05-08 11:43:53 -04:00
|
|
|
return 0;
|
2013-09-29 22:06:07 -04:00
|
|
|
mnt = real_mount(bastard);
|
|
|
|
|
mnt_add_count(mnt, 1);
|
2025-04-28 23:56:14 -04:00
|
|
|
smp_mb(); // see mntput_no_expire() and do_umount()
|
2013-09-29 22:06:07 -04:00
|
|
|
if (likely(!read_seqretry(&mount_lock, seq)))
|
2015-05-08 11:43:53 -04:00
|
|
|
return 0;
|
2018-08-09 17:51:32 -04:00
|
|
|
lock_mount_hash();
|
2025-04-27 15:41:51 -04:00
|
|
|
if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) {
|
2018-08-09 17:51:32 -04:00
|
|
|
mnt_add_count(mnt, -1);
|
|
|
|
|
unlock_mount_hash();
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
unlock_mount_hash();
|
|
|
|
|
/* caller will mntput() */
|
2015-05-08 11:43:53 -04:00
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* call under rcu_read_lock */
|
2022-07-05 12:22:46 -04:00
|
|
|
static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
|
2015-05-08 11:43:53 -04:00
|
|
|
{
|
|
|
|
|
int res = __legitimize_mnt(bastard, seq);
|
|
|
|
|
if (likely(!res))
|
|
|
|
|
return true;
|
|
|
|
|
if (unlikely(res < 0)) {
|
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
mntput(bastard);
|
|
|
|
|
rcu_read_lock();
|
2013-09-29 22:06:07 -04:00
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2023-05-03 13:18:40 +02:00
|
|
|
/**
|
|
|
|
|
* __lookup_mnt - find first child mount
|
|
|
|
|
* @mnt: parent mount
|
|
|
|
|
* @dentry: mountpoint
|
|
|
|
|
*
|
|
|
|
|
* If @mnt has a child mount @c mounted @dentry find and return it.
|
|
|
|
|
*
|
|
|
|
|
* Note that the child mount @c need not be unique. There are cases
|
|
|
|
|
* where shadow mounts are created. For example, during mount
|
|
|
|
|
* propagation when a source mount @mnt whose root got overmounted by a
|
|
|
|
|
* mount @o after path lookup but before @namespace_sem could be
|
|
|
|
|
* acquired gets copied and propagated. So @mnt gets copied including
|
|
|
|
|
* @o. When @mnt is propagated to a destination mount @d that already
|
|
|
|
|
* has another mount @n mounted at the same mountpoint then the source
|
|
|
|
|
* mount @mnt will be tucked beneath @n, i.e., @n will be mounted on
|
|
|
|
|
* @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt
|
|
|
|
|
* on @dentry.
|
|
|
|
|
*
|
|
|
|
|
* Return: The first child of @mnt mounted @dentry or NULL.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2013-10-01 16:11:26 -04:00
|
|
|
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2014-03-20 21:10:51 -04:00
|
|
|
struct hlist_head *head = m_hash(mnt, dentry);
|
2013-10-01 16:11:26 -04:00
|
|
|
struct mount *p;
|
|
|
|
|
|
2014-03-20 21:10:51 -04:00
|
|
|
hlist_for_each_entry_rcu(p, head, mnt_hash)
|
2013-10-01 16:11:26 -04:00
|
|
|
if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
|
|
|
|
|
return p;
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2005-11-07 17:20:17 -05:00
|
|
|
/*
|
2012-06-25 12:55:28 +01:00
|
|
|
* lookup_mnt - Return the first child mount mounted at path
|
|
|
|
|
*
|
|
|
|
|
* "First" means first mounted chronologically. If you create the
|
|
|
|
|
* following mounts:
|
|
|
|
|
*
|
|
|
|
|
* mount /dev/sda1 /mnt
|
|
|
|
|
* mount /dev/sda2 /mnt
|
|
|
|
|
* mount /dev/sda3 /mnt
|
|
|
|
|
*
|
|
|
|
|
* Then lookup_mnt() on the base /mnt dentry in the root mount will
|
|
|
|
|
* return successively the root dentry and vfsmount of /dev/sda1, then
|
|
|
|
|
* /dev/sda2, then /dev/sda3, then NULL.
|
|
|
|
|
*
|
|
|
|
|
* lookup_mnt takes a reference to the found vfsmount.
|
2005-11-07 17:20:17 -05:00
|
|
|
*/
|
2016-11-20 19:45:28 -05:00
|
|
|
struct vfsmount *lookup_mnt(const struct path *path)
|
2005-11-07 17:20:17 -05:00
|
|
|
{
|
2011-11-24 18:22:03 -05:00
|
|
|
struct mount *child_mnt;
|
2013-09-29 22:06:07 -04:00
|
|
|
struct vfsmount *m;
|
|
|
|
|
unsigned seq;
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
|
2013-09-29 22:06:07 -04:00
|
|
|
rcu_read_lock();
|
|
|
|
|
do {
|
|
|
|
|
seq = read_seqbegin(&mount_lock);
|
|
|
|
|
child_mnt = __lookup_mnt(path->mnt, path->dentry);
|
|
|
|
|
m = child_mnt ? &child_mnt->mnt : NULL;
|
|
|
|
|
} while (!legitimize_mnt(m, seq));
|
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
return m;
|
2005-11-07 17:20:17 -05:00
|
|
|
}
|
|
|
|
|
|
2013-10-04 19:15:13 -07:00
|
|
|
/*
|
|
|
|
|
* __is_local_mountpoint - Test to see if dentry is a mountpoint in the
|
|
|
|
|
* current mount namespace.
|
|
|
|
|
*
|
|
|
|
|
* The common case is dentries are not mountpoints at all and that
|
|
|
|
|
* test is handled inline. For the slow case when we are actually
|
|
|
|
|
* dealing with a mountpoint of some kind, walk through all of the
|
|
|
|
|
* mounts in the current mount namespace and test to see if the dentry
|
|
|
|
|
* is a mountpoint.
|
|
|
|
|
*
|
|
|
|
|
* The mount_hashtable is not usable in the context because we
|
|
|
|
|
* need to identify all mounts that may be in the current mount
|
|
|
|
|
* namespace not just a mount that happens to have some specified
|
|
|
|
|
* parent mount.
|
|
|
|
|
*/
|
2025-06-09 22:03:17 -04:00
|
|
|
bool __is_local_mountpoint(const struct dentry *dentry)
|
2013-10-04 19:15:13 -07:00
|
|
|
{
|
|
|
|
|
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
|
2023-10-25 16:02:00 +02:00
|
|
|
struct mount *mnt, *n;
|
2013-10-04 19:15:13 -07:00
|
|
|
bool is_covered = false;
|
|
|
|
|
|
|
|
|
|
down_read(&namespace_sem);
|
2023-10-25 16:02:00 +02:00
|
|
|
rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
|
2013-10-04 19:15:13 -07:00
|
|
|
is_covered = (mnt->mnt_mountpoint == dentry);
|
|
|
|
|
if (is_covered)
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
up_read(&namespace_sem);
|
2020-03-04 18:12:45 +02:00
|
|
|
|
2013-10-04 19:15:13 -07:00
|
|
|
return is_covered;
|
|
|
|
|
}
|
|
|
|
|
|
2014-02-24 17:32:34 -08:00
|
|
|
static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
|
2013-03-15 10:53:28 -04:00
|
|
|
{
|
2014-02-28 13:46:44 -05:00
|
|
|
struct hlist_head *chain = mp_hash(dentry);
|
2013-03-15 10:53:28 -04:00
|
|
|
struct mountpoint *mp;
|
|
|
|
|
|
2014-02-28 13:46:44 -05:00
|
|
|
hlist_for_each_entry(mp, chain, m_hash) {
|
2013-03-15 10:53:28 -04:00
|
|
|
if (mp->m_dentry == dentry) {
|
|
|
|
|
mp->m_count++;
|
|
|
|
|
return mp;
|
|
|
|
|
}
|
|
|
|
|
}
|
2014-02-24 17:32:34 -08:00
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2017-01-03 14:18:43 +13:00
|
|
|
static struct mountpoint *get_mountpoint(struct dentry *dentry)
|
2014-02-24 17:32:34 -08:00
|
|
|
{
|
2017-01-03 14:18:43 +13:00
|
|
|
struct mountpoint *mp, *new = NULL;
|
2014-02-24 17:32:34 -08:00
|
|
|
int ret;
|
2013-03-15 10:53:28 -04:00
|
|
|
|
2017-01-03 14:18:43 +13:00
|
|
|
if (d_mountpoint(dentry)) {
|
2018-10-03 10:18:33 -04:00
|
|
|
/* might be worth a WARN_ON() */
|
|
|
|
|
if (d_unlinked(dentry))
|
|
|
|
|
return ERR_PTR(-ENOENT);
|
2017-01-03 14:18:43 +13:00
|
|
|
mountpoint:
|
|
|
|
|
read_seqlock_excl(&mount_lock);
|
|
|
|
|
mp = lookup_mountpoint(dentry);
|
|
|
|
|
read_sequnlock_excl(&mount_lock);
|
|
|
|
|
if (mp)
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!new)
|
|
|
|
|
new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
|
|
|
|
|
if (!new)
|
2013-03-15 10:53:28 -04:00
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
2017-01-03 14:18:43 +13:00
|
|
|
|
|
|
|
|
/* Exactly one processes may set d_mounted */
|
2013-09-05 14:39:11 +02:00
|
|
|
ret = d_set_mounted(dentry);
|
|
|
|
|
|
2017-01-03 14:18:43 +13:00
|
|
|
/* Someone else set d_mounted? */
|
|
|
|
|
if (ret == -EBUSY)
|
|
|
|
|
goto mountpoint;
|
|
|
|
|
|
|
|
|
|
/* The dentry is not available as a mountpoint? */
|
|
|
|
|
mp = ERR_PTR(ret);
|
|
|
|
|
if (ret)
|
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
|
|
/* Add the new mountpoint to the hash table */
|
|
|
|
|
read_seqlock_excl(&mount_lock);
|
2019-06-30 10:39:08 -04:00
|
|
|
new->m_dentry = dget(dentry);
|
2017-01-03 14:18:43 +13:00
|
|
|
new->m_count = 1;
|
|
|
|
|
hlist_add_head(&new->m_hash, mp_hash(dentry));
|
|
|
|
|
INIT_HLIST_HEAD(&new->m_list);
|
|
|
|
|
read_sequnlock_excl(&mount_lock);
|
|
|
|
|
|
|
|
|
|
mp = new;
|
|
|
|
|
new = NULL;
|
|
|
|
|
done:
|
|
|
|
|
kfree(new);
|
2013-03-15 10:53:28 -04:00
|
|
|
return mp;
|
|
|
|
|
}
|
|
|
|
|
|
2019-06-30 10:39:08 -04:00
|
|
|
/*
|
|
|
|
|
* vfsmount lock must be held. Additionally, the caller is responsible
|
|
|
|
|
* for serializing calls for given disposal list.
|
|
|
|
|
*/
|
|
|
|
|
static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
|
2013-03-15 10:53:28 -04:00
|
|
|
{
|
|
|
|
|
if (!--mp->m_count) {
|
|
|
|
|
struct dentry *dentry = mp->m_dentry;
|
2013-09-22 19:37:01 -07:00
|
|
|
BUG_ON(!hlist_empty(&mp->m_list));
|
2013-03-15 10:53:28 -04:00
|
|
|
spin_lock(&dentry->d_lock);
|
|
|
|
|
dentry->d_flags &= ~DCACHE_MOUNTED;
|
|
|
|
|
spin_unlock(&dentry->d_lock);
|
2019-06-30 10:39:08 -04:00
|
|
|
dput_to_list(dentry, list);
|
2014-02-28 13:46:44 -05:00
|
|
|
hlist_del(&mp->m_hash);
|
2013-03-15 10:53:28 -04:00
|
|
|
kfree(mp);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-06-30 10:39:08 -04:00
|
|
|
/* called with namespace_lock and vfsmount lock */
|
|
|
|
|
static void put_mountpoint(struct mountpoint *mp)
|
|
|
|
|
{
|
|
|
|
|
__put_mountpoint(mp, &ex_mountpoints);
|
|
|
|
|
}
|
|
|
|
|
|
2011-11-25 00:46:35 -05:00
|
|
|
static inline int check_mnt(struct mount *mnt)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-12-08 02:37:56 -08:00
|
|
|
return mnt->mnt_ns == current->nsproxy->mnt_ns;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2025-02-21 14:13:05 +01:00
|
|
|
static inline bool check_anonymous_mnt(struct mount *mnt)
|
|
|
|
|
{
|
fs: allow creating detached mounts from fsmount() file descriptors
The previous patch series only enabled the creation of detached mounts
from detached mounts that were created via open_tree(). In such cases we
know that the origin sequence number for the newly created anonymous
mount namespace will be set to the sequence number of the mount
namespace the source mount belonged to.
But fsmount() creates an anonymous mount namespace that does not have an
origin mount namespace as the anonymous mount namespace was derived from
a filesystem context created via fsopen().
Account for this case and allow the creation of detached mounts from
mounts created via fsmount(). Consequently, any such detached mount
created from an fsmount() mount will also have a zero origin sequence
number.
This allows to mount subdirectories without ever having to expose the
filesystem to a a non-anonymous mount namespace:
fd_context = sys_fsopen("tmpfs", 0);
sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
fd_tmpfs = sys_fsmount(fd_context, 0, 0);
mkdirat(fd_tmpfs, "subdir", 0755);
fd_tree = sys_open_tree(fd_tmpfs, "subdir", OPEN_TREE_CLONE);
sys_move_mount(fd_tree, "", -EBADF, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-02-25 10:31:34 +01:00
|
|
|
u64 seq;
|
|
|
|
|
|
|
|
|
|
if (!is_anon_ns(mnt->mnt_ns))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
seq = mnt->mnt_ns->seq_origin;
|
|
|
|
|
return !seq || (seq == current->nsproxy->mnt_ns->seq);
|
2025-02-21 14:13:05 +01:00
|
|
|
}
|
|
|
|
|
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
/*
|
|
|
|
|
* vfsmount lock must be held for write
|
|
|
|
|
*/
|
2006-12-08 02:37:56 -08:00
|
|
|
static void touch_mnt_namespace(struct mnt_namespace *ns)
|
2005-11-07 17:15:49 -05:00
|
|
|
{
|
|
|
|
|
if (ns) {
|
|
|
|
|
ns->event = ++event;
|
|
|
|
|
wake_up_interruptible(&ns->poll);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
/*
|
|
|
|
|
* vfsmount lock must be held for write
|
|
|
|
|
*/
|
2006-12-08 02:37:56 -08:00
|
|
|
static void __touch_mnt_namespace(struct mnt_namespace *ns)
|
2005-11-07 17:15:49 -05:00
|
|
|
{
|
|
|
|
|
if (ns && ns->event != event) {
|
|
|
|
|
ns->event = event;
|
|
|
|
|
wake_up_interruptible(&ns->poll);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
/*
|
|
|
|
|
* vfsmount lock must be held for write
|
|
|
|
|
*/
|
2019-06-29 12:58:42 -04:00
|
|
|
static struct mountpoint *unhash_mnt(struct mount *mnt)
|
2011-11-24 19:41:16 -05:00
|
|
|
{
|
2019-06-29 12:58:42 -04:00
|
|
|
struct mountpoint *mp;
|
2025-06-20 22:46:55 -04:00
|
|
|
struct mount *parent = mnt->mnt_parent;
|
|
|
|
|
if (unlikely(parent->overmount == mnt))
|
|
|
|
|
parent->overmount = NULL;
|
2011-11-24 22:19:58 -05:00
|
|
|
mnt->mnt_parent = mnt;
|
2011-11-24 22:25:07 -05:00
|
|
|
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
|
2011-11-24 23:24:33 -05:00
|
|
|
list_del_init(&mnt->mnt_child);
|
2014-03-20 21:10:51 -04:00
|
|
|
hlist_del_init_rcu(&mnt->mnt_hash);
|
2013-09-22 19:37:01 -07:00
|
|
|
hlist_del_init(&mnt->mnt_mp_list);
|
2019-06-29 12:58:42 -04:00
|
|
|
mp = mnt->mnt_mp;
|
2013-03-15 10:53:28 -04:00
|
|
|
mnt->mnt_mp = NULL;
|
2019-06-29 12:58:42 -04:00
|
|
|
return mp;
|
2014-12-29 13:03:41 -06:00
|
|
|
}
|
|
|
|
|
|
2015-01-15 22:58:33 -06:00
|
|
|
/*
|
|
|
|
|
* vfsmount lock must be held for write
|
|
|
|
|
*/
|
|
|
|
|
static void umount_mnt(struct mount *mnt)
|
|
|
|
|
{
|
2019-06-29 12:58:42 -04:00
|
|
|
put_mountpoint(unhash_mnt(mnt));
|
2015-01-15 22:58:33 -06:00
|
|
|
}
|
|
|
|
|
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
/*
|
|
|
|
|
* vfsmount lock must be held for write
|
|
|
|
|
*/
|
2013-03-15 10:53:28 -04:00
|
|
|
void mnt_set_mountpoint(struct mount *mnt,
|
|
|
|
|
struct mountpoint *mp,
|
2011-11-24 21:28:22 -05:00
|
|
|
struct mount *child_mnt)
|
2005-11-07 17:19:50 -05:00
|
|
|
{
|
2013-03-15 10:53:28 -04:00
|
|
|
mp->m_count++;
|
2011-11-25 03:19:09 -05:00
|
|
|
mnt_add_count(mnt, 1); /* essentially, that's mntget */
|
2019-06-30 10:39:08 -04:00
|
|
|
child_mnt->mnt_mountpoint = mp->m_dentry;
|
2011-11-25 03:19:09 -05:00
|
|
|
child_mnt->mnt_parent = mnt;
|
2013-03-15 10:53:28 -04:00
|
|
|
child_mnt->mnt_mp = mp;
|
2013-09-22 19:37:01 -07:00
|
|
|
hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
|
2005-11-07 17:19:50 -05:00
|
|
|
}
|
|
|
|
|
|
2025-06-17 21:10:02 -04:00
|
|
|
static void make_visible(struct mount *mnt)
|
2017-01-20 18:28:35 +13:00
|
|
|
{
|
2025-06-17 21:10:02 -04:00
|
|
|
struct mount *parent = mnt->mnt_parent;
|
2025-06-20 22:46:55 -04:00
|
|
|
if (unlikely(mnt->mnt_mountpoint == parent->mnt.mnt_root))
|
|
|
|
|
parent->overmount = mnt;
|
2017-01-20 18:28:35 +13:00
|
|
|
hlist_add_head_rcu(&mnt->mnt_hash,
|
|
|
|
|
m_hash(&parent->mnt, mnt->mnt_mountpoint));
|
|
|
|
|
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
|
|
|
|
|
}
|
|
|
|
|
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
/**
|
|
|
|
|
* attach_mnt - mount a mount, attach to @mount_hashtable and parent's
|
|
|
|
|
* list of child mounts
|
|
|
|
|
* @parent: the parent
|
|
|
|
|
* @mnt: the new mount
|
|
|
|
|
* @mp: the new mountpoint
|
|
|
|
|
*
|
2025-04-25 12:40:28 -04:00
|
|
|
* Mount @mnt at @mp on @parent. Then attach @mnt
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
* to @parent's child mount list and to @mount_hashtable.
|
|
|
|
|
*
|
2025-06-17 21:10:02 -04:00
|
|
|
* Note, when make_visible() is called @mnt->mnt_parent already points
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
* to the correct parent.
|
|
|
|
|
*
|
|
|
|
|
* Context: This function expects namespace_lock() and lock_mount_hash()
|
|
|
|
|
* to have been acquired in that order.
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
*/
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
static void attach_mnt(struct mount *mnt, struct mount *parent,
|
2025-04-25 12:40:28 -04:00
|
|
|
struct mountpoint *mp)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2025-04-25 12:40:28 -04:00
|
|
|
mnt_set_mountpoint(parent, mp, mnt);
|
2025-06-17 21:10:02 -04:00
|
|
|
make_visible(mnt);
|
2005-11-07 17:19:50 -05:00
|
|
|
}
|
|
|
|
|
|
2017-01-20 18:28:35 +13:00
|
|
|
void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
|
2014-08-10 03:44:55 -04:00
|
|
|
{
|
2017-01-20 18:28:35 +13:00
|
|
|
struct mountpoint *old_mp = mnt->mnt_mp;
|
|
|
|
|
struct mount *old_parent = mnt->mnt_parent;
|
|
|
|
|
|
|
|
|
|
list_del_init(&mnt->mnt_child);
|
|
|
|
|
hlist_del_init(&mnt->mnt_mp_list);
|
|
|
|
|
hlist_del_init_rcu(&mnt->mnt_hash);
|
|
|
|
|
|
2025-04-25 12:40:28 -04:00
|
|
|
attach_mnt(mnt, parent, mp);
|
2017-01-20 18:28:35 +13:00
|
|
|
|
|
|
|
|
put_mountpoint(old_mp);
|
|
|
|
|
mnt_add_count(old_parent, -1);
|
2014-08-10 03:44:55 -04:00
|
|
|
}
|
|
|
|
|
|
2023-10-25 16:02:00 +02:00
|
|
|
static inline struct mount *node_to_mount(struct rb_node *node)
|
|
|
|
|
{
|
2023-10-25 16:02:03 +02:00
|
|
|
return node ? rb_entry(node, struct mount, mnt_node) : NULL;
|
2023-10-25 16:02:00 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
|
|
|
|
|
{
|
|
|
|
|
struct rb_node **link = &ns->mounts.rb_node;
|
|
|
|
|
struct rb_node *parent = NULL;
|
2024-12-15 21:17:06 +01:00
|
|
|
bool mnt_first_node = true, mnt_last_node = true;
|
2023-10-25 16:02:00 +02:00
|
|
|
|
2024-12-15 21:17:05 +01:00
|
|
|
WARN_ON(mnt_ns_attached(mnt));
|
2023-10-25 16:02:00 +02:00
|
|
|
mnt->mnt_ns = ns;
|
|
|
|
|
while (*link) {
|
|
|
|
|
parent = *link;
|
2024-12-15 21:17:06 +01:00
|
|
|
if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) {
|
2023-10-25 16:02:00 +02:00
|
|
|
link = &parent->rb_left;
|
2024-12-15 21:17:06 +01:00
|
|
|
mnt_last_node = false;
|
|
|
|
|
} else {
|
2023-10-25 16:02:00 +02:00
|
|
|
link = &parent->rb_right;
|
2024-12-15 21:17:06 +01:00
|
|
|
mnt_first_node = false;
|
|
|
|
|
}
|
2023-10-25 16:02:00 +02:00
|
|
|
}
|
2024-12-15 21:17:06 +01:00
|
|
|
|
|
|
|
|
if (mnt_last_node)
|
|
|
|
|
ns->mnt_last_node = &mnt->mnt_node;
|
|
|
|
|
if (mnt_first_node)
|
|
|
|
|
ns->mnt_first_node = &mnt->mnt_node;
|
2023-10-25 16:02:00 +02:00
|
|
|
rb_link_node(&mnt->mnt_node, parent, link);
|
|
|
|
|
rb_insert_color(&mnt->mnt_node, &ns->mounts);
|
2025-01-29 17:58:01 +01:00
|
|
|
|
|
|
|
|
mnt_notify_add(mnt);
|
2023-10-25 16:02:00 +02:00
|
|
|
}
|
|
|
|
|
|
2005-11-07 17:19:50 -05:00
|
|
|
/*
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
* vfsmount lock must be held for write
|
2005-11-07 17:19:50 -05:00
|
|
|
*/
|
2017-01-20 18:28:35 +13:00
|
|
|
static void commit_tree(struct mount *mnt)
|
2005-11-07 17:19:50 -05:00
|
|
|
{
|
2011-11-24 22:19:58 -05:00
|
|
|
struct mount *parent = mnt->mnt_parent;
|
2011-11-24 22:37:54 -05:00
|
|
|
struct mount *m;
|
2005-11-07 17:19:50 -05:00
|
|
|
LIST_HEAD(head);
|
2011-11-25 00:46:35 -05:00
|
|
|
struct mnt_namespace *n = parent->mnt_ns;
|
2005-11-07 17:19:50 -05:00
|
|
|
|
2011-11-24 22:19:58 -05:00
|
|
|
BUG_ON(parent == mnt);
|
2005-11-07 17:19:50 -05:00
|
|
|
|
2025-04-25 22:34:33 -04:00
|
|
|
if (!mnt_ns_attached(mnt)) {
|
|
|
|
|
list_add_tail(&head, &mnt->mnt_list);
|
|
|
|
|
while (!list_empty(&head)) {
|
|
|
|
|
m = list_first_entry(&head, typeof(*m), mnt_list);
|
|
|
|
|
list_del(&m->mnt_list);
|
2011-01-14 22:30:21 -05:00
|
|
|
|
2025-04-25 22:34:33 -04:00
|
|
|
mnt_add_to_ns(n, m);
|
|
|
|
|
}
|
|
|
|
|
n->nr_mounts += n->pending_mounts;
|
|
|
|
|
n->pending_mounts = 0;
|
2023-10-25 16:02:00 +02:00
|
|
|
}
|
2016-09-28 00:27:17 -05:00
|
|
|
|
2025-06-17 21:10:02 -04:00
|
|
|
make_visible(mnt);
|
2006-12-08 02:37:56 -08:00
|
|
|
touch_mnt_namespace(n);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2011-11-25 03:06:56 -05:00
|
|
|
static struct mount *next_mnt(struct mount *p, struct mount *root)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2011-11-24 23:24:33 -05:00
|
|
|
struct list_head *next = p->mnt_mounts.next;
|
|
|
|
|
if (next == &p->mnt_mounts) {
|
2005-04-16 15:20:36 -07:00
|
|
|
while (1) {
|
2011-11-25 03:06:56 -05:00
|
|
|
if (p == root)
|
2005-04-16 15:20:36 -07:00
|
|
|
return NULL;
|
2011-11-24 23:24:33 -05:00
|
|
|
next = p->mnt_child.next;
|
|
|
|
|
if (next != &p->mnt_parent->mnt_mounts)
|
2005-04-16 15:20:36 -07:00
|
|
|
break;
|
2011-11-24 22:19:58 -05:00
|
|
|
p = p->mnt_parent;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
}
|
2011-11-24 23:24:33 -05:00
|
|
|
return list_entry(next, struct mount, mnt_child);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2011-11-24 18:57:30 -05:00
|
|
|
static struct mount *skip_mnt_tree(struct mount *p)
|
2005-11-07 17:21:20 -05:00
|
|
|
{
|
2011-11-24 23:24:33 -05:00
|
|
|
struct list_head *prev = p->mnt_mounts.prev;
|
|
|
|
|
while (prev != &p->mnt_mounts) {
|
|
|
|
|
p = list_entry(prev, struct mount, mnt_child);
|
|
|
|
|
prev = p->mnt_mounts.prev;
|
2005-11-07 17:21:20 -05:00
|
|
|
}
|
|
|
|
|
return p;
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-04 06:48:34 -05:00
|
|
|
/**
|
|
|
|
|
* vfs_create_mount - Create a mount for a configured superblock
|
|
|
|
|
* @fc: The configuration context with the superblock attached
|
|
|
|
|
*
|
|
|
|
|
* Create a mount to an already configured superblock. If necessary, the
|
|
|
|
|
* caller should invoke vfs_get_tree() before calling this.
|
|
|
|
|
*
|
|
|
|
|
* Note that this does not attach the mount to anything.
|
|
|
|
|
*/
|
|
|
|
|
struct vfsmount *vfs_create_mount(struct fs_context *fc)
|
2011-03-17 22:08:28 -04:00
|
|
|
{
|
2011-11-24 20:38:33 -05:00
|
|
|
struct mount *mnt;
|
2011-03-17 22:08:28 -04:00
|
|
|
|
2018-11-04 06:48:34 -05:00
|
|
|
if (!fc->root)
|
|
|
|
|
return ERR_PTR(-EINVAL);
|
2011-03-17 22:08:28 -04:00
|
|
|
|
2025-04-21 04:35:09 +01:00
|
|
|
mnt = alloc_vfsmnt(fc->source);
|
2011-03-17 22:08:28 -04:00
|
|
|
if (!mnt)
|
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
2018-11-04 06:48:34 -05:00
|
|
|
if (fc->sb_flags & SB_KERNMOUNT)
|
2011-11-24 20:38:33 -05:00
|
|
|
mnt->mnt.mnt_flags = MNT_INTERNAL;
|
2011-03-17 22:08:28 -04:00
|
|
|
|
2018-11-04 06:48:34 -05:00
|
|
|
atomic_inc(&fc->root->d_sb->s_active);
|
|
|
|
|
mnt->mnt.mnt_sb = fc->root->d_sb;
|
|
|
|
|
mnt->mnt.mnt_root = dget(fc->root);
|
|
|
|
|
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
|
|
|
|
|
mnt->mnt_parent = mnt;
|
2011-03-17 22:08:28 -04:00
|
|
|
|
2013-09-29 11:24:49 -04:00
|
|
|
lock_mount_hash();
|
2018-11-04 06:48:34 -05:00
|
|
|
list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
|
2013-09-29 11:24:49 -04:00
|
|
|
unlock_mount_hash();
|
2011-11-24 20:38:33 -05:00
|
|
|
return &mnt->mnt;
|
2011-03-17 22:08:28 -04:00
|
|
|
}
|
2018-11-04 06:48:34 -05:00
|
|
|
EXPORT_SYMBOL(vfs_create_mount);
|
|
|
|
|
|
|
|
|
|
struct vfsmount *fc_mount(struct fs_context *fc)
|
|
|
|
|
{
|
|
|
|
|
int err = vfs_get_tree(fc);
|
|
|
|
|
if (!err) {
|
|
|
|
|
up_write(&fc->root->d_sb->s_umount);
|
|
|
|
|
return vfs_create_mount(fc);
|
|
|
|
|
}
|
|
|
|
|
return ERR_PTR(err);
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(fc_mount);
|
|
|
|
|
|
sanitize handling of long-term internal mounts
Original rationale for those had been the reduced cost of mntput()
for the stuff that is mounted somewhere. Mount refcount increments and
decrements are frequent; what's worse, they tend to concentrate on the
same instances and cacheline pingpong is quite noticable.
As the result, mount refcounts are per-cpu; that allows a very cheap
increment. Plain decrement would be just as easy, but decrement-and-test
is anything but (we need to add the components up, with exclusion against
possible increment-from-zero, etc.).
Fortunately, there is a very common case where we can tell that decrement
won't be the final one - if the thing we are dropping is currently
mounted somewhere. We have an RCU delay between the removal from mount
tree and dropping the reference that used to pin it there, so we can
just take rcu_read_lock() and check if the victim is mounted somewhere.
If it is, we can go ahead and decrement without and further checks -
the reference we are dropping is not the last one. If it isn't, we
get all the fun with locking, carefully adding up components, etc.,
but the majority of refcount decrements end up taking the fast path.
There is a major exception, though - pipes and sockets. Those live
on the internal filesystems that are not going to be mounted anywhere.
They are not going to be _un_mounted, of course, so having to take the
slow path every time a pipe or socket gets closed is really obnoxious.
Solution had been to mark them as long-lived ones - essentially faking
"they are mounted somewhere" indicator.
With minor modification that works even for ones that do eventually get
dropped - all it takes is making sure we have an RCU delay between
clearing the "mounted somewhere" indicator and dropping the reference.
There are some additional twists (if you want to drop a dozen of such
internal mounts, you'd be better off with clearing the indicator on
all of them, doing an RCU delay once, then dropping the references),
but in the basic form it had been
* use kern_mount() if you want your internal mount to be
a long-term one.
* use kern_unmount() to undo that.
Unfortunately, the things did rot a bit during the mount API reshuffling.
In several cases we have lost the "fake the indicator" part; kern_unmount()
on the unmount side remained (it doesn't warn if you use it on a mount
without the indicator), but all benefits regaring mntput() cost had been
lost.
To get rid of that bitrot, let's add a new helper that would work
with fs_context-based API: fc_mount_longterm(). It's a counterpart
of fc_mount() that does, on success, mark its result as long-term.
It must be paired with kern_unmount() or equivalents.
Converted:
1) mqueue (it used to use kern_mount_data() and the umount side
is still as it used to be)
2) hugetlbfs (used to use kern_mount_data(), internal mount is
never unmounted in this one)
3) i915 gemfs (used to be kern_mount() + manual remount to set
options, still uses kern_unmount() on umount side)
4) v3d gemfs (copied from i915)
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-05-02 21:32:01 -04:00
|
|
|
struct vfsmount *fc_mount_longterm(struct fs_context *fc)
|
|
|
|
|
{
|
|
|
|
|
struct vfsmount *mnt = fc_mount(fc);
|
|
|
|
|
if (!IS_ERR(mnt))
|
|
|
|
|
real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
|
|
|
|
|
return mnt;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(fc_mount_longterm);
|
|
|
|
|
|
2018-11-04 03:19:03 -05:00
|
|
|
struct vfsmount *vfs_kern_mount(struct file_system_type *type,
|
|
|
|
|
int flags, const char *name,
|
|
|
|
|
void *data)
|
2011-03-17 22:08:28 -04:00
|
|
|
{
|
2018-11-04 03:19:03 -05:00
|
|
|
struct fs_context *fc;
|
2018-11-04 06:48:34 -05:00
|
|
|
struct vfsmount *mnt;
|
2018-11-04 03:19:03 -05:00
|
|
|
int ret = 0;
|
2011-03-17 22:08:28 -04:00
|
|
|
|
|
|
|
|
if (!type)
|
2018-11-01 23:07:25 +00:00
|
|
|
return ERR_PTR(-EINVAL);
|
2011-03-17 22:08:28 -04:00
|
|
|
|
2018-11-04 03:19:03 -05:00
|
|
|
fc = fs_context_for_mount(type, flags);
|
|
|
|
|
if (IS_ERR(fc))
|
|
|
|
|
return ERR_CAST(fc);
|
|
|
|
|
|
2018-11-01 23:07:25 +00:00
|
|
|
if (name)
|
|
|
|
|
ret = vfs_parse_fs_string(fc, "source",
|
|
|
|
|
name, strlen(name));
|
2018-11-04 03:19:03 -05:00
|
|
|
if (!ret)
|
|
|
|
|
ret = parse_monolithic_mount_data(fc, data);
|
|
|
|
|
if (!ret)
|
2018-11-04 06:48:34 -05:00
|
|
|
mnt = fc_mount(fc);
|
|
|
|
|
else
|
|
|
|
|
mnt = ERR_PTR(ret);
|
2011-03-17 22:08:28 -04:00
|
|
|
|
2018-11-04 03:19:03 -05:00
|
|
|
put_fs_context(fc);
|
2018-11-04 06:48:34 -05:00
|
|
|
return mnt;
|
2011-03-17 22:08:28 -04:00
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL_GPL(vfs_kern_mount);
|
|
|
|
|
|
2011-11-24 21:24:27 -05:00
|
|
|
static struct mount *clone_mnt(struct mount *old, struct dentry *root,
|
2005-11-07 17:17:22 -05:00
|
|
|
int flag)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2011-11-24 21:24:27 -05:00
|
|
|
struct super_block *sb = old->mnt.mnt_sb;
|
2012-06-25 12:55:18 +01:00
|
|
|
struct mount *mnt;
|
|
|
|
|
int err;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2012-06-25 12:55:18 +01:00
|
|
|
mnt = alloc_vfsmnt(old->mnt_devname);
|
|
|
|
|
if (!mnt)
|
|
|
|
|
return ERR_PTR(-ENOMEM);
|
2008-03-27 13:06:23 +01:00
|
|
|
|
2025-05-07 14:05:50 -04:00
|
|
|
mnt->mnt.mnt_flags = READ_ONCE(old->mnt.mnt_flags) &
|
|
|
|
|
~MNT_INTERNAL_FLAGS;
|
|
|
|
|
|
2012-07-31 13:13:04 -07:00
|
|
|
if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
|
2012-06-25 12:55:18 +01:00
|
|
|
mnt->mnt_group_id = 0; /* not a peer of original */
|
|
|
|
|
else
|
|
|
|
|
mnt->mnt_group_id = old->mnt_group_id;
|
2005-11-07 17:19:50 -05:00
|
|
|
|
2012-06-25 12:55:18 +01:00
|
|
|
if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
|
|
|
|
|
err = mnt_alloc_group_id(mnt);
|
|
|
|
|
if (err)
|
|
|
|
|
goto out_free;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2012-06-25 12:55:18 +01:00
|
|
|
|
2025-05-07 14:05:50 -04:00
|
|
|
if (mnt->mnt_group_id)
|
|
|
|
|
set_mnt_shared(mnt);
|
2013-03-29 21:04:39 -07:00
|
|
|
|
2012-06-25 12:55:18 +01:00
|
|
|
atomic_inc(&sb->s_active);
|
2022-10-26 12:51:27 +02:00
|
|
|
mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
|
|
|
|
|
|
2012-06-25 12:55:18 +01:00
|
|
|
mnt->mnt.mnt_sb = sb;
|
|
|
|
|
mnt->mnt.mnt_root = dget(root);
|
|
|
|
|
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
|
|
|
|
|
mnt->mnt_parent = mnt;
|
2013-09-29 11:24:49 -04:00
|
|
|
lock_mount_hash();
|
2012-06-25 12:55:18 +01:00
|
|
|
list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
|
2013-09-29 11:24:49 -04:00
|
|
|
unlock_mount_hash();
|
2012-06-25 12:55:18 +01:00
|
|
|
|
2025-05-07 14:05:50 -04:00
|
|
|
if (flag & CL_PRIVATE) // we are done with it
|
|
|
|
|
return mnt;
|
|
|
|
|
|
|
|
|
|
if (peers(mnt, old))
|
|
|
|
|
list_add(&mnt->mnt_share, &old->mnt_share);
|
|
|
|
|
|
2012-07-31 13:13:04 -07:00
|
|
|
if ((flag & CL_SLAVE) ||
|
|
|
|
|
((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
|
2012-06-25 12:55:18 +01:00
|
|
|
list_add(&mnt->mnt_slave, &old->mnt_slave_list);
|
|
|
|
|
mnt->mnt_master = old;
|
2025-05-07 14:05:50 -04:00
|
|
|
} else if (IS_MNT_SLAVE(old)) {
|
|
|
|
|
list_add(&mnt->mnt_slave, &old->mnt_slave);
|
2012-06-25 12:55:18 +01:00
|
|
|
mnt->mnt_master = old->mnt_master;
|
|
|
|
|
}
|
2011-11-24 20:55:08 -05:00
|
|
|
return mnt;
|
2008-03-27 13:06:23 +01:00
|
|
|
|
|
|
|
|
out_free:
|
2014-01-24 12:17:54 +00:00
|
|
|
mnt_free_id(mnt);
|
2008-03-27 13:06:23 +01:00
|
|
|
free_vfsmnt(mnt);
|
2012-06-25 12:55:18 +01:00
|
|
|
return ERR_PTR(err);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2014-08-08 13:08:20 -04:00
|
|
|
static void cleanup_mnt(struct mount *mnt)
|
|
|
|
|
{
|
2019-07-04 16:57:51 -04:00
|
|
|
struct hlist_node *p;
|
|
|
|
|
struct mount *m;
|
2014-08-08 13:08:20 -04:00
|
|
|
/*
|
2019-07-04 16:57:51 -04:00
|
|
|
* The warning here probably indicates that somebody messed
|
|
|
|
|
* up a mnt_want/drop_write() pair. If this happens, the
|
|
|
|
|
* filesystem was probably unable to make r/w->r/o transitions.
|
2014-08-08 13:08:20 -04:00
|
|
|
* The locking used to deal with mnt_count decrement provides barriers,
|
|
|
|
|
* so mnt_get_writers() below is safe.
|
|
|
|
|
*/
|
|
|
|
|
WARN_ON(mnt_get_writers(mnt));
|
|
|
|
|
if (unlikely(mnt->mnt_pins.first))
|
|
|
|
|
mnt_pin_kill(mnt);
|
2019-07-04 16:57:51 -04:00
|
|
|
hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
|
|
|
|
|
hlist_del(&m->mnt_umount);
|
|
|
|
|
mntput(&m->mnt);
|
|
|
|
|
}
|
2014-08-08 13:08:20 -04:00
|
|
|
fsnotify_vfsmount_delete(&mnt->mnt);
|
|
|
|
|
dput(mnt->mnt.mnt_root);
|
|
|
|
|
deactivate_super(mnt->mnt.mnt_sb);
|
|
|
|
|
mnt_free_id(mnt);
|
|
|
|
|
call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void __cleanup_mnt(struct rcu_head *head)
|
|
|
|
|
{
|
|
|
|
|
cleanup_mnt(container_of(head, struct mount, mnt_rcu));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static LLIST_HEAD(delayed_mntput_list);
|
|
|
|
|
static void delayed_mntput(struct work_struct *unused)
|
|
|
|
|
{
|
|
|
|
|
struct llist_node *node = llist_del_all(&delayed_mntput_list);
|
2017-08-07 17:44:45 +09:00
|
|
|
struct mount *m, *t;
|
2014-08-08 13:08:20 -04:00
|
|
|
|
2017-08-07 17:44:45 +09:00
|
|
|
llist_for_each_entry_safe(m, t, node, mnt_llist)
|
|
|
|
|
cleanup_mnt(m);
|
2014-08-08 13:08:20 -04:00
|
|
|
}
|
|
|
|
|
static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
|
|
|
|
|
|
2011-11-25 00:33:11 -05:00
|
|
|
static void mntput_no_expire(struct mount *mnt)
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
{
|
2019-06-30 10:39:08 -04:00
|
|
|
LIST_HEAD(list);
|
2020-10-31 21:40:21 -07:00
|
|
|
int count;
|
2019-06-30 10:39:08 -04:00
|
|
|
|
2013-09-29 22:06:07 -04:00
|
|
|
rcu_read_lock();
|
fix mntput/mntput race
mntput_no_expire() does the calculation of total refcount under mount_lock;
unfortunately, the decrement (as well as all increments) are done outside
of it, leading to false positives in the "are we dropping the last reference"
test. Consider the following situation:
* mnt is a lazy-umounted mount, kept alive by two opened files. One
of those files gets closed. Total refcount of mnt is 2. On CPU 42
mntput(mnt) (called from __fput()) drops one reference, decrementing component
* After it has looked at component #0, the process on CPU 0 does
mntget(), incrementing component #0, gets preempted and gets to run again -
on CPU 69. There it does mntput(), which drops the reference (component #69)
and proceeds to spin on mount_lock.
* On CPU 42 our first mntput() finishes counting. It observes the
decrement of component #69, but not the increment of component #0. As the
result, the total it gets is not 1 as it should've been - it's 0. At which
point we decide that vfsmount needs to be killed and proceed to free it and
shut the filesystem down. However, there's still another opened file
on that filesystem, with reference to (now freed) vfsmount, etc. and we are
screwed.
It's not a wide race, but it can be reproduced with artificial slowdown of
the mnt_get_count() loop, and it should be easier to hit on SMP KVM setups.
Fix consists of moving the refcount decrement under mount_lock; the tricky
part is that we want (and can) keep the fast case (i.e. mount that still
has non-NULL ->mnt_ns) entirely out of mount_lock. All places that zero
mnt->mnt_ns are dropping some reference to mnt and they call synchronize_rcu()
before that mntput(). IOW, if mntput() observes (under rcu_read_lock())
a non-NULL ->mnt_ns, it is guaranteed that there is another reference yet to
be dropped.
Reported-by: Jann Horn <jannh@google.com>
Tested-by: Jann Horn <jannh@google.com>
Fixes: 48a066e72d97 ("RCU'd vsfmounts")
Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-08-09 17:21:17 -04:00
|
|
|
if (likely(READ_ONCE(mnt->mnt_ns))) {
|
|
|
|
|
/*
|
|
|
|
|
* Since we don't do lock_mount_hash() here,
|
|
|
|
|
* ->mnt_ns can change under us. However, if it's
|
|
|
|
|
* non-NULL, then there's a reference that won't
|
|
|
|
|
* be dropped until after an RCU delay done after
|
|
|
|
|
* turning ->mnt_ns NULL. So if we observe it
|
|
|
|
|
* non-NULL under rcu_read_lock(), the reference
|
|
|
|
|
* we are dropping is not the final one.
|
|
|
|
|
*/
|
|
|
|
|
mnt_add_count(mnt, -1);
|
2013-09-29 22:06:07 -04:00
|
|
|
rcu_read_unlock();
|
2011-01-14 22:30:21 -05:00
|
|
|
return;
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
}
|
2013-09-29 11:24:49 -04:00
|
|
|
lock_mount_hash();
|
2018-08-09 17:51:32 -04:00
|
|
|
/*
|
|
|
|
|
* make sure that if __legitimize_mnt() has not seen us grab
|
|
|
|
|
* mount_lock, we'll see their refcount increment here.
|
|
|
|
|
*/
|
|
|
|
|
smp_mb();
|
fix mntput/mntput race
mntput_no_expire() does the calculation of total refcount under mount_lock;
unfortunately, the decrement (as well as all increments) are done outside
of it, leading to false positives in the "are we dropping the last reference"
test. Consider the following situation:
* mnt is a lazy-umounted mount, kept alive by two opened files. One
of those files gets closed. Total refcount of mnt is 2. On CPU 42
mntput(mnt) (called from __fput()) drops one reference, decrementing component
* After it has looked at component #0, the process on CPU 0 does
mntget(), incrementing component #0, gets preempted and gets to run again -
on CPU 69. There it does mntput(), which drops the reference (component #69)
and proceeds to spin on mount_lock.
* On CPU 42 our first mntput() finishes counting. It observes the
decrement of component #69, but not the increment of component #0. As the
result, the total it gets is not 1 as it should've been - it's 0. At which
point we decide that vfsmount needs to be killed and proceed to free it and
shut the filesystem down. However, there's still another opened file
on that filesystem, with reference to (now freed) vfsmount, etc. and we are
screwed.
It's not a wide race, but it can be reproduced with artificial slowdown of
the mnt_get_count() loop, and it should be easier to hit on SMP KVM setups.
Fix consists of moving the refcount decrement under mount_lock; the tricky
part is that we want (and can) keep the fast case (i.e. mount that still
has non-NULL ->mnt_ns) entirely out of mount_lock. All places that zero
mnt->mnt_ns are dropping some reference to mnt and they call synchronize_rcu()
before that mntput(). IOW, if mntput() observes (under rcu_read_lock())
a non-NULL ->mnt_ns, it is guaranteed that there is another reference yet to
be dropped.
Reported-by: Jann Horn <jannh@google.com>
Tested-by: Jann Horn <jannh@google.com>
Fixes: 48a066e72d97 ("RCU'd vsfmounts")
Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-08-09 17:21:17 -04:00
|
|
|
mnt_add_count(mnt, -1);
|
2020-10-31 21:40:21 -07:00
|
|
|
count = mnt_get_count(mnt);
|
|
|
|
|
if (count != 0) {
|
|
|
|
|
WARN_ON(count < 0);
|
2013-09-29 22:06:07 -04:00
|
|
|
rcu_read_unlock();
|
2013-09-29 11:24:49 -04:00
|
|
|
unlock_mount_hash();
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
return;
|
|
|
|
|
}
|
2013-09-29 22:06:07 -04:00
|
|
|
if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
|
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
unlock_mount_hash();
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
mnt->mnt.mnt_flags |= MNT_DOOMED;
|
|
|
|
|
rcu_read_unlock();
|
2012-05-08 13:32:02 +09:30
|
|
|
|
2011-11-21 12:11:30 +01:00
|
|
|
list_del(&mnt->mnt_instance);
|
2025-05-01 20:40:57 -04:00
|
|
|
if (unlikely(!list_empty(&mnt->mnt_expire)))
|
|
|
|
|
list_del(&mnt->mnt_expire);
|
2014-12-23 21:37:03 -06:00
|
|
|
|
|
|
|
|
if (unlikely(!list_empty(&mnt->mnt_mounts))) {
|
|
|
|
|
struct mount *p, *tmp;
|
|
|
|
|
list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
|
2019-06-30 10:39:08 -04:00
|
|
|
__put_mountpoint(unhash_mnt(p), &list);
|
2019-07-04 16:57:51 -04:00
|
|
|
hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
|
2014-12-23 21:37:03 -06:00
|
|
|
}
|
|
|
|
|
}
|
2013-09-29 11:24:49 -04:00
|
|
|
unlock_mount_hash();
|
2019-06-30 10:39:08 -04:00
|
|
|
shrink_dentry_list(&list);
|
2013-09-28 12:41:25 -04:00
|
|
|
|
2014-08-08 13:08:20 -04:00
|
|
|
if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
|
|
|
|
|
struct task_struct *task = current;
|
|
|
|
|
if (likely(!(task->flags & PF_KTHREAD))) {
|
|
|
|
|
init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
|
task_work: cleanup notification modes
A previous commit changed the notification mode from true/false to an
int, allowing notify-no, notify-yes, or signal-notify. This was
backwards compatible in the sense that any existing true/false user
would translate to either 0 (on notification sent) or 1, the latter
which mapped to TWA_RESUME. TWA_SIGNAL was assigned a value of 2.
Clean this up properly, and define a proper enum for the notification
mode. Now we have:
- TWA_NONE. This is 0, same as before the original change, meaning no
notification requested.
- TWA_RESUME. This is 1, same as before the original change, meaning
that we use TIF_NOTIFY_RESUME.
- TWA_SIGNAL. This uses TIF_SIGPENDING/JOBCTL_TASK_WORK for the
notification.
Clean up all the callers, switching their 0/1/false/true to using the
appropriate TWA_* mode for notifications.
Fixes: e91b48162332 ("task_work: teach task_work_add() to do signal_wake_up()")
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-10-16 09:02:26 -06:00
|
|
|
if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
|
2014-08-08 13:08:20 -04:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
|
|
|
|
|
schedule_delayed_work(&delayed_mntput_work, 1);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
cleanup_mnt(mnt);
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void mntput(struct vfsmount *mnt)
|
|
|
|
|
{
|
|
|
|
|
if (mnt) {
|
2011-11-25 00:57:42 -05:00
|
|
|
struct mount *m = real_mount(mnt);
|
2023-10-04 13:19:16 +02:00
|
|
|
/* avoid cacheline pingpong */
|
2011-11-25 00:57:42 -05:00
|
|
|
if (unlikely(m->mnt_expiry_mark))
|
2023-10-04 13:19:16 +02:00
|
|
|
WRITE_ONCE(m->mnt_expiry_mark, 0);
|
2011-11-25 00:57:42 -05:00
|
|
|
mntput_no_expire(m);
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(mntput);
|
|
|
|
|
|
|
|
|
|
struct vfsmount *mntget(struct vfsmount *mnt)
|
|
|
|
|
{
|
|
|
|
|
if (mnt)
|
2011-11-24 22:37:54 -05:00
|
|
|
mnt_add_count(real_mount(mnt), 1);
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
return mnt;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(mntget);
|
|
|
|
|
|
2023-01-27 13:46:51 -05:00
|
|
|
/*
|
|
|
|
|
* Make a mount point inaccessible to new lookups.
|
|
|
|
|
* Because there may still be current users, the caller MUST WAIT
|
|
|
|
|
* for an RCU grace period before destroying the mount point.
|
|
|
|
|
*/
|
|
|
|
|
void mnt_make_shortterm(struct vfsmount *mnt)
|
|
|
|
|
{
|
|
|
|
|
if (mnt)
|
|
|
|
|
real_mount(mnt)->mnt_ns = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2021-03-17 19:52:25 -07:00
|
|
|
/**
|
|
|
|
|
* path_is_mountpoint() - Check if path is a mount in the current namespace.
|
|
|
|
|
* @path: path to check
|
2016-11-24 08:03:41 +11:00
|
|
|
*
|
|
|
|
|
* d_mountpoint() can only be used reliably to establish if a dentry is
|
|
|
|
|
* not mounted in any namespace and that common case is handled inline.
|
|
|
|
|
* d_mountpoint() isn't aware of the possibility there may be multiple
|
|
|
|
|
* mounts using a given dentry in a different namespace. This function
|
|
|
|
|
* checks if the passed in path is a mountpoint rather than the dentry
|
|
|
|
|
* alone.
|
|
|
|
|
*/
|
|
|
|
|
bool path_is_mountpoint(const struct path *path)
|
|
|
|
|
{
|
|
|
|
|
unsigned seq;
|
|
|
|
|
bool res;
|
|
|
|
|
|
|
|
|
|
if (!d_mountpoint(path->dentry))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
|
do {
|
|
|
|
|
seq = read_seqbegin(&mount_lock);
|
|
|
|
|
res = __path_is_mountpoint(path);
|
|
|
|
|
} while (read_seqretry(&mount_lock, seq));
|
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(path_is_mountpoint);
|
|
|
|
|
|
2016-11-20 19:45:28 -05:00
|
|
|
struct vfsmount *mnt_clone_internal(const struct path *path)
|
2005-11-07 17:13:39 -05:00
|
|
|
{
|
2014-08-07 09:12:31 -04:00
|
|
|
struct mount *p;
|
|
|
|
|
p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
|
|
|
|
|
if (IS_ERR(p))
|
|
|
|
|
return ERR_CAST(p);
|
|
|
|
|
p->mnt.mnt_flags |= MNT_INTERNAL;
|
|
|
|
|
return &p->mnt;
|
2005-11-07 17:13:39 -05:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2023-10-25 16:02:00 +02:00
|
|
|
/*
|
|
|
|
|
* Returns the mount which either has the specified mnt_id, or has the next
|
|
|
|
|
* smallest id afer the specified one.
|
|
|
|
|
*/
|
|
|
|
|
static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
|
2020-05-14 16:44:24 +02:00
|
|
|
{
|
2023-10-25 16:02:00 +02:00
|
|
|
struct rb_node *node = ns->mounts.rb_node;
|
|
|
|
|
struct mount *ret = NULL;
|
2020-05-14 16:44:24 +02:00
|
|
|
|
2023-10-25 16:02:00 +02:00
|
|
|
while (node) {
|
|
|
|
|
struct mount *m = node_to_mount(node);
|
|
|
|
|
|
|
|
|
|
if (mnt_id <= m->mnt_id_unique) {
|
|
|
|
|
ret = node_to_mount(node);
|
|
|
|
|
if (mnt_id == m->mnt_id_unique)
|
|
|
|
|
break;
|
|
|
|
|
node = node->rb_left;
|
|
|
|
|
} else {
|
|
|
|
|
node = node->rb_right;
|
2020-05-14 16:44:24 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-07 16:55:37 +02:00
|
|
|
/*
|
|
|
|
|
* Returns the mount which either has the specified mnt_id, or has the next
|
|
|
|
|
* greater id before the specified one.
|
|
|
|
|
*/
|
|
|
|
|
static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id)
|
|
|
|
|
{
|
|
|
|
|
struct rb_node *node = ns->mounts.rb_node;
|
|
|
|
|
struct mount *ret = NULL;
|
|
|
|
|
|
|
|
|
|
while (node) {
|
|
|
|
|
struct mount *m = node_to_mount(node);
|
|
|
|
|
|
|
|
|
|
if (mnt_id >= m->mnt_id_unique) {
|
|
|
|
|
ret = node_to_mount(node);
|
|
|
|
|
if (mnt_id == m->mnt_id_unique)
|
|
|
|
|
break;
|
|
|
|
|
node = node->rb_right;
|
|
|
|
|
} else {
|
|
|
|
|
node = node->rb_left;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-25 16:02:00 +02:00
|
|
|
#ifdef CONFIG_PROC_FS
|
|
|
|
|
|
2011-12-06 12:21:54 -05:00
|
|
|
/* iterator; we want it to have access to namespace_sem, thus here... */
|
2005-04-16 15:20:36 -07:00
|
|
|
static void *m_start(struct seq_file *m, loff_t *pos)
|
|
|
|
|
{
|
fs: use seq_open_private() for proc_mounts
A patchset to remove support for passing pre-allocated struct seq_file to
seq_open(). Such feature is undocumented and prone to error.
In particular, if seq_release() is used in release handler, it will
kfree() a pointer which was not allocated by seq_open().
So this patchset drops support for pre-allocated struct seq_file: it's
only of use in proc_namespace.c and can be easily replaced by using
seq_open_private()/seq_release_private().
Additionally, it documents the use of file->private_data to hold pointer
to struct seq_file by seq_open().
This patch (of 3):
Since patch described below, from v2.6.15-rc1, seq_open() could use a
struct seq_file already allocated by the caller if the pointer to the
structure is stored in file->private_data before calling the function.
Commit 1abe77b0fc4b485927f1f798ae81a752677e1d05
Author: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon Nov 7 17:15:34 2005 -0500
[PATCH] allow callers of seq_open do allocation themselves
Allow caller of seq_open() to kmalloc() seq_file + whatever else they
want and set ->private_data to it. seq_open() will then abstain from
doing allocation itself.
Such behavior is only used by mounts_open_common().
In order to drop support for such uncommon feature, proc_mounts is
converted to use seq_open_private(), which take care of allocating the
proc_mounts structure, making it available through ->private in struct
seq_file.
Conversely, proc_mounts is converted to use seq_release_private(), in
order to release the private structure allocated by seq_open_private().
Then, ->private is used directly instead of proc_mounts() macro to access
to the proc_mounts structure.
Link: http://lkml.kernel.org/r/cover.1433193673.git.ydroneaud@opteya.com
Signed-off-by: Yann Droneaud <ydroneaud@opteya.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-06-30 14:57:30 -07:00
|
|
|
struct proc_mounts *p = m->private;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2005-11-07 17:17:51 -05:00
|
|
|
down_read(&namespace_sem);
|
2020-05-14 16:44:24 +02:00
|
|
|
|
2023-10-25 16:02:00 +02:00
|
|
|
return mnt_find_id_at(p->ns, *pos);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
|
|
|
|
|
{
|
2023-10-25 16:02:00 +02:00
|
|
|
struct mount *next = NULL, *mnt = v;
|
|
|
|
|
struct rb_node *node = rb_next(&mnt->mnt_node);
|
2007-07-15 23:39:55 -07:00
|
|
|
|
2020-05-14 16:44:24 +02:00
|
|
|
++*pos;
|
2023-10-25 16:02:00 +02:00
|
|
|
if (node) {
|
|
|
|
|
next = node_to_mount(node);
|
|
|
|
|
*pos = next->mnt_id_unique;
|
|
|
|
|
}
|
|
|
|
|
return next;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void m_stop(struct seq_file *m, void *v)
|
|
|
|
|
{
|
2005-11-07 17:17:51 -05:00
|
|
|
up_read(&namespace_sem);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2011-12-06 12:21:54 -05:00
|
|
|
static int m_show(struct seq_file *m, void *v)
|
2008-03-27 13:06:25 +01:00
|
|
|
{
|
fs: use seq_open_private() for proc_mounts
A patchset to remove support for passing pre-allocated struct seq_file to
seq_open(). Such feature is undocumented and prone to error.
In particular, if seq_release() is used in release handler, it will
kfree() a pointer which was not allocated by seq_open().
So this patchset drops support for pre-allocated struct seq_file: it's
only of use in proc_namespace.c and can be easily replaced by using
seq_open_private()/seq_release_private().
Additionally, it documents the use of file->private_data to hold pointer
to struct seq_file by seq_open().
This patch (of 3):
Since patch described below, from v2.6.15-rc1, seq_open() could use a
struct seq_file already allocated by the caller if the pointer to the
structure is stored in file->private_data before calling the function.
Commit 1abe77b0fc4b485927f1f798ae81a752677e1d05
Author: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon Nov 7 17:15:34 2005 -0500
[PATCH] allow callers of seq_open do allocation themselves
Allow caller of seq_open() to kmalloc() seq_file + whatever else they
want and set ->private_data to it. seq_open() will then abstain from
doing allocation itself.
Such behavior is only used by mounts_open_common().
In order to drop support for such uncommon feature, proc_mounts is
converted to use seq_open_private(), which take care of allocating the
proc_mounts structure, making it available through ->private in struct
seq_file.
Conversely, proc_mounts is converted to use seq_release_private(), in
order to release the private structure allocated by seq_open_private().
Then, ->private is used directly instead of proc_mounts() macro to access
to the proc_mounts structure.
Link: http://lkml.kernel.org/r/cover.1433193673.git.ydroneaud@opteya.com
Signed-off-by: Yann Droneaud <ydroneaud@opteya.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-06-30 14:57:30 -07:00
|
|
|
struct proc_mounts *p = m->private;
|
2020-05-14 16:44:24 +02:00
|
|
|
struct mount *r = v;
|
2011-12-06 12:21:54 -05:00
|
|
|
return p->show(m, &r->mnt);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2008-03-27 13:06:24 +01:00
|
|
|
const struct seq_operations mounts_op = {
|
2005-04-16 15:20:36 -07:00
|
|
|
.start = m_start,
|
|
|
|
|
.next = m_next,
|
|
|
|
|
.stop = m_stop,
|
2011-12-06 12:21:54 -05:00
|
|
|
.show = m_show,
|
2006-03-20 13:44:12 -05:00
|
|
|
};
|
2020-05-14 16:44:24 +02:00
|
|
|
|
2008-03-27 13:06:24 +01:00
|
|
|
#endif /* CONFIG_PROC_FS */
|
2006-03-20 13:44:12 -05:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/**
|
|
|
|
|
* may_umount_tree - check if a mount tree is busy
|
2021-03-17 19:52:25 -07:00
|
|
|
* @m: root of mount tree
|
2005-04-16 15:20:36 -07:00
|
|
|
*
|
|
|
|
|
* This is called to check if a tree of mounts has any
|
|
|
|
|
* open files, pwds, chroots or sub mounts that are
|
|
|
|
|
* busy.
|
|
|
|
|
*/
|
2011-11-25 03:06:56 -05:00
|
|
|
int may_umount_tree(struct vfsmount *m)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2011-11-25 03:06:56 -05:00
|
|
|
struct mount *mnt = real_mount(m);
|
2005-11-07 17:17:22 -05:00
|
|
|
int actual_refs = 0;
|
|
|
|
|
int minimum_refs = 0;
|
2011-11-24 18:57:30 -05:00
|
|
|
struct mount *p;
|
2011-11-25 03:06:56 -05:00
|
|
|
BUG_ON(!m);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
/* write lock needed for mnt_get_count */
|
2013-09-29 11:24:49 -04:00
|
|
|
lock_mount_hash();
|
2011-11-25 03:06:56 -05:00
|
|
|
for (p = mnt; p; p = next_mnt(p, mnt)) {
|
2011-11-24 22:37:54 -05:00
|
|
|
actual_refs += mnt_get_count(p);
|
2005-04-16 15:20:36 -07:00
|
|
|
minimum_refs += 2;
|
|
|
|
|
}
|
2013-09-29 11:24:49 -04:00
|
|
|
unlock_mount_hash();
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
|
if (actual_refs > minimum_refs)
|
2006-03-27 01:14:51 -08:00
|
|
|
return 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2006-03-27 01:14:51 -08:00
|
|
|
return 1;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(may_umount_tree);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* may_umount - check if a mount point is busy
|
|
|
|
|
* @mnt: root of mount
|
|
|
|
|
*
|
|
|
|
|
* This is called to check if a mount point has any
|
|
|
|
|
* open files, pwds, chroots or sub mounts. If the
|
|
|
|
|
* mount has sub mounts this will return busy
|
|
|
|
|
* regardless of whether the sub mounts are busy.
|
|
|
|
|
*
|
|
|
|
|
* Doesn't take quota and stuff into account. IOW, in some cases it will
|
|
|
|
|
* give false negatives. The main reason why it's here is that we need
|
|
|
|
|
* a non-destructive way to look for easily umountable filesystems.
|
|
|
|
|
*/
|
|
|
|
|
int may_umount(struct vfsmount *mnt)
|
|
|
|
|
{
|
2006-03-27 01:14:51 -08:00
|
|
|
int ret = 1;
|
2010-01-16 12:56:08 -05:00
|
|
|
down_read(&namespace_sem);
|
2013-09-29 11:24:49 -04:00
|
|
|
lock_mount_hash();
|
2011-11-24 21:35:16 -05:00
|
|
|
if (propagate_mount_busy(real_mount(mnt), 2))
|
2006-03-27 01:14:51 -08:00
|
|
|
ret = 0;
|
2013-09-29 11:24:49 -04:00
|
|
|
unlock_mount_hash();
|
2010-01-16 12:56:08 -05:00
|
|
|
up_read(&namespace_sem);
|
2005-11-07 17:20:17 -05:00
|
|
|
return ret;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(may_umount);
|
|
|
|
|
|
2025-01-29 17:58:01 +01:00
|
|
|
#ifdef CONFIG_FSNOTIFY
|
|
|
|
|
static void mnt_notify(struct mount *p)
|
|
|
|
|
{
|
|
|
|
|
if (!p->prev_ns && p->mnt_ns) {
|
|
|
|
|
fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
|
|
|
|
|
} else if (p->prev_ns && !p->mnt_ns) {
|
|
|
|
|
fsnotify_mnt_detach(p->prev_ns, &p->mnt);
|
|
|
|
|
} else if (p->prev_ns == p->mnt_ns) {
|
|
|
|
|
fsnotify_mnt_move(p->mnt_ns, &p->mnt);
|
|
|
|
|
} else {
|
|
|
|
|
fsnotify_mnt_detach(p->prev_ns, &p->mnt);
|
|
|
|
|
fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
|
|
|
|
|
}
|
|
|
|
|
p->prev_ns = p->mnt_ns;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void notify_mnt_list(void)
|
|
|
|
|
{
|
|
|
|
|
struct mount *m, *tmp;
|
|
|
|
|
/*
|
|
|
|
|
* Notify about mounts that were added/reparented/detached/remain
|
|
|
|
|
* connected after unmount.
|
|
|
|
|
*/
|
|
|
|
|
list_for_each_entry_safe(m, tmp, ¬ify_list, to_notify) {
|
|
|
|
|
mnt_notify(m);
|
|
|
|
|
list_del_init(&m->to_notify);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool need_notify_mnt_list(void)
|
|
|
|
|
{
|
|
|
|
|
return !list_empty(¬ify_list);
|
|
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
|
static void notify_mnt_list(void)
|
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool need_notify_mnt_list(void)
|
|
|
|
|
{
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
2013-03-16 15:12:40 -04:00
|
|
|
static void namespace_unlock(void)
|
2005-11-07 17:17:04 -05:00
|
|
|
{
|
2015-04-02 20:33:53 -05:00
|
|
|
struct hlist_head head;
|
2019-07-04 16:57:51 -04:00
|
|
|
struct hlist_node *p;
|
|
|
|
|
struct mount *m;
|
2019-06-30 10:39:08 -04:00
|
|
|
LIST_HEAD(list);
|
2013-03-16 15:12:40 -04:00
|
|
|
|
2015-04-02 20:33:53 -05:00
|
|
|
hlist_move_list(&unmounted, &head);
|
2019-06-30 10:39:08 -04:00
|
|
|
list_splice_init(&ex_mountpoints, &list);
|
2013-03-16 15:12:40 -04:00
|
|
|
|
2025-01-29 17:58:01 +01:00
|
|
|
if (need_notify_mnt_list()) {
|
|
|
|
|
/*
|
|
|
|
|
* No point blocking out concurrent readers while notifications
|
|
|
|
|
* are sent. This will also allow statmount()/listmount() to run
|
|
|
|
|
* concurrently.
|
|
|
|
|
*/
|
|
|
|
|
downgrade_write(&namespace_sem);
|
|
|
|
|
notify_mnt_list();
|
|
|
|
|
up_read(&namespace_sem);
|
|
|
|
|
} else {
|
|
|
|
|
up_write(&namespace_sem);
|
|
|
|
|
}
|
2013-03-16 15:12:40 -04:00
|
|
|
|
2019-06-30 10:39:08 -04:00
|
|
|
shrink_dentry_list(&list);
|
|
|
|
|
|
2015-04-02 20:33:53 -05:00
|
|
|
if (likely(hlist_empty(&head)))
|
|
|
|
|
return;
|
|
|
|
|
|
2018-11-30 10:33:18 +11:00
|
|
|
synchronize_rcu_expedited();
|
2013-09-29 22:06:07 -04:00
|
|
|
|
2019-07-04 16:57:51 -04:00
|
|
|
hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
|
|
|
|
|
hlist_del(&m->mnt_umount);
|
|
|
|
|
mntput(&m->mnt);
|
|
|
|
|
}
|
2005-11-07 17:17:04 -05:00
|
|
|
}
|
|
|
|
|
|
2013-03-16 15:12:40 -04:00
|
|
|
static inline void namespace_lock(void)
|
2013-03-16 14:35:16 -04:00
|
|
|
{
|
2013-03-16 15:12:40 -04:00
|
|
|
down_write(&namespace_sem);
|
2013-03-16 14:35:16 -04:00
|
|
|
}
|
|
|
|
|
|
2025-04-10 17:05:42 +02:00
|
|
|
DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock())
|
|
|
|
|
|
2014-12-24 07:20:01 -06:00
|
|
|
enum umount_tree_flags {
|
|
|
|
|
UMOUNT_SYNC = 1,
|
|
|
|
|
UMOUNT_PROPAGATE = 2,
|
2015-04-01 18:30:06 -05:00
|
|
|
UMOUNT_CONNECTED = 4,
|
2014-12-24 07:20:01 -06:00
|
|
|
};
|
2015-07-17 14:15:30 -05:00
|
|
|
|
|
|
|
|
static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
|
|
|
|
|
{
|
|
|
|
|
/* Leaving mounts connected is only valid for lazy umounts */
|
|
|
|
|
if (how & UMOUNT_SYNC)
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
/* A mount without a parent has nothing to be connected to */
|
|
|
|
|
if (!mnt_has_parent(mnt))
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
/* Because the reference counting rules change when mounts are
|
|
|
|
|
* unmounted and connected, umounted mounts may not be
|
|
|
|
|
* connected to mounted mounts.
|
|
|
|
|
*/
|
|
|
|
|
if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
/* Has it been requested that the mount remain connected? */
|
|
|
|
|
if (how & UMOUNT_CONNECTED)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* Is the mount locked such that it needs to remain connected? */
|
|
|
|
|
if (IS_MNT_LOCKED(mnt))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* By default disconnect the mount */
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
/*
|
2013-09-29 22:06:07 -04:00
|
|
|
* mount_lock must be held
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
* namespace_sem must be held for write
|
|
|
|
|
*/
|
2014-12-24 07:20:01 -06:00
|
|
|
static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2014-12-18 13:10:48 -06:00
|
|
|
LIST_HEAD(tmp_list);
|
2011-11-24 18:57:30 -05:00
|
|
|
struct mount *p;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2015-01-03 05:39:35 -06:00
|
|
|
if (how & UMOUNT_PROPAGATE)
|
|
|
|
|
propagate_mount_unlock(mnt);
|
|
|
|
|
|
2014-12-18 13:10:48 -06:00
|
|
|
/* Gather the mounts to umount */
|
2014-12-22 18:30:08 -06:00
|
|
|
for (p = mnt; p; p = next_mnt(p, mnt)) {
|
|
|
|
|
p->mnt.mnt_flags |= MNT_UMOUNT;
|
2024-12-15 21:17:05 +01:00
|
|
|
if (mnt_ns_attached(p))
|
2023-10-25 16:02:00 +02:00
|
|
|
move_from_ns(p, &tmp_list);
|
|
|
|
|
else
|
|
|
|
|
list_move(&p->mnt_list, &tmp_list);
|
2014-12-22 18:30:08 -06:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2014-12-22 19:12:07 -06:00
|
|
|
/* Hide the mounts from mnt_mounts */
|
2014-12-18 13:10:48 -06:00
|
|
|
list_for_each_entry(p, &tmp_list, mnt_list) {
|
2014-08-18 15:09:26 -04:00
|
|
|
list_del_init(&p->mnt_child);
|
2014-12-18 13:10:48 -06:00
|
|
|
}
|
2014-08-18 15:09:26 -04:00
|
|
|
|
2024-08-06 11:47:10 +08:00
|
|
|
/* Add propagated mounts to the tmp_list */
|
2014-12-24 07:20:01 -06:00
|
|
|
if (how & UMOUNT_PROPAGATE)
|
2011-01-15 20:08:44 -05:00
|
|
|
propagate_umount(&tmp_list);
|
2005-11-07 17:20:17 -05:00
|
|
|
|
2014-12-18 13:10:48 -06:00
|
|
|
while (!list_empty(&tmp_list)) {
|
2016-09-28 00:27:17 -05:00
|
|
|
struct mnt_namespace *ns;
|
2014-12-23 21:37:03 -06:00
|
|
|
bool disconnect;
|
2014-12-18 13:10:48 -06:00
|
|
|
p = list_first_entry(&tmp_list, struct mount, mnt_list);
|
2011-11-25 00:22:05 -05:00
|
|
|
list_del_init(&p->mnt_expire);
|
2011-11-25 02:19:55 -05:00
|
|
|
list_del_init(&p->mnt_list);
|
2016-09-28 00:27:17 -05:00
|
|
|
ns = p->mnt_ns;
|
|
|
|
|
if (ns) {
|
2023-10-25 16:02:00 +02:00
|
|
|
ns->nr_mounts--;
|
2016-09-28 00:27:17 -05:00
|
|
|
__touch_mnt_namespace(ns);
|
|
|
|
|
}
|
2011-11-25 00:46:35 -05:00
|
|
|
p->mnt_ns = NULL;
|
2014-12-24 07:20:01 -06:00
|
|
|
if (how & UMOUNT_SYNC)
|
2013-09-29 22:06:07 -04:00
|
|
|
p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
|
2015-01-10 19:01:08 -05:00
|
|
|
|
2015-07-17 14:15:30 -05:00
|
|
|
disconnect = disconnect_mount(p, how);
|
2011-11-24 21:47:05 -05:00
|
|
|
if (mnt_has_parent(p)) {
|
2014-08-30 18:32:05 -04:00
|
|
|
mnt_add_count(p->mnt_parent, -1);
|
2014-12-23 21:37:03 -06:00
|
|
|
if (!disconnect) {
|
|
|
|
|
/* Don't forget about p */
|
|
|
|
|
list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
|
|
|
|
|
} else {
|
|
|
|
|
umount_mnt(p);
|
|
|
|
|
}
|
2008-03-21 23:59:49 -04:00
|
|
|
}
|
2011-11-24 20:43:10 -05:00
|
|
|
change_mnt_propagation(p, MS_PRIVATE);
|
2019-07-24 12:45:46 -04:00
|
|
|
if (disconnect)
|
|
|
|
|
hlist_add_head(&p->mnt_umount, &unmounted);
|
2025-01-29 17:58:01 +01:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* At this point p->mnt_ns is NULL, notification will be queued
|
|
|
|
|
* only if
|
|
|
|
|
*
|
|
|
|
|
* - p->prev_ns is non-NULL *and*
|
|
|
|
|
* - p->prev_ns->n_fsnotify_marks is non-NULL
|
|
|
|
|
*
|
|
|
|
|
* This will preclude queuing the mount if this is a cleanup
|
|
|
|
|
* after a failed copy_tree() or destruction of an anonymous
|
|
|
|
|
* namespace, etc.
|
|
|
|
|
*/
|
|
|
|
|
mnt_notify_add(p);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2013-03-16 14:39:34 -04:00
|
|
|
static void shrink_submounts(struct mount *mnt);
|
2008-03-22 00:46:23 -04:00
|
|
|
|
2018-11-04 09:28:36 -05:00
|
|
|
static int do_umount_root(struct super_block *sb)
|
|
|
|
|
{
|
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
|
|
down_write(&sb->s_umount);
|
|
|
|
|
if (!sb_rdonly(sb)) {
|
|
|
|
|
struct fs_context *fc;
|
|
|
|
|
|
|
|
|
|
fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
|
|
|
|
|
SB_RDONLY);
|
|
|
|
|
if (IS_ERR(fc)) {
|
|
|
|
|
ret = PTR_ERR(fc);
|
|
|
|
|
} else {
|
|
|
|
|
ret = parse_monolithic_mount_data(fc, NULL);
|
|
|
|
|
if (!ret)
|
|
|
|
|
ret = reconfigure_super(fc);
|
|
|
|
|
put_fs_context(fc);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
up_write(&sb->s_umount);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2011-11-24 21:35:16 -05:00
|
|
|
static int do_umount(struct mount *mnt, int flags)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2011-11-24 21:35:16 -05:00
|
|
|
struct super_block *sb = mnt->mnt.mnt_sb;
|
2005-04-16 15:20:36 -07:00
|
|
|
int retval;
|
|
|
|
|
|
2011-11-24 21:35:16 -05:00
|
|
|
retval = security_sb_umount(&mnt->mnt, flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (retval)
|
|
|
|
|
return retval;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Allow userspace to request a mountpoint be expired rather than
|
|
|
|
|
* unmounting unconditionally. Unmount only happens if:
|
|
|
|
|
* (1) the mark is already set (the mark is cleared by mntput())
|
|
|
|
|
* (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
|
|
|
|
|
*/
|
|
|
|
|
if (flags & MNT_EXPIRE) {
|
2011-11-24 21:35:16 -05:00
|
|
|
if (&mnt->mnt == current->fs->root.mnt ||
|
2005-04-16 15:20:36 -07:00
|
|
|
flags & (MNT_FORCE | MNT_DETACH))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
/*
|
|
|
|
|
* probably don't strictly need the lock here if we examined
|
|
|
|
|
* all race cases, but it's a slowpath.
|
|
|
|
|
*/
|
2013-09-29 11:24:49 -04:00
|
|
|
lock_mount_hash();
|
2011-11-24 22:37:54 -05:00
|
|
|
if (mnt_get_count(mnt) != 2) {
|
2013-09-29 11:24:49 -04:00
|
|
|
unlock_mount_hash();
|
2005-04-16 15:20:36 -07:00
|
|
|
return -EBUSY;
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
}
|
2013-09-29 11:24:49 -04:00
|
|
|
unlock_mount_hash();
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2011-11-25 00:57:42 -05:00
|
|
|
if (!xchg(&mnt->mnt_expiry_mark, 1))
|
2005-04-16 15:20:36 -07:00
|
|
|
return -EAGAIN;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If we may have to abort operations to get out of this
|
|
|
|
|
* mount, and they will themselves hold resources we must
|
|
|
|
|
* allow the fs to do things. In the Unix tradition of
|
|
|
|
|
* 'Gee thats tricky lets do it in userspace' the umount_begin
|
|
|
|
|
* might fail to complete on the first run through as other tasks
|
|
|
|
|
* must return, and the like. Thats for the mount program to worry
|
|
|
|
|
* about for the moment.
|
|
|
|
|
*/
|
|
|
|
|
|
2008-04-24 07:21:56 -04:00
|
|
|
if (flags & MNT_FORCE && sb->s_op->umount_begin) {
|
|
|
|
|
sb->s_op->umount_begin(sb);
|
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* No sense to grab the lock for this test, but test itself looks
|
|
|
|
|
* somewhat bogus. Suggestions for better replacement?
|
|
|
|
|
* Ho-hum... In principle, we might treat that as umount + switch
|
|
|
|
|
* to rootfs. GC would eventually take care of the old vfsmount.
|
|
|
|
|
* Actually it makes sense, especially if rootfs would contain a
|
|
|
|
|
* /reboot - static binary that would close all descriptors and
|
|
|
|
|
* call reboot(9). Then init(8) could umount root and exec /reboot.
|
|
|
|
|
*/
|
2011-11-24 21:35:16 -05:00
|
|
|
if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
|
* Special case for "unmounting" root ...
|
|
|
|
|
* we just try to remount it readonly.
|
|
|
|
|
*/
|
2017-09-18 17:58:08 -05:00
|
|
|
if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
|
2014-10-08 12:32:47 -07:00
|
|
|
return -EPERM;
|
2018-11-04 09:28:36 -05:00
|
|
|
return do_umount_root(sb);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2013-03-16 15:12:40 -04:00
|
|
|
namespace_lock();
|
2013-09-29 11:24:49 -04:00
|
|
|
lock_mount_hash();
|
2005-04-16 15:20:36 -07:00
|
|
|
|
do_umount(): simplify the "is it still mounted" checks
Calls of do_umount() are always preceded by can_umount(), where we'd
done a racy check for mount belonging to our namespace; if it wasn't,
can_unmount() would've failed with -EINVAL and we wouldn't have
reached do_umount() at all.
That check needs to be redone once we have acquired namespace_sem
and in do_umount() we do that. However, that's done in a very odd
way; we check that mount is still in rbtree of _some_ namespace or
its mnt_list is not empty. It is equivalent to check_mnt(mnt) -
we know that earlier mnt was mounted in our namespace; if it has
stayed there, it's going to remain in rbtree of our namespace.
OTOH, if it ever had been removed from out namespace, it would be
removed from rbtree and it never would've re-added to a namespace
afterwards. As for ->mnt_list, for something that had been mounted
in a namespace we'll never observe non-empty ->mnt_list while holding
namespace_sem - it does temporarily become non-empty during
umount_tree(), but that doesn't outlast the call of umount_tree(),
let alone dropping namespace_sem.
Things get much easier to follow if we replace that with (equivalent)
check_mnt(mnt) there. What's more, currently we treat a failure of
that test as "quietly do nothing"; we might as well pretend that we'd
lost the race and fail on that the same way can_umount() would have.
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-04-26 19:17:28 -04:00
|
|
|
/* Repeat the earlier racy checks, now that we are holding the locks */
|
2018-10-22 10:21:38 -05:00
|
|
|
retval = -EINVAL;
|
do_umount(): simplify the "is it still mounted" checks
Calls of do_umount() are always preceded by can_umount(), where we'd
done a racy check for mount belonging to our namespace; if it wasn't,
can_unmount() would've failed with -EINVAL and we wouldn't have
reached do_umount() at all.
That check needs to be redone once we have acquired namespace_sem
and in do_umount() we do that. However, that's done in a very odd
way; we check that mount is still in rbtree of _some_ namespace or
its mnt_list is not empty. It is equivalent to check_mnt(mnt) -
we know that earlier mnt was mounted in our namespace; if it has
stayed there, it's going to remain in rbtree of our namespace.
OTOH, if it ever had been removed from out namespace, it would be
removed from rbtree and it never would've re-added to a namespace
afterwards. As for ->mnt_list, for something that had been mounted
in a namespace we'll never observe non-empty ->mnt_list while holding
namespace_sem - it does temporarily become non-empty during
umount_tree(), but that doesn't outlast the call of umount_tree(),
let alone dropping namespace_sem.
Things get much easier to follow if we replace that with (equivalent)
check_mnt(mnt) there. What's more, currently we treat a failure of
that test as "quietly do nothing"; we might as well pretend that we'd
lost the race and fail on that the same way can_umount() would have.
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-04-26 19:17:28 -04:00
|
|
|
if (!check_mnt(mnt))
|
|
|
|
|
goto out;
|
|
|
|
|
|
2018-10-22 10:21:38 -05:00
|
|
|
if (mnt->mnt.mnt_flags & MNT_LOCKED)
|
|
|
|
|
goto out;
|
|
|
|
|
|
don't set MNT_LOCKED on parentless mounts
Originally MNT_LOCKED meant only one thing - "don't let this mount to
be peeled off its parent, we don't want to have its mountpoint exposed".
Accordingly, it had only been set on mounts that *do* have a parent.
Later it got overloaded with another use - setting it on the absolute
root had given free protection against umount(2) of absolute root
(was possible to trigger, oopsed). Not a bad trick, but it ended
up costing more than it bought us. Unfortunately, the cost included
both hard-to-reason-about logics and a subtle race between
mount -o remount,ro and mount --[r]bind - lockless &= ~MNT_LOCKED in
the end of __do_loopback() could race with sb_prepare_remount_readonly()
setting and clearing MNT_HOLD_WRITE (under mount_lock, as it should
be). The race wouldn't be much of a problem (there are other ways to
deal with it), but the subtlety is.
Turns out that nobody except umount(2) had ever made use of having
MNT_LOCKED set on absolute root. So let's give up on that trick,
clever as it had been, add an explicit check in do_umount() and
return to using MNT_LOCKED only for mounts that have a parent.
It means that
* clone_mnt() no longer copies MNT_LOCKED
* copy_tree() sets it on submounts if their counterparts had
been marked such, and does that right next to attach_mnt() in there,
in the same mount_lock scope.
* __do_loopback() no longer needs to strip MNT_LOCKED off the
root of subtree it's about to return; no store, no race.
* init_mount_tree() doesn't bother setting MNT_LOCKED on absolute
root.
* lock_mnt_tree() does not set MNT_LOCKED on the subtree's root;
accordingly, its caller (loop in attach_recursive_mnt()) does not need to
bother stripping that MNT_LOCKED on root. Note that lock_mnt_tree() setting
MNT_LOCKED on submounts happens in the same mount_lock scope as __attach_mnt()
(from commit_tree()) that makes them reachable.
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-05-06 18:48:05 -04:00
|
|
|
if (!mnt_has_parent(mnt)) /* not the absolute root */
|
|
|
|
|
goto out;
|
|
|
|
|
|
2018-10-22 10:21:38 -05:00
|
|
|
event++;
|
2013-09-29 22:06:07 -04:00
|
|
|
if (flags & MNT_DETACH) {
|
do_umount(): simplify the "is it still mounted" checks
Calls of do_umount() are always preceded by can_umount(), where we'd
done a racy check for mount belonging to our namespace; if it wasn't,
can_unmount() would've failed with -EINVAL and we wouldn't have
reached do_umount() at all.
That check needs to be redone once we have acquired namespace_sem
and in do_umount() we do that. However, that's done in a very odd
way; we check that mount is still in rbtree of _some_ namespace or
its mnt_list is not empty. It is equivalent to check_mnt(mnt) -
we know that earlier mnt was mounted in our namespace; if it has
stayed there, it's going to remain in rbtree of our namespace.
OTOH, if it ever had been removed from out namespace, it would be
removed from rbtree and it never would've re-added to a namespace
afterwards. As for ->mnt_list, for something that had been mounted
in a namespace we'll never observe non-empty ->mnt_list while holding
namespace_sem - it does temporarily become non-empty during
umount_tree(), but that doesn't outlast the call of umount_tree(),
let alone dropping namespace_sem.
Things get much easier to follow if we replace that with (equivalent)
check_mnt(mnt) there. What's more, currently we treat a failure of
that test as "quietly do nothing"; we might as well pretend that we'd
lost the race and fail on that the same way can_umount() would have.
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-04-26 19:17:28 -04:00
|
|
|
umount_tree(mnt, UMOUNT_PROPAGATE);
|
2005-04-16 15:20:36 -07:00
|
|
|
retval = 0;
|
2013-09-29 22:06:07 -04:00
|
|
|
} else {
|
2025-04-28 23:56:14 -04:00
|
|
|
smp_mb(); // paired with __legitimize_mnt()
|
2013-09-29 22:06:07 -04:00
|
|
|
shrink_submounts(mnt);
|
|
|
|
|
retval = -EBUSY;
|
|
|
|
|
if (!propagate_mount_busy(mnt, 2)) {
|
do_umount(): simplify the "is it still mounted" checks
Calls of do_umount() are always preceded by can_umount(), where we'd
done a racy check for mount belonging to our namespace; if it wasn't,
can_unmount() would've failed with -EINVAL and we wouldn't have
reached do_umount() at all.
That check needs to be redone once we have acquired namespace_sem
and in do_umount() we do that. However, that's done in a very odd
way; we check that mount is still in rbtree of _some_ namespace or
its mnt_list is not empty. It is equivalent to check_mnt(mnt) -
we know that earlier mnt was mounted in our namespace; if it has
stayed there, it's going to remain in rbtree of our namespace.
OTOH, if it ever had been removed from out namespace, it would be
removed from rbtree and it never would've re-added to a namespace
afterwards. As for ->mnt_list, for something that had been mounted
in a namespace we'll never observe non-empty ->mnt_list while holding
namespace_sem - it does temporarily become non-empty during
umount_tree(), but that doesn't outlast the call of umount_tree(),
let alone dropping namespace_sem.
Things get much easier to follow if we replace that with (equivalent)
check_mnt(mnt) there. What's more, currently we treat a failure of
that test as "quietly do nothing"; we might as well pretend that we'd
lost the race and fail on that the same way can_umount() would have.
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-04-26 19:17:28 -04:00
|
|
|
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
|
2013-09-29 22:06:07 -04:00
|
|
|
retval = 0;
|
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2018-10-22 10:21:38 -05:00
|
|
|
out:
|
2013-09-29 11:24:49 -04:00
|
|
|
unlock_mount_hash();
|
2013-03-16 14:35:16 -04:00
|
|
|
namespace_unlock();
|
2005-04-16 15:20:36 -07:00
|
|
|
return retval;
|
|
|
|
|
}
|
|
|
|
|
|
2013-10-03 01:31:18 -07:00
|
|
|
/*
|
|
|
|
|
* __detach_mounts - lazily unmount all mounts on the specified dentry
|
|
|
|
|
*
|
|
|
|
|
* During unlink, rmdir, and d_drop it is possible to loose the path
|
|
|
|
|
* to an existing mountpoint, and wind up leaking the mount.
|
|
|
|
|
* detach_mounts allows lazily unmounting those mounts instead of
|
|
|
|
|
* leaking them.
|
|
|
|
|
*
|
|
|
|
|
* The caller may hold dentry->d_inode->i_mutex.
|
|
|
|
|
*/
|
|
|
|
|
void __detach_mounts(struct dentry *dentry)
|
|
|
|
|
{
|
|
|
|
|
struct mountpoint *mp;
|
|
|
|
|
struct mount *mnt;
|
|
|
|
|
|
|
|
|
|
namespace_lock();
|
2017-01-03 14:18:43 +13:00
|
|
|
lock_mount_hash();
|
2013-10-03 01:31:18 -07:00
|
|
|
mp = lookup_mountpoint(dentry);
|
2019-06-29 12:06:51 -04:00
|
|
|
if (!mp)
|
2013-10-03 01:31:18 -07:00
|
|
|
goto out_unlock;
|
|
|
|
|
|
2016-04-15 14:24:41 -07:00
|
|
|
event++;
|
2013-10-03 01:31:18 -07:00
|
|
|
while (!hlist_empty(&mp->m_list)) {
|
|
|
|
|
mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
|
2014-12-23 21:37:03 -06:00
|
|
|
if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
|
2015-07-17 14:54:27 -05:00
|
|
|
umount_mnt(mnt);
|
2019-07-04 16:57:51 -04:00
|
|
|
hlist_add_head(&mnt->mnt_umount, &unmounted);
|
2014-12-23 21:37:03 -06:00
|
|
|
}
|
2015-04-01 18:30:06 -05:00
|
|
|
else umount_tree(mnt, UMOUNT_CONNECTED);
|
2013-10-03 01:31:18 -07:00
|
|
|
}
|
|
|
|
|
put_mountpoint(mp);
|
|
|
|
|
out_unlock:
|
2017-01-03 14:18:43 +13:00
|
|
|
unlock_mount_hash();
|
2013-10-03 01:31:18 -07:00
|
|
|
namespace_unlock();
|
|
|
|
|
}
|
|
|
|
|
|
2017-07-04 17:25:09 +01:00
|
|
|
/*
|
2013-02-22 22:45:42 -05:00
|
|
|
* Is the caller allowed to modify his namespace?
|
|
|
|
|
*/
|
2022-03-01 00:05:29 -05:00
|
|
|
bool may_mount(void)
|
2013-02-22 22:45:42 -05:00
|
|
|
{
|
|
|
|
|
return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
|
|
|
|
|
}
|
|
|
|
|
|
2021-08-19 14:56:38 -04:00
|
|
|
static void warn_mandlock(void)
|
2015-11-16 09:49:34 -05:00
|
|
|
{
|
2021-08-19 14:56:38 -04:00
|
|
|
pr_warn_once("=======================================================\n"
|
|
|
|
|
"WARNING: The mand mount option has been deprecated and\n"
|
|
|
|
|
" and is ignored by this kernel. Remove the mand\n"
|
|
|
|
|
" option from the mount to silence this warning.\n"
|
|
|
|
|
"=======================================================\n");
|
2015-11-16 09:49:34 -05:00
|
|
|
}
|
|
|
|
|
|
2020-08-06 16:07:10 +02:00
|
|
|
static int can_umount(const struct path *path, int flags)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2020-08-06 16:07:10 +02:00
|
|
|
struct mount *mnt = real_mount(path->mnt);
|
2025-03-18 12:29:21 -04:00
|
|
|
struct super_block *sb = path->dentry->d_sb;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2013-02-22 22:45:42 -05:00
|
|
|
if (!may_mount())
|
|
|
|
|
return -EPERM;
|
2023-05-03 13:18:39 +02:00
|
|
|
if (!path_mounted(path))
|
2020-08-06 16:07:10 +02:00
|
|
|
return -EINVAL;
|
2011-11-25 00:46:35 -05:00
|
|
|
if (!check_mnt(mnt))
|
2020-08-06 16:07:10 +02:00
|
|
|
return -EINVAL;
|
2018-10-22 10:21:38 -05:00
|
|
|
if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
|
2020-08-06 16:07:10 +02:00
|
|
|
return -EINVAL;
|
2025-03-18 12:29:21 -04:00
|
|
|
if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
|
2020-08-06 16:07:10 +02:00
|
|
|
return -EPERM;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-04 15:25:34 -05:00
|
|
|
// caller is responsible for flags being sane
|
2020-08-06 16:07:10 +02:00
|
|
|
int path_umount(struct path *path, int flags)
|
|
|
|
|
{
|
|
|
|
|
struct mount *mnt = real_mount(path->mnt);
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
ret = can_umount(path, flags);
|
|
|
|
|
if (!ret)
|
|
|
|
|
ret = do_umount(mnt, flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-02-14 19:34:31 -08:00
|
|
|
/* we mustn't call path_put() as that would clear mnt_expiry_mark */
|
2020-07-21 10:54:34 +02:00
|
|
|
dput(path->dentry);
|
2011-11-25 00:33:11 -05:00
|
|
|
mntput_no_expire(mnt);
|
2020-08-06 16:07:10 +02:00
|
|
|
return ret;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2020-07-23 08:23:08 +02:00
|
|
|
static int ksys_umount(char __user *name, int flags)
|
2020-07-21 10:54:34 +02:00
|
|
|
{
|
|
|
|
|
int lookup_flags = LOOKUP_MOUNTPOINT;
|
|
|
|
|
struct path path;
|
|
|
|
|
int ret;
|
|
|
|
|
|
2021-01-04 15:25:34 -05:00
|
|
|
// basic validity checks done first
|
|
|
|
|
if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
2020-07-21 10:54:34 +02:00
|
|
|
if (!(flags & UMOUNT_NOFOLLOW))
|
|
|
|
|
lookup_flags |= LOOKUP_FOLLOW;
|
|
|
|
|
ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
return path_umount(&path, flags);
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-11 11:34:40 +01:00
|
|
|
SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
|
|
|
|
|
{
|
|
|
|
|
return ksys_umount(name, flags);
|
|
|
|
|
}
|
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#ifdef __ARCH_WANT_SYS_OLDUMOUNT
|
|
|
|
|
|
|
|
|
|
/*
|
2005-11-07 17:16:09 -05:00
|
|
|
* The 2.0 compatible umount. No flags.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2009-01-14 14:14:12 +01:00
|
|
|
SYSCALL_DEFINE1(oldumount, char __user *, name)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2018-03-11 11:34:40 +01:00
|
|
|
return ksys_umount(name, 0);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
2013-03-30 01:35:18 -07:00
|
|
|
static bool is_mnt_ns_file(struct dentry *dentry)
|
2010-03-07 18:49:36 -08:00
|
|
|
{
|
2024-12-11 13:11:17 +01:00
|
|
|
struct ns_common *ns;
|
|
|
|
|
|
2013-03-30 01:35:18 -07:00
|
|
|
/* Is this a proxy for a mount namespace? */
|
2024-12-11 13:11:17 +01:00
|
|
|
if (dentry->d_op != &ns_dentry_operations)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
ns = d_inode(dentry)->i_private;
|
|
|
|
|
|
|
|
|
|
return ns->ops == &mntns_operations;
|
2013-03-30 01:35:18 -07:00
|
|
|
}
|
|
|
|
|
|
2024-07-19 13:41:52 +02:00
|
|
|
struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
|
2014-11-01 00:00:23 -04:00
|
|
|
{
|
2024-07-19 13:41:52 +02:00
|
|
|
return &mnt->ns;
|
2014-11-01 00:00:23 -04:00
|
|
|
}
|
|
|
|
|
|
2024-12-13 00:03:44 +01:00
|
|
|
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
|
nsproxy: attach to namespaces via pidfds
For quite a while we have been thinking about using pidfds to attach to
namespaces. This patchset has existed for about a year already but we've
wanted to wait to see how the general api would be received and adopted.
Now that more and more programs in userspace have started using pidfds
for process management it's time to send this one out.
This patch makes it possible to use pidfds to attach to the namespaces
of another process, i.e. they can be passed as the first argument to the
setns() syscall. When only a single namespace type is specified the
semantics are equivalent to passing an nsfd. That means
setns(nsfd, CLONE_NEWNET) equals setns(pidfd, CLONE_NEWNET). However,
when a pidfd is passed, multiple namespace flags can be specified in the
second setns() argument and setns() will attach the caller to all the
specified namespaces all at once or to none of them. Specifying 0 is not
valid together with a pidfd.
Here are just two obvious examples:
setns(pidfd, CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET);
setns(pidfd, CLONE_NEWUSER);
Allowing to also attach subsets of namespaces supports various use-cases
where callers setns to a subset of namespaces to retain privilege, perform
an action and then re-attach another subset of namespaces.
If the need arises, as Eric suggested, we can extend this patchset to
assume even more context than just attaching all namespaces. His suggestion
specifically was about assuming the process' root directory when
setns(pidfd, 0) or setns(pidfd, SETNS_PIDFD) is specified. For now, just
keep it flexible in terms of supporting subsets of namespaces but let's
wait until we have users asking for even more context to be assumed. At
that point we can add an extension.
The obvious example where this is useful is a standard container
manager interacting with a running container: pushing and pulling files
or directories, injecting mounts, attaching/execing any kind of process,
managing network devices all these operations require attaching to all
or at least multiple namespaces at the same time. Given that nowadays
most containers are spawned with all namespaces enabled we're currently
looking at at least 14 syscalls, 7 to open the /proc/<pid>/ns/<ns>
nsfds, another 7 to actually perform the namespace switch. With time
namespaces we're looking at about 16 syscalls.
(We could amortize the first 7 or 8 syscalls for opening the nsfds by
stashing them in each container's monitor process but that would mean
we need to send around those file descriptors through unix sockets
everytime we want to interact with the container or keep on-disk
state. Even in scenarios where a caller wants to join a particular
namespace in a particular order callers still profit from batching
other namespaces. That mostly applies to the user namespace but
all container runtimes I found join the user namespace first no matter
if it privileges or deprivileges the container similar to how unshare
behaves.)
With pidfds this becomes a single syscall no matter how many namespaces
are supposed to be attached to.
A decently designed, large-scale container manager usually isn't the
parent of any of the containers it spawns so the containers don't die
when it crashes or needs to update or reinitialize. This means that
for the manager to interact with containers through pids is inherently
racy especially on systems where the maximum pid number is not
significicantly bumped. This is even more problematic since we often spawn
and manage thousands or ten-thousands of containers. Interacting with a
container through a pid thus can become risky quite quickly. Especially
since we allow for an administrator to enable advanced features such as
syscall interception where we're performing syscalls in lieu of the
container. In all of those cases we use pidfds if they are available and
we pass them around as stable references. Using them to setns() to the
target process' namespaces is as reliable as using nsfds. Either the
target process is already dead and we get ESRCH or we manage to attach
to its namespaces but we can't accidently attach to another process'
namespaces. So pidfds lend themselves to be used with this api.
The other main advantage is that with this change the pidfd becomes the
only relevant token for most container interactions and it's the only
token we need to create and send around.
Apart from significiantly reducing the number of syscalls from double
digit to single digit which is a decent reason post-spectre/meltdown
this also allows to switch to a set of namespaces atomically, i.e.
either attaching to all the specified namespaces succeeds or we fail. If
we fail we haven't changed a single namespace. There are currently three
namespaces that can fail (other than for ENOMEM which really is not
very interesting since we then have other problems anyway) for
non-trivial reasons, user, mount, and pid namespaces. We can fail to
attach to a pid namespace if it is not our current active pid namespace
or a descendant of it. We can fail to attach to a user namespace because
we are multi-threaded or because our current mount namespace shares
filesystem state with other tasks, or because we're trying to setns()
to the same user namespace, i.e. the target task has the same user
namespace as we do. We can fail to attach to a mount namespace because
it shares filesystem state with other tasks or because we fail to lookup
the new root for the new mount namespace. In most non-pathological
scenarios these issues can be somewhat mitigated. But there are cases where
we're half-attached to some namespace and failing to attach to another one.
I've talked about some of these problem during the hallway track (something
only the pre-COVID-19 generation will remember) of Plumbers in Los Angeles
in 2018(?). Even if all these issues could be avoided with super careful
userspace coding it would be nicer to have this done in-kernel. Pidfds seem
to lend themselves nicely for this.
The other neat thing about this is that setns() becomes an actual
counterpart to the namespace bits of unshare().
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Jann Horn <jannh@google.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Link: https://lore.kernel.org/r/20200505140432.181565-3-christian.brauner@ubuntu.com
2020-05-05 16:04:31 +02:00
|
|
|
{
|
2024-12-13 00:03:44 +01:00
|
|
|
guard(rcu)();
|
|
|
|
|
|
2024-07-19 13:41:52 +02:00
|
|
|
for (;;) {
|
2024-12-13 00:03:44 +01:00
|
|
|
struct list_head *list;
|
2024-07-19 13:41:52 +02:00
|
|
|
|
|
|
|
|
if (previous)
|
2024-12-13 00:03:44 +01:00
|
|
|
list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
|
2024-07-19 13:41:52 +02:00
|
|
|
else
|
2024-12-13 00:03:44 +01:00
|
|
|
list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
|
|
|
|
|
if (list_is_head(list, &mnt_ns_list))
|
2024-07-19 13:41:52 +02:00
|
|
|
return ERR_PTR(-ENOENT);
|
|
|
|
|
|
2024-12-13 00:03:44 +01:00
|
|
|
mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);
|
2024-07-19 13:41:52 +02:00
|
|
|
|
2024-12-13 00:03:44 +01:00
|
|
|
/*
|
|
|
|
|
* The last passive reference count is put with RCU
|
|
|
|
|
* delay so accessing the mount namespace is not just
|
|
|
|
|
* safe but all relevant members are still valid.
|
|
|
|
|
*/
|
2024-07-19 13:41:52 +02:00
|
|
|
if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/*
|
2024-12-13 00:03:44 +01:00
|
|
|
* We need an active reference count as we're persisting
|
|
|
|
|
* the mount namespace and it might already be on its
|
|
|
|
|
* deathbed.
|
2024-07-19 13:41:52 +02:00
|
|
|
*/
|
|
|
|
|
if (!refcount_inc_not_zero(&mntns->ns.count))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
return mntns;
|
|
|
|
|
}
|
nsproxy: attach to namespaces via pidfds
For quite a while we have been thinking about using pidfds to attach to
namespaces. This patchset has existed for about a year already but we've
wanted to wait to see how the general api would be received and adopted.
Now that more and more programs in userspace have started using pidfds
for process management it's time to send this one out.
This patch makes it possible to use pidfds to attach to the namespaces
of another process, i.e. they can be passed as the first argument to the
setns() syscall. When only a single namespace type is specified the
semantics are equivalent to passing an nsfd. That means
setns(nsfd, CLONE_NEWNET) equals setns(pidfd, CLONE_NEWNET). However,
when a pidfd is passed, multiple namespace flags can be specified in the
second setns() argument and setns() will attach the caller to all the
specified namespaces all at once or to none of them. Specifying 0 is not
valid together with a pidfd.
Here are just two obvious examples:
setns(pidfd, CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET);
setns(pidfd, CLONE_NEWUSER);
Allowing to also attach subsets of namespaces supports various use-cases
where callers setns to a subset of namespaces to retain privilege, perform
an action and then re-attach another subset of namespaces.
If the need arises, as Eric suggested, we can extend this patchset to
assume even more context than just attaching all namespaces. His suggestion
specifically was about assuming the process' root directory when
setns(pidfd, 0) or setns(pidfd, SETNS_PIDFD) is specified. For now, just
keep it flexible in terms of supporting subsets of namespaces but let's
wait until we have users asking for even more context to be assumed. At
that point we can add an extension.
The obvious example where this is useful is a standard container
manager interacting with a running container: pushing and pulling files
or directories, injecting mounts, attaching/execing any kind of process,
managing network devices all these operations require attaching to all
or at least multiple namespaces at the same time. Given that nowadays
most containers are spawned with all namespaces enabled we're currently
looking at at least 14 syscalls, 7 to open the /proc/<pid>/ns/<ns>
nsfds, another 7 to actually perform the namespace switch. With time
namespaces we're looking at about 16 syscalls.
(We could amortize the first 7 or 8 syscalls for opening the nsfds by
stashing them in each container's monitor process but that would mean
we need to send around those file descriptors through unix sockets
everytime we want to interact with the container or keep on-disk
state. Even in scenarios where a caller wants to join a particular
namespace in a particular order callers still profit from batching
other namespaces. That mostly applies to the user namespace but
all container runtimes I found join the user namespace first no matter
if it privileges or deprivileges the container similar to how unshare
behaves.)
With pidfds this becomes a single syscall no matter how many namespaces
are supposed to be attached to.
A decently designed, large-scale container manager usually isn't the
parent of any of the containers it spawns so the containers don't die
when it crashes or needs to update or reinitialize. This means that
for the manager to interact with containers through pids is inherently
racy especially on systems where the maximum pid number is not
significicantly bumped. This is even more problematic since we often spawn
and manage thousands or ten-thousands of containers. Interacting with a
container through a pid thus can become risky quite quickly. Especially
since we allow for an administrator to enable advanced features such as
syscall interception where we're performing syscalls in lieu of the
container. In all of those cases we use pidfds if they are available and
we pass them around as stable references. Using them to setns() to the
target process' namespaces is as reliable as using nsfds. Either the
target process is already dead and we get ESRCH or we manage to attach
to its namespaces but we can't accidently attach to another process'
namespaces. So pidfds lend themselves to be used with this api.
The other main advantage is that with this change the pidfd becomes the
only relevant token for most container interactions and it's the only
token we need to create and send around.
Apart from significiantly reducing the number of syscalls from double
digit to single digit which is a decent reason post-spectre/meltdown
this also allows to switch to a set of namespaces atomically, i.e.
either attaching to all the specified namespaces succeeds or we fail. If
we fail we haven't changed a single namespace. There are currently three
namespaces that can fail (other than for ENOMEM which really is not
very interesting since we then have other problems anyway) for
non-trivial reasons, user, mount, and pid namespaces. We can fail to
attach to a pid namespace if it is not our current active pid namespace
or a descendant of it. We can fail to attach to a user namespace because
we are multi-threaded or because our current mount namespace shares
filesystem state with other tasks, or because we're trying to setns()
to the same user namespace, i.e. the target task has the same user
namespace as we do. We can fail to attach to a mount namespace because
it shares filesystem state with other tasks or because we fail to lookup
the new root for the new mount namespace. In most non-pathological
scenarios these issues can be somewhat mitigated. But there are cases where
we're half-attached to some namespace and failing to attach to another one.
I've talked about some of these problem during the hallway track (something
only the pre-COVID-19 generation will remember) of Plumbers in Los Angeles
in 2018(?). Even if all these issues could be avoided with super careful
userspace coding it would be nicer to have this done in-kernel. Pidfds seem
to lend themselves nicely for this.
The other neat thing about this is that setns() becomes an actual
counterpart to the namespace bits of unshare().
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Jann Horn <jannh@google.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Link: https://lore.kernel.org/r/20200505140432.181565-3-christian.brauner@ubuntu.com
2020-05-05 16:04:31 +02:00
|
|
|
}
|
|
|
|
|
|
2025-01-29 17:58:00 +01:00
|
|
|
struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry)
|
|
|
|
|
{
|
|
|
|
|
if (!is_mnt_ns_file(dentry))
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
return to_mnt_ns(get_proc_ns(dentry->d_inode));
|
|
|
|
|
}
|
|
|
|
|
|
2013-03-30 01:35:18 -07:00
|
|
|
static bool mnt_ns_loop(struct dentry *dentry)
|
|
|
|
|
{
|
|
|
|
|
/* Could bind mounting the mount namespace inode cause a
|
|
|
|
|
* mount namespace loop?
|
|
|
|
|
*/
|
2025-01-29 17:58:00 +01:00
|
|
|
struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry);
|
|
|
|
|
|
|
|
|
|
if (!mnt_ns)
|
2013-03-30 01:35:18 -07:00
|
|
|
return false;
|
|
|
|
|
|
2010-03-07 18:49:36 -08:00
|
|
|
return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-07 01:39:12 +08:00
|
|
|
struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
|
2005-11-07 17:17:22 -05:00
|
|
|
int flag)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2024-06-07 01:39:12 +08:00
|
|
|
struct mount *res, *src_parent, *src_root_child, *src_mnt,
|
|
|
|
|
*dst_parent, *dst_mnt;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2024-06-07 01:39:12 +08:00
|
|
|
if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root))
|
2013-03-30 01:35:18 -07:00
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
|
|
|
|
|
if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
|
2012-06-25 12:55:18 +01:00
|
|
|
return ERR_PTR(-EINVAL);
|
2005-11-07 17:21:20 -05:00
|
|
|
|
2024-06-07 01:39:12 +08:00
|
|
|
res = dst_mnt = clone_mnt(src_root, dentry, flag);
|
|
|
|
|
if (IS_ERR(dst_mnt))
|
|
|
|
|
return dst_mnt;
|
2012-06-25 12:55:18 +01:00
|
|
|
|
2024-06-07 01:39:12 +08:00
|
|
|
src_parent = src_root;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2024-06-07 01:39:12 +08:00
|
|
|
list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) {
|
|
|
|
|
if (!is_subdir(src_root_child->mnt_mountpoint, dentry))
|
2005-04-16 15:20:36 -07:00
|
|
|
continue;
|
|
|
|
|
|
2024-06-07 01:39:12 +08:00
|
|
|
for (src_mnt = src_root_child; src_mnt;
|
|
|
|
|
src_mnt = next_mnt(src_mnt, src_root_child)) {
|
2013-03-30 01:35:18 -07:00
|
|
|
if (!(flag & CL_COPY_UNBINDABLE) &&
|
2024-06-07 01:39:12 +08:00
|
|
|
IS_MNT_UNBINDABLE(src_mnt)) {
|
|
|
|
|
if (src_mnt->mnt.mnt_flags & MNT_LOCKED) {
|
2018-10-25 09:04:18 -05:00
|
|
|
/* Both unbindable and locked. */
|
2024-06-07 01:39:12 +08:00
|
|
|
dst_mnt = ERR_PTR(-EPERM);
|
2018-10-25 09:04:18 -05:00
|
|
|
goto out;
|
|
|
|
|
} else {
|
2024-06-07 01:39:12 +08:00
|
|
|
src_mnt = skip_mnt_tree(src_mnt);
|
2018-10-25 09:04:18 -05:00
|
|
|
continue;
|
|
|
|
|
}
|
2013-03-30 01:35:18 -07:00
|
|
|
}
|
|
|
|
|
if (!(flag & CL_COPY_MNT_NS_FILE) &&
|
2024-06-07 01:39:12 +08:00
|
|
|
is_mnt_ns_file(src_mnt->mnt.mnt_root)) {
|
|
|
|
|
src_mnt = skip_mnt_tree(src_mnt);
|
2005-11-07 17:21:20 -05:00
|
|
|
continue;
|
|
|
|
|
}
|
2024-06-07 01:39:12 +08:00
|
|
|
while (src_parent != src_mnt->mnt_parent) {
|
|
|
|
|
src_parent = src_parent->mnt_parent;
|
|
|
|
|
dst_mnt = dst_mnt->mnt_parent;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2024-06-07 01:39:12 +08:00
|
|
|
|
|
|
|
|
src_parent = src_mnt;
|
|
|
|
|
dst_parent = dst_mnt;
|
|
|
|
|
dst_mnt = clone_mnt(src_mnt, src_mnt->mnt.mnt_root, flag);
|
|
|
|
|
if (IS_ERR(dst_mnt))
|
2012-06-25 12:55:18 +01:00
|
|
|
goto out;
|
2013-09-29 11:24:49 -04:00
|
|
|
lock_mount_hash();
|
don't set MNT_LOCKED on parentless mounts
Originally MNT_LOCKED meant only one thing - "don't let this mount to
be peeled off its parent, we don't want to have its mountpoint exposed".
Accordingly, it had only been set on mounts that *do* have a parent.
Later it got overloaded with another use - setting it on the absolute
root had given free protection against umount(2) of absolute root
(was possible to trigger, oopsed). Not a bad trick, but it ended
up costing more than it bought us. Unfortunately, the cost included
both hard-to-reason-about logics and a subtle race between
mount -o remount,ro and mount --[r]bind - lockless &= ~MNT_LOCKED in
the end of __do_loopback() could race with sb_prepare_remount_readonly()
setting and clearing MNT_HOLD_WRITE (under mount_lock, as it should
be). The race wouldn't be much of a problem (there are other ways to
deal with it), but the subtlety is.
Turns out that nobody except umount(2) had ever made use of having
MNT_LOCKED set on absolute root. So let's give up on that trick,
clever as it had been, add an explicit check in do_umount() and
return to using MNT_LOCKED only for mounts that have a parent.
It means that
* clone_mnt() no longer copies MNT_LOCKED
* copy_tree() sets it on submounts if their counterparts had
been marked such, and does that right next to attach_mnt() in there,
in the same mount_lock scope.
* __do_loopback() no longer needs to strip MNT_LOCKED off the
root of subtree it's about to return; no store, no race.
* init_mount_tree() doesn't bother setting MNT_LOCKED on absolute
root.
* lock_mnt_tree() does not set MNT_LOCKED on the subtree's root;
accordingly, its caller (loop in attach_recursive_mnt()) does not need to
bother stripping that MNT_LOCKED on root. Note that lock_mnt_tree() setting
MNT_LOCKED on submounts happens in the same mount_lock scope as __attach_mnt()
(from commit_tree()) that makes them reachable.
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-05-06 18:48:05 -04:00
|
|
|
if (src_mnt->mnt.mnt_flags & MNT_LOCKED)
|
|
|
|
|
dst_mnt->mnt.mnt_flags |= MNT_LOCKED;
|
2025-05-01 20:40:57 -04:00
|
|
|
if (unlikely(flag & CL_EXPIRE)) {
|
|
|
|
|
/* stick the duplicate mount on the same expiry
|
|
|
|
|
* list as the original if that was on one */
|
|
|
|
|
if (!list_empty(&src_mnt->mnt_expire))
|
|
|
|
|
list_add(&dst_mnt->mnt_expire,
|
|
|
|
|
&src_mnt->mnt_expire);
|
|
|
|
|
}
|
2024-06-07 01:39:12 +08:00
|
|
|
list_add_tail(&dst_mnt->mnt_list, &res->mnt_list);
|
2025-04-25 12:40:28 -04:00
|
|
|
attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp);
|
2013-09-29 11:24:49 -04:00
|
|
|
unlock_mount_hash();
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return res;
|
2024-06-07 01:39:12 +08:00
|
|
|
|
2012-06-25 12:55:18 +01:00
|
|
|
out:
|
2005-04-16 15:20:36 -07:00
|
|
|
if (res) {
|
2013-09-29 11:24:49 -04:00
|
|
|
lock_mount_hash();
|
2014-12-24 07:20:01 -06:00
|
|
|
umount_tree(res, UMOUNT_SYNC);
|
2013-09-29 11:24:49 -04:00
|
|
|
unlock_mount_hash();
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2024-06-07 01:39:12 +08:00
|
|
|
return dst_mnt;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
replace collect_mounts()/drop_collected_mounts() with a safer variant
collect_mounts() has several problems - one can't iterate over the results
directly, so it has to be done with callback passed to iterate_mounts();
it has an oopsable race with d_invalidate(); it creates temporary clones
of mounts invisibly for sync umount (IOW, you can have non-lazy umount
succeed leaving filesystem not mounted anywhere and yet still busy).
A saner approach is to give caller an array of struct path that would pin
every mount in a subtree, without cloning any mounts.
* collect_mounts()/drop_collected_mounts()/iterate_mounts() is gone
* collect_paths(where, preallocated, size) gives either ERR_PTR(-E...) or
a pointer to array of struct path, one for each chunk of tree visible under
'where' (i.e. the first element is a copy of where, followed by (mount,root)
for everything mounted under it - the same set collect_mounts() would give).
Unlike collect_mounts(), the mounts are *not* cloned - we just get pinning
references to the roots of subtrees in the caller's namespace.
Array is terminated by {NULL, NULL} struct path. If it fits into
preallocated array (on-stack, normally), that's where it goes; otherwise
it's allocated by kmalloc_array(). Passing 0 as size means that 'preallocated'
is ignored (and expected to be NULL).
* drop_collected_paths(paths, preallocated) is given the array returned
by an earlier call of collect_paths() and the preallocated array passed to that
call. All mount/dentry references are dropped and array is kfree'd if it's not
equal to 'preallocated'.
* instead of iterate_mounts(), users should just iterate over array
of struct path - nothing exotic is needed for that. Existing users (all in
audit_tree.c) are converted.
[folded a fix for braino reported by Venkat Rao Bagalkote <venkat88@linux.ibm.com>]
Fixes: 80b5dce8c59b0 ("vfs: Add a function to lazily unmount all mounts from any dentry")
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-06-17 00:09:51 -04:00
|
|
|
static inline bool extend_array(struct path **res, struct path **to_free,
|
|
|
|
|
unsigned n, unsigned *count, unsigned new_count)
|
|
|
|
|
{
|
|
|
|
|
struct path *p;
|
|
|
|
|
|
|
|
|
|
if (likely(n < *count))
|
|
|
|
|
return true;
|
|
|
|
|
p = kmalloc_array(new_count, sizeof(struct path), GFP_KERNEL);
|
|
|
|
|
if (p && *count)
|
|
|
|
|
memcpy(p, *res, *count * sizeof(struct path));
|
|
|
|
|
*count = new_count;
|
|
|
|
|
kfree(*to_free);
|
|
|
|
|
*to_free = *res = p;
|
|
|
|
|
return p;
|
|
|
|
|
}
|
2012-06-25 12:55:18 +01:00
|
|
|
|
replace collect_mounts()/drop_collected_mounts() with a safer variant
collect_mounts() has several problems - one can't iterate over the results
directly, so it has to be done with callback passed to iterate_mounts();
it has an oopsable race with d_invalidate(); it creates temporary clones
of mounts invisibly for sync umount (IOW, you can have non-lazy umount
succeed leaving filesystem not mounted anywhere and yet still busy).
A saner approach is to give caller an array of struct path that would pin
every mount in a subtree, without cloning any mounts.
* collect_mounts()/drop_collected_mounts()/iterate_mounts() is gone
* collect_paths(where, preallocated, size) gives either ERR_PTR(-E...) or
a pointer to array of struct path, one for each chunk of tree visible under
'where' (i.e. the first element is a copy of where, followed by (mount,root)
for everything mounted under it - the same set collect_mounts() would give).
Unlike collect_mounts(), the mounts are *not* cloned - we just get pinning
references to the roots of subtrees in the caller's namespace.
Array is terminated by {NULL, NULL} struct path. If it fits into
preallocated array (on-stack, normally), that's where it goes; otherwise
it's allocated by kmalloc_array(). Passing 0 as size means that 'preallocated'
is ignored (and expected to be NULL).
* drop_collected_paths(paths, preallocated) is given the array returned
by an earlier call of collect_paths() and the preallocated array passed to that
call. All mount/dentry references are dropped and array is kfree'd if it's not
equal to 'preallocated'.
* instead of iterate_mounts(), users should just iterate over array
of struct path - nothing exotic is needed for that. Existing users (all in
audit_tree.c) are converted.
[folded a fix for braino reported by Venkat Rao Bagalkote <venkat88@linux.ibm.com>]
Fixes: 80b5dce8c59b0 ("vfs: Add a function to lazily unmount all mounts from any dentry")
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-06-17 00:09:51 -04:00
|
|
|
struct path *collect_paths(const struct path *path,
|
|
|
|
|
struct path *prealloc, unsigned count)
|
2007-06-07 12:20:32 -04:00
|
|
|
{
|
replace collect_mounts()/drop_collected_mounts() with a safer variant
collect_mounts() has several problems - one can't iterate over the results
directly, so it has to be done with callback passed to iterate_mounts();
it has an oopsable race with d_invalidate(); it creates temporary clones
of mounts invisibly for sync umount (IOW, you can have non-lazy umount
succeed leaving filesystem not mounted anywhere and yet still busy).
A saner approach is to give caller an array of struct path that would pin
every mount in a subtree, without cloning any mounts.
* collect_mounts()/drop_collected_mounts()/iterate_mounts() is gone
* collect_paths(where, preallocated, size) gives either ERR_PTR(-E...) or
a pointer to array of struct path, one for each chunk of tree visible under
'where' (i.e. the first element is a copy of where, followed by (mount,root)
for everything mounted under it - the same set collect_mounts() would give).
Unlike collect_mounts(), the mounts are *not* cloned - we just get pinning
references to the roots of subtrees in the caller's namespace.
Array is terminated by {NULL, NULL} struct path. If it fits into
preallocated array (on-stack, normally), that's where it goes; otherwise
it's allocated by kmalloc_array(). Passing 0 as size means that 'preallocated'
is ignored (and expected to be NULL).
* drop_collected_paths(paths, preallocated) is given the array returned
by an earlier call of collect_paths() and the preallocated array passed to that
call. All mount/dentry references are dropped and array is kfree'd if it's not
equal to 'preallocated'.
* instead of iterate_mounts(), users should just iterate over array
of struct path - nothing exotic is needed for that. Existing users (all in
audit_tree.c) are converted.
[folded a fix for braino reported by Venkat Rao Bagalkote <venkat88@linux.ibm.com>]
Fixes: 80b5dce8c59b0 ("vfs: Add a function to lazily unmount all mounts from any dentry")
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-06-17 00:09:51 -04:00
|
|
|
struct mount *root = real_mount(path->mnt);
|
|
|
|
|
struct mount *child;
|
|
|
|
|
struct path *res = prealloc, *to_free = NULL;
|
|
|
|
|
unsigned n = 0;
|
|
|
|
|
|
|
|
|
|
guard(rwsem_read)(&namespace_sem);
|
|
|
|
|
|
|
|
|
|
if (!check_mnt(root))
|
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
if (!extend_array(&res, &to_free, 0, &count, 32))
|
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
res[n++] = *path;
|
|
|
|
|
list_for_each_entry(child, &root->mnt_mounts, mnt_child) {
|
|
|
|
|
if (!is_subdir(child->mnt_mountpoint, path->dentry))
|
|
|
|
|
continue;
|
|
|
|
|
for (struct mount *m = child; m; m = next_mnt(m, child)) {
|
|
|
|
|
if (!extend_array(&res, &to_free, n, &count, 2 * count))
|
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
res[n].mnt = &m->mnt;
|
|
|
|
|
res[n].dentry = m->mnt.mnt_root;
|
|
|
|
|
n++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (!extend_array(&res, &to_free, n, &count, count + 1))
|
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
memset(res + n, 0, (count - n) * sizeof(struct path));
|
|
|
|
|
for (struct path *p = res; p->mnt; p++)
|
|
|
|
|
path_get(p);
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void drop_collected_paths(struct path *paths, struct path *prealloc)
|
|
|
|
|
{
|
|
|
|
|
for (struct path *p = paths; p->mnt; p++)
|
|
|
|
|
path_put(p);
|
|
|
|
|
if (paths != prealloc)
|
|
|
|
|
kfree(paths);
|
2007-06-07 12:20:32 -04:00
|
|
|
}
|
|
|
|
|
|
2018-11-05 17:40:30 +00:00
|
|
|
static void free_mnt_ns(struct mnt_namespace *);
|
|
|
|
|
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
|
|
|
|
|
|
|
|
|
|
void dissolve_on_fput(struct vfsmount *mnt)
|
|
|
|
|
{
|
|
|
|
|
struct mnt_namespace *ns;
|
2025-02-21 14:13:03 +01:00
|
|
|
struct mount *m = real_mount(mnt);
|
|
|
|
|
|
2025-06-08 23:41:23 -04:00
|
|
|
/*
|
|
|
|
|
* m used to be the root of anon namespace; if it still is one,
|
|
|
|
|
* we need to dissolve the mount tree and free that namespace.
|
|
|
|
|
* Let's try to avoid taking namespace_sem if we can determine
|
|
|
|
|
* that there's nothing to do without it - rcu_read_lock() is
|
|
|
|
|
* enough to make anon_ns_root() memory-safe and once m has
|
|
|
|
|
* left its namespace, it's no longer our concern, since it will
|
|
|
|
|
* never become a root of anon ns again.
|
|
|
|
|
*/
|
|
|
|
|
|
2025-02-21 14:13:03 +01:00
|
|
|
scoped_guard(rcu) {
|
2025-06-08 23:41:23 -04:00
|
|
|
if (!anon_ns_root(m))
|
2025-02-21 14:13:03 +01:00
|
|
|
return;
|
2018-11-05 17:40:31 +00:00
|
|
|
}
|
2025-02-21 14:13:03 +01:00
|
|
|
|
2025-04-10 17:05:42 +02:00
|
|
|
scoped_guard(namespace_lock, &namespace_sem) {
|
2025-06-08 23:41:23 -04:00
|
|
|
if (!anon_ns_root(m))
|
2025-02-21 14:13:08 +01:00
|
|
|
return;
|
|
|
|
|
|
2025-06-08 23:41:23 -04:00
|
|
|
ns = m->mnt_ns;
|
2025-02-21 14:13:03 +01:00
|
|
|
lock_mount_hash();
|
|
|
|
|
umount_tree(m, UMOUNT_CONNECTED);
|
|
|
|
|
unlock_mount_hash();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Make sure we notice when we leak mounts. */
|
|
|
|
|
VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
|
|
|
|
|
free_mnt_ns(ns);
|
2018-11-05 17:40:30 +00:00
|
|
|
}
|
|
|
|
|
|
2025-06-01 14:23:52 -04:00
|
|
|
static bool __has_locked_children(struct mount *mnt, struct dentry *dentry)
|
2021-08-09 10:19:47 +02:00
|
|
|
{
|
|
|
|
|
struct mount *child;
|
|
|
|
|
|
|
|
|
|
list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
|
|
|
|
|
if (!is_subdir(child->mnt_mountpoint, dentry))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (child->mnt.mnt_flags & MNT_LOCKED)
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2025-06-01 14:23:52 -04:00
|
|
|
bool has_locked_children(struct mount *mnt, struct dentry *dentry)
|
|
|
|
|
{
|
|
|
|
|
bool res;
|
|
|
|
|
|
|
|
|
|
read_seqlock_excl(&mount_lock);
|
|
|
|
|
res = __has_locked_children(mnt, dentry);
|
|
|
|
|
read_sequnlock_excl(&mount_lock);
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-23 20:19:48 +01:00
|
|
|
/*
|
|
|
|
|
* Check that there aren't references to earlier/same mount namespaces in the
|
|
|
|
|
* specified subtree. Such references can act as pins for mount namespaces
|
|
|
|
|
* that aren't checked by the mount-cycle checking code, thereby allowing
|
|
|
|
|
* cycles to be made.
|
|
|
|
|
*/
|
|
|
|
|
static bool check_for_nsfs_mounts(struct mount *subtree)
|
|
|
|
|
{
|
|
|
|
|
struct mount *p;
|
|
|
|
|
bool ret = false;
|
|
|
|
|
|
|
|
|
|
lock_mount_hash();
|
|
|
|
|
for (p = subtree; p; p = next_mnt(p, subtree))
|
|
|
|
|
if (mnt_ns_loop(p->mnt.mnt_root))
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
ret = true;
|
|
|
|
|
out:
|
|
|
|
|
unlock_mount_hash();
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2014-10-24 00:14:36 +02:00
|
|
|
/**
|
|
|
|
|
* clone_private_mount - create a private clone of a path
|
2021-03-17 19:52:25 -07:00
|
|
|
* @path: path to clone
|
2014-10-24 00:14:36 +02:00
|
|
|
*
|
2021-03-17 19:52:25 -07:00
|
|
|
* This creates a new vfsmount, which will be the clone of @path. The new mount
|
|
|
|
|
* will not be attached anywhere in the namespace and will be private (i.e.
|
|
|
|
|
* changes to the originating mount won't be propagated into this).
|
2014-10-24 00:14:36 +02:00
|
|
|
*
|
2025-01-23 20:19:48 +01:00
|
|
|
* This assumes caller has called or done the equivalent of may_mount().
|
|
|
|
|
*
|
2014-10-24 00:14:36 +02:00
|
|
|
* Release with mntput().
|
|
|
|
|
*/
|
2016-11-20 19:45:28 -05:00
|
|
|
struct vfsmount *clone_private_mount(const struct path *path)
|
2014-10-24 00:14:36 +02:00
|
|
|
{
|
|
|
|
|
struct mount *old_mnt = real_mount(path->mnt);
|
|
|
|
|
struct mount *new_mnt;
|
|
|
|
|
|
2025-04-03 16:43:50 +02:00
|
|
|
guard(rwsem_read)(&namespace_sem);
|
|
|
|
|
|
2014-10-24 00:14:36 +02:00
|
|
|
if (IS_MNT_UNBINDABLE(old_mnt))
|
2025-01-23 20:19:48 +01:00
|
|
|
return ERR_PTR(-EINVAL);
|
2021-08-09 10:19:47 +02:00
|
|
|
|
2025-05-15 12:18:30 +00:00
|
|
|
/*
|
|
|
|
|
* Make sure the source mount is acceptable.
|
|
|
|
|
* Anything mounted in our mount namespace is allowed.
|
|
|
|
|
* Otherwise, it must be the root of an anonymous mount
|
|
|
|
|
* namespace, and we need to make sure no namespace
|
|
|
|
|
* loops get created.
|
|
|
|
|
*/
|
|
|
|
|
if (!check_mnt(old_mnt)) {
|
2025-06-08 23:25:36 -04:00
|
|
|
if (!anon_ns_root(old_mnt))
|
2025-01-23 20:19:48 +01:00
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
|
|
|
|
|
if (!check_for_nsfs_mounts(old_mnt))
|
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
}
|
2021-08-09 10:19:47 +02:00
|
|
|
|
2025-06-01 20:11:06 -04:00
|
|
|
if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
|
|
|
|
|
return ERR_PTR(-EPERM);
|
|
|
|
|
|
2025-06-01 14:23:52 -04:00
|
|
|
if (__has_locked_children(old_mnt, path->dentry))
|
2025-01-23 20:19:48 +01:00
|
|
|
return ERR_PTR(-EINVAL);
|
2014-10-24 00:14:36 +02:00
|
|
|
|
|
|
|
|
new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
|
|
|
|
|
if (IS_ERR(new_mnt))
|
2025-01-23 20:19:48 +01:00
|
|
|
return ERR_PTR(-EINVAL);
|
2014-10-24 00:14:36 +02:00
|
|
|
|
2020-06-04 10:48:19 +02:00
|
|
|
/* Longterm mount to be removed by kern_unmount*() */
|
|
|
|
|
new_mnt->mnt_ns = MNT_NS_INTERNAL;
|
2014-10-24 00:14:36 +02:00
|
|
|
return &new_mnt->mnt;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL_GPL(clone_private_mount);
|
|
|
|
|
|
2019-01-30 13:15:45 -05:00
|
|
|
static void lock_mnt_tree(struct mount *mnt)
|
|
|
|
|
{
|
|
|
|
|
struct mount *p;
|
|
|
|
|
|
|
|
|
|
for (p = mnt; p; p = next_mnt(p, mnt)) {
|
|
|
|
|
int flags = p->mnt.mnt_flags;
|
|
|
|
|
/* Don't allow unprivileged users to change mount flags */
|
|
|
|
|
flags |= MNT_LOCK_ATIME;
|
|
|
|
|
|
|
|
|
|
if (flags & MNT_READONLY)
|
|
|
|
|
flags |= MNT_LOCK_READONLY;
|
|
|
|
|
|
|
|
|
|
if (flags & MNT_NODEV)
|
|
|
|
|
flags |= MNT_LOCK_NODEV;
|
|
|
|
|
|
|
|
|
|
if (flags & MNT_NOSUID)
|
|
|
|
|
flags |= MNT_LOCK_NOSUID;
|
|
|
|
|
|
|
|
|
|
if (flags & MNT_NOEXEC)
|
|
|
|
|
flags |= MNT_LOCK_NOEXEC;
|
|
|
|
|
/* Don't allow unprivileged users to reveal what is under a mount */
|
don't set MNT_LOCKED on parentless mounts
Originally MNT_LOCKED meant only one thing - "don't let this mount to
be peeled off its parent, we don't want to have its mountpoint exposed".
Accordingly, it had only been set on mounts that *do* have a parent.
Later it got overloaded with another use - setting it on the absolute
root had given free protection against umount(2) of absolute root
(was possible to trigger, oopsed). Not a bad trick, but it ended
up costing more than it bought us. Unfortunately, the cost included
both hard-to-reason-about logics and a subtle race between
mount -o remount,ro and mount --[r]bind - lockless &= ~MNT_LOCKED in
the end of __do_loopback() could race with sb_prepare_remount_readonly()
setting and clearing MNT_HOLD_WRITE (under mount_lock, as it should
be). The race wouldn't be much of a problem (there are other ways to
deal with it), but the subtlety is.
Turns out that nobody except umount(2) had ever made use of having
MNT_LOCKED set on absolute root. So let's give up on that trick,
clever as it had been, add an explicit check in do_umount() and
return to using MNT_LOCKED only for mounts that have a parent.
It means that
* clone_mnt() no longer copies MNT_LOCKED
* copy_tree() sets it on submounts if their counterparts had
been marked such, and does that right next to attach_mnt() in there,
in the same mount_lock scope.
* __do_loopback() no longer needs to strip MNT_LOCKED off the
root of subtree it's about to return; no store, no race.
* init_mount_tree() doesn't bother setting MNT_LOCKED on absolute
root.
* lock_mnt_tree() does not set MNT_LOCKED on the subtree's root;
accordingly, its caller (loop in attach_recursive_mnt()) does not need to
bother stripping that MNT_LOCKED on root. Note that lock_mnt_tree() setting
MNT_LOCKED on submounts happens in the same mount_lock scope as __attach_mnt()
(from commit_tree()) that makes them reachable.
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-05-06 18:48:05 -04:00
|
|
|
if (list_empty(&p->mnt_expire) && p != mnt)
|
2019-01-30 13:15:45 -05:00
|
|
|
flags |= MNT_LOCKED;
|
|
|
|
|
p->mnt.mnt_flags = flags;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-11-24 19:54:23 -05:00
|
|
|
static void cleanup_group_ids(struct mount *mnt, struct mount *end)
|
2008-03-27 13:06:23 +01:00
|
|
|
{
|
2011-11-24 18:57:30 -05:00
|
|
|
struct mount *p;
|
2008-03-27 13:06:23 +01:00
|
|
|
|
2011-11-25 03:06:56 -05:00
|
|
|
for (p = mnt; p != end; p = next_mnt(p, mnt)) {
|
2011-11-25 01:05:37 -05:00
|
|
|
if (p->mnt_group_id && !IS_MNT_SHARED(p))
|
2011-11-24 19:54:23 -05:00
|
|
|
mnt_release_group_id(p);
|
2008-03-27 13:06:23 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-11-24 19:54:23 -05:00
|
|
|
static int invent_group_ids(struct mount *mnt, bool recurse)
|
2008-03-27 13:06:23 +01:00
|
|
|
{
|
2011-11-24 18:57:30 -05:00
|
|
|
struct mount *p;
|
2008-03-27 13:06:23 +01:00
|
|
|
|
2011-11-25 03:06:56 -05:00
|
|
|
for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
|
2011-11-25 01:05:37 -05:00
|
|
|
if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
|
2011-11-24 19:54:23 -05:00
|
|
|
int err = mnt_alloc_group_id(p);
|
2008-03-27 13:06:23 +01:00
|
|
|
if (err) {
|
2011-11-24 19:54:23 -05:00
|
|
|
cleanup_group_ids(mnt, p);
|
2008-03-27 13:06:23 +01:00
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-28 00:27:17 -05:00
|
|
|
int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
|
|
|
|
|
{
|
|
|
|
|
unsigned int max = READ_ONCE(sysctl_mount_max);
|
2022-02-13 22:42:30 -05:00
|
|
|
unsigned int mounts = 0;
|
2016-09-28 00:27:17 -05:00
|
|
|
struct mount *p;
|
|
|
|
|
|
2023-10-25 16:02:00 +02:00
|
|
|
if (ns->nr_mounts >= max)
|
2022-02-13 22:42:30 -05:00
|
|
|
return -ENOSPC;
|
2023-10-25 16:02:00 +02:00
|
|
|
max -= ns->nr_mounts;
|
2022-02-13 22:42:30 -05:00
|
|
|
if (ns->pending_mounts >= max)
|
|
|
|
|
return -ENOSPC;
|
|
|
|
|
max -= ns->pending_mounts;
|
|
|
|
|
|
2016-09-28 00:27:17 -05:00
|
|
|
for (p = mnt; p; p = next_mnt(p, mnt))
|
|
|
|
|
mounts++;
|
|
|
|
|
|
2022-02-13 22:42:30 -05:00
|
|
|
if (mounts > max)
|
2016-09-28 00:27:17 -05:00
|
|
|
return -ENOSPC;
|
|
|
|
|
|
2022-02-13 22:42:30 -05:00
|
|
|
ns->pending_mounts += mounts;
|
2016-09-28 00:27:17 -05:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
enum mnt_tree_flags_t {
|
2025-04-25 22:54:56 -04:00
|
|
|
MNT_TREE_BENEATH = BIT(0),
|
|
|
|
|
MNT_TREE_PROPAGATION = BIT(1),
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* attach_recursive_mnt - attach a source mount tree
|
|
|
|
|
* @source_mnt: mount tree to be attached
|
2025-04-25 22:49:47 -04:00
|
|
|
* @dest_mnt: mount that @source_mnt will be mounted on
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
* @dest_mp: the mountpoint @source_mnt will be mounted at
|
2005-11-07 17:19:50 -05:00
|
|
|
*
|
|
|
|
|
* NOTE: in the table below explains the semantics when a source mount
|
|
|
|
|
* of a given type is attached to a destination mount of a given type.
|
2005-11-07 17:21:20 -05:00
|
|
|
* ---------------------------------------------------------------------------
|
|
|
|
|
* | BIND MOUNT OPERATION |
|
|
|
|
|
* |**************************************************************************
|
|
|
|
|
* | source-->| shared | private | slave | unbindable |
|
|
|
|
|
* | dest | | | | |
|
|
|
|
|
* | | | | | | |
|
|
|
|
|
* | v | | | | |
|
|
|
|
|
* |**************************************************************************
|
|
|
|
|
* | shared | shared (++) | shared (+) | shared(+++)| invalid |
|
|
|
|
|
* | | | | | |
|
|
|
|
|
* |non-shared| shared (+) | private | slave (*) | invalid |
|
|
|
|
|
* ***************************************************************************
|
2005-11-07 17:19:50 -05:00
|
|
|
* A bind operation clones the source mount and mounts the clone on the
|
|
|
|
|
* destination mount.
|
|
|
|
|
*
|
|
|
|
|
* (++) the cloned mount is propagated to all the mounts in the propagation
|
|
|
|
|
* tree of the destination mount and the cloned mount is added to
|
|
|
|
|
* the peer group of the source mount.
|
|
|
|
|
* (+) the cloned mount is created under the destination mount and is marked
|
|
|
|
|
* as shared. The cloned mount is added to the peer group of the source
|
|
|
|
|
* mount.
|
2005-11-07 17:21:01 -05:00
|
|
|
* (+++) the mount is propagated to all the mounts in the propagation tree
|
|
|
|
|
* of the destination mount and the cloned mount is made slave
|
|
|
|
|
* of the same master as that of the source mount. The cloned mount
|
|
|
|
|
* is marked as 'shared and slave'.
|
|
|
|
|
* (*) the cloned mount is made a slave of the same master as that of the
|
|
|
|
|
* source mount.
|
|
|
|
|
*
|
2005-11-07 17:21:20 -05:00
|
|
|
* ---------------------------------------------------------------------------
|
|
|
|
|
* | MOVE MOUNT OPERATION |
|
|
|
|
|
* |**************************************************************************
|
|
|
|
|
* | source-->| shared | private | slave | unbindable |
|
|
|
|
|
* | dest | | | | |
|
|
|
|
|
* | | | | | | |
|
|
|
|
|
* | v | | | | |
|
|
|
|
|
* |**************************************************************************
|
|
|
|
|
* | shared | shared (+) | shared (+) | shared(+++) | invalid |
|
|
|
|
|
* | | | | | |
|
|
|
|
|
* |non-shared| shared (+*) | private | slave (*) | unbindable |
|
|
|
|
|
* ***************************************************************************
|
2005-11-07 17:21:01 -05:00
|
|
|
*
|
|
|
|
|
* (+) the mount is moved to the destination. And is then propagated to
|
|
|
|
|
* all the mounts in the propagation tree of the destination mount.
|
2005-11-07 17:20:03 -05:00
|
|
|
* (+*) the mount is moved to the destination.
|
2005-11-07 17:21:01 -05:00
|
|
|
* (+++) the mount is moved to the destination and is then propagated to
|
|
|
|
|
* all the mounts belonging to the destination mount's propagation tree.
|
|
|
|
|
* the mount is marked as 'shared and slave'.
|
|
|
|
|
* (*) the mount continues to be a slave at the new location.
|
2005-11-07 17:19:50 -05:00
|
|
|
*
|
|
|
|
|
* if the source mount is a tree, the operations explained above is
|
|
|
|
|
* applied to each mount in the tree.
|
|
|
|
|
* Must be called without spinlocks held, since this function can sleep
|
|
|
|
|
* in allocations.
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
*
|
|
|
|
|
* Context: The function expects namespace_lock() to be held.
|
|
|
|
|
* Return: If @source_mnt was successfully attached 0 is returned.
|
|
|
|
|
* Otherwise a negative error code is returned.
|
2005-11-07 17:19:50 -05:00
|
|
|
*/
|
2011-11-24 19:59:16 -05:00
|
|
|
static int attach_recursive_mnt(struct mount *source_mnt,
|
2025-04-25 22:49:47 -04:00
|
|
|
struct mount *dest_mnt,
|
2025-04-25 22:54:56 -04:00
|
|
|
struct mountpoint *dest_mp)
|
2005-11-07 17:19:50 -05:00
|
|
|
{
|
2019-01-30 13:15:45 -05:00
|
|
|
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
|
2014-03-20 21:10:51 -04:00
|
|
|
HLIST_HEAD(tree_list);
|
2025-04-25 22:49:47 -04:00
|
|
|
struct mnt_namespace *ns = dest_mnt->mnt_ns;
|
2017-01-20 18:28:35 +13:00
|
|
|
struct mountpoint *smp;
|
2025-04-25 22:40:48 -04:00
|
|
|
struct mountpoint *shorter = NULL;
|
2025-04-25 22:49:47 -04:00
|
|
|
struct mount *child, *p;
|
2025-06-20 22:46:55 -04:00
|
|
|
struct mount *top;
|
2014-03-20 21:10:51 -04:00
|
|
|
struct hlist_node *n;
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
int err = 0;
|
2025-04-25 22:54:56 -04:00
|
|
|
bool moving = mnt_has_parent(source_mnt);
|
2005-11-07 17:19:50 -05:00
|
|
|
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
/*
|
|
|
|
|
* Preallocate a mountpoint in case the new mounts need to be
|
|
|
|
|
* mounted beneath mounts on the same mountpoint.
|
2017-01-20 18:28:35 +13:00
|
|
|
*/
|
2025-06-20 22:46:55 -04:00
|
|
|
for (top = source_mnt; unlikely(top->overmount); top = top->overmount) {
|
2025-04-25 22:40:48 -04:00
|
|
|
if (!shorter && is_mnt_ns_file(top->mnt.mnt_root))
|
|
|
|
|
shorter = top->mnt_mp;
|
2025-06-20 22:46:55 -04:00
|
|
|
}
|
|
|
|
|
smp = get_mountpoint(top->mnt.mnt_root);
|
2017-01-20 18:28:35 +13:00
|
|
|
if (IS_ERR(smp))
|
|
|
|
|
return PTR_ERR(smp);
|
|
|
|
|
|
2016-09-28 00:27:17 -05:00
|
|
|
/* Is there space to add these mounts to the mount namespace? */
|
2019-06-30 19:18:53 -04:00
|
|
|
if (!moving) {
|
2016-09-28 00:27:17 -05:00
|
|
|
err = count_mounts(ns, source_mnt);
|
|
|
|
|
if (err)
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
2011-11-25 01:05:37 -05:00
|
|
|
if (IS_MNT_SHARED(dest_mnt)) {
|
2011-11-24 19:59:16 -05:00
|
|
|
err = invent_group_ids(source_mnt, true);
|
2008-03-27 13:06:23 +01:00
|
|
|
if (err)
|
|
|
|
|
goto out;
|
2014-03-21 10:14:08 -04:00
|
|
|
err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
}
|
|
|
|
|
lock_mount_hash();
|
|
|
|
|
if (err)
|
|
|
|
|
goto out_cleanup_ids;
|
|
|
|
|
|
|
|
|
|
if (IS_MNT_SHARED(dest_mnt)) {
|
2011-11-25 03:06:56 -05:00
|
|
|
for (p = source_mnt; p; p = next_mnt(p, source_mnt))
|
2011-11-24 20:43:10 -05:00
|
|
|
set_mnt_shared(p);
|
2005-11-07 17:19:50 -05:00
|
|
|
}
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
|
2019-06-30 19:18:53 -04:00
|
|
|
if (moving) {
|
2025-04-25 12:55:39 -04:00
|
|
|
umount_mnt(source_mnt);
|
2025-01-29 17:58:01 +01:00
|
|
|
mnt_notify_add(source_mnt);
|
2025-05-01 19:59:30 -04:00
|
|
|
/* if the mount is moved, it should no longer be expired
|
|
|
|
|
* automatically */
|
|
|
|
|
list_del_init(&source_mnt->mnt_expire);
|
2005-11-07 17:20:03 -05:00
|
|
|
} else {
|
2018-11-05 17:40:31 +00:00
|
|
|
if (source_mnt->mnt_ns) {
|
2023-10-25 16:02:00 +02:00
|
|
|
LIST_HEAD(head);
|
|
|
|
|
|
2018-11-05 17:40:31 +00:00
|
|
|
/* move from anon - the caller will destroy */
|
2023-10-25 16:02:00 +02:00
|
|
|
for (p = source_mnt; p; p = next_mnt(p, source_mnt))
|
|
|
|
|
move_from_ns(p, &head);
|
|
|
|
|
list_del_init(&head);
|
2018-11-05 17:40:31 +00:00
|
|
|
}
|
2005-11-07 17:20:03 -05:00
|
|
|
}
|
2005-11-07 17:19:50 -05:00
|
|
|
|
2025-04-25 22:34:33 -04:00
|
|
|
mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
|
2025-04-25 22:40:48 -04:00
|
|
|
/*
|
|
|
|
|
* Now the original copy is in the same state as the secondaries -
|
|
|
|
|
* its root attached to mountpoint, but not hashed and all mounts
|
|
|
|
|
* in it are either in our namespace or in no namespace at all.
|
|
|
|
|
* Add the original to the list of copies and deal with the
|
|
|
|
|
* rest of work for all of them uniformly.
|
|
|
|
|
*/
|
|
|
|
|
hlist_add_head(&source_mnt->mnt_hash, &tree_list);
|
2025-04-25 22:34:33 -04:00
|
|
|
|
2014-03-20 21:10:51 -04:00
|
|
|
hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
|
2014-03-20 20:34:43 -04:00
|
|
|
struct mount *q;
|
2014-03-20 21:10:51 -04:00
|
|
|
hlist_del_init(&child->mnt_hash);
|
2019-01-30 13:15:45 -05:00
|
|
|
/* Notice when we are propagating across user namespaces */
|
|
|
|
|
if (child->mnt_parent->mnt_ns->user_ns != user_ns)
|
|
|
|
|
lock_mnt_tree(child);
|
2025-06-22 18:03:29 -04:00
|
|
|
q = __lookup_mnt(&child->mnt_parent->mnt,
|
|
|
|
|
child->mnt_mountpoint);
|
2025-06-20 22:46:55 -04:00
|
|
|
if (q) {
|
2025-04-25 22:40:48 -04:00
|
|
|
struct mountpoint *mp = smp;
|
2025-06-20 22:46:55 -04:00
|
|
|
struct mount *r = child;
|
|
|
|
|
while (unlikely(r->overmount))
|
|
|
|
|
r = r->overmount;
|
2025-04-25 22:40:48 -04:00
|
|
|
if (unlikely(shorter) && child != source_mnt)
|
|
|
|
|
mp = shorter;
|
|
|
|
|
mnt_change_mountpoint(r, mp, q);
|
2025-06-20 22:46:55 -04:00
|
|
|
}
|
2017-01-20 18:28:35 +13:00
|
|
|
commit_tree(child);
|
2005-11-07 17:19:50 -05:00
|
|
|
}
|
2017-01-20 18:28:35 +13:00
|
|
|
put_mountpoint(smp);
|
2013-09-29 11:24:49 -04:00
|
|
|
unlock_mount_hash();
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
|
2005-11-07 17:19:50 -05:00
|
|
|
return 0;
|
2008-03-27 13:06:23 +01:00
|
|
|
|
|
|
|
|
out_cleanup_ids:
|
smarter propagate_mnt()
The current mainline has copies propagated to *all* nodes, then
tears down the copies we made for nodes that do not contain
counterparts of the desired mountpoint. That sets the right
propagation graph for the copies (at teardown time we move
the slaves of removed node to a surviving peer or directly
to master), but we end up paying a fairly steep price in
useless allocations. It's fairly easy to create a situation
where N calls of mount(2) create exactly N bindings, with
O(N^2) vfsmounts allocated and freed in process.
Fortunately, it is possible to avoid those allocations/freeings.
The trick is to create copies in the right order and find which
one would've eventually become a master with the current algorithm.
It turns out to be possible in O(nodes getting propagation) time
and with no extra allocations at all.
One part is that we need to make sure that eventual master will be
created before its slaves, so we need to walk the propagation
tree in a different order - by peer groups. And iterate through
the peers before dealing with the next group.
Another thing is finding the (earlier) copy that will be a master
of one we are about to create; to do that we are (temporary) marking
the masters of mountpoints we are attaching the copies to.
Either we are in a peer of the last mountpoint we'd dealt with,
or we have the following situation: we are attaching to mountpoint M,
the last copy S_0 had been attached to M_0 and there are sequences
S_0...S_n, M_0...M_n such that S_{i+1} is a master of S_{i},
S_{i} mounted on M{i} and we need to create a slave of the first S_{k}
such that M is getting propagation from M_{k}. It means that the master
of M_{k} will be among the sequence of masters of M. On the
other hand, the nearest marked node in that sequence will either
be the master of M_{k} or the master of M_{k-1} (the latter -
in the case if M_{k-1} is a slave of something M gets propagation
from, but in a wrong peer group).
So we go through the sequence of masters of M until we find
a marked one (P). Let N be the one before it. Then we go through
the sequence of masters of S_0 until we find one (say, S) mounted
on a node D that has P as master and check if D is a peer of N.
If it is, S will be the master of new copy, if not - the master of S
will be.
That's it for the hard part; the rest is fairly simple. Iterator
is in next_group(), handling of one prospective mountpoint is
propagate_one().
It seems to survive all tests and gives a noticably better performance
than the current mainline for setups that are seriously using shared
subtrees.
Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-02-27 09:35:45 -05:00
|
|
|
while (!hlist_empty(&tree_list)) {
|
|
|
|
|
child = hlist_entry(tree_list.first, struct mount, mnt_hash);
|
2016-09-28 00:27:17 -05:00
|
|
|
child->mnt_parent->mnt_ns->pending_mounts = 0;
|
2014-12-24 07:20:01 -06:00
|
|
|
umount_tree(child, UMOUNT_SYNC);
|
smarter propagate_mnt()
The current mainline has copies propagated to *all* nodes, then
tears down the copies we made for nodes that do not contain
counterparts of the desired mountpoint. That sets the right
propagation graph for the copies (at teardown time we move
the slaves of removed node to a surviving peer or directly
to master), but we end up paying a fairly steep price in
useless allocations. It's fairly easy to create a situation
where N calls of mount(2) create exactly N bindings, with
O(N^2) vfsmounts allocated and freed in process.
Fortunately, it is possible to avoid those allocations/freeings.
The trick is to create copies in the right order and find which
one would've eventually become a master with the current algorithm.
It turns out to be possible in O(nodes getting propagation) time
and with no extra allocations at all.
One part is that we need to make sure that eventual master will be
created before its slaves, so we need to walk the propagation
tree in a different order - by peer groups. And iterate through
the peers before dealing with the next group.
Another thing is finding the (earlier) copy that will be a master
of one we are about to create; to do that we are (temporary) marking
the masters of mountpoints we are attaching the copies to.
Either we are in a peer of the last mountpoint we'd dealt with,
or we have the following situation: we are attaching to mountpoint M,
the last copy S_0 had been attached to M_0 and there are sequences
S_0...S_n, M_0...M_n such that S_{i+1} is a master of S_{i},
S_{i} mounted on M{i} and we need to create a slave of the first S_{k}
such that M is getting propagation from M_{k}. It means that the master
of M_{k} will be among the sequence of masters of M. On the
other hand, the nearest marked node in that sequence will either
be the master of M_{k} or the master of M_{k-1} (the latter -
in the case if M_{k-1} is a slave of something M gets propagation
from, but in a wrong peer group).
So we go through the sequence of masters of M until we find
a marked one (P). Let N be the one before it. Then we go through
the sequence of masters of S_0 until we find one (say, S) mounted
on a node D that has P as master and check if D is a peer of N.
If it is, S will be the master of new copy, if not - the master of S
will be.
That's it for the hard part; the rest is fairly simple. Iterator
is in next_group(), handling of one prospective mountpoint is
propagate_one().
It seems to survive all tests and gives a noticably better performance
than the current mainline for setups that are seriously using shared
subtrees.
Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-02-27 09:35:45 -05:00
|
|
|
}
|
|
|
|
|
unlock_mount_hash();
|
2014-03-21 10:14:08 -04:00
|
|
|
cleanup_group_ids(source_mnt, NULL);
|
2008-03-27 13:06:23 +01:00
|
|
|
out:
|
2016-09-28 00:27:17 -05:00
|
|
|
ns->pending_mounts = 0;
|
2017-01-20 18:28:35 +13:00
|
|
|
|
|
|
|
|
read_seqlock_excl(&mount_lock);
|
|
|
|
|
put_mountpoint(smp);
|
|
|
|
|
read_sequnlock_excl(&mount_lock);
|
|
|
|
|
|
2008-03-27 13:06:23 +01:00
|
|
|
return err;
|
2005-11-07 17:19:50 -05:00
|
|
|
}
|
|
|
|
|
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
/**
|
|
|
|
|
* do_lock_mount - lock mount and mountpoint
|
|
|
|
|
* @path: target path
|
|
|
|
|
* @beneath: whether the intention is to mount beneath @path
|
|
|
|
|
*
|
|
|
|
|
* Follow the mount stack on @path until the top mount @mnt is found. If
|
|
|
|
|
* the initial @path->{mnt,dentry} is a mountpoint lookup the first
|
|
|
|
|
* mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root}
|
|
|
|
|
* until nothing is stacked on top of it anymore.
|
|
|
|
|
*
|
|
|
|
|
* Acquire the inode_lock() on the top mount's ->mnt_root to protect
|
|
|
|
|
* against concurrent removal of the new mountpoint from another mount
|
|
|
|
|
* namespace.
|
|
|
|
|
*
|
|
|
|
|
* If @beneath is requested, acquire inode_lock() on @mnt's mountpoint
|
|
|
|
|
* @mp on @mnt->mnt_parent must be acquired. This protects against a
|
|
|
|
|
* concurrent unlink of @mp->mnt_dentry from another mount namespace
|
|
|
|
|
* where @mnt doesn't have a child mount mounted @mp. A concurrent
|
|
|
|
|
* removal of @mnt->mnt_root doesn't matter as nothing will be mounted
|
|
|
|
|
* on top of it for @beneath.
|
|
|
|
|
*
|
|
|
|
|
* In addition, @beneath needs to make sure that @mnt hasn't been
|
|
|
|
|
* unmounted or moved from its current mountpoint in between dropping
|
|
|
|
|
* @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt
|
|
|
|
|
* being unmounted would be detected later by e.g., calling
|
|
|
|
|
* check_mnt(mnt) in the function it's called from. For the @beneath
|
|
|
|
|
* case however, it's useful to detect it directly in do_lock_mount().
|
|
|
|
|
* If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points
|
|
|
|
|
* to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will
|
|
|
|
|
* point to @mnt->mnt_root and @mnt->mnt_mp will be NULL.
|
|
|
|
|
*
|
|
|
|
|
* Return: Either the target mountpoint on the top mount or the top
|
|
|
|
|
* mount's mountpoint.
|
|
|
|
|
*/
|
|
|
|
|
static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
|
2011-03-18 08:55:38 -04:00
|
|
|
{
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
struct vfsmount *mnt = path->mnt;
|
2023-05-03 13:18:41 +02:00
|
|
|
struct dentry *dentry;
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
struct mountpoint *mp = ERR_PTR(-ENOENT);
|
fix a couple of races in MNT_TREE_BENEATH handling by do_move_mount()
Normally do_lock_mount(path, _) is locking a mountpoint pinned by
*path and at the time when matching unlock_mount() unlocks that
location it is still pinned by the same thing.
Unfortunately, for 'beneath' case it's no longer that simple -
the object being locked is not the one *path points to. It's the
mountpoint of path->mnt. The thing is, without sufficient locking
->mnt_parent may change under us and none of the locks are held
at that point. The rules are
* mount_lock stabilizes m->mnt_parent for any mount m.
* namespace_sem stabilizes m->mnt_parent, provided that
m is mounted.
* if either of the above holds and refcount of m is positive,
we are guaranteed the same for refcount of m->mnt_parent.
namespace_sem nests inside inode_lock(), so do_lock_mount() has
to take inode_lock() before grabbing namespace_sem. It does
recheck that path->mnt is still mounted in the same place after
getting namespace_sem, and it does take care to pin the dentry.
It is needed, since otherwise we might end up with racing mount --move
(or umount) happening while we were getting locks; in that case
dentry would no longer be a mountpoint and could've been evicted
on memory pressure along with its inode - not something you want
when grabbing lock on that inode.
However, pinning a dentry is not enough - the matching mount is
also pinned only by the fact that path->mnt is mounted on top it
and at that point we are not holding any locks whatsoever, so
the same kind of races could end up with all references to
that mount gone just as we are about to enter inode_lock().
If that happens, we are left with filesystem being shut down while
we are holding a dentry reference on it; results are not pretty.
What we need to do is grab both dentry and mount at the same time;
that makes inode_lock() safe *and* avoids the problem with fs getting
shut down under us. After taking namespace_sem we verify that
path->mnt is still mounted (which stabilizes its ->mnt_parent) and
check that it's still mounted at the same place. From that point
on to the matching namespace_unlock() we are guaranteed that
mount/dentry pair we'd grabbed are also pinned by being the mountpoint
of path->mnt, so we can quietly drop both the dentry reference (as
the current code does) and mnt one - it's OK to do under namespace_sem,
since we are not dropping the final refs.
That solves the problem on do_lock_mount() side; unlock_mount()
also has one, since dentry is guaranteed to stay pinned only until
the namespace_unlock(). That's easy to fix - just have inode_unlock()
done earlier, while it's still pinned by mp->m_dentry.
Fixes: 6ac392815628 "fs: allow to mount beneath top mount" # v6.5+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-04-23 02:30:34 +01:00
|
|
|
struct path under = {};
|
2023-05-03 13:18:41 +02:00
|
|
|
|
|
|
|
|
for (;;) {
|
fix a couple of races in MNT_TREE_BENEATH handling by do_move_mount()
Normally do_lock_mount(path, _) is locking a mountpoint pinned by
*path and at the time when matching unlock_mount() unlocks that
location it is still pinned by the same thing.
Unfortunately, for 'beneath' case it's no longer that simple -
the object being locked is not the one *path points to. It's the
mountpoint of path->mnt. The thing is, without sufficient locking
->mnt_parent may change under us and none of the locks are held
at that point. The rules are
* mount_lock stabilizes m->mnt_parent for any mount m.
* namespace_sem stabilizes m->mnt_parent, provided that
m is mounted.
* if either of the above holds and refcount of m is positive,
we are guaranteed the same for refcount of m->mnt_parent.
namespace_sem nests inside inode_lock(), so do_lock_mount() has
to take inode_lock() before grabbing namespace_sem. It does
recheck that path->mnt is still mounted in the same place after
getting namespace_sem, and it does take care to pin the dentry.
It is needed, since otherwise we might end up with racing mount --move
(or umount) happening while we were getting locks; in that case
dentry would no longer be a mountpoint and could've been evicted
on memory pressure along with its inode - not something you want
when grabbing lock on that inode.
However, pinning a dentry is not enough - the matching mount is
also pinned only by the fact that path->mnt is mounted on top it
and at that point we are not holding any locks whatsoever, so
the same kind of races could end up with all references to
that mount gone just as we are about to enter inode_lock().
If that happens, we are left with filesystem being shut down while
we are holding a dentry reference on it; results are not pretty.
What we need to do is grab both dentry and mount at the same time;
that makes inode_lock() safe *and* avoids the problem with fs getting
shut down under us. After taking namespace_sem we verify that
path->mnt is still mounted (which stabilizes its ->mnt_parent) and
check that it's still mounted at the same place. From that point
on to the matching namespace_unlock() we are guaranteed that
mount/dentry pair we'd grabbed are also pinned by being the mountpoint
of path->mnt, so we can quietly drop both the dentry reference (as
the current code does) and mnt one - it's OK to do under namespace_sem,
since we are not dropping the final refs.
That solves the problem on do_lock_mount() side; unlock_mount()
also has one, since dentry is guaranteed to stay pinned only until
the namespace_unlock(). That's easy to fix - just have inode_unlock()
done earlier, while it's still pinned by mp->m_dentry.
Fixes: 6ac392815628 "fs: allow to mount beneath top mount" # v6.5+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-04-23 02:30:34 +01:00
|
|
|
struct mount *m = real_mount(mnt);
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
|
|
|
|
|
if (beneath) {
|
fix a couple of races in MNT_TREE_BENEATH handling by do_move_mount()
Normally do_lock_mount(path, _) is locking a mountpoint pinned by
*path and at the time when matching unlock_mount() unlocks that
location it is still pinned by the same thing.
Unfortunately, for 'beneath' case it's no longer that simple -
the object being locked is not the one *path points to. It's the
mountpoint of path->mnt. The thing is, without sufficient locking
->mnt_parent may change under us and none of the locks are held
at that point. The rules are
* mount_lock stabilizes m->mnt_parent for any mount m.
* namespace_sem stabilizes m->mnt_parent, provided that
m is mounted.
* if either of the above holds and refcount of m is positive,
we are guaranteed the same for refcount of m->mnt_parent.
namespace_sem nests inside inode_lock(), so do_lock_mount() has
to take inode_lock() before grabbing namespace_sem. It does
recheck that path->mnt is still mounted in the same place after
getting namespace_sem, and it does take care to pin the dentry.
It is needed, since otherwise we might end up with racing mount --move
(or umount) happening while we were getting locks; in that case
dentry would no longer be a mountpoint and could've been evicted
on memory pressure along with its inode - not something you want
when grabbing lock on that inode.
However, pinning a dentry is not enough - the matching mount is
also pinned only by the fact that path->mnt is mounted on top it
and at that point we are not holding any locks whatsoever, so
the same kind of races could end up with all references to
that mount gone just as we are about to enter inode_lock().
If that happens, we are left with filesystem being shut down while
we are holding a dentry reference on it; results are not pretty.
What we need to do is grab both dentry and mount at the same time;
that makes inode_lock() safe *and* avoids the problem with fs getting
shut down under us. After taking namespace_sem we verify that
path->mnt is still mounted (which stabilizes its ->mnt_parent) and
check that it's still mounted at the same place. From that point
on to the matching namespace_unlock() we are guaranteed that
mount/dentry pair we'd grabbed are also pinned by being the mountpoint
of path->mnt, so we can quietly drop both the dentry reference (as
the current code does) and mnt one - it's OK to do under namespace_sem,
since we are not dropping the final refs.
That solves the problem on do_lock_mount() side; unlock_mount()
also has one, since dentry is guaranteed to stay pinned only until
the namespace_unlock(). That's easy to fix - just have inode_unlock()
done earlier, while it's still pinned by mp->m_dentry.
Fixes: 6ac392815628 "fs: allow to mount beneath top mount" # v6.5+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-04-23 02:30:34 +01:00
|
|
|
path_put(&under);
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
read_seqlock_excl(&mount_lock);
|
fix a couple of races in MNT_TREE_BENEATH handling by do_move_mount()
Normally do_lock_mount(path, _) is locking a mountpoint pinned by
*path and at the time when matching unlock_mount() unlocks that
location it is still pinned by the same thing.
Unfortunately, for 'beneath' case it's no longer that simple -
the object being locked is not the one *path points to. It's the
mountpoint of path->mnt. The thing is, without sufficient locking
->mnt_parent may change under us and none of the locks are held
at that point. The rules are
* mount_lock stabilizes m->mnt_parent for any mount m.
* namespace_sem stabilizes m->mnt_parent, provided that
m is mounted.
* if either of the above holds and refcount of m is positive,
we are guaranteed the same for refcount of m->mnt_parent.
namespace_sem nests inside inode_lock(), so do_lock_mount() has
to take inode_lock() before grabbing namespace_sem. It does
recheck that path->mnt is still mounted in the same place after
getting namespace_sem, and it does take care to pin the dentry.
It is needed, since otherwise we might end up with racing mount --move
(or umount) happening while we were getting locks; in that case
dentry would no longer be a mountpoint and could've been evicted
on memory pressure along with its inode - not something you want
when grabbing lock on that inode.
However, pinning a dentry is not enough - the matching mount is
also pinned only by the fact that path->mnt is mounted on top it
and at that point we are not holding any locks whatsoever, so
the same kind of races could end up with all references to
that mount gone just as we are about to enter inode_lock().
If that happens, we are left with filesystem being shut down while
we are holding a dentry reference on it; results are not pretty.
What we need to do is grab both dentry and mount at the same time;
that makes inode_lock() safe *and* avoids the problem with fs getting
shut down under us. After taking namespace_sem we verify that
path->mnt is still mounted (which stabilizes its ->mnt_parent) and
check that it's still mounted at the same place. From that point
on to the matching namespace_unlock() we are guaranteed that
mount/dentry pair we'd grabbed are also pinned by being the mountpoint
of path->mnt, so we can quietly drop both the dentry reference (as
the current code does) and mnt one - it's OK to do under namespace_sem,
since we are not dropping the final refs.
That solves the problem on do_lock_mount() side; unlock_mount()
also has one, since dentry is guaranteed to stay pinned only until
the namespace_unlock(). That's easy to fix - just have inode_unlock()
done earlier, while it's still pinned by mp->m_dentry.
Fixes: 6ac392815628 "fs: allow to mount beneath top mount" # v6.5+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-04-23 02:30:34 +01:00
|
|
|
under.mnt = mntget(&m->mnt_parent->mnt);
|
|
|
|
|
under.dentry = dget(m->mnt_mountpoint);
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
read_sequnlock_excl(&mount_lock);
|
fix a couple of races in MNT_TREE_BENEATH handling by do_move_mount()
Normally do_lock_mount(path, _) is locking a mountpoint pinned by
*path and at the time when matching unlock_mount() unlocks that
location it is still pinned by the same thing.
Unfortunately, for 'beneath' case it's no longer that simple -
the object being locked is not the one *path points to. It's the
mountpoint of path->mnt. The thing is, without sufficient locking
->mnt_parent may change under us and none of the locks are held
at that point. The rules are
* mount_lock stabilizes m->mnt_parent for any mount m.
* namespace_sem stabilizes m->mnt_parent, provided that
m is mounted.
* if either of the above holds and refcount of m is positive,
we are guaranteed the same for refcount of m->mnt_parent.
namespace_sem nests inside inode_lock(), so do_lock_mount() has
to take inode_lock() before grabbing namespace_sem. It does
recheck that path->mnt is still mounted in the same place after
getting namespace_sem, and it does take care to pin the dentry.
It is needed, since otherwise we might end up with racing mount --move
(or umount) happening while we were getting locks; in that case
dentry would no longer be a mountpoint and could've been evicted
on memory pressure along with its inode - not something you want
when grabbing lock on that inode.
However, pinning a dentry is not enough - the matching mount is
also pinned only by the fact that path->mnt is mounted on top it
and at that point we are not holding any locks whatsoever, so
the same kind of races could end up with all references to
that mount gone just as we are about to enter inode_lock().
If that happens, we are left with filesystem being shut down while
we are holding a dentry reference on it; results are not pretty.
What we need to do is grab both dentry and mount at the same time;
that makes inode_lock() safe *and* avoids the problem with fs getting
shut down under us. After taking namespace_sem we verify that
path->mnt is still mounted (which stabilizes its ->mnt_parent) and
check that it's still mounted at the same place. From that point
on to the matching namespace_unlock() we are guaranteed that
mount/dentry pair we'd grabbed are also pinned by being the mountpoint
of path->mnt, so we can quietly drop both the dentry reference (as
the current code does) and mnt one - it's OK to do under namespace_sem,
since we are not dropping the final refs.
That solves the problem on do_lock_mount() side; unlock_mount()
also has one, since dentry is guaranteed to stay pinned only until
the namespace_unlock(). That's easy to fix - just have inode_unlock()
done earlier, while it's still pinned by mp->m_dentry.
Fixes: 6ac392815628 "fs: allow to mount beneath top mount" # v6.5+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-04-23 02:30:34 +01:00
|
|
|
dentry = under.dentry;
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
} else {
|
|
|
|
|
dentry = path->dentry;
|
|
|
|
|
}
|
|
|
|
|
|
2023-05-03 13:18:41 +02:00
|
|
|
inode_lock(dentry->d_inode);
|
|
|
|
|
namespace_lock();
|
|
|
|
|
|
fix a couple of races in MNT_TREE_BENEATH handling by do_move_mount()
Normally do_lock_mount(path, _) is locking a mountpoint pinned by
*path and at the time when matching unlock_mount() unlocks that
location it is still pinned by the same thing.
Unfortunately, for 'beneath' case it's no longer that simple -
the object being locked is not the one *path points to. It's the
mountpoint of path->mnt. The thing is, without sufficient locking
->mnt_parent may change under us and none of the locks are held
at that point. The rules are
* mount_lock stabilizes m->mnt_parent for any mount m.
* namespace_sem stabilizes m->mnt_parent, provided that
m is mounted.
* if either of the above holds and refcount of m is positive,
we are guaranteed the same for refcount of m->mnt_parent.
namespace_sem nests inside inode_lock(), so do_lock_mount() has
to take inode_lock() before grabbing namespace_sem. It does
recheck that path->mnt is still mounted in the same place after
getting namespace_sem, and it does take care to pin the dentry.
It is needed, since otherwise we might end up with racing mount --move
(or umount) happening while we were getting locks; in that case
dentry would no longer be a mountpoint and could've been evicted
on memory pressure along with its inode - not something you want
when grabbing lock on that inode.
However, pinning a dentry is not enough - the matching mount is
also pinned only by the fact that path->mnt is mounted on top it
and at that point we are not holding any locks whatsoever, so
the same kind of races could end up with all references to
that mount gone just as we are about to enter inode_lock().
If that happens, we are left with filesystem being shut down while
we are holding a dentry reference on it; results are not pretty.
What we need to do is grab both dentry and mount at the same time;
that makes inode_lock() safe *and* avoids the problem with fs getting
shut down under us. After taking namespace_sem we verify that
path->mnt is still mounted (which stabilizes its ->mnt_parent) and
check that it's still mounted at the same place. From that point
on to the matching namespace_unlock() we are guaranteed that
mount/dentry pair we'd grabbed are also pinned by being the mountpoint
of path->mnt, so we can quietly drop both the dentry reference (as
the current code does) and mnt one - it's OK to do under namespace_sem,
since we are not dropping the final refs.
That solves the problem on do_lock_mount() side; unlock_mount()
also has one, since dentry is guaranteed to stay pinned only until
the namespace_unlock(). That's easy to fix - just have inode_unlock()
done earlier, while it's still pinned by mp->m_dentry.
Fixes: 6ac392815628 "fs: allow to mount beneath top mount" # v6.5+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-04-23 02:30:34 +01:00
|
|
|
if (unlikely(cant_mount(dentry) || !is_mounted(mnt)))
|
|
|
|
|
break; // not to be mounted on
|
|
|
|
|
|
|
|
|
|
if (beneath && unlikely(m->mnt_mountpoint != dentry ||
|
|
|
|
|
&m->mnt_parent->mnt != under.mnt)) {
|
2013-03-16 15:12:40 -04:00
|
|
|
namespace_unlock();
|
2016-01-22 15:40:57 -05:00
|
|
|
inode_unlock(dentry->d_inode);
|
fix a couple of races in MNT_TREE_BENEATH handling by do_move_mount()
Normally do_lock_mount(path, _) is locking a mountpoint pinned by
*path and at the time when matching unlock_mount() unlocks that
location it is still pinned by the same thing.
Unfortunately, for 'beneath' case it's no longer that simple -
the object being locked is not the one *path points to. It's the
mountpoint of path->mnt. The thing is, without sufficient locking
->mnt_parent may change under us and none of the locks are held
at that point. The rules are
* mount_lock stabilizes m->mnt_parent for any mount m.
* namespace_sem stabilizes m->mnt_parent, provided that
m is mounted.
* if either of the above holds and refcount of m is positive,
we are guaranteed the same for refcount of m->mnt_parent.
namespace_sem nests inside inode_lock(), so do_lock_mount() has
to take inode_lock() before grabbing namespace_sem. It does
recheck that path->mnt is still mounted in the same place after
getting namespace_sem, and it does take care to pin the dentry.
It is needed, since otherwise we might end up with racing mount --move
(or umount) happening while we were getting locks; in that case
dentry would no longer be a mountpoint and could've been evicted
on memory pressure along with its inode - not something you want
when grabbing lock on that inode.
However, pinning a dentry is not enough - the matching mount is
also pinned only by the fact that path->mnt is mounted on top it
and at that point we are not holding any locks whatsoever, so
the same kind of races could end up with all references to
that mount gone just as we are about to enter inode_lock().
If that happens, we are left with filesystem being shut down while
we are holding a dentry reference on it; results are not pretty.
What we need to do is grab both dentry and mount at the same time;
that makes inode_lock() safe *and* avoids the problem with fs getting
shut down under us. After taking namespace_sem we verify that
path->mnt is still mounted (which stabilizes its ->mnt_parent) and
check that it's still mounted at the same place. From that point
on to the matching namespace_unlock() we are guaranteed that
mount/dentry pair we'd grabbed are also pinned by being the mountpoint
of path->mnt, so we can quietly drop both the dentry reference (as
the current code does) and mnt one - it's OK to do under namespace_sem,
since we are not dropping the final refs.
That solves the problem on do_lock_mount() side; unlock_mount()
also has one, since dentry is guaranteed to stay pinned only until
the namespace_unlock(). That's easy to fix - just have inode_unlock()
done earlier, while it's still pinned by mp->m_dentry.
Fixes: 6ac392815628 "fs: allow to mount beneath top mount" # v6.5+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-04-23 02:30:34 +01:00
|
|
|
continue; // got moved
|
2013-03-15 10:53:28 -04:00
|
|
|
}
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
|
2023-05-03 13:18:41 +02:00
|
|
|
mnt = lookup_mnt(path);
|
fix a couple of races in MNT_TREE_BENEATH handling by do_move_mount()
Normally do_lock_mount(path, _) is locking a mountpoint pinned by
*path and at the time when matching unlock_mount() unlocks that
location it is still pinned by the same thing.
Unfortunately, for 'beneath' case it's no longer that simple -
the object being locked is not the one *path points to. It's the
mountpoint of path->mnt. The thing is, without sufficient locking
->mnt_parent may change under us and none of the locks are held
at that point. The rules are
* mount_lock stabilizes m->mnt_parent for any mount m.
* namespace_sem stabilizes m->mnt_parent, provided that
m is mounted.
* if either of the above holds and refcount of m is positive,
we are guaranteed the same for refcount of m->mnt_parent.
namespace_sem nests inside inode_lock(), so do_lock_mount() has
to take inode_lock() before grabbing namespace_sem. It does
recheck that path->mnt is still mounted in the same place after
getting namespace_sem, and it does take care to pin the dentry.
It is needed, since otherwise we might end up with racing mount --move
(or umount) happening while we were getting locks; in that case
dentry would no longer be a mountpoint and could've been evicted
on memory pressure along with its inode - not something you want
when grabbing lock on that inode.
However, pinning a dentry is not enough - the matching mount is
also pinned only by the fact that path->mnt is mounted on top it
and at that point we are not holding any locks whatsoever, so
the same kind of races could end up with all references to
that mount gone just as we are about to enter inode_lock().
If that happens, we are left with filesystem being shut down while
we are holding a dentry reference on it; results are not pretty.
What we need to do is grab both dentry and mount at the same time;
that makes inode_lock() safe *and* avoids the problem with fs getting
shut down under us. After taking namespace_sem we verify that
path->mnt is still mounted (which stabilizes its ->mnt_parent) and
check that it's still mounted at the same place. From that point
on to the matching namespace_unlock() we are guaranteed that
mount/dentry pair we'd grabbed are also pinned by being the mountpoint
of path->mnt, so we can quietly drop both the dentry reference (as
the current code does) and mnt one - it's OK to do under namespace_sem,
since we are not dropping the final refs.
That solves the problem on do_lock_mount() side; unlock_mount()
also has one, since dentry is guaranteed to stay pinned only until
the namespace_unlock(). That's easy to fix - just have inode_unlock()
done earlier, while it's still pinned by mp->m_dentry.
Fixes: 6ac392815628 "fs: allow to mount beneath top mount" # v6.5+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-04-23 02:30:34 +01:00
|
|
|
if (unlikely(mnt)) {
|
|
|
|
|
namespace_unlock();
|
|
|
|
|
inode_unlock(dentry->d_inode);
|
|
|
|
|
path_put(path);
|
|
|
|
|
path->mnt = mnt;
|
|
|
|
|
path->dentry = dget(mnt->mnt_root);
|
|
|
|
|
continue; // got overmounted
|
|
|
|
|
}
|
|
|
|
|
mp = get_mountpoint(dentry);
|
|
|
|
|
if (IS_ERR(mp))
|
2023-05-03 13:18:41 +02:00
|
|
|
break;
|
fix a couple of races in MNT_TREE_BENEATH handling by do_move_mount()
Normally do_lock_mount(path, _) is locking a mountpoint pinned by
*path and at the time when matching unlock_mount() unlocks that
location it is still pinned by the same thing.
Unfortunately, for 'beneath' case it's no longer that simple -
the object being locked is not the one *path points to. It's the
mountpoint of path->mnt. The thing is, without sufficient locking
->mnt_parent may change under us and none of the locks are held
at that point. The rules are
* mount_lock stabilizes m->mnt_parent for any mount m.
* namespace_sem stabilizes m->mnt_parent, provided that
m is mounted.
* if either of the above holds and refcount of m is positive,
we are guaranteed the same for refcount of m->mnt_parent.
namespace_sem nests inside inode_lock(), so do_lock_mount() has
to take inode_lock() before grabbing namespace_sem. It does
recheck that path->mnt is still mounted in the same place after
getting namespace_sem, and it does take care to pin the dentry.
It is needed, since otherwise we might end up with racing mount --move
(or umount) happening while we were getting locks; in that case
dentry would no longer be a mountpoint and could've been evicted
on memory pressure along with its inode - not something you want
when grabbing lock on that inode.
However, pinning a dentry is not enough - the matching mount is
also pinned only by the fact that path->mnt is mounted on top it
and at that point we are not holding any locks whatsoever, so
the same kind of races could end up with all references to
that mount gone just as we are about to enter inode_lock().
If that happens, we are left with filesystem being shut down while
we are holding a dentry reference on it; results are not pretty.
What we need to do is grab both dentry and mount at the same time;
that makes inode_lock() safe *and* avoids the problem with fs getting
shut down under us. After taking namespace_sem we verify that
path->mnt is still mounted (which stabilizes its ->mnt_parent) and
check that it's still mounted at the same place. From that point
on to the matching namespace_unlock() we are guaranteed that
mount/dentry pair we'd grabbed are also pinned by being the mountpoint
of path->mnt, so we can quietly drop both the dentry reference (as
the current code does) and mnt one - it's OK to do under namespace_sem,
since we are not dropping the final refs.
That solves the problem on do_lock_mount() side; unlock_mount()
also has one, since dentry is guaranteed to stay pinned only until
the namespace_unlock(). That's easy to fix - just have inode_unlock()
done earlier, while it's still pinned by mp->m_dentry.
Fixes: 6ac392815628 "fs: allow to mount beneath top mount" # v6.5+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-04-23 02:30:34 +01:00
|
|
|
if (beneath) {
|
|
|
|
|
/*
|
|
|
|
|
* @under duplicates the references that will stay
|
|
|
|
|
* at least until namespace_unlock(), so the path_put()
|
|
|
|
|
* below is safe (and OK to do under namespace_lock -
|
|
|
|
|
* we are not dropping the final references here).
|
|
|
|
|
*/
|
|
|
|
|
path_put(&under);
|
|
|
|
|
}
|
|
|
|
|
return mp;
|
2023-05-03 13:18:41 +02:00
|
|
|
}
|
fix a couple of races in MNT_TREE_BENEATH handling by do_move_mount()
Normally do_lock_mount(path, _) is locking a mountpoint pinned by
*path and at the time when matching unlock_mount() unlocks that
location it is still pinned by the same thing.
Unfortunately, for 'beneath' case it's no longer that simple -
the object being locked is not the one *path points to. It's the
mountpoint of path->mnt. The thing is, without sufficient locking
->mnt_parent may change under us and none of the locks are held
at that point. The rules are
* mount_lock stabilizes m->mnt_parent for any mount m.
* namespace_sem stabilizes m->mnt_parent, provided that
m is mounted.
* if either of the above holds and refcount of m is positive,
we are guaranteed the same for refcount of m->mnt_parent.
namespace_sem nests inside inode_lock(), so do_lock_mount() has
to take inode_lock() before grabbing namespace_sem. It does
recheck that path->mnt is still mounted in the same place after
getting namespace_sem, and it does take care to pin the dentry.
It is needed, since otherwise we might end up with racing mount --move
(or umount) happening while we were getting locks; in that case
dentry would no longer be a mountpoint and could've been evicted
on memory pressure along with its inode - not something you want
when grabbing lock on that inode.
However, pinning a dentry is not enough - the matching mount is
also pinned only by the fact that path->mnt is mounted on top it
and at that point we are not holding any locks whatsoever, so
the same kind of races could end up with all references to
that mount gone just as we are about to enter inode_lock().
If that happens, we are left with filesystem being shut down while
we are holding a dentry reference on it; results are not pretty.
What we need to do is grab both dentry and mount at the same time;
that makes inode_lock() safe *and* avoids the problem with fs getting
shut down under us. After taking namespace_sem we verify that
path->mnt is still mounted (which stabilizes its ->mnt_parent) and
check that it's still mounted at the same place. From that point
on to the matching namespace_unlock() we are guaranteed that
mount/dentry pair we'd grabbed are also pinned by being the mountpoint
of path->mnt, so we can quietly drop both the dentry reference (as
the current code does) and mnt one - it's OK to do under namespace_sem,
since we are not dropping the final refs.
That solves the problem on do_lock_mount() side; unlock_mount()
also has one, since dentry is guaranteed to stay pinned only until
the namespace_unlock(). That's easy to fix - just have inode_unlock()
done earlier, while it's still pinned by mp->m_dentry.
Fixes: 6ac392815628 "fs: allow to mount beneath top mount" # v6.5+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-04-23 02:30:34 +01:00
|
|
|
namespace_unlock();
|
|
|
|
|
inode_unlock(dentry->d_inode);
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
if (beneath)
|
fix a couple of races in MNT_TREE_BENEATH handling by do_move_mount()
Normally do_lock_mount(path, _) is locking a mountpoint pinned by
*path and at the time when matching unlock_mount() unlocks that
location it is still pinned by the same thing.
Unfortunately, for 'beneath' case it's no longer that simple -
the object being locked is not the one *path points to. It's the
mountpoint of path->mnt. The thing is, without sufficient locking
->mnt_parent may change under us and none of the locks are held
at that point. The rules are
* mount_lock stabilizes m->mnt_parent for any mount m.
* namespace_sem stabilizes m->mnt_parent, provided that
m is mounted.
* if either of the above holds and refcount of m is positive,
we are guaranteed the same for refcount of m->mnt_parent.
namespace_sem nests inside inode_lock(), so do_lock_mount() has
to take inode_lock() before grabbing namespace_sem. It does
recheck that path->mnt is still mounted in the same place after
getting namespace_sem, and it does take care to pin the dentry.
It is needed, since otherwise we might end up with racing mount --move
(or umount) happening while we were getting locks; in that case
dentry would no longer be a mountpoint and could've been evicted
on memory pressure along with its inode - not something you want
when grabbing lock on that inode.
However, pinning a dentry is not enough - the matching mount is
also pinned only by the fact that path->mnt is mounted on top it
and at that point we are not holding any locks whatsoever, so
the same kind of races could end up with all references to
that mount gone just as we are about to enter inode_lock().
If that happens, we are left with filesystem being shut down while
we are holding a dentry reference on it; results are not pretty.
What we need to do is grab both dentry and mount at the same time;
that makes inode_lock() safe *and* avoids the problem with fs getting
shut down under us. After taking namespace_sem we verify that
path->mnt is still mounted (which stabilizes its ->mnt_parent) and
check that it's still mounted at the same place. From that point
on to the matching namespace_unlock() we are guaranteed that
mount/dentry pair we'd grabbed are also pinned by being the mountpoint
of path->mnt, so we can quietly drop both the dentry reference (as
the current code does) and mnt one - it's OK to do under namespace_sem,
since we are not dropping the final refs.
That solves the problem on do_lock_mount() side; unlock_mount()
also has one, since dentry is guaranteed to stay pinned only until
the namespace_unlock(). That's easy to fix - just have inode_unlock()
done earlier, while it's still pinned by mp->m_dentry.
Fixes: 6ac392815628 "fs: allow to mount beneath top mount" # v6.5+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-04-23 02:30:34 +01:00
|
|
|
path_put(&under);
|
2023-05-03 13:18:41 +02:00
|
|
|
return mp;
|
2011-03-18 08:55:38 -04:00
|
|
|
}
|
|
|
|
|
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
static inline struct mountpoint *lock_mount(struct path *path)
|
|
|
|
|
{
|
|
|
|
|
return do_lock_mount(path, false);
|
2011-03-18 08:55:38 -04:00
|
|
|
}
|
|
|
|
|
|
2013-03-15 10:53:28 -04:00
|
|
|
static void unlock_mount(struct mountpoint *where)
|
2011-03-18 08:55:38 -04:00
|
|
|
{
|
fix a couple of races in MNT_TREE_BENEATH handling by do_move_mount()
Normally do_lock_mount(path, _) is locking a mountpoint pinned by
*path and at the time when matching unlock_mount() unlocks that
location it is still pinned by the same thing.
Unfortunately, for 'beneath' case it's no longer that simple -
the object being locked is not the one *path points to. It's the
mountpoint of path->mnt. The thing is, without sufficient locking
->mnt_parent may change under us and none of the locks are held
at that point. The rules are
* mount_lock stabilizes m->mnt_parent for any mount m.
* namespace_sem stabilizes m->mnt_parent, provided that
m is mounted.
* if either of the above holds and refcount of m is positive,
we are guaranteed the same for refcount of m->mnt_parent.
namespace_sem nests inside inode_lock(), so do_lock_mount() has
to take inode_lock() before grabbing namespace_sem. It does
recheck that path->mnt is still mounted in the same place after
getting namespace_sem, and it does take care to pin the dentry.
It is needed, since otherwise we might end up with racing mount --move
(or umount) happening while we were getting locks; in that case
dentry would no longer be a mountpoint and could've been evicted
on memory pressure along with its inode - not something you want
when grabbing lock on that inode.
However, pinning a dentry is not enough - the matching mount is
also pinned only by the fact that path->mnt is mounted on top it
and at that point we are not holding any locks whatsoever, so
the same kind of races could end up with all references to
that mount gone just as we are about to enter inode_lock().
If that happens, we are left with filesystem being shut down while
we are holding a dentry reference on it; results are not pretty.
What we need to do is grab both dentry and mount at the same time;
that makes inode_lock() safe *and* avoids the problem with fs getting
shut down under us. After taking namespace_sem we verify that
path->mnt is still mounted (which stabilizes its ->mnt_parent) and
check that it's still mounted at the same place. From that point
on to the matching namespace_unlock() we are guaranteed that
mount/dentry pair we'd grabbed are also pinned by being the mountpoint
of path->mnt, so we can quietly drop both the dentry reference (as
the current code does) and mnt one - it's OK to do under namespace_sem,
since we are not dropping the final refs.
That solves the problem on do_lock_mount() side; unlock_mount()
also has one, since dentry is guaranteed to stay pinned only until
the namespace_unlock(). That's easy to fix - just have inode_unlock()
done earlier, while it's still pinned by mp->m_dentry.
Fixes: 6ac392815628 "fs: allow to mount beneath top mount" # v6.5+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-04-23 02:30:34 +01:00
|
|
|
inode_unlock(where->m_dentry->d_inode);
|
2017-01-03 14:18:43 +13:00
|
|
|
read_seqlock_excl(&mount_lock);
|
2013-03-15 10:53:28 -04:00
|
|
|
put_mountpoint(where);
|
2017-01-03 14:18:43 +13:00
|
|
|
read_sequnlock_excl(&mount_lock);
|
2013-03-16 14:49:45 -04:00
|
|
|
namespace_unlock();
|
2011-03-18 08:55:38 -04:00
|
|
|
}
|
|
|
|
|
|
2013-03-15 10:53:28 -04:00
|
|
|
static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2017-07-17 08:45:35 +01:00
|
|
|
if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
|
2005-04-16 15:20:36 -07:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
VFS: (Scripted) Convert S_ISLNK/DIR/REG(dentry->d_inode) to d_is_*(dentry)
Convert the following where appropriate:
(1) S_ISLNK(dentry->d_inode) to d_is_symlink(dentry).
(2) S_ISREG(dentry->d_inode) to d_is_reg(dentry).
(3) S_ISDIR(dentry->d_inode) to d_is_dir(dentry). This is actually more
complicated than it appears as some calls should be converted to
d_can_lookup() instead. The difference is whether the directory in
question is a real dir with a ->lookup op or whether it's a fake dir with
a ->d_automount op.
In some circumstances, we can subsume checks for dentry->d_inode not being
NULL into this, provided we the code isn't in a filesystem that expects
d_inode to be NULL if the dirent really *is* negative (ie. if we're going to
use d_inode() rather than d_backing_inode() to get the inode pointer).
Note that the dentry type field may be set to something other than
DCACHE_MISS_TYPE when d_inode is NULL in the case of unionmount, where the VFS
manages the fall-through from a negative dentry to a lower layer. In such a
case, the dentry type of the negative union dentry is set to the same as the
type of the lower dentry.
However, if you know d_inode is not NULL at the call site, then you can use
the d_is_xxx() functions even in a filesystem.
There is one further complication: a 0,0 chardev dentry may be labelled
DCACHE_WHITEOUT_TYPE rather than DCACHE_SPECIAL_TYPE. Strictly, this was
intended for special directory entry types that don't have attached inodes.
The following perl+coccinelle script was used:
use strict;
my @callers;
open($fd, 'git grep -l \'S_IS[A-Z].*->d_inode\' |') ||
die "Can't grep for S_ISDIR and co. callers";
@callers = <$fd>;
close($fd);
unless (@callers) {
print "No matches\n";
exit(0);
}
my @cocci = (
'@@',
'expression E;',
'@@',
'',
'- S_ISLNK(E->d_inode->i_mode)',
'+ d_is_symlink(E)',
'',
'@@',
'expression E;',
'@@',
'',
'- S_ISDIR(E->d_inode->i_mode)',
'+ d_is_dir(E)',
'',
'@@',
'expression E;',
'@@',
'',
'- S_ISREG(E->d_inode->i_mode)',
'+ d_is_reg(E)' );
my $coccifile = "tmp.sp.cocci";
open($fd, ">$coccifile") || die $coccifile;
print($fd "$_\n") || die $coccifile foreach (@cocci);
close($fd);
foreach my $file (@callers) {
chomp $file;
print "Processing ", $file, "\n";
system("spatch", "--sp-file", $coccifile, $file, "--in-place", "--no-show-diff") == 0 ||
die "spatch failed";
}
[AV: overlayfs parts skipped]
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-01-29 12:02:35 +00:00
|
|
|
if (d_is_dir(mp->m_dentry) !=
|
|
|
|
|
d_is_dir(mnt->mnt.mnt_root))
|
2005-04-16 15:20:36 -07:00
|
|
|
return -ENOTDIR;
|
|
|
|
|
|
2025-04-25 22:54:56 -04:00
|
|
|
return attach_recursive_mnt(mnt, p, mp);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2010-08-26 11:07:22 -07:00
|
|
|
/*
|
|
|
|
|
* Sanity check the flags to change_mnt_propagation.
|
|
|
|
|
*/
|
|
|
|
|
|
2017-07-17 08:45:35 +01:00
|
|
|
static int flags_to_propagation_type(int ms_flags)
|
2010-08-26 11:07:22 -07:00
|
|
|
{
|
2017-07-17 08:45:35 +01:00
|
|
|
int type = ms_flags & ~(MS_REC | MS_SILENT);
|
2010-08-26 11:07:22 -07:00
|
|
|
|
|
|
|
|
/* Fail if any non-propagation flags are set */
|
|
|
|
|
if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
|
|
|
|
|
return 0;
|
|
|
|
|
/* Only one propagation flag should be set */
|
|
|
|
|
if (!is_power_of_2(type))
|
|
|
|
|
return 0;
|
|
|
|
|
return type;
|
|
|
|
|
}
|
|
|
|
|
|
2005-11-07 17:19:07 -05:00
|
|
|
/*
|
|
|
|
|
* recursively change the type of the mountpoint.
|
|
|
|
|
*/
|
2017-07-17 08:45:35 +01:00
|
|
|
static int do_change_type(struct path *path, int ms_flags)
|
2005-11-07 17:19:07 -05:00
|
|
|
{
|
2011-11-24 18:57:30 -05:00
|
|
|
struct mount *m;
|
2011-11-24 19:54:23 -05:00
|
|
|
struct mount *mnt = real_mount(path->mnt);
|
2017-07-17 08:45:35 +01:00
|
|
|
int recurse = ms_flags & MS_REC;
|
2010-08-26 11:07:22 -07:00
|
|
|
int type;
|
2008-03-27 13:06:23 +01:00
|
|
|
int err = 0;
|
2005-11-07 17:19:07 -05:00
|
|
|
|
2023-05-03 13:18:39 +02:00
|
|
|
if (!path_mounted(path))
|
2005-11-07 17:19:07 -05:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
2017-07-17 08:45:35 +01:00
|
|
|
type = flags_to_propagation_type(ms_flags);
|
2010-08-26 11:07:22 -07:00
|
|
|
if (!type)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
2013-03-16 15:12:40 -04:00
|
|
|
namespace_lock();
|
2025-06-04 12:27:08 -04:00
|
|
|
if (!check_mnt(mnt)) {
|
|
|
|
|
err = -EINVAL;
|
|
|
|
|
goto out_unlock;
|
|
|
|
|
}
|
2008-03-27 13:06:23 +01:00
|
|
|
if (type == MS_SHARED) {
|
|
|
|
|
err = invent_group_ids(mnt, recurse);
|
|
|
|
|
if (err)
|
|
|
|
|
goto out_unlock;
|
|
|
|
|
}
|
|
|
|
|
|
2013-09-29 11:24:49 -04:00
|
|
|
lock_mount_hash();
|
2011-11-25 03:06:56 -05:00
|
|
|
for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
|
2011-11-24 20:43:10 -05:00
|
|
|
change_mnt_propagation(m, type);
|
2013-09-29 11:24:49 -04:00
|
|
|
unlock_mount_hash();
|
2008-03-27 13:06:23 +01:00
|
|
|
|
|
|
|
|
out_unlock:
|
2013-03-16 15:12:40 -04:00
|
|
|
namespace_unlock();
|
2008-03-27 13:06:23 +01:00
|
|
|
return err;
|
2005-11-07 17:19:07 -05:00
|
|
|
}
|
|
|
|
|
|
2025-02-21 14:13:04 +01:00
|
|
|
/* may_copy_tree() - check if a mount tree can be copied
|
|
|
|
|
* @path: path to the mount tree to be copied
|
|
|
|
|
*
|
|
|
|
|
* This helper checks if the caller may copy the mount tree starting
|
|
|
|
|
* from @path->mnt. The caller may copy the mount tree under the
|
|
|
|
|
* following circumstances:
|
|
|
|
|
*
|
|
|
|
|
* (1) The caller is located in the mount namespace of the mount tree.
|
|
|
|
|
* This also implies that the mount does not belong to an anonymous
|
|
|
|
|
* mount namespace.
|
|
|
|
|
* (2) The caller tries to copy an nfs mount referring to a mount
|
|
|
|
|
* namespace, i.e., the caller is trying to copy a mount namespace
|
|
|
|
|
* entry from nsfs.
|
|
|
|
|
* (3) The caller tries to copy a pidfs mount referring to a pidfd.
|
2025-02-21 14:13:05 +01:00
|
|
|
* (4) The caller is trying to copy a mount tree that belongs to an
|
|
|
|
|
* anonymous mount namespace.
|
|
|
|
|
*
|
|
|
|
|
* For that to be safe, this helper enforces that the origin mount
|
|
|
|
|
* namespace the anonymous mount namespace was created from is the
|
|
|
|
|
* same as the caller's mount namespace by comparing the sequence
|
|
|
|
|
* numbers.
|
|
|
|
|
*
|
|
|
|
|
* This is not strictly necessary. The current semantics of the new
|
|
|
|
|
* mount api enforce that the caller must be located in the same
|
|
|
|
|
* mount namespace as the mount tree it interacts with. Using the
|
|
|
|
|
* origin sequence number preserves these semantics even for
|
|
|
|
|
* anonymous mount namespaces. However, one could envision extending
|
|
|
|
|
* the api to directly operate across mount namespace if needed.
|
|
|
|
|
*
|
|
|
|
|
* The ownership of a non-anonymous mount namespace such as the
|
|
|
|
|
* caller's cannot change.
|
|
|
|
|
* => We know that the caller's mount namespace is stable.
|
|
|
|
|
*
|
|
|
|
|
* If the origin sequence number of the anonymous mount namespace is
|
|
|
|
|
* the same as the sequence number of the caller's mount namespace.
|
|
|
|
|
* => The owning namespaces are the same.
|
|
|
|
|
*
|
|
|
|
|
* ==> The earlier capability check on the owning namespace of the
|
|
|
|
|
* caller's mount namespace ensures that the caller has the
|
|
|
|
|
* ability to copy the mount tree.
|
2025-02-21 14:13:04 +01:00
|
|
|
*
|
|
|
|
|
* Returns true if the mount tree can be copied, false otherwise.
|
|
|
|
|
*/
|
|
|
|
|
static inline bool may_copy_tree(struct path *path)
|
|
|
|
|
{
|
|
|
|
|
struct mount *mnt = real_mount(path->mnt);
|
|
|
|
|
const struct dentry_operations *d_op;
|
|
|
|
|
|
|
|
|
|
if (check_mnt(mnt))
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
d_op = path->dentry->d_op;
|
|
|
|
|
if (d_op == &ns_dentry_operations)
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
if (d_op == &pidfs_dentry_operations)
|
|
|
|
|
return true;
|
|
|
|
|
|
2025-02-21 14:13:05 +01:00
|
|
|
if (!is_mounted(path->mnt))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
return check_anonymous_mnt(mnt);
|
2025-02-21 14:13:04 +01:00
|
|
|
}
|
|
|
|
|
|
2025-02-21 14:13:05 +01:00
|
|
|
|
2018-11-05 17:40:30 +00:00
|
|
|
static struct mount *__do_loopback(struct path *old_path, int recurse)
|
|
|
|
|
{
|
don't set MNT_LOCKED on parentless mounts
Originally MNT_LOCKED meant only one thing - "don't let this mount to
be peeled off its parent, we don't want to have its mountpoint exposed".
Accordingly, it had only been set on mounts that *do* have a parent.
Later it got overloaded with another use - setting it on the absolute
root had given free protection against umount(2) of absolute root
(was possible to trigger, oopsed). Not a bad trick, but it ended
up costing more than it bought us. Unfortunately, the cost included
both hard-to-reason-about logics and a subtle race between
mount -o remount,ro and mount --[r]bind - lockless &= ~MNT_LOCKED in
the end of __do_loopback() could race with sb_prepare_remount_readonly()
setting and clearing MNT_HOLD_WRITE (under mount_lock, as it should
be). The race wouldn't be much of a problem (there are other ways to
deal with it), but the subtlety is.
Turns out that nobody except umount(2) had ever made use of having
MNT_LOCKED set on absolute root. So let's give up on that trick,
clever as it had been, add an explicit check in do_umount() and
return to using MNT_LOCKED only for mounts that have a parent.
It means that
* clone_mnt() no longer copies MNT_LOCKED
* copy_tree() sets it on submounts if their counterparts had
been marked such, and does that right next to attach_mnt() in there,
in the same mount_lock scope.
* __do_loopback() no longer needs to strip MNT_LOCKED off the
root of subtree it's about to return; no store, no race.
* init_mount_tree() doesn't bother setting MNT_LOCKED on absolute
root.
* lock_mnt_tree() does not set MNT_LOCKED on the subtree's root;
accordingly, its caller (loop in attach_recursive_mnt()) does not need to
bother stripping that MNT_LOCKED on root. Note that lock_mnt_tree() setting
MNT_LOCKED on submounts happens in the same mount_lock scope as __attach_mnt()
(from commit_tree()) that makes them reachable.
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-05-06 18:48:05 -04:00
|
|
|
struct mount *old = real_mount(old_path->mnt);
|
2018-11-05 17:40:30 +00:00
|
|
|
|
|
|
|
|
if (IS_MNT_UNBINDABLE(old))
|
don't set MNT_LOCKED on parentless mounts
Originally MNT_LOCKED meant only one thing - "don't let this mount to
be peeled off its parent, we don't want to have its mountpoint exposed".
Accordingly, it had only been set on mounts that *do* have a parent.
Later it got overloaded with another use - setting it on the absolute
root had given free protection against umount(2) of absolute root
(was possible to trigger, oopsed). Not a bad trick, but it ended
up costing more than it bought us. Unfortunately, the cost included
both hard-to-reason-about logics and a subtle race between
mount -o remount,ro and mount --[r]bind - lockless &= ~MNT_LOCKED in
the end of __do_loopback() could race with sb_prepare_remount_readonly()
setting and clearing MNT_HOLD_WRITE (under mount_lock, as it should
be). The race wouldn't be much of a problem (there are other ways to
deal with it), but the subtlety is.
Turns out that nobody except umount(2) had ever made use of having
MNT_LOCKED set on absolute root. So let's give up on that trick,
clever as it had been, add an explicit check in do_umount() and
return to using MNT_LOCKED only for mounts that have a parent.
It means that
* clone_mnt() no longer copies MNT_LOCKED
* copy_tree() sets it on submounts if their counterparts had
been marked such, and does that right next to attach_mnt() in there,
in the same mount_lock scope.
* __do_loopback() no longer needs to strip MNT_LOCKED off the
root of subtree it's about to return; no store, no race.
* init_mount_tree() doesn't bother setting MNT_LOCKED on absolute
root.
* lock_mnt_tree() does not set MNT_LOCKED on the subtree's root;
accordingly, its caller (loop in attach_recursive_mnt()) does not need to
bother stripping that MNT_LOCKED on root. Note that lock_mnt_tree() setting
MNT_LOCKED on submounts happens in the same mount_lock scope as __attach_mnt()
(from commit_tree()) that makes them reachable.
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-05-06 18:48:05 -04:00
|
|
|
return ERR_PTR(-EINVAL);
|
2018-11-05 17:40:30 +00:00
|
|
|
|
2025-02-21 14:13:04 +01:00
|
|
|
if (!may_copy_tree(old_path))
|
don't set MNT_LOCKED on parentless mounts
Originally MNT_LOCKED meant only one thing - "don't let this mount to
be peeled off its parent, we don't want to have its mountpoint exposed".
Accordingly, it had only been set on mounts that *do* have a parent.
Later it got overloaded with another use - setting it on the absolute
root had given free protection against umount(2) of absolute root
(was possible to trigger, oopsed). Not a bad trick, but it ended
up costing more than it bought us. Unfortunately, the cost included
both hard-to-reason-about logics and a subtle race between
mount -o remount,ro and mount --[r]bind - lockless &= ~MNT_LOCKED in
the end of __do_loopback() could race with sb_prepare_remount_readonly()
setting and clearing MNT_HOLD_WRITE (under mount_lock, as it should
be). The race wouldn't be much of a problem (there are other ways to
deal with it), but the subtlety is.
Turns out that nobody except umount(2) had ever made use of having
MNT_LOCKED set on absolute root. So let's give up on that trick,
clever as it had been, add an explicit check in do_umount() and
return to using MNT_LOCKED only for mounts that have a parent.
It means that
* clone_mnt() no longer copies MNT_LOCKED
* copy_tree() sets it on submounts if their counterparts had
been marked such, and does that right next to attach_mnt() in there,
in the same mount_lock scope.
* __do_loopback() no longer needs to strip MNT_LOCKED off the
root of subtree it's about to return; no store, no race.
* init_mount_tree() doesn't bother setting MNT_LOCKED on absolute
root.
* lock_mnt_tree() does not set MNT_LOCKED on the subtree's root;
accordingly, its caller (loop in attach_recursive_mnt()) does not need to
bother stripping that MNT_LOCKED on root. Note that lock_mnt_tree() setting
MNT_LOCKED on submounts happens in the same mount_lock scope as __attach_mnt()
(from commit_tree()) that makes them reachable.
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-05-06 18:48:05 -04:00
|
|
|
return ERR_PTR(-EINVAL);
|
2018-11-05 17:40:30 +00:00
|
|
|
|
2025-06-01 14:23:52 -04:00
|
|
|
if (!recurse && __has_locked_children(old, old_path->dentry))
|
don't set MNT_LOCKED on parentless mounts
Originally MNT_LOCKED meant only one thing - "don't let this mount to
be peeled off its parent, we don't want to have its mountpoint exposed".
Accordingly, it had only been set on mounts that *do* have a parent.
Later it got overloaded with another use - setting it on the absolute
root had given free protection against umount(2) of absolute root
(was possible to trigger, oopsed). Not a bad trick, but it ended
up costing more than it bought us. Unfortunately, the cost included
both hard-to-reason-about logics and a subtle race between
mount -o remount,ro and mount --[r]bind - lockless &= ~MNT_LOCKED in
the end of __do_loopback() could race with sb_prepare_remount_readonly()
setting and clearing MNT_HOLD_WRITE (under mount_lock, as it should
be). The race wouldn't be much of a problem (there are other ways to
deal with it), but the subtlety is.
Turns out that nobody except umount(2) had ever made use of having
MNT_LOCKED set on absolute root. So let's give up on that trick,
clever as it had been, add an explicit check in do_umount() and
return to using MNT_LOCKED only for mounts that have a parent.
It means that
* clone_mnt() no longer copies MNT_LOCKED
* copy_tree() sets it on submounts if their counterparts had
been marked such, and does that right next to attach_mnt() in there,
in the same mount_lock scope.
* __do_loopback() no longer needs to strip MNT_LOCKED off the
root of subtree it's about to return; no store, no race.
* init_mount_tree() doesn't bother setting MNT_LOCKED on absolute
root.
* lock_mnt_tree() does not set MNT_LOCKED on the subtree's root;
accordingly, its caller (loop in attach_recursive_mnt()) does not need to
bother stripping that MNT_LOCKED on root. Note that lock_mnt_tree() setting
MNT_LOCKED on submounts happens in the same mount_lock scope as __attach_mnt()
(from commit_tree()) that makes them reachable.
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-05-06 18:48:05 -04:00
|
|
|
return ERR_PTR(-EINVAL);
|
2018-11-05 17:40:30 +00:00
|
|
|
|
|
|
|
|
if (recurse)
|
don't set MNT_LOCKED on parentless mounts
Originally MNT_LOCKED meant only one thing - "don't let this mount to
be peeled off its parent, we don't want to have its mountpoint exposed".
Accordingly, it had only been set on mounts that *do* have a parent.
Later it got overloaded with another use - setting it on the absolute
root had given free protection against umount(2) of absolute root
(was possible to trigger, oopsed). Not a bad trick, but it ended
up costing more than it bought us. Unfortunately, the cost included
both hard-to-reason-about logics and a subtle race between
mount -o remount,ro and mount --[r]bind - lockless &= ~MNT_LOCKED in
the end of __do_loopback() could race with sb_prepare_remount_readonly()
setting and clearing MNT_HOLD_WRITE (under mount_lock, as it should
be). The race wouldn't be much of a problem (there are other ways to
deal with it), but the subtlety is.
Turns out that nobody except umount(2) had ever made use of having
MNT_LOCKED set on absolute root. So let's give up on that trick,
clever as it had been, add an explicit check in do_umount() and
return to using MNT_LOCKED only for mounts that have a parent.
It means that
* clone_mnt() no longer copies MNT_LOCKED
* copy_tree() sets it on submounts if their counterparts had
been marked such, and does that right next to attach_mnt() in there,
in the same mount_lock scope.
* __do_loopback() no longer needs to strip MNT_LOCKED off the
root of subtree it's about to return; no store, no race.
* init_mount_tree() doesn't bother setting MNT_LOCKED on absolute
root.
* lock_mnt_tree() does not set MNT_LOCKED on the subtree's root;
accordingly, its caller (loop in attach_recursive_mnt()) does not need to
bother stripping that MNT_LOCKED on root. Note that lock_mnt_tree() setting
MNT_LOCKED on submounts happens in the same mount_lock scope as __attach_mnt()
(from commit_tree()) that makes them reachable.
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-05-06 18:48:05 -04:00
|
|
|
return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
|
2018-11-05 17:40:30 +00:00
|
|
|
else
|
don't set MNT_LOCKED on parentless mounts
Originally MNT_LOCKED meant only one thing - "don't let this mount to
be peeled off its parent, we don't want to have its mountpoint exposed".
Accordingly, it had only been set on mounts that *do* have a parent.
Later it got overloaded with another use - setting it on the absolute
root had given free protection against umount(2) of absolute root
(was possible to trigger, oopsed). Not a bad trick, but it ended
up costing more than it bought us. Unfortunately, the cost included
both hard-to-reason-about logics and a subtle race between
mount -o remount,ro and mount --[r]bind - lockless &= ~MNT_LOCKED in
the end of __do_loopback() could race with sb_prepare_remount_readonly()
setting and clearing MNT_HOLD_WRITE (under mount_lock, as it should
be). The race wouldn't be much of a problem (there are other ways to
deal with it), but the subtlety is.
Turns out that nobody except umount(2) had ever made use of having
MNT_LOCKED set on absolute root. So let's give up on that trick,
clever as it had been, add an explicit check in do_umount() and
return to using MNT_LOCKED only for mounts that have a parent.
It means that
* clone_mnt() no longer copies MNT_LOCKED
* copy_tree() sets it on submounts if their counterparts had
been marked such, and does that right next to attach_mnt() in there,
in the same mount_lock scope.
* __do_loopback() no longer needs to strip MNT_LOCKED off the
root of subtree it's about to return; no store, no race.
* init_mount_tree() doesn't bother setting MNT_LOCKED on absolute
root.
* lock_mnt_tree() does not set MNT_LOCKED on the subtree's root;
accordingly, its caller (loop in attach_recursive_mnt()) does not need to
bother stripping that MNT_LOCKED on root. Note that lock_mnt_tree() setting
MNT_LOCKED on submounts happens in the same mount_lock scope as __attach_mnt()
(from commit_tree()) that makes them reachable.
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-05-06 18:48:05 -04:00
|
|
|
return clone_mnt(old, old_path->dentry, 0);
|
2018-11-05 17:40:30 +00:00
|
|
|
}
|
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
|
* do loopback mount.
|
|
|
|
|
*/
|
2012-10-11 11:42:01 -04:00
|
|
|
static int do_loopback(struct path *path, const char *old_name,
|
2008-02-08 04:22:12 -08:00
|
|
|
int recurse)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2008-08-02 00:51:11 -04:00
|
|
|
struct path old_path;
|
2018-11-05 17:40:30 +00:00
|
|
|
struct mount *mnt = NULL, *parent;
|
2013-03-15 10:53:28 -04:00
|
|
|
struct mountpoint *mp;
|
2013-02-22 22:49:10 -05:00
|
|
|
int err;
|
2005-04-16 15:20:36 -07:00
|
|
|
if (!old_name || !*old_name)
|
|
|
|
|
return -EINVAL;
|
2011-09-26 20:36:09 -04:00
|
|
|
err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (err)
|
|
|
|
|
return err;
|
|
|
|
|
|
2010-03-07 18:49:36 -08:00
|
|
|
err = -EINVAL;
|
2013-03-30 01:35:18 -07:00
|
|
|
if (mnt_ns_loop(old_path.dentry))
|
2017-07-04 17:25:09 +01:00
|
|
|
goto out;
|
2010-03-07 18:49:36 -08:00
|
|
|
|
2013-03-15 10:53:28 -04:00
|
|
|
mp = lock_mount(path);
|
2018-11-05 17:40:30 +00:00
|
|
|
if (IS_ERR(mp)) {
|
|
|
|
|
err = PTR_ERR(mp);
|
2011-03-18 08:55:38 -04:00
|
|
|
goto out;
|
2018-11-05 17:40:30 +00:00
|
|
|
}
|
2011-03-18 08:55:38 -04:00
|
|
|
|
2013-03-15 10:53:28 -04:00
|
|
|
parent = real_mount(path->mnt);
|
take the targets of /proc/*/ns/* symlinks to separate fs
New pseudo-filesystem: nsfs. Targets of /proc/*/ns/* live there now.
It's not mountable (not even registered, so it's not in /proc/filesystems,
etc.). Files on it *are* bindable - we explicitly permit that in do_loopback().
This stuff lives in fs/nsfs.c now; proc_ns_fget() moved there as well.
get_proc_ns() is a macro now (it's simply returning ->i_private; would
have been an inline, if not for header ordering headache).
proc_ns_inode() is an ex-parrot. The interface used in procfs is
ns_get_path(path, task, ops) and ns_get_name(buf, size, task, ops).
Dentries and inodes are never hashed; a non-counting reference to dentry
is stashed in ns_common (removed by ->d_prune()) and reused by ns_get_path()
if present. See ns_get_path()/ns_prune_dentry/nsfs_evict() for details
of that mechanism.
As the result, proc_ns_follow_link() has stopped poking in nd->path.mnt;
it does nd_jump_link() on a consistent <vfsmount,dentry> pair it gets
from ns_get_path().
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-11-01 10:57:28 -04:00
|
|
|
if (!check_mnt(parent))
|
|
|
|
|
goto out2;
|
|
|
|
|
|
2018-11-05 17:40:30 +00:00
|
|
|
mnt = __do_loopback(&old_path, recurse);
|
2012-06-25 12:55:18 +01:00
|
|
|
if (IS_ERR(mnt)) {
|
|
|
|
|
err = PTR_ERR(mnt);
|
2013-04-09 17:33:29 +04:00
|
|
|
goto out2;
|
2012-06-25 12:55:18 +01:00
|
|
|
}
|
2005-11-07 17:15:04 -05:00
|
|
|
|
2013-03-15 10:53:28 -04:00
|
|
|
err = graft_tree(mnt, parent, mp);
|
2005-11-07 17:15:04 -05:00
|
|
|
if (err) {
|
2013-09-29 11:24:49 -04:00
|
|
|
lock_mount_hash();
|
2014-12-24 07:20:01 -06:00
|
|
|
umount_tree(mnt, UMOUNT_SYNC);
|
2013-09-29 11:24:49 -04:00
|
|
|
unlock_mount_hash();
|
2005-11-07 17:16:29 -05:00
|
|
|
}
|
2011-03-18 08:55:38 -04:00
|
|
|
out2:
|
2013-03-15 10:53:28 -04:00
|
|
|
unlock_mount(mp);
|
2005-11-07 17:15:04 -05:00
|
|
|
out:
|
2008-08-02 00:51:11 -04:00
|
|
|
path_put(&old_path);
|
2005-04-16 15:20:36 -07:00
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-05 17:40:30 +00:00
|
|
|
static struct file *open_detached_copy(struct path *path, bool recursive)
|
|
|
|
|
{
|
2025-02-21 14:13:00 +01:00
|
|
|
struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
|
|
|
|
|
struct user_namespace *user_ns = mnt_ns->user_ns;
|
2018-11-05 17:40:30 +00:00
|
|
|
struct mount *mnt, *p;
|
|
|
|
|
struct file *file;
|
|
|
|
|
|
2025-02-21 14:13:00 +01:00
|
|
|
ns = alloc_mnt_ns(user_ns, true);
|
2018-11-05 17:40:30 +00:00
|
|
|
if (IS_ERR(ns))
|
|
|
|
|
return ERR_CAST(ns);
|
|
|
|
|
|
|
|
|
|
namespace_lock();
|
2025-02-21 14:13:00 +01:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Record the sequence number of the source mount namespace.
|
|
|
|
|
* This needs to hold namespace_sem to ensure that the mount
|
|
|
|
|
* doesn't get attached.
|
|
|
|
|
*/
|
|
|
|
|
if (is_mounted(path->mnt)) {
|
|
|
|
|
src_mnt_ns = real_mount(path->mnt)->mnt_ns;
|
|
|
|
|
if (is_anon_ns(src_mnt_ns))
|
|
|
|
|
ns->seq_origin = src_mnt_ns->seq_origin;
|
|
|
|
|
else
|
|
|
|
|
ns->seq_origin = src_mnt_ns->seq;
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-05 17:40:30 +00:00
|
|
|
mnt = __do_loopback(path, recursive);
|
|
|
|
|
if (IS_ERR(mnt)) {
|
|
|
|
|
namespace_unlock();
|
|
|
|
|
free_mnt_ns(ns);
|
|
|
|
|
return ERR_CAST(mnt);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
lock_mount_hash();
|
|
|
|
|
for (p = mnt; p; p = next_mnt(p, mnt)) {
|
2023-10-25 16:02:00 +02:00
|
|
|
mnt_add_to_ns(ns, p);
|
|
|
|
|
ns->nr_mounts++;
|
2018-11-05 17:40:30 +00:00
|
|
|
}
|
|
|
|
|
ns->root = mnt;
|
|
|
|
|
mntget(&mnt->mnt);
|
|
|
|
|
unlock_mount_hash();
|
|
|
|
|
namespace_unlock();
|
|
|
|
|
|
|
|
|
|
mntput(path->mnt);
|
|
|
|
|
path->mnt = &mnt->mnt;
|
|
|
|
|
file = dentry_open(path, O_PATH, current_cred());
|
|
|
|
|
if (IS_ERR(file))
|
|
|
|
|
dissolve_on_fput(path->mnt);
|
|
|
|
|
else
|
|
|
|
|
file->f_mode |= FMODE_NEED_UNMOUNT;
|
|
|
|
|
return file;
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-28 11:33:39 +01:00
|
|
|
static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags)
|
2018-11-05 17:40:30 +00:00
|
|
|
{
|
2025-01-28 11:33:39 +01:00
|
|
|
int ret;
|
|
|
|
|
struct path path __free(path_put) = {};
|
2018-11-05 17:40:30 +00:00
|
|
|
int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
|
|
|
|
|
bool detached = flags & OPEN_TREE_CLONE;
|
|
|
|
|
|
|
|
|
|
BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
|
|
|
|
|
|
|
|
|
|
if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
|
|
|
|
|
AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
|
|
|
|
|
OPEN_TREE_CLOEXEC))
|
2025-01-28 11:33:39 +01:00
|
|
|
return ERR_PTR(-EINVAL);
|
2018-11-05 17:40:30 +00:00
|
|
|
|
|
|
|
|
if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
|
2025-01-28 11:33:39 +01:00
|
|
|
return ERR_PTR(-EINVAL);
|
2018-11-05 17:40:30 +00:00
|
|
|
|
|
|
|
|
if (flags & AT_NO_AUTOMOUNT)
|
|
|
|
|
lookup_flags &= ~LOOKUP_AUTOMOUNT;
|
|
|
|
|
if (flags & AT_SYMLINK_NOFOLLOW)
|
|
|
|
|
lookup_flags &= ~LOOKUP_FOLLOW;
|
|
|
|
|
if (flags & AT_EMPTY_PATH)
|
|
|
|
|
lookup_flags |= LOOKUP_EMPTY;
|
|
|
|
|
|
|
|
|
|
if (detached && !may_mount())
|
2025-01-28 11:33:39 +01:00
|
|
|
return ERR_PTR(-EPERM);
|
|
|
|
|
|
|
|
|
|
ret = user_path_at(dfd, filename, lookup_flags, &path);
|
|
|
|
|
if (unlikely(ret))
|
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
|
|
|
|
|
|
if (detached)
|
|
|
|
|
return open_detached_copy(&path, flags & AT_RECURSIVE);
|
|
|
|
|
|
|
|
|
|
return dentry_open(&path, O_PATH, current_cred());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
|
|
|
|
|
{
|
|
|
|
|
int fd;
|
|
|
|
|
struct file *file __free(fput) = NULL;
|
|
|
|
|
|
|
|
|
|
file = vfs_open_tree(dfd, filename, flags);
|
|
|
|
|
if (IS_ERR(file))
|
|
|
|
|
return PTR_ERR(file);
|
2018-11-05 17:40:30 +00:00
|
|
|
|
|
|
|
|
fd = get_unused_fd_flags(flags & O_CLOEXEC);
|
|
|
|
|
if (fd < 0)
|
|
|
|
|
return fd;
|
|
|
|
|
|
2025-01-28 11:33:39 +01:00
|
|
|
fd_install(fd, no_free_ptr(file));
|
2018-11-05 17:40:30 +00:00
|
|
|
return fd;
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-01 23:07:25 +00:00
|
|
|
/*
|
|
|
|
|
* Don't allow locked mount flags to be cleared.
|
|
|
|
|
*
|
|
|
|
|
* No locks need to be held here while testing the various MNT_LOCK
|
|
|
|
|
* flags because those flags can never be cleared once they are set.
|
|
|
|
|
*/
|
|
|
|
|
static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
|
2008-02-15 14:38:00 -08:00
|
|
|
{
|
2018-11-01 23:07:25 +00:00
|
|
|
unsigned int fl = mnt->mnt.mnt_flags;
|
|
|
|
|
|
|
|
|
|
if ((fl & MNT_LOCK_READONLY) &&
|
|
|
|
|
!(mnt_flags & MNT_READONLY))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if ((fl & MNT_LOCK_NODEV) &&
|
|
|
|
|
!(mnt_flags & MNT_NODEV))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if ((fl & MNT_LOCK_NOSUID) &&
|
|
|
|
|
!(mnt_flags & MNT_NOSUID))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if ((fl & MNT_LOCK_NOEXEC) &&
|
|
|
|
|
!(mnt_flags & MNT_NOEXEC))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if ((fl & MNT_LOCK_ATIME) &&
|
|
|
|
|
((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
|
|
|
|
|
return false;
|
2008-02-15 14:38:00 -08:00
|
|
|
|
2018-11-01 23:07:25 +00:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
|
2008-02-15 14:38:00 -08:00
|
|
|
{
|
2018-11-01 23:07:25 +00:00
|
|
|
bool readonly_request = (mnt_flags & MNT_READONLY);
|
2008-02-15 14:38:00 -08:00
|
|
|
|
2018-11-01 23:07:25 +00:00
|
|
|
if (readonly_request == __mnt_is_readonly(&mnt->mnt))
|
2008-02-15 14:38:00 -08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
if (readonly_request)
|
2018-11-01 23:07:25 +00:00
|
|
|
return mnt_make_readonly(mnt);
|
|
|
|
|
|
2021-01-21 14:19:48 +01:00
|
|
|
mnt->mnt.mnt_flags &= ~MNT_READONLY;
|
|
|
|
|
return 0;
|
2018-11-01 23:07:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
|
|
|
|
|
{
|
|
|
|
|
mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
|
|
|
|
|
mnt->mnt.mnt_flags = mnt_flags;
|
|
|
|
|
touch_mnt_namespace(mnt->mnt_ns);
|
|
|
|
|
}
|
|
|
|
|
|
2019-04-15 14:17:12 -07:00
|
|
|
static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
|
|
|
|
|
{
|
|
|
|
|
struct super_block *sb = mnt->mnt_sb;
|
|
|
|
|
|
|
|
|
|
if (!__mnt_is_readonly(mnt) &&
|
2022-03-22 14:39:22 -07:00
|
|
|
(!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
|
2019-04-15 14:17:12 -07:00
|
|
|
(ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
|
2024-07-30 10:58:13 +02:00
|
|
|
char *buf, *mntpath;
|
|
|
|
|
|
|
|
|
|
buf = (char *)__get_free_page(GFP_KERNEL);
|
|
|
|
|
if (buf)
|
|
|
|
|
mntpath = d_path(mountpoint, buf, PAGE_SIZE);
|
|
|
|
|
else
|
|
|
|
|
mntpath = ERR_PTR(-ENOMEM);
|
|
|
|
|
if (IS_ERR(mntpath))
|
|
|
|
|
mntpath = "(unknown)";
|
2019-04-15 14:17:12 -07:00
|
|
|
|
2023-03-14 17:09:06 +02:00
|
|
|
pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
|
2019-10-16 19:48:14 -07:00
|
|
|
sb->s_type->name,
|
|
|
|
|
is_mounted(mnt) ? "remounted" : "mounted",
|
2023-03-14 17:09:06 +02:00
|
|
|
mntpath, &sb->s_time_max,
|
|
|
|
|
(unsigned long long)sb->s_time_max);
|
2019-04-15 14:17:12 -07:00
|
|
|
|
2022-03-22 14:39:22 -07:00
|
|
|
sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
|
2024-07-30 10:58:13 +02:00
|
|
|
if (buf)
|
|
|
|
|
free_page((unsigned long)buf);
|
2019-04-15 14:17:12 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-01 23:07:25 +00:00
|
|
|
/*
|
|
|
|
|
* Handle reconfiguration of the mountpoint only without alteration of the
|
|
|
|
|
* superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND
|
|
|
|
|
* to mount(2).
|
|
|
|
|
*/
|
|
|
|
|
static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
|
|
|
|
|
{
|
|
|
|
|
struct super_block *sb = path->mnt->mnt_sb;
|
|
|
|
|
struct mount *mnt = real_mount(path->mnt);
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
if (!check_mnt(mnt))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
2023-05-03 13:18:39 +02:00
|
|
|
if (!path_mounted(path))
|
2018-11-01 23:07:25 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
if (!can_change_locked_flags(mnt, mnt_flags))
|
|
|
|
|
return -EPERM;
|
|
|
|
|
|
2021-01-21 14:19:50 +01:00
|
|
|
/*
|
|
|
|
|
* We're only checking whether the superblock is read-only not
|
|
|
|
|
* changing it, so only take down_read(&sb->s_umount).
|
|
|
|
|
*/
|
|
|
|
|
down_read(&sb->s_umount);
|
2021-01-21 14:19:48 +01:00
|
|
|
lock_mount_hash();
|
2018-11-01 23:07:25 +00:00
|
|
|
ret = change_mount_ro_state(mnt, mnt_flags);
|
|
|
|
|
if (ret == 0)
|
|
|
|
|
set_mount_attributes(mnt, mnt_flags);
|
2021-01-21 14:19:48 +01:00
|
|
|
unlock_mount_hash();
|
2021-01-21 14:19:50 +01:00
|
|
|
up_read(&sb->s_umount);
|
2019-04-15 14:17:12 -07:00
|
|
|
|
|
|
|
|
mnt_warn_timestamp_expiry(path, &mnt->mnt);
|
|
|
|
|
|
2018-11-01 23:07:25 +00:00
|
|
|
return ret;
|
2008-02-15 14:38:00 -08:00
|
|
|
}
|
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
|
* change filesystem flags. dir should be a physical root of filesystem.
|
|
|
|
|
* If you've mounted a non-root directory somewhere and want to do remount
|
|
|
|
|
* on it - tough luck.
|
|
|
|
|
*/
|
2017-07-17 08:45:35 +01:00
|
|
|
static int do_remount(struct path *path, int ms_flags, int sb_flags,
|
|
|
|
|
int mnt_flags, void *data)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
|
int err;
|
2008-08-02 00:51:11 -04:00
|
|
|
struct super_block *sb = path->mnt->mnt_sb;
|
2011-11-25 00:46:35 -05:00
|
|
|
struct mount *mnt = real_mount(path->mnt);
|
2018-11-04 09:28:36 -05:00
|
|
|
struct fs_context *fc;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2011-11-25 00:46:35 -05:00
|
|
|
if (!check_mnt(mnt))
|
2005-04-16 15:20:36 -07:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
2023-05-03 13:18:39 +02:00
|
|
|
if (!path_mounted(path))
|
2005-04-16 15:20:36 -07:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
2018-11-01 23:07:25 +00:00
|
|
|
if (!can_change_locked_flags(mnt, mnt_flags))
|
mnt: Correct permission checks in do_remount
While invesgiating the issue where in "mount --bind -oremount,ro ..."
would result in later "mount --bind -oremount,rw" succeeding even if
the mount started off locked I realized that there are several
additional mount flags that should be locked and are not.
In particular MNT_NOSUID, MNT_NODEV, MNT_NOEXEC, and the atime
flags in addition to MNT_READONLY should all be locked. These
flags are all per superblock, can all be changed with MS_BIND,
and should not be changable if set by a more privileged user.
The following additions to the current logic are added in this patch.
- nosuid may not be clearable by a less privileged user.
- nodev may not be clearable by a less privielged user.
- noexec may not be clearable by a less privileged user.
- atime flags may not be changeable by a less privileged user.
The logic with atime is that always setting atime on access is a
global policy and backup software and auditing software could break if
atime bits are not updated (when they are configured to be updated),
and serious performance degradation could result (DOS attack) if atime
updates happen when they have been explicitly disabled. Therefore an
unprivileged user should not be able to mess with the atime bits set
by a more privileged user.
The additional restrictions are implemented with the addition of
MNT_LOCK_NOSUID, MNT_LOCK_NODEV, MNT_LOCK_NOEXEC, and MNT_LOCK_ATIME
mnt flags.
Taken together these changes and the fixes for MNT_LOCK_READONLY
should make it safe for an unprivileged user to create a user
namespace and to call "mount --bind -o remount,... ..." without
the danger of mount flags being changed maliciously.
Cc: stable@vger.kernel.org
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
2014-07-28 17:26:07 -07:00
|
|
|
return -EPERM;
|
|
|
|
|
|
2018-11-04 09:28:36 -05:00
|
|
|
fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
|
|
|
|
|
if (IS_ERR(fc))
|
|
|
|
|
return PTR_ERR(fc);
|
2011-03-03 16:09:14 -05:00
|
|
|
|
2023-11-22 12:17:37 -05:00
|
|
|
/*
|
|
|
|
|
* Indicate to the filesystem that the remount request is coming
|
|
|
|
|
* from the legacy mount system call.
|
|
|
|
|
*/
|
2020-07-14 14:45:41 +02:00
|
|
|
fc->oldapi = true;
|
2023-11-22 12:17:37 -05:00
|
|
|
|
2018-11-04 09:28:36 -05:00
|
|
|
err = parse_monolithic_mount_data(fc, data);
|
|
|
|
|
if (!err) {
|
|
|
|
|
down_write(&sb->s_umount);
|
|
|
|
|
err = -EPERM;
|
|
|
|
|
if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
|
|
|
|
|
err = reconfigure_super(fc);
|
2021-01-21 14:19:48 +01:00
|
|
|
if (!err) {
|
|
|
|
|
lock_mount_hash();
|
2018-11-04 09:28:36 -05:00
|
|
|
set_mount_attributes(mnt, mnt_flags);
|
2021-01-21 14:19:48 +01:00
|
|
|
unlock_mount_hash();
|
|
|
|
|
}
|
2018-11-04 09:28:36 -05:00
|
|
|
}
|
|
|
|
|
up_write(&sb->s_umount);
|
2008-09-26 19:01:20 -07:00
|
|
|
}
|
2019-04-15 14:17:12 -07:00
|
|
|
|
|
|
|
|
mnt_warn_timestamp_expiry(path, &mnt->mnt);
|
|
|
|
|
|
2018-11-04 09:28:36 -05:00
|
|
|
put_fs_context(fc);
|
2005-04-16 15:20:36 -07:00
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
2011-11-24 20:01:19 -05:00
|
|
|
static inline int tree_contains_unbindable(struct mount *mnt)
|
2005-11-07 17:21:20 -05:00
|
|
|
{
|
2011-11-24 18:57:30 -05:00
|
|
|
struct mount *p;
|
2011-11-25 03:06:56 -05:00
|
|
|
for (p = mnt; p; p = next_mnt(p, mnt)) {
|
2011-11-25 01:05:37 -05:00
|
|
|
if (IS_MNT_UNBINDABLE(p))
|
2005-11-07 17:21:20 -05:00
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-15 13:07:13 +03:00
|
|
|
static int do_set_group(struct path *from_path, struct path *to_path)
|
|
|
|
|
{
|
|
|
|
|
struct mount *from, *to;
|
|
|
|
|
int err;
|
|
|
|
|
|
|
|
|
|
from = real_mount(from_path->mnt);
|
|
|
|
|
to = real_mount(to_path->mnt);
|
|
|
|
|
|
|
|
|
|
namespace_lock();
|
|
|
|
|
|
|
|
|
|
err = -EINVAL;
|
|
|
|
|
/* To and From must be mounted */
|
|
|
|
|
if (!is_mounted(&from->mnt))
|
|
|
|
|
goto out;
|
|
|
|
|
if (!is_mounted(&to->mnt))
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
err = -EPERM;
|
|
|
|
|
/* We should be allowed to modify mount namespaces of both mounts */
|
|
|
|
|
if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
|
|
|
|
|
goto out;
|
|
|
|
|
if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
err = -EINVAL;
|
|
|
|
|
/* To and From paths should be mount roots */
|
2023-05-03 13:18:39 +02:00
|
|
|
if (!path_mounted(from_path))
|
2021-07-15 13:07:13 +03:00
|
|
|
goto out;
|
2023-05-03 13:18:39 +02:00
|
|
|
if (!path_mounted(to_path))
|
2021-07-15 13:07:13 +03:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
/* Setting sharing groups is only allowed across same superblock */
|
|
|
|
|
if (from->mnt.mnt_sb != to->mnt.mnt_sb)
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
/* From mount root should be wider than To mount root */
|
|
|
|
|
if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
/* From mount should not have locked children in place of To's root */
|
2025-06-01 14:23:52 -04:00
|
|
|
if (__has_locked_children(from, to->mnt.mnt_root))
|
2021-07-15 13:07:13 +03:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
/* Setting sharing groups is only allowed on private mounts */
|
|
|
|
|
if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
/* From should not be private */
|
|
|
|
|
if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
if (IS_MNT_SLAVE(from)) {
|
|
|
|
|
struct mount *m = from->mnt_master;
|
|
|
|
|
|
2025-06-03 17:57:27 -04:00
|
|
|
list_add(&to->mnt_slave, &from->mnt_slave);
|
2021-07-15 13:07:13 +03:00
|
|
|
to->mnt_master = m;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (IS_MNT_SHARED(from)) {
|
|
|
|
|
to->mnt_group_id = from->mnt_group_id;
|
|
|
|
|
list_add(&to->mnt_share, &from->mnt_share);
|
|
|
|
|
lock_mount_hash();
|
|
|
|
|
set_mnt_shared(to);
|
|
|
|
|
unlock_mount_hash();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
err = 0;
|
|
|
|
|
out:
|
|
|
|
|
namespace_unlock();
|
|
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
/**
|
|
|
|
|
* path_overmounted - check if path is overmounted
|
|
|
|
|
* @path: path to check
|
|
|
|
|
*
|
|
|
|
|
* Check if path is overmounted, i.e., if there's a mount on top of
|
|
|
|
|
* @path->mnt with @path->dentry as mountpoint.
|
|
|
|
|
*
|
2025-06-01 14:02:26 -04:00
|
|
|
* Context: namespace_sem must be held at least shared.
|
|
|
|
|
* MUST NOT be called under lock_mount_hash() (there one should just
|
|
|
|
|
* call __lookup_mnt() and check if it returns NULL).
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
* Return: If path is overmounted true is returned, false if not.
|
|
|
|
|
*/
|
|
|
|
|
static inline bool path_overmounted(const struct path *path)
|
|
|
|
|
{
|
2025-06-01 14:02:26 -04:00
|
|
|
unsigned seq = read_seqbegin(&mount_lock);
|
|
|
|
|
bool no_child;
|
|
|
|
|
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
rcu_read_lock();
|
2025-06-01 14:02:26 -04:00
|
|
|
no_child = !__lookup_mnt(path->mnt, path->dentry);
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
rcu_read_unlock();
|
2025-06-01 14:02:26 -04:00
|
|
|
if (need_seqretry(&mount_lock, seq)) {
|
|
|
|
|
read_seqlock_excl(&mount_lock);
|
|
|
|
|
no_child = !__lookup_mnt(path->mnt, path->dentry);
|
|
|
|
|
read_sequnlock_excl(&mount_lock);
|
|
|
|
|
}
|
|
|
|
|
return unlikely(!no_child);
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
}
|
|
|
|
|
|
2025-06-08 23:10:33 -04:00
|
|
|
/*
|
|
|
|
|
* Check if there is a possibly empty chain of descent from p1 to p2.
|
|
|
|
|
* Locks: namespace_sem (shared) or mount_lock (read_seqlock_excl).
|
|
|
|
|
*/
|
|
|
|
|
static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2)
|
|
|
|
|
{
|
|
|
|
|
while (p2 != p1 && mnt_has_parent(p2))
|
|
|
|
|
p2 = p2->mnt_parent;
|
|
|
|
|
return p2 == p1;
|
|
|
|
|
}
|
|
|
|
|
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
/**
|
|
|
|
|
* can_move_mount_beneath - check that we can mount beneath the top mount
|
|
|
|
|
* @from: mount to mount beneath
|
|
|
|
|
* @to: mount under which to mount
|
2023-11-21 00:15:55 +01:00
|
|
|
* @mp: mountpoint of @to
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
*
|
|
|
|
|
* - Make sure that @to->dentry is actually the root of a mount under
|
|
|
|
|
* which we can mount another mount.
|
|
|
|
|
* - Make sure that nothing can be mounted beneath the caller's current
|
|
|
|
|
* root or the rootfs of the namespace.
|
|
|
|
|
* - Make sure that the caller can unmount the topmost mount ensuring
|
|
|
|
|
* that the caller could reveal the underlying mountpoint.
|
|
|
|
|
* - Ensure that nothing has been mounted on top of @from before we
|
|
|
|
|
* grabbed @namespace_sem to avoid creating pointless shadow mounts.
|
|
|
|
|
* - Prevent mounting beneath a mount if the propagation relationship
|
|
|
|
|
* between the source mount, parent mount, and top mount would lead to
|
|
|
|
|
* nonsensical mount trees.
|
|
|
|
|
*
|
|
|
|
|
* Context: This function expects namespace_lock() to be held.
|
|
|
|
|
* Return: On success 0, and on error a negative error code is returned.
|
|
|
|
|
*/
|
|
|
|
|
static int can_move_mount_beneath(const struct path *from,
|
|
|
|
|
const struct path *to,
|
|
|
|
|
const struct mountpoint *mp)
|
|
|
|
|
{
|
|
|
|
|
struct mount *mnt_from = real_mount(from->mnt),
|
|
|
|
|
*mnt_to = real_mount(to->mnt),
|
|
|
|
|
*parent_mnt_to = mnt_to->mnt_parent;
|
|
|
|
|
|
|
|
|
|
if (!mnt_has_parent(mnt_to))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
if (!path_mounted(to))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
if (IS_MNT_LOCKED(mnt_to))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
/* Avoid creating shadow mounts during mount propagation. */
|
|
|
|
|
if (path_overmounted(from))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Mounting beneath the rootfs only makes sense when the
|
|
|
|
|
* semantics of pivot_root(".", ".") are used.
|
|
|
|
|
*/
|
|
|
|
|
if (&mnt_to->mnt == current->fs->root.mnt)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
if (parent_mnt_to == current->nsproxy->mnt_ns->root)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
2025-06-08 23:10:33 -04:00
|
|
|
if (mount_is_ancestor(mnt_to, mnt_from))
|
|
|
|
|
return -EINVAL;
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If the parent mount propagates to the child mount this would
|
|
|
|
|
* mean mounting @mnt_from on @mnt_to->mnt_parent and then
|
|
|
|
|
* propagating a copy @c of @mnt_from on top of @mnt_to. This
|
|
|
|
|
* defeats the whole purpose of mounting beneath another mount.
|
|
|
|
|
*/
|
|
|
|
|
if (propagation_would_overmount(parent_mnt_to, mnt_to, mp))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If @mnt_to->mnt_parent propagates to @mnt_from this would
|
|
|
|
|
* mean propagating a copy @c of @mnt_from on top of @mnt_from.
|
|
|
|
|
* Afterwards @mnt_from would be mounted on top of
|
|
|
|
|
* @mnt_to->mnt_parent and @mnt_to would be unmounted from
|
|
|
|
|
* @mnt->mnt_parent and remounted on @mnt_from. But since @c is
|
|
|
|
|
* already mounted on @mnt_from, @mnt_to would ultimately be
|
|
|
|
|
* remounted on top of @c. Afterwards, @mnt_from would be
|
|
|
|
|
* covered by a copy @c of @mnt_from and @c would be covered by
|
|
|
|
|
* @mnt_from itself. This defeats the whole purpose of mounting
|
|
|
|
|
* @mnt_from beneath @mnt_to.
|
|
|
|
|
*/
|
fix IS_MNT_PROPAGATING uses
propagate_mnt() does not attach anything to mounts created during
propagate_mnt() itself. What's more, anything on ->mnt_slave_list
of such new mount must also be new, so we don't need to even look
there.
When move_mount() had been introduced, we've got an additional
class of mounts to skip - if we are moving from anon namespace,
we do not want to propagate to mounts we are moving (i.e. all
mounts in that anon namespace).
Unfortunately, the part about "everything on their ->mnt_slave_list
will also be ignorable" is not true - if we have propagation graph
A -> B -> C
and do OPEN_TREE_CLONE open_tree() of B, we get
A -> [B <-> B'] -> C
as propagation graph, where B' is a clone of B in our detached tree.
Making B private will result in
A -> B' -> C
C still gets propagation from A, as it would after making B private
if we hadn't done that open_tree(), but now the propagation goes
through B'. Trying to move_mount() our detached tree on subdirectory
in A should have
* moved B' on that subdirectory in A
* skipped the corresponding subdirectory in B' itself
* copied B' on the corresponding subdirectory in C.
As it is, the logics in propagation_next() and friends ends up
skipping propagation into C, since it doesn't consider anything
downstream of B'.
IOW, walking the propagation graph should only skip the ->mnt_slave_list
of new mounts; the only places where the check for "in that one
anon namespace" are applicable are propagate_one() (where we should
treat that as the same kind of thing as "mountpoint we are looking
at is not visible in the mount we are looking at") and
propagation_would_overmount(). The latter is better dealt with
in the caller (can_move_mount_beneath()); on the first call of
propagation_would_overmount() the test is always false, on the
second it is always true in "move from anon namespace" case and
always false in "move within our namespace" one, so it's easier
to just use check_mnt() before bothering with the second call and
be done with that.
Fixes: 064fe6e233e8 ("mount: handle mount propagation for detached mount trees")
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-05-08 15:35:51 -04:00
|
|
|
if (check_mnt(mnt_from) &&
|
|
|
|
|
propagation_would_overmount(parent_mnt_to, mnt_from, mp))
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-21 14:13:08 +01:00
|
|
|
/* may_use_mount() - check if a mount tree can be used
|
|
|
|
|
* @mnt: vfsmount to be used
|
|
|
|
|
*
|
|
|
|
|
* This helper checks if the caller may use the mount tree starting
|
|
|
|
|
* from @path->mnt. The caller may use the mount tree under the
|
|
|
|
|
* following circumstances:
|
|
|
|
|
*
|
|
|
|
|
* (1) The caller is located in the mount namespace of the mount tree.
|
|
|
|
|
* This also implies that the mount does not belong to an anonymous
|
|
|
|
|
* mount namespace.
|
|
|
|
|
* (2) The caller is trying to use a mount tree that belongs to an
|
|
|
|
|
* anonymous mount namespace.
|
|
|
|
|
*
|
|
|
|
|
* For that to be safe, this helper enforces that the origin mount
|
|
|
|
|
* namespace the anonymous mount namespace was created from is the
|
|
|
|
|
* same as the caller's mount namespace by comparing the sequence
|
|
|
|
|
* numbers.
|
|
|
|
|
*
|
|
|
|
|
* The ownership of a non-anonymous mount namespace such as the
|
|
|
|
|
* caller's cannot change.
|
|
|
|
|
* => We know that the caller's mount namespace is stable.
|
|
|
|
|
*
|
|
|
|
|
* If the origin sequence number of the anonymous mount namespace is
|
|
|
|
|
* the same as the sequence number of the caller's mount namespace.
|
|
|
|
|
* => The owning namespaces are the same.
|
|
|
|
|
*
|
|
|
|
|
* ==> The earlier capability check on the owning namespace of the
|
|
|
|
|
* caller's mount namespace ensures that the caller has the
|
|
|
|
|
* ability to use the mount tree.
|
|
|
|
|
*
|
|
|
|
|
* Returns true if the mount tree can be used, false otherwise.
|
|
|
|
|
*/
|
|
|
|
|
static inline bool may_use_mount(struct mount *mnt)
|
|
|
|
|
{
|
|
|
|
|
if (check_mnt(mnt))
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Make sure that noone unmounted the target path or somehow
|
|
|
|
|
* managed to get their hands on something purely kernel
|
|
|
|
|
* internal.
|
|
|
|
|
*/
|
|
|
|
|
if (!is_mounted(&mnt->mnt))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
return check_anonymous_mnt(mnt);
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-21 14:13:07 +01:00
|
|
|
static int do_move_mount(struct path *old_path,
|
|
|
|
|
struct path *new_path, enum mnt_tree_flags_t flags)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2018-11-05 17:40:31 +00:00
|
|
|
struct mnt_namespace *ns;
|
2011-11-24 21:47:05 -05:00
|
|
|
struct mount *p;
|
2011-11-24 19:59:16 -05:00
|
|
|
struct mount *old;
|
2019-06-30 19:18:53 -04:00
|
|
|
struct mount *parent;
|
2025-04-25 12:55:39 -04:00
|
|
|
struct mountpoint *mp;
|
2013-02-22 22:49:10 -05:00
|
|
|
int err;
|
2025-05-08 00:09:30 -04:00
|
|
|
bool beneath = flags & MNT_TREE_BENEATH;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
mp = do_lock_mount(new_path, beneath);
|
2013-03-15 10:53:28 -04:00
|
|
|
if (IS_ERR(mp))
|
2018-11-05 17:40:30 +00:00
|
|
|
return PTR_ERR(mp);
|
Add a dentry op to allow processes to be held during pathwalk transit
Add a dentry op (d_manage) to permit a filesystem to hold a process and make it
sleep when it tries to transit away from one of that filesystem's directories
during a pathwalk. The operation is keyed off a new dentry flag
(DCACHE_MANAGE_TRANSIT).
The filesystem is allowed to be selective about which processes it holds and
which it permits to continue on or prohibits from transiting from each flagged
directory. This will allow autofs to hold up client processes whilst letting
its userspace daemon through to maintain the directory or the stuff behind it
or mounted upon it.
The ->d_manage() dentry operation:
int (*d_manage)(struct path *path, bool mounting_here);
takes a pointer to the directory about to be transited away from and a flag
indicating whether the transit is undertaken by do_add_mount() or
do_move_mount() skipping through a pile of filesystems mounted on a mountpoint.
It should return 0 if successful and to let the process continue on its way;
-EISDIR to prohibit the caller from skipping to overmounted filesystems or
automounting, and to use this directory; or some other error code to return to
the user.
->d_manage() is called with namespace_sem writelocked if mounting_here is true
and no other locks held, so it may sleep. However, if mounting_here is true,
it may not initiate or wait for a mount or unmount upon the parameter
directory, even if the act is actually performed by userspace.
Within fs/namei.c, follow_managed() is extended to check with d_manage() first
on each managed directory, before transiting away from it or attempting to
automount upon it.
follow_down() is renamed follow_down_one() and should only be used where the
filesystem deliberately intends to avoid management steps (e.g. autofs).
A new follow_down() is added that incorporates the loop done by all other
callers of follow_down() (do_add/move_mount(), autofs and NFSD; whilst AFS, NFS
and CIFS do use it, their use is removed by converting them to use
d_automount()). The new follow_down() calls d_manage() as appropriate. It
also takes an extra parameter to indicate if it is being called from mount code
(with namespace_sem writelocked) which it passes to d_manage(). follow_down()
ignores automount points so that it can be used to mount on them.
__follow_mount_rcu() is made to abort rcu-walk mode if it hits a directory with
DCACHE_MANAGE_TRANSIT set on the basis that we're probably going to have to
sleep. It would be possible to enter d_manage() in rcu-walk mode too, and have
that determine whether to abort or not itself. That would allow the autofs
daemon to continue on in rcu-walk mode.
Note that DCACHE_MANAGE_TRANSIT on a directory should be cleared when it isn't
required as every tranist from that directory will cause d_manage() to be
invoked. It can always be set again when necessary.
==========================
WHAT THIS MEANS FOR AUTOFS
==========================
Autofs currently uses the lookup() inode op and the d_revalidate() dentry op to
trigger the automounting of indirect mounts, and both of these can be called
with i_mutex held.
autofs knows that the i_mutex will be held by the caller in lookup(), and so
can drop it before invoking the daemon - but this isn't so for d_revalidate(),
since the lock is only held on _some_ of the code paths that call it. This
means that autofs can't risk dropping i_mutex from its d_revalidate() function
before it calls the daemon.
The bug could manifest itself as, for example, a process that's trying to
validate an automount dentry that gets made to wait because that dentry is
expired and needs cleaning up:
mkdir S ffffffff8014e05a 0 32580 24956
Call Trace:
[<ffffffff885371fd>] :autofs4:autofs4_wait+0x674/0x897
[<ffffffff80127f7d>] avc_has_perm+0x46/0x58
[<ffffffff8009fdcf>] autoremove_wake_function+0x0/0x2e
[<ffffffff88537be6>] :autofs4:autofs4_expire_wait+0x41/0x6b
[<ffffffff88535cfc>] :autofs4:autofs4_revalidate+0x91/0x149
[<ffffffff80036d96>] __lookup_hash+0xa0/0x12f
[<ffffffff80057a2f>] lookup_create+0x46/0x80
[<ffffffff800e6e31>] sys_mkdirat+0x56/0xe4
versus the automount daemon which wants to remove that dentry, but can't
because the normal process is holding the i_mutex lock:
automount D ffffffff8014e05a 0 32581 1 32561
Call Trace:
[<ffffffff80063c3f>] __mutex_lock_slowpath+0x60/0x9b
[<ffffffff8000ccf1>] do_path_lookup+0x2ca/0x2f1
[<ffffffff80063c89>] .text.lock.mutex+0xf/0x14
[<ffffffff800e6d55>] do_rmdir+0x77/0xde
[<ffffffff8005d229>] tracesys+0x71/0xe0
[<ffffffff8005d28d>] tracesys+0xd5/0xe0
which means that the system is deadlocked.
This patch allows autofs to hold up normal processes whilst the daemon goes
ahead and does things to the dentry tree behind the automouter point without
risking a deadlock as almost no locks are held in d_manage() and none in
d_automount().
Signed-off-by: David Howells <dhowells@redhat.com>
Was-Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2011-01-14 18:45:26 +00:00
|
|
|
|
2018-11-05 17:40:30 +00:00
|
|
|
old = real_mount(old_path->mnt);
|
|
|
|
|
p = real_mount(new_path->mnt);
|
2019-06-30 19:18:53 -04:00
|
|
|
parent = old->mnt_parent;
|
2018-11-05 17:40:31 +00:00
|
|
|
ns = old->mnt_ns;
|
2011-11-25 00:46:35 -05:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
err = -EINVAL;
|
2018-11-05 17:40:31 +00:00
|
|
|
|
2025-06-06 18:31:03 -04:00
|
|
|
if (check_mnt(old)) {
|
|
|
|
|
/* if the source is in our namespace... */
|
|
|
|
|
/* ... it should be detachable from parent */
|
|
|
|
|
if (!mnt_has_parent(old) || IS_MNT_LOCKED(old))
|
|
|
|
|
goto out;
|
|
|
|
|
/* ... and the target should be in our namespace */
|
|
|
|
|
if (!check_mnt(p))
|
|
|
|
|
goto out;
|
2025-05-08 00:09:30 -04:00
|
|
|
/* parent of the source should not be shared */
|
|
|
|
|
if (IS_MNT_SHARED(parent))
|
|
|
|
|
goto out;
|
2025-06-06 18:31:03 -04:00
|
|
|
} else {
|
2025-02-25 11:15:46 +01:00
|
|
|
/*
|
2025-06-06 18:31:03 -04:00
|
|
|
* otherwise the source must be the root of some anon namespace.
|
2025-02-25 11:15:46 +01:00
|
|
|
*/
|
2025-06-08 23:25:36 -04:00
|
|
|
if (!anon_ns_root(old))
|
2025-06-06 18:31:03 -04:00
|
|
|
goto out;
|
2025-02-25 11:15:46 +01:00
|
|
|
/*
|
2025-06-06 18:31:03 -04:00
|
|
|
* Bail out early if the target is within the same namespace -
|
|
|
|
|
* subsequent checks would've rejected that, but they lose
|
|
|
|
|
* some corner cases if we check it early.
|
2025-02-25 11:15:46 +01:00
|
|
|
*/
|
2025-06-06 18:31:03 -04:00
|
|
|
if (ns == p->mnt_ns)
|
|
|
|
|
goto out;
|
|
|
|
|
/*
|
|
|
|
|
* Target should be either in our namespace or in an acceptable
|
|
|
|
|
* anon namespace, sensu check_anonymous_mnt().
|
|
|
|
|
*/
|
|
|
|
|
if (!may_use_mount(p))
|
|
|
|
|
goto out;
|
2025-02-25 11:15:46 +01:00
|
|
|
}
|
2025-02-21 14:13:08 +01:00
|
|
|
|
2023-05-03 13:18:39 +02:00
|
|
|
if (!path_mounted(old_path))
|
2018-11-05 17:40:30 +00:00
|
|
|
goto out;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2018-11-05 17:40:30 +00:00
|
|
|
if (d_is_dir(new_path->dentry) !=
|
|
|
|
|
d_is_dir(old_path->dentry))
|
|
|
|
|
goto out;
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
|
|
|
|
|
if (beneath) {
|
|
|
|
|
err = can_move_mount_beneath(old_path, new_path, mp);
|
|
|
|
|
if (err)
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
err = -EINVAL;
|
|
|
|
|
p = p->mnt_parent;
|
|
|
|
|
}
|
|
|
|
|
|
2005-11-07 17:21:20 -05:00
|
|
|
/*
|
|
|
|
|
* Don't move a mount tree containing unbindable mounts to a destination
|
|
|
|
|
* mount which is shared.
|
|
|
|
|
*/
|
2011-11-25 01:05:37 -05:00
|
|
|
if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
|
2018-11-05 17:40:30 +00:00
|
|
|
goto out;
|
2005-04-16 15:20:36 -07:00
|
|
|
err = -ELOOP;
|
2018-11-05 17:40:31 +00:00
|
|
|
if (!check_for_nsfs_mounts(old))
|
|
|
|
|
goto out;
|
2025-06-08 23:10:33 -04:00
|
|
|
if (mount_is_ancestor(old, p))
|
|
|
|
|
goto out;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2025-04-25 22:54:56 -04:00
|
|
|
err = attach_recursive_mnt(old, p, mp);
|
2005-04-16 15:20:36 -07:00
|
|
|
out:
|
2018-11-05 17:40:30 +00:00
|
|
|
unlock_mount(mp);
|
2018-11-05 17:40:31 +00:00
|
|
|
if (!err) {
|
2025-05-08 00:09:30 -04:00
|
|
|
if (!is_anon_ns(ns)) {
|
2019-06-30 19:18:53 -04:00
|
|
|
mntput_no_expire(parent);
|
2025-02-21 14:13:02 +01:00
|
|
|
} else {
|
|
|
|
|
/* Make sure we notice when we leak mounts. */
|
|
|
|
|
VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
|
2018-11-05 17:40:31 +00:00
|
|
|
free_mnt_ns(ns);
|
2025-02-21 14:13:02 +01:00
|
|
|
}
|
2018-11-05 17:40:31 +00:00
|
|
|
}
|
2018-11-05 17:40:30 +00:00
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int do_move_mount_old(struct path *path, const char *old_name)
|
|
|
|
|
{
|
|
|
|
|
struct path old_path;
|
|
|
|
|
int err;
|
|
|
|
|
|
|
|
|
|
if (!old_name || !*old_name)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
|
|
|
|
|
if (err)
|
|
|
|
|
return err;
|
|
|
|
|
|
2025-02-21 14:13:07 +01:00
|
|
|
err = do_move_mount(&old_path, path, 0);
|
2008-08-02 00:51:11 -04:00
|
|
|
path_put(&old_path);
|
2005-04-16 15:20:36 -07:00
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
2011-03-17 22:08:28 -04:00
|
|
|
/*
|
|
|
|
|
* add a mount into a namespace's mount tree
|
|
|
|
|
*/
|
2020-01-11 10:14:09 -05:00
|
|
|
static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
|
2021-06-18 20:27:57 -04:00
|
|
|
const struct path *path, int mnt_flags)
|
2011-03-17 22:08:28 -04:00
|
|
|
{
|
2020-01-11 10:14:09 -05:00
|
|
|
struct mount *parent = real_mount(path->mnt);
|
2011-03-17 22:08:28 -04:00
|
|
|
|
smarter propagate_mnt()
The current mainline has copies propagated to *all* nodes, then
tears down the copies we made for nodes that do not contain
counterparts of the desired mountpoint. That sets the right
propagation graph for the copies (at teardown time we move
the slaves of removed node to a surviving peer or directly
to master), but we end up paying a fairly steep price in
useless allocations. It's fairly easy to create a situation
where N calls of mount(2) create exactly N bindings, with
O(N^2) vfsmounts allocated and freed in process.
Fortunately, it is possible to avoid those allocations/freeings.
The trick is to create copies in the right order and find which
one would've eventually become a master with the current algorithm.
It turns out to be possible in O(nodes getting propagation) time
and with no extra allocations at all.
One part is that we need to make sure that eventual master will be
created before its slaves, so we need to walk the propagation
tree in a different order - by peer groups. And iterate through
the peers before dealing with the next group.
Another thing is finding the (earlier) copy that will be a master
of one we are about to create; to do that we are (temporary) marking
the masters of mountpoints we are attaching the copies to.
Either we are in a peer of the last mountpoint we'd dealt with,
or we have the following situation: we are attaching to mountpoint M,
the last copy S_0 had been attached to M_0 and there are sequences
S_0...S_n, M_0...M_n such that S_{i+1} is a master of S_{i},
S_{i} mounted on M{i} and we need to create a slave of the first S_{k}
such that M is getting propagation from M_{k}. It means that the master
of M_{k} will be among the sequence of masters of M. On the
other hand, the nearest marked node in that sequence will either
be the master of M_{k} or the master of M_{k-1} (the latter -
in the case if M_{k-1} is a slave of something M gets propagation
from, but in a wrong peer group).
So we go through the sequence of masters of M until we find
a marked one (P). Let N be the one before it. Then we go through
the sequence of masters of S_0 until we find one (say, S) mounted
on a node D that has P as master and check if D is a peer of N.
If it is, S will be the master of new copy, if not - the master of S
will be.
That's it for the hard part; the rest is fairly simple. Iterator
is in next_group(), handling of one prospective mountpoint is
propagate_one().
It seems to survive all tests and gives a noticably better performance
than the current mainline for setups that are seriously using shared
subtrees.
Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-02-27 09:35:45 -05:00
|
|
|
mnt_flags &= ~MNT_INTERNAL_FLAGS;
|
2011-03-17 22:08:28 -04:00
|
|
|
|
2013-03-15 10:53:28 -04:00
|
|
|
if (unlikely(!check_mnt(parent))) {
|
2012-09-21 08:19:02 -04:00
|
|
|
/* that's acceptable only for automounts done in private ns */
|
|
|
|
|
if (!(mnt_flags & MNT_SHRINKABLE))
|
2020-01-11 10:14:09 -05:00
|
|
|
return -EINVAL;
|
2012-09-21 08:19:02 -04:00
|
|
|
/* ... and for those we'd better have mountpoint still alive */
|
2013-03-15 10:53:28 -04:00
|
|
|
if (!parent->mnt_ns)
|
2020-01-11 10:14:09 -05:00
|
|
|
return -EINVAL;
|
2012-09-21 08:19:02 -04:00
|
|
|
}
|
2011-03-17 22:08:28 -04:00
|
|
|
|
|
|
|
|
/* Refuse the same filesystem on the same mount point */
|
2023-05-03 13:18:39 +02:00
|
|
|
if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path))
|
2020-01-11 10:14:09 -05:00
|
|
|
return -EBUSY;
|
2011-03-17 22:08:28 -04:00
|
|
|
|
VFS: (Scripted) Convert S_ISLNK/DIR/REG(dentry->d_inode) to d_is_*(dentry)
Convert the following where appropriate:
(1) S_ISLNK(dentry->d_inode) to d_is_symlink(dentry).
(2) S_ISREG(dentry->d_inode) to d_is_reg(dentry).
(3) S_ISDIR(dentry->d_inode) to d_is_dir(dentry). This is actually more
complicated than it appears as some calls should be converted to
d_can_lookup() instead. The difference is whether the directory in
question is a real dir with a ->lookup op or whether it's a fake dir with
a ->d_automount op.
In some circumstances, we can subsume checks for dentry->d_inode not being
NULL into this, provided we the code isn't in a filesystem that expects
d_inode to be NULL if the dirent really *is* negative (ie. if we're going to
use d_inode() rather than d_backing_inode() to get the inode pointer).
Note that the dentry type field may be set to something other than
DCACHE_MISS_TYPE when d_inode is NULL in the case of unionmount, where the VFS
manages the fall-through from a negative dentry to a lower layer. In such a
case, the dentry type of the negative union dentry is set to the same as the
type of the lower dentry.
However, if you know d_inode is not NULL at the call site, then you can use
the d_is_xxx() functions even in a filesystem.
There is one further complication: a 0,0 chardev dentry may be labelled
DCACHE_WHITEOUT_TYPE rather than DCACHE_SPECIAL_TYPE. Strictly, this was
intended for special directory entry types that don't have attached inodes.
The following perl+coccinelle script was used:
use strict;
my @callers;
open($fd, 'git grep -l \'S_IS[A-Z].*->d_inode\' |') ||
die "Can't grep for S_ISDIR and co. callers";
@callers = <$fd>;
close($fd);
unless (@callers) {
print "No matches\n";
exit(0);
}
my @cocci = (
'@@',
'expression E;',
'@@',
'',
'- S_ISLNK(E->d_inode->i_mode)',
'+ d_is_symlink(E)',
'',
'@@',
'expression E;',
'@@',
'',
'- S_ISDIR(E->d_inode->i_mode)',
'+ d_is_dir(E)',
'',
'@@',
'expression E;',
'@@',
'',
'- S_ISREG(E->d_inode->i_mode)',
'+ d_is_reg(E)' );
my $coccifile = "tmp.sp.cocci";
open($fd, ">$coccifile") || die $coccifile;
print($fd "$_\n") || die $coccifile foreach (@cocci);
close($fd);
foreach my $file (@callers) {
chomp $file;
print "Processing ", $file, "\n";
system("spatch", "--sp-file", $coccifile, $file, "--in-place", "--no-show-diff") == 0 ||
die "spatch failed";
}
[AV: overlayfs parts skipped]
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-01-29 12:02:35 +00:00
|
|
|
if (d_is_symlink(newmnt->mnt.mnt_root))
|
2020-01-11 10:14:09 -05:00
|
|
|
return -EINVAL;
|
2011-03-17 22:08:28 -04:00
|
|
|
|
2011-11-25 00:30:56 -05:00
|
|
|
newmnt->mnt.mnt_flags = mnt_flags;
|
2020-01-11 10:14:09 -05:00
|
|
|
return graft_tree(newmnt, parent, mp);
|
2011-03-17 22:08:28 -04:00
|
|
|
}
|
2011-01-17 01:47:59 -05:00
|
|
|
|
2018-11-04 07:43:08 -05:00
|
|
|
static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Create a new mount using a superblock configuration and request it
|
|
|
|
|
* be added to the namespace tree.
|
|
|
|
|
*/
|
|
|
|
|
static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
|
|
|
|
|
unsigned int mnt_flags)
|
|
|
|
|
{
|
|
|
|
|
struct vfsmount *mnt;
|
2020-01-11 10:14:09 -05:00
|
|
|
struct mountpoint *mp;
|
2018-11-04 07:43:08 -05:00
|
|
|
struct super_block *sb = fc->root->d_sb;
|
|
|
|
|
int error;
|
|
|
|
|
|
2018-12-20 15:04:50 -05:00
|
|
|
error = security_sb_kern_mount(sb);
|
|
|
|
|
if (!error && mount_too_revealing(sb, &mnt_flags))
|
|
|
|
|
error = -EPERM;
|
|
|
|
|
|
|
|
|
|
if (unlikely(error)) {
|
|
|
|
|
fc_drop_locked(fc);
|
|
|
|
|
return error;
|
2018-11-04 07:43:08 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
up_write(&sb->s_umount);
|
|
|
|
|
|
|
|
|
|
mnt = vfs_create_mount(fc);
|
|
|
|
|
if (IS_ERR(mnt))
|
|
|
|
|
return PTR_ERR(mnt);
|
|
|
|
|
|
2019-04-15 14:17:12 -07:00
|
|
|
mnt_warn_timestamp_expiry(mountpoint, mnt);
|
|
|
|
|
|
2020-01-11 10:14:09 -05:00
|
|
|
mp = lock_mount(mountpoint);
|
|
|
|
|
if (IS_ERR(mp)) {
|
|
|
|
|
mntput(mnt);
|
|
|
|
|
return PTR_ERR(mp);
|
|
|
|
|
}
|
|
|
|
|
error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
|
|
|
|
|
unlock_mount(mp);
|
2019-10-16 19:48:14 -07:00
|
|
|
if (error < 0)
|
|
|
|
|
mntput(mnt);
|
2018-11-04 07:43:08 -05:00
|
|
|
return error;
|
|
|
|
|
}
|
2015-05-08 23:22:29 -05:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
|
* create a new mount for userspace and request it to be added into the
|
|
|
|
|
* namespace's tree
|
|
|
|
|
*/
|
2017-07-17 08:45:35 +01:00
|
|
|
static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
|
2012-10-11 11:42:01 -04:00
|
|
|
int mnt_flags, const char *name, void *data)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2012-07-26 21:42:03 -07:00
|
|
|
struct file_system_type *type;
|
2018-11-04 07:18:51 -05:00
|
|
|
struct fs_context *fc;
|
|
|
|
|
const char *subtype = NULL;
|
|
|
|
|
int err = 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2012-07-26 21:42:03 -07:00
|
|
|
if (!fstype)
|
2005-04-16 15:20:36 -07:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
2012-07-26 21:42:03 -07:00
|
|
|
type = get_fs_type(fstype);
|
|
|
|
|
if (!type)
|
|
|
|
|
return -ENODEV;
|
|
|
|
|
|
2018-11-04 07:18:51 -05:00
|
|
|
if (type->fs_flags & FS_HAS_SUBTYPE) {
|
|
|
|
|
subtype = strchr(fstype, '.');
|
|
|
|
|
if (subtype) {
|
|
|
|
|
subtype++;
|
|
|
|
|
if (!*subtype) {
|
|
|
|
|
put_filesystem(type);
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2012-07-26 21:42:03 -07:00
|
|
|
|
2018-11-04 07:18:51 -05:00
|
|
|
fc = fs_context_for_mount(type, sb_flags);
|
2012-07-26 21:42:03 -07:00
|
|
|
put_filesystem(type);
|
2018-11-04 07:18:51 -05:00
|
|
|
if (IS_ERR(fc))
|
|
|
|
|
return PTR_ERR(fc);
|
|
|
|
|
|
2023-11-22 12:17:37 -05:00
|
|
|
/*
|
|
|
|
|
* Indicate to the filesystem that the mount request is coming
|
|
|
|
|
* from the legacy mount system call.
|
|
|
|
|
*/
|
|
|
|
|
fc->oldapi = true;
|
|
|
|
|
|
2018-11-01 23:07:25 +00:00
|
|
|
if (subtype)
|
|
|
|
|
err = vfs_parse_fs_string(fc, "subtype",
|
|
|
|
|
subtype, strlen(subtype));
|
|
|
|
|
if (!err && name)
|
|
|
|
|
err = vfs_parse_fs_string(fc, "source", name, strlen(name));
|
2018-11-04 07:18:51 -05:00
|
|
|
if (!err)
|
|
|
|
|
err = parse_monolithic_mount_data(fc, data);
|
2019-05-13 12:57:22 -04:00
|
|
|
if (!err && !mount_capable(fc))
|
|
|
|
|
err = -EPERM;
|
2018-11-04 07:18:51 -05:00
|
|
|
if (!err)
|
|
|
|
|
err = vfs_get_tree(fc);
|
2018-11-04 07:43:08 -05:00
|
|
|
if (!err)
|
|
|
|
|
err = do_new_mount_fc(fc, path, mnt_flags);
|
2016-06-09 16:06:06 -05:00
|
|
|
|
2018-11-04 07:18:51 -05:00
|
|
|
put_fs_context(fc);
|
2011-01-17 01:41:58 -05:00
|
|
|
return err;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2021-06-18 20:27:57 -04:00
|
|
|
int finish_automount(struct vfsmount *m, const struct path *path)
|
2011-01-17 01:35:23 -05:00
|
|
|
{
|
fix automount/automount race properly
Protection against automount/automount races (two threads hitting the same
referral point at the same time) is based upon do_add_mount() prevention of
identical overmounts - trying to overmount the root of mounted tree with
the same tree fails with -EBUSY. It's unreliable (the other thread might've
mounted something on top of the automount it has triggered) *and* causes
no end of headache for follow_automount() and its caller, since
finish_automount() behaves like do_new_mount() - if the mountpoint to be is
overmounted, it mounts on top what's overmounting it. It's not only wrong
(we want to go into what's overmounting the automount point and quietly
discard what we planned to mount there), it introduces the possibility of
original parent mount getting dropped. That's what 8aef18845266 (VFS: Fix
vfsmount overput on simultaneous automount) deals with, but it can't do
anything about the reliability of conflict detection - if something had
been overmounted the other thread's automount (e.g. that other thread
having stepped into automount in mount(2)), we don't get that -EBUSY and
the result is
referral point under automounted NFS under explicit overmount
under another copy of automounted NFS
What we need is finish_automount() *NOT* digging into overmounts - if it
finds one, it should just quietly discard the thing it was asked to mount.
And don't bother with actually crossing into the results of finish_automount() -
the same loop that calls follow_automount() will do that just fine on the
next iteration.
IOW, instead of calling lock_mount() have finish_automount() do it manually,
_without_ the "move into overmount and retry" part. And leave crossing into
the results to the caller of follow_automount(), which simplifies it a lot.
Moral: if you end up with a lot of glue working around the calling conventions
of something, perhaps these calling conventions are simply wrong...
Fixes: 8aef18845266 (VFS: Fix vfsmount overput on simultaneous automount)
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2020-01-11 10:44:29 -05:00
|
|
|
struct dentry *dentry = path->dentry;
|
2020-01-11 10:14:09 -05:00
|
|
|
struct mountpoint *mp;
|
2020-01-11 11:27:46 -05:00
|
|
|
struct mount *mnt;
|
2011-01-17 01:35:23 -05:00
|
|
|
int err;
|
2020-01-11 11:27:46 -05:00
|
|
|
|
|
|
|
|
if (!m)
|
|
|
|
|
return 0;
|
|
|
|
|
if (IS_ERR(m))
|
|
|
|
|
return PTR_ERR(m);
|
|
|
|
|
|
|
|
|
|
mnt = real_mount(m);
|
2011-01-17 01:35:23 -05:00
|
|
|
|
|
|
|
|
if (m->mnt_sb == path->mnt->mnt_sb &&
|
fix automount/automount race properly
Protection against automount/automount races (two threads hitting the same
referral point at the same time) is based upon do_add_mount() prevention of
identical overmounts - trying to overmount the root of mounted tree with
the same tree fails with -EBUSY. It's unreliable (the other thread might've
mounted something on top of the automount it has triggered) *and* causes
no end of headache for follow_automount() and its caller, since
finish_automount() behaves like do_new_mount() - if the mountpoint to be is
overmounted, it mounts on top what's overmounting it. It's not only wrong
(we want to go into what's overmounting the automount point and quietly
discard what we planned to mount there), it introduces the possibility of
original parent mount getting dropped. That's what 8aef18845266 (VFS: Fix
vfsmount overput on simultaneous automount) deals with, but it can't do
anything about the reliability of conflict detection - if something had
been overmounted the other thread's automount (e.g. that other thread
having stepped into automount in mount(2)), we don't get that -EBUSY and
the result is
referral point under automounted NFS under explicit overmount
under another copy of automounted NFS
What we need is finish_automount() *NOT* digging into overmounts - if it
finds one, it should just quietly discard the thing it was asked to mount.
And don't bother with actually crossing into the results of finish_automount() -
the same loop that calls follow_automount() will do that just fine on the
next iteration.
IOW, instead of calling lock_mount() have finish_automount() do it manually,
_without_ the "move into overmount and retry" part. And leave crossing into
the results to the caller of follow_automount(), which simplifies it a lot.
Moral: if you end up with a lot of glue working around the calling conventions
of something, perhaps these calling conventions are simply wrong...
Fixes: 8aef18845266 (VFS: Fix vfsmount overput on simultaneous automount)
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2020-01-11 10:44:29 -05:00
|
|
|
m->mnt_root == dentry) {
|
2011-01-17 01:47:59 -05:00
|
|
|
err = -ELOOP;
|
fix automount/automount race properly
Protection against automount/automount races (two threads hitting the same
referral point at the same time) is based upon do_add_mount() prevention of
identical overmounts - trying to overmount the root of mounted tree with
the same tree fails with -EBUSY. It's unreliable (the other thread might've
mounted something on top of the automount it has triggered) *and* causes
no end of headache for follow_automount() and its caller, since
finish_automount() behaves like do_new_mount() - if the mountpoint to be is
overmounted, it mounts on top what's overmounting it. It's not only wrong
(we want to go into what's overmounting the automount point and quietly
discard what we planned to mount there), it introduces the possibility of
original parent mount getting dropped. That's what 8aef18845266 (VFS: Fix
vfsmount overput on simultaneous automount) deals with, but it can't do
anything about the reliability of conflict detection - if something had
been overmounted the other thread's automount (e.g. that other thread
having stepped into automount in mount(2)), we don't get that -EBUSY and
the result is
referral point under automounted NFS under explicit overmount
under another copy of automounted NFS
What we need is finish_automount() *NOT* digging into overmounts - if it
finds one, it should just quietly discard the thing it was asked to mount.
And don't bother with actually crossing into the results of finish_automount() -
the same loop that calls follow_automount() will do that just fine on the
next iteration.
IOW, instead of calling lock_mount() have finish_automount() do it manually,
_without_ the "move into overmount and retry" part. And leave crossing into
the results to the caller of follow_automount(), which simplifies it a lot.
Moral: if you end up with a lot of glue working around the calling conventions
of something, perhaps these calling conventions are simply wrong...
Fixes: 8aef18845266 (VFS: Fix vfsmount overput on simultaneous automount)
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2020-01-11 10:44:29 -05:00
|
|
|
goto discard;
|
2011-01-17 01:35:23 -05:00
|
|
|
}
|
|
|
|
|
|
fix automount/automount race properly
Protection against automount/automount races (two threads hitting the same
referral point at the same time) is based upon do_add_mount() prevention of
identical overmounts - trying to overmount the root of mounted tree with
the same tree fails with -EBUSY. It's unreliable (the other thread might've
mounted something on top of the automount it has triggered) *and* causes
no end of headache for follow_automount() and its caller, since
finish_automount() behaves like do_new_mount() - if the mountpoint to be is
overmounted, it mounts on top what's overmounting it. It's not only wrong
(we want to go into what's overmounting the automount point and quietly
discard what we planned to mount there), it introduces the possibility of
original parent mount getting dropped. That's what 8aef18845266 (VFS: Fix
vfsmount overput on simultaneous automount) deals with, but it can't do
anything about the reliability of conflict detection - if something had
been overmounted the other thread's automount (e.g. that other thread
having stepped into automount in mount(2)), we don't get that -EBUSY and
the result is
referral point under automounted NFS under explicit overmount
under another copy of automounted NFS
What we need is finish_automount() *NOT* digging into overmounts - if it
finds one, it should just quietly discard the thing it was asked to mount.
And don't bother with actually crossing into the results of finish_automount() -
the same loop that calls follow_automount() will do that just fine on the
next iteration.
IOW, instead of calling lock_mount() have finish_automount() do it manually,
_without_ the "move into overmount and retry" part. And leave crossing into
the results to the caller of follow_automount(), which simplifies it a lot.
Moral: if you end up with a lot of glue working around the calling conventions
of something, perhaps these calling conventions are simply wrong...
Fixes: 8aef18845266 (VFS: Fix vfsmount overput on simultaneous automount)
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2020-01-11 10:44:29 -05:00
|
|
|
/*
|
|
|
|
|
* we don't want to use lock_mount() - in this case finding something
|
|
|
|
|
* that overmounts our mountpoint to be means "quitely drop what we've
|
|
|
|
|
* got", not "try to mount it on top".
|
|
|
|
|
*/
|
|
|
|
|
inode_lock(dentry->d_inode);
|
|
|
|
|
namespace_lock();
|
|
|
|
|
if (unlikely(cant_mount(dentry))) {
|
|
|
|
|
err = -ENOENT;
|
|
|
|
|
goto discard_locked;
|
|
|
|
|
}
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
if (path_overmounted(path)) {
|
fix automount/automount race properly
Protection against automount/automount races (two threads hitting the same
referral point at the same time) is based upon do_add_mount() prevention of
identical overmounts - trying to overmount the root of mounted tree with
the same tree fails with -EBUSY. It's unreliable (the other thread might've
mounted something on top of the automount it has triggered) *and* causes
no end of headache for follow_automount() and its caller, since
finish_automount() behaves like do_new_mount() - if the mountpoint to be is
overmounted, it mounts on top what's overmounting it. It's not only wrong
(we want to go into what's overmounting the automount point and quietly
discard what we planned to mount there), it introduces the possibility of
original parent mount getting dropped. That's what 8aef18845266 (VFS: Fix
vfsmount overput on simultaneous automount) deals with, but it can't do
anything about the reliability of conflict detection - if something had
been overmounted the other thread's automount (e.g. that other thread
having stepped into automount in mount(2)), we don't get that -EBUSY and
the result is
referral point under automounted NFS under explicit overmount
under another copy of automounted NFS
What we need is finish_automount() *NOT* digging into overmounts - if it
finds one, it should just quietly discard the thing it was asked to mount.
And don't bother with actually crossing into the results of finish_automount() -
the same loop that calls follow_automount() will do that just fine on the
next iteration.
IOW, instead of calling lock_mount() have finish_automount() do it manually,
_without_ the "move into overmount and retry" part. And leave crossing into
the results to the caller of follow_automount(), which simplifies it a lot.
Moral: if you end up with a lot of glue working around the calling conventions
of something, perhaps these calling conventions are simply wrong...
Fixes: 8aef18845266 (VFS: Fix vfsmount overput on simultaneous automount)
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2020-01-11 10:44:29 -05:00
|
|
|
err = 0;
|
|
|
|
|
goto discard_locked;
|
|
|
|
|
}
|
|
|
|
|
mp = get_mountpoint(dentry);
|
2020-01-11 10:14:09 -05:00
|
|
|
if (IS_ERR(mp)) {
|
|
|
|
|
err = PTR_ERR(mp);
|
fix automount/automount race properly
Protection against automount/automount races (two threads hitting the same
referral point at the same time) is based upon do_add_mount() prevention of
identical overmounts - trying to overmount the root of mounted tree with
the same tree fails with -EBUSY. It's unreliable (the other thread might've
mounted something on top of the automount it has triggered) *and* causes
no end of headache for follow_automount() and its caller, since
finish_automount() behaves like do_new_mount() - if the mountpoint to be is
overmounted, it mounts on top what's overmounting it. It's not only wrong
(we want to go into what's overmounting the automount point and quietly
discard what we planned to mount there), it introduces the possibility of
original parent mount getting dropped. That's what 8aef18845266 (VFS: Fix
vfsmount overput on simultaneous automount) deals with, but it can't do
anything about the reliability of conflict detection - if something had
been overmounted the other thread's automount (e.g. that other thread
having stepped into automount in mount(2)), we don't get that -EBUSY and
the result is
referral point under automounted NFS under explicit overmount
under another copy of automounted NFS
What we need is finish_automount() *NOT* digging into overmounts - if it
finds one, it should just quietly discard the thing it was asked to mount.
And don't bother with actually crossing into the results of finish_automount() -
the same loop that calls follow_automount() will do that just fine on the
next iteration.
IOW, instead of calling lock_mount() have finish_automount() do it manually,
_without_ the "move into overmount and retry" part. And leave crossing into
the results to the caller of follow_automount(), which simplifies it a lot.
Moral: if you end up with a lot of glue working around the calling conventions
of something, perhaps these calling conventions are simply wrong...
Fixes: 8aef18845266 (VFS: Fix vfsmount overput on simultaneous automount)
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2020-01-11 10:44:29 -05:00
|
|
|
goto discard_locked;
|
2020-01-11 10:14:09 -05:00
|
|
|
}
|
fix automount/automount race properly
Protection against automount/automount races (two threads hitting the same
referral point at the same time) is based upon do_add_mount() prevention of
identical overmounts - trying to overmount the root of mounted tree with
the same tree fails with -EBUSY. It's unreliable (the other thread might've
mounted something on top of the automount it has triggered) *and* causes
no end of headache for follow_automount() and its caller, since
finish_automount() behaves like do_new_mount() - if the mountpoint to be is
overmounted, it mounts on top what's overmounting it. It's not only wrong
(we want to go into what's overmounting the automount point and quietly
discard what we planned to mount there), it introduces the possibility of
original parent mount getting dropped. That's what 8aef18845266 (VFS: Fix
vfsmount overput on simultaneous automount) deals with, but it can't do
anything about the reliability of conflict detection - if something had
been overmounted the other thread's automount (e.g. that other thread
having stepped into automount in mount(2)), we don't get that -EBUSY and
the result is
referral point under automounted NFS under explicit overmount
under another copy of automounted NFS
What we need is finish_automount() *NOT* digging into overmounts - if it
finds one, it should just quietly discard the thing it was asked to mount.
And don't bother with actually crossing into the results of finish_automount() -
the same loop that calls follow_automount() will do that just fine on the
next iteration.
IOW, instead of calling lock_mount() have finish_automount() do it manually,
_without_ the "move into overmount and retry" part. And leave crossing into
the results to the caller of follow_automount(), which simplifies it a lot.
Moral: if you end up with a lot of glue working around the calling conventions
of something, perhaps these calling conventions are simply wrong...
Fixes: 8aef18845266 (VFS: Fix vfsmount overput on simultaneous automount)
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2020-01-11 10:44:29 -05:00
|
|
|
|
2020-01-11 10:14:09 -05:00
|
|
|
err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
|
|
|
|
|
unlock_mount(mp);
|
fix automount/automount race properly
Protection against automount/automount races (two threads hitting the same
referral point at the same time) is based upon do_add_mount() prevention of
identical overmounts - trying to overmount the root of mounted tree with
the same tree fails with -EBUSY. It's unreliable (the other thread might've
mounted something on top of the automount it has triggered) *and* causes
no end of headache for follow_automount() and its caller, since
finish_automount() behaves like do_new_mount() - if the mountpoint to be is
overmounted, it mounts on top what's overmounting it. It's not only wrong
(we want to go into what's overmounting the automount point and quietly
discard what we planned to mount there), it introduces the possibility of
original parent mount getting dropped. That's what 8aef18845266 (VFS: Fix
vfsmount overput on simultaneous automount) deals with, but it can't do
anything about the reliability of conflict detection - if something had
been overmounted the other thread's automount (e.g. that other thread
having stepped into automount in mount(2)), we don't get that -EBUSY and
the result is
referral point under automounted NFS under explicit overmount
under another copy of automounted NFS
What we need is finish_automount() *NOT* digging into overmounts - if it
finds one, it should just quietly discard the thing it was asked to mount.
And don't bother with actually crossing into the results of finish_automount() -
the same loop that calls follow_automount() will do that just fine on the
next iteration.
IOW, instead of calling lock_mount() have finish_automount() do it manually,
_without_ the "move into overmount and retry" part. And leave crossing into
the results to the caller of follow_automount(), which simplifies it a lot.
Moral: if you end up with a lot of glue working around the calling conventions
of something, perhaps these calling conventions are simply wrong...
Fixes: 8aef18845266 (VFS: Fix vfsmount overput on simultaneous automount)
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2020-01-11 10:44:29 -05:00
|
|
|
if (unlikely(err))
|
|
|
|
|
goto discard;
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
discard_locked:
|
|
|
|
|
namespace_unlock();
|
|
|
|
|
inode_unlock(dentry->d_inode);
|
|
|
|
|
discard:
|
2011-01-17 01:47:59 -05:00
|
|
|
mntput(m);
|
2011-01-17 01:35:23 -05:00
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
2011-01-14 19:10:03 +00:00
|
|
|
/**
|
|
|
|
|
* mnt_set_expiry - Put a mount on an expiration list
|
|
|
|
|
* @mnt: The mount to list.
|
|
|
|
|
* @expiry_list: The list to add the mount to.
|
|
|
|
|
*/
|
|
|
|
|
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
|
|
|
|
|
{
|
2025-05-01 20:40:57 -04:00
|
|
|
read_seqlock_excl(&mount_lock);
|
2011-11-25 00:22:05 -05:00
|
|
|
list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
|
2025-05-01 20:40:57 -04:00
|
|
|
read_sequnlock_excl(&mount_lock);
|
2011-01-14 19:10:03 +00:00
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(mnt_set_expiry);
|
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
|
* process a list of expirable mountpoints with the intent of discarding any
|
|
|
|
|
* mountpoints that aren't in use and haven't been touched since last we came
|
|
|
|
|
* here
|
|
|
|
|
*/
|
|
|
|
|
void mark_mounts_for_expiry(struct list_head *mounts)
|
|
|
|
|
{
|
2011-11-24 21:07:43 -05:00
|
|
|
struct mount *mnt, *next;
|
2005-04-16 15:20:36 -07:00
|
|
|
LIST_HEAD(graveyard);
|
|
|
|
|
|
|
|
|
|
if (list_empty(mounts))
|
|
|
|
|
return;
|
|
|
|
|
|
2013-03-16 15:12:40 -04:00
|
|
|
namespace_lock();
|
2013-09-29 11:24:49 -04:00
|
|
|
lock_mount_hash();
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
|
/* extract from the expiration list every vfsmount that matches the
|
|
|
|
|
* following criteria:
|
2025-04-24 01:45:05 -04:00
|
|
|
* - already mounted
|
2005-04-16 15:20:36 -07:00
|
|
|
* - only referenced by its parent vfsmount
|
|
|
|
|
* - still marked for expiry (marked on the last call here; marks are
|
|
|
|
|
* cleared by mntput())
|
|
|
|
|
*/
|
2011-11-25 00:22:05 -05:00
|
|
|
list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
|
2025-04-24 01:45:05 -04:00
|
|
|
if (!is_mounted(&mnt->mnt))
|
|
|
|
|
continue;
|
2011-11-25 00:57:42 -05:00
|
|
|
if (!xchg(&mnt->mnt_expiry_mark, 1) ||
|
2011-11-24 21:35:16 -05:00
|
|
|
propagate_mount_busy(mnt, 1))
|
2005-04-16 15:20:36 -07:00
|
|
|
continue;
|
2011-11-25 00:22:05 -05:00
|
|
|
list_move(&mnt->mnt_expire, &graveyard);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2008-03-22 00:21:53 -04:00
|
|
|
while (!list_empty(&graveyard)) {
|
2011-11-25 00:22:05 -05:00
|
|
|
mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
|
2011-11-25 00:46:35 -05:00
|
|
|
touch_mnt_namespace(mnt->mnt_ns);
|
2014-12-24 07:20:01 -06:00
|
|
|
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
|
2008-03-22 00:21:53 -04:00
|
|
|
}
|
2013-09-29 11:24:49 -04:00
|
|
|
unlock_mount_hash();
|
2013-03-16 14:42:19 -04:00
|
|
|
namespace_unlock();
|
2006-06-09 09:34:17 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Ripoff of 'select_parent()'
|
|
|
|
|
*
|
|
|
|
|
* search the list of submounts for a given mountpoint, and move any
|
|
|
|
|
* shrinkable submounts to the 'graveyard' list.
|
|
|
|
|
*/
|
2011-11-24 21:15:14 -05:00
|
|
|
static int select_submounts(struct mount *parent, struct list_head *graveyard)
|
2006-06-09 09:34:17 -04:00
|
|
|
{
|
2011-11-24 21:15:14 -05:00
|
|
|
struct mount *this_parent = parent;
|
2006-06-09 09:34:17 -04:00
|
|
|
struct list_head *next;
|
|
|
|
|
int found = 0;
|
|
|
|
|
|
|
|
|
|
repeat:
|
2011-11-24 23:24:33 -05:00
|
|
|
next = this_parent->mnt_mounts.next;
|
2006-06-09 09:34:17 -04:00
|
|
|
resume:
|
2011-11-24 23:24:33 -05:00
|
|
|
while (next != &this_parent->mnt_mounts) {
|
2006-06-09 09:34:17 -04:00
|
|
|
struct list_head *tmp = next;
|
2011-11-24 23:24:33 -05:00
|
|
|
struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
|
2006-06-09 09:34:17 -04:00
|
|
|
|
|
|
|
|
next = tmp->next;
|
2011-11-24 21:15:14 -05:00
|
|
|
if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
|
2005-04-16 15:20:36 -07:00
|
|
|
continue;
|
2006-06-09 09:34:17 -04:00
|
|
|
/*
|
|
|
|
|
* Descend a level if the d_mounts list is non-empty.
|
|
|
|
|
*/
|
2011-11-24 23:24:33 -05:00
|
|
|
if (!list_empty(&mnt->mnt_mounts)) {
|
2006-06-09 09:34:17 -04:00
|
|
|
this_parent = mnt;
|
|
|
|
|
goto repeat;
|
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2011-11-24 21:35:16 -05:00
|
|
|
if (!propagate_mount_busy(mnt, 1)) {
|
2011-11-25 00:22:05 -05:00
|
|
|
list_move_tail(&mnt->mnt_expire, graveyard);
|
2006-06-09 09:34:17 -04:00
|
|
|
found++;
|
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2006-06-09 09:34:17 -04:00
|
|
|
/*
|
|
|
|
|
* All done at this level ... ascend and resume the search
|
|
|
|
|
*/
|
|
|
|
|
if (this_parent != parent) {
|
2011-11-24 23:24:33 -05:00
|
|
|
next = this_parent->mnt_child.next;
|
2011-11-24 22:19:58 -05:00
|
|
|
this_parent = this_parent->mnt_parent;
|
2006-06-09 09:34:17 -04:00
|
|
|
goto resume;
|
|
|
|
|
}
|
|
|
|
|
return found;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* process a list of expirable mountpoints with the intent of discarding any
|
|
|
|
|
* submounts of a specific parent mountpoint
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
*
|
2013-09-29 22:06:07 -04:00
|
|
|
* mount_lock must be held for write
|
2006-06-09 09:34:17 -04:00
|
|
|
*/
|
2013-03-16 14:39:34 -04:00
|
|
|
static void shrink_submounts(struct mount *mnt)
|
2006-06-09 09:34:17 -04:00
|
|
|
{
|
|
|
|
|
LIST_HEAD(graveyard);
|
2011-11-24 21:07:43 -05:00
|
|
|
struct mount *m;
|
2006-06-09 09:34:17 -04:00
|
|
|
|
|
|
|
|
/* extract submounts of 'mountpoint' from the expiration list */
|
2008-03-22 00:46:23 -04:00
|
|
|
while (select_submounts(mnt, &graveyard)) {
|
2008-03-22 00:21:53 -04:00
|
|
|
while (!list_empty(&graveyard)) {
|
2011-11-24 21:07:43 -05:00
|
|
|
m = list_first_entry(&graveyard, struct mount,
|
2011-11-25 00:22:05 -05:00
|
|
|
mnt_expire);
|
2011-11-25 00:46:35 -05:00
|
|
|
touch_mnt_namespace(m->mnt_ns);
|
2014-12-24 07:20:01 -06:00
|
|
|
umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
|
2008-03-22 00:21:53 -04:00
|
|
|
}
|
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2020-09-17 10:22:34 +02:00
|
|
|
static void *copy_mount_options(const void __user * data)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2015-12-14 18:44:44 -05:00
|
|
|
char *copy;
|
2020-07-01 17:46:06 +01:00
|
|
|
unsigned left, offset;
|
2005-11-07 17:16:09 -05:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
if (!data)
|
2015-12-14 18:44:44 -05:00
|
|
|
return NULL;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2015-12-14 18:44:44 -05:00
|
|
|
copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
|
|
|
|
|
if (!copy)
|
|
|
|
|
return ERR_PTR(-ENOMEM);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2020-07-01 17:46:06 +01:00
|
|
|
left = copy_from_user(copy, data, PAGE_SIZE);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2020-07-01 17:46:06 +01:00
|
|
|
/*
|
|
|
|
|
* Not all architectures have an exact copy_from_user(). Resort to
|
|
|
|
|
* byte at a time.
|
|
|
|
|
*/
|
|
|
|
|
offset = PAGE_SIZE - left;
|
|
|
|
|
while (left) {
|
|
|
|
|
char c;
|
|
|
|
|
if (get_user(c, (const char __user *)data + offset))
|
|
|
|
|
break;
|
|
|
|
|
copy[offset] = c;
|
|
|
|
|
left--;
|
|
|
|
|
offset++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (left == PAGE_SIZE) {
|
2015-12-14 18:44:44 -05:00
|
|
|
kfree(copy);
|
|
|
|
|
return ERR_PTR(-EFAULT);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2020-07-01 17:46:06 +01:00
|
|
|
|
2015-12-14 18:44:44 -05:00
|
|
|
return copy;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2020-09-17 10:22:34 +02:00
|
|
|
static char *copy_mount_string(const void __user *data)
|
fs: fix overflow in sys_mount() for in-kernel calls
sys_mount() reads/copies a whole page for its "type" parameter. When
do_mount_root() passes a kernel address that points to an object which is
smaller than a whole page, copy_mount_options() will happily go past this
memory object, possibly dereferencing "wild" pointers that could be in any
state (hence the kmemcheck warning, which shows that parts of the next
page are not even allocated).
(The likelihood of something going wrong here is pretty low -- first of
all this only applies to kernel calls to sys_mount(), which are mostly
found in the boot code. Secondly, I guess if the page was not mapped,
exact_copy_from_user() _would_ in fact handle it correctly because of its
access_ok(), etc. checks.)
But it is much nicer to avoid the dubious reads altogether, by stopping as
soon as we find a NUL byte. Is there a good reason why we can't do
something like this, using the already existing strndup_from_user()?
[akpm@linux-foundation.org: make copy_mount_string() static]
[AV: fix compat mount breakage, which involves undoing akpm's change above]
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: al <al@dizzy.pdmi.ras.ru>
2009-09-18 13:05:45 -07:00
|
|
|
{
|
2019-01-22 12:21:52 +05:30
|
|
|
return data ? strndup_user(data, PATH_MAX) : NULL;
|
fs: fix overflow in sys_mount() for in-kernel calls
sys_mount() reads/copies a whole page for its "type" parameter. When
do_mount_root() passes a kernel address that points to an object which is
smaller than a whole page, copy_mount_options() will happily go past this
memory object, possibly dereferencing "wild" pointers that could be in any
state (hence the kmemcheck warning, which shows that parts of the next
page are not even allocated).
(The likelihood of something going wrong here is pretty low -- first of
all this only applies to kernel calls to sys_mount(), which are mostly
found in the boot code. Secondly, I guess if the page was not mapped,
exact_copy_from_user() _would_ in fact handle it correctly because of its
access_ok(), etc. checks.)
But it is much nicer to avoid the dubious reads altogether, by stopping as
soon as we find a NUL byte. Is there a good reason why we can't do
something like this, using the already existing strndup_from_user()?
[akpm@linux-foundation.org: make copy_mount_string() static]
[AV: fix compat mount breakage, which involves undoing akpm's change above]
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: al <al@dizzy.pdmi.ras.ru>
2009-09-18 13:05:45 -07:00
|
|
|
}
|
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
|
* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
|
|
|
|
|
* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
|
|
|
|
|
*
|
|
|
|
|
* data is a (void *) that can point to any structure up to
|
|
|
|
|
* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
|
|
|
|
|
* information (or be NULL).
|
|
|
|
|
*
|
|
|
|
|
* Pre-0.97 versions of mount() didn't have a flags word.
|
|
|
|
|
* When the flags word was introduced its top half was required
|
|
|
|
|
* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
|
|
|
|
|
* Therefore, if this magic number is present, it carries no information
|
|
|
|
|
* and must be discarded.
|
|
|
|
|
*/
|
2020-07-21 11:12:08 +02:00
|
|
|
int path_mount(const char *dev_name, struct path *path,
|
2012-10-11 11:42:01 -04:00
|
|
|
const char *type_page, unsigned long flags, void *data_page)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2017-07-17 08:45:35 +01:00
|
|
|
unsigned int mnt_flags = 0, sb_flags;
|
2020-06-04 09:41:08 +02:00
|
|
|
int ret;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
|
/* Discard magic */
|
|
|
|
|
if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
|
|
|
|
|
flags &= ~MS_MGC_MSK;
|
|
|
|
|
|
|
|
|
|
/* Basic sanity checks */
|
|
|
|
|
if (data_page)
|
|
|
|
|
((char *)data_page)[PAGE_SIZE - 1] = 0;
|
|
|
|
|
|
2017-07-17 08:45:35 +01:00
|
|
|
if (flags & MS_NOUSER)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
2020-06-04 09:41:08 +02:00
|
|
|
ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
if (!may_mount())
|
|
|
|
|
return -EPERM;
|
2021-08-19 14:56:38 -04:00
|
|
|
if (flags & SB_MANDLOCK)
|
|
|
|
|
warn_mandlock();
|
2009-10-04 21:49:49 +09:00
|
|
|
|
2009-04-19 18:40:43 +02:00
|
|
|
/* Default to relatime unless overriden */
|
|
|
|
|
if (!(flags & MS_NOATIME))
|
|
|
|
|
mnt_flags |= MNT_RELATIME;
|
2009-03-26 17:53:14 +00:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/* Separate the per-mountpoint flags */
|
|
|
|
|
if (flags & MS_NOSUID)
|
|
|
|
|
mnt_flags |= MNT_NOSUID;
|
|
|
|
|
if (flags & MS_NODEV)
|
|
|
|
|
mnt_flags |= MNT_NODEV;
|
|
|
|
|
if (flags & MS_NOEXEC)
|
|
|
|
|
mnt_flags |= MNT_NOEXEC;
|
2006-01-09 20:52:17 -08:00
|
|
|
if (flags & MS_NOATIME)
|
|
|
|
|
mnt_flags |= MNT_NOATIME;
|
|
|
|
|
if (flags & MS_NODIRATIME)
|
|
|
|
|
mnt_flags |= MNT_NODIRATIME;
|
2009-03-26 17:49:56 +00:00
|
|
|
if (flags & MS_STRICTATIME)
|
|
|
|
|
mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
|
2018-04-20 13:35:02 +01:00
|
|
|
if (flags & MS_RDONLY)
|
2008-02-15 14:38:00 -08:00
|
|
|
mnt_flags |= MNT_READONLY;
|
2020-08-27 11:09:46 -06:00
|
|
|
if (flags & MS_NOSYMFOLLOW)
|
|
|
|
|
mnt_flags |= MNT_NOSYMFOLLOW;
|
2006-01-09 20:52:17 -08:00
|
|
|
|
2014-07-28 17:36:04 -07:00
|
|
|
/* The default atime for remount is preservation */
|
|
|
|
|
if ((flags & MS_REMOUNT) &&
|
|
|
|
|
((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
|
|
|
|
|
MS_STRICTATIME)) == 0)) {
|
|
|
|
|
mnt_flags &= ~MNT_ATIME_MASK;
|
2020-06-04 09:41:08 +02:00
|
|
|
mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
|
2014-07-28 17:36:04 -07:00
|
|
|
}
|
|
|
|
|
|
2017-07-17 08:45:35 +01:00
|
|
|
sb_flags = flags & (SB_RDONLY |
|
|
|
|
|
SB_SYNCHRONOUS |
|
|
|
|
|
SB_MANDLOCK |
|
|
|
|
|
SB_DIRSYNC |
|
|
|
|
|
SB_SILENT |
|
2017-10-08 00:28:21 -04:00
|
|
|
SB_POSIXACL |
|
2017-10-11 07:01:31 +02:00
|
|
|
SB_LAZYTIME |
|
2017-10-08 00:28:21 -04:00
|
|
|
SB_I_VERSION);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2018-11-01 23:07:25 +00:00
|
|
|
if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
|
2020-06-04 09:41:08 +02:00
|
|
|
return do_reconfigure_mnt(path, mnt_flags);
|
|
|
|
|
if (flags & MS_REMOUNT)
|
|
|
|
|
return do_remount(path, flags, sb_flags, mnt_flags, data_page);
|
|
|
|
|
if (flags & MS_BIND)
|
|
|
|
|
return do_loopback(path, dev_name, flags & MS_REC);
|
|
|
|
|
if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
|
|
|
|
|
return do_change_type(path, flags);
|
|
|
|
|
if (flags & MS_MOVE)
|
|
|
|
|
return do_move_mount_old(path, dev_name);
|
|
|
|
|
|
|
|
|
|
return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
|
|
|
|
|
data_page);
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-14 00:14:00 +09:00
|
|
|
int do_mount(const char *dev_name, const char __user *dir_name,
|
2020-06-04 09:41:08 +02:00
|
|
|
const char *type_page, unsigned long flags, void *data_page)
|
|
|
|
|
{
|
|
|
|
|
struct path path;
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
ret = path_mount(dev_name, &path, type_page, flags, data_page);
|
2008-08-02 00:51:11 -04:00
|
|
|
path_put(&path);
|
2020-06-04 09:41:08 +02:00
|
|
|
return ret;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2016-08-08 14:37:37 -05:00
|
|
|
static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
|
|
|
|
|
{
|
|
|
|
|
return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void dec_mnt_namespaces(struct ucounts *ucounts)
|
|
|
|
|
{
|
|
|
|
|
dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
|
|
|
|
|
}
|
|
|
|
|
|
2012-07-26 21:08:32 -07:00
|
|
|
static void free_mnt_ns(struct mnt_namespace *ns)
|
|
|
|
|
{
|
2019-01-30 13:30:21 -05:00
|
|
|
if (!is_anon_ns(ns))
|
|
|
|
|
ns_free_inum(&ns->ns);
|
2016-08-08 14:37:37 -05:00
|
|
|
dec_mnt_namespaces(ns->ucounts);
|
2024-06-24 11:49:46 -04:00
|
|
|
mnt_ns_tree_remove(ns);
|
2012-07-26 21:08:32 -07:00
|
|
|
}
|
|
|
|
|
|
2010-03-07 18:49:36 -08:00
|
|
|
/*
|
|
|
|
|
* Assign a sequence number so we can detect when we attempt to bind
|
|
|
|
|
* mount a reference to an older mount namespace into the current
|
|
|
|
|
* mount namespace, preventing reference counting loops. A 64bit
|
|
|
|
|
* number incrementing at 10Ghz will take 12,427 years to wrap which
|
|
|
|
|
* is effectively never, so we can ignore the possibility.
|
|
|
|
|
*/
|
|
|
|
|
static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
|
|
|
|
|
|
2019-01-30 13:30:21 -05:00
|
|
|
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
|
2009-06-22 15:09:13 -04:00
|
|
|
{
|
|
|
|
|
struct mnt_namespace *new_ns;
|
2016-08-08 14:37:37 -05:00
|
|
|
struct ucounts *ucounts;
|
2011-06-15 10:21:48 -07:00
|
|
|
int ret;
|
2009-06-22 15:09:13 -04:00
|
|
|
|
2016-08-08 14:37:37 -05:00
|
|
|
ucounts = inc_mnt_namespaces(user_ns);
|
|
|
|
|
if (!ucounts)
|
2016-09-22 13:08:36 -05:00
|
|
|
return ERR_PTR(-ENOSPC);
|
2016-08-08 14:37:37 -05:00
|
|
|
|
2021-09-02 14:55:27 -07:00
|
|
|
new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
|
2016-08-08 14:37:37 -05:00
|
|
|
if (!new_ns) {
|
|
|
|
|
dec_mnt_namespaces(ucounts);
|
2009-06-22 15:09:13 -04:00
|
|
|
return ERR_PTR(-ENOMEM);
|
2016-08-08 14:37:37 -05:00
|
|
|
}
|
2019-01-30 13:30:21 -05:00
|
|
|
if (!anon) {
|
|
|
|
|
ret = ns_alloc_inum(&new_ns->ns);
|
|
|
|
|
if (ret) {
|
|
|
|
|
kfree(new_ns);
|
|
|
|
|
dec_mnt_namespaces(ucounts);
|
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
|
}
|
2011-06-15 10:21:48 -07:00
|
|
|
}
|
2014-11-01 02:32:53 -04:00
|
|
|
new_ns->ns.ops = &mntns_operations;
|
2019-01-30 13:30:21 -05:00
|
|
|
if (!anon)
|
2024-10-07 10:52:37 +02:00
|
|
|
new_ns->seq = atomic64_inc_return(&mnt_ns_seq);
|
2020-08-03 13:16:42 +03:00
|
|
|
refcount_set(&new_ns->ns.count, 1);
|
2024-06-24 11:49:46 -04:00
|
|
|
refcount_set(&new_ns->passive, 1);
|
2023-10-25 16:02:00 +02:00
|
|
|
new_ns->mounts = RB_ROOT;
|
2024-12-13 00:03:44 +01:00
|
|
|
INIT_LIST_HEAD(&new_ns->mnt_ns_list);
|
2024-06-24 11:49:46 -04:00
|
|
|
RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
|
2009-06-22 15:09:13 -04:00
|
|
|
init_waitqueue_head(&new_ns->poll);
|
2012-07-26 21:08:32 -07:00
|
|
|
new_ns->user_ns = get_user_ns(user_ns);
|
2016-08-08 14:37:37 -05:00
|
|
|
new_ns->ucounts = ucounts;
|
2009-06-22 15:09:13 -04:00
|
|
|
return new_ns;
|
|
|
|
|
}
|
|
|
|
|
|
2016-06-20 20:42:34 +02:00
|
|
|
__latent_entropy
|
2013-09-28 20:47:57 -04:00
|
|
|
struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
|
|
|
|
|
struct user_namespace *user_ns, struct fs_struct *new_fs)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-12-08 02:37:56 -08:00
|
|
|
struct mnt_namespace *new_ns;
|
2008-05-10 20:44:54 -04:00
|
|
|
struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
|
2011-11-24 18:57:30 -05:00
|
|
|
struct mount *p, *q;
|
2013-09-28 20:47:57 -04:00
|
|
|
struct mount *old;
|
2011-11-24 20:55:08 -05:00
|
|
|
struct mount *new;
|
2012-07-31 13:13:04 -07:00
|
|
|
int copy_flags;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2013-09-28 20:47:57 -04:00
|
|
|
BUG_ON(!ns);
|
|
|
|
|
|
|
|
|
|
if (likely(!(flags & CLONE_NEWNS))) {
|
|
|
|
|
get_mnt_ns(ns);
|
|
|
|
|
return ns;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
old = ns->root;
|
|
|
|
|
|
2019-01-30 13:30:21 -05:00
|
|
|
new_ns = alloc_mnt_ns(user_ns, false);
|
2009-06-22 15:09:13 -04:00
|
|
|
if (IS_ERR(new_ns))
|
|
|
|
|
return new_ns;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2013-03-16 15:12:40 -04:00
|
|
|
namespace_lock();
|
2005-04-16 15:20:36 -07:00
|
|
|
/* First pass: copy the tree topology */
|
2013-03-30 01:35:18 -07:00
|
|
|
copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
|
2013-09-28 20:47:57 -04:00
|
|
|
if (user_ns != ns->user_ns)
|
2019-01-30 13:15:45 -05:00
|
|
|
copy_flags |= CL_SHARED_TO_SLAVE;
|
2012-07-31 13:13:04 -07:00
|
|
|
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
|
2012-06-25 12:55:18 +01:00
|
|
|
if (IS_ERR(new)) {
|
2013-03-16 14:49:45 -04:00
|
|
|
namespace_unlock();
|
2024-10-16 19:49:48 +02:00
|
|
|
ns_free_inum(&new_ns->ns);
|
|
|
|
|
dec_mnt_namespaces(new_ns->ucounts);
|
|
|
|
|
mnt_ns_release(new_ns);
|
2012-06-25 12:55:18 +01:00
|
|
|
return ERR_CAST(new);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2019-01-30 13:15:45 -05:00
|
|
|
if (user_ns != ns->user_ns) {
|
|
|
|
|
lock_mount_hash();
|
|
|
|
|
lock_mnt_tree(new);
|
|
|
|
|
unlock_mount_hash();
|
|
|
|
|
}
|
2011-12-06 13:32:36 -05:00
|
|
|
new_ns->root = new;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
|
|
|
|
|
* as belonging to new namespace. We have already acquired a private
|
|
|
|
|
* fs_struct, so tsk->fs->lock is not needed.
|
|
|
|
|
*/
|
2011-11-25 03:06:56 -05:00
|
|
|
p = old;
|
2011-11-24 20:55:08 -05:00
|
|
|
q = new;
|
2005-04-16 15:20:36 -07:00
|
|
|
while (p) {
|
2023-10-25 16:02:00 +02:00
|
|
|
mnt_add_to_ns(new_ns, q);
|
|
|
|
|
new_ns->nr_mounts++;
|
2013-09-28 20:47:57 -04:00
|
|
|
if (new_fs) {
|
|
|
|
|
if (&p->mnt == new_fs->root.mnt) {
|
|
|
|
|
new_fs->root.mnt = mntget(&q->mnt);
|
2011-11-24 18:57:30 -05:00
|
|
|
rootmnt = &p->mnt;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2013-09-28 20:47:57 -04:00
|
|
|
if (&p->mnt == new_fs->pwd.mnt) {
|
|
|
|
|
new_fs->pwd.mnt = mntget(&q->mnt);
|
2011-11-24 18:57:30 -05:00
|
|
|
pwdmnt = &p->mnt;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
}
|
2011-11-25 03:06:56 -05:00
|
|
|
p = next_mnt(p, old);
|
|
|
|
|
q = next_mnt(q, new);
|
2013-03-30 01:35:18 -07:00
|
|
|
if (!q)
|
|
|
|
|
break;
|
2022-11-24 22:55:57 -05:00
|
|
|
// an mntns binding we'd skipped?
|
2013-03-30 01:35:18 -07:00
|
|
|
while (p->mnt.mnt_root != q->mnt.mnt_root)
|
2022-11-24 22:55:57 -05:00
|
|
|
p = next_mnt(skip_mnt_tree(p), old);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2013-03-16 14:49:45 -04:00
|
|
|
namespace_unlock();
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
|
if (rootmnt)
|
2011-01-14 22:30:21 -05:00
|
|
|
mntput(rootmnt);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (pwdmnt)
|
2011-01-14 22:30:21 -05:00
|
|
|
mntput(pwdmnt);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2024-12-13 00:03:41 +01:00
|
|
|
mnt_ns_tree_add(new_ns);
|
2006-02-07 12:59:00 -08:00
|
|
|
return new_ns;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2019-01-30 13:30:21 -05:00
|
|
|
struct dentry *mount_subtree(struct vfsmount *m, const char *name)
|
2011-11-16 21:43:59 -05:00
|
|
|
{
|
2019-01-30 13:30:21 -05:00
|
|
|
struct mount *mnt = real_mount(m);
|
2011-11-16 21:43:59 -05:00
|
|
|
struct mnt_namespace *ns;
|
2011-11-22 12:31:21 -05:00
|
|
|
struct super_block *s;
|
2011-11-16 21:43:59 -05:00
|
|
|
struct path path;
|
|
|
|
|
int err;
|
|
|
|
|
|
2019-01-30 13:30:21 -05:00
|
|
|
ns = alloc_mnt_ns(&init_user_ns, true);
|
|
|
|
|
if (IS_ERR(ns)) {
|
|
|
|
|
mntput(m);
|
2011-11-16 21:43:59 -05:00
|
|
|
return ERR_CAST(ns);
|
2019-01-30 13:30:21 -05:00
|
|
|
}
|
|
|
|
|
ns->root = mnt;
|
2023-10-25 16:02:00 +02:00
|
|
|
ns->nr_mounts++;
|
|
|
|
|
mnt_add_to_ns(ns, mnt);
|
2011-11-16 21:43:59 -05:00
|
|
|
|
2019-01-30 13:30:21 -05:00
|
|
|
err = vfs_path_lookup(m->mnt_root, m,
|
2011-11-16 21:43:59 -05:00
|
|
|
name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
|
|
|
|
|
|
|
|
|
|
put_mnt_ns(ns);
|
|
|
|
|
|
|
|
|
|
if (err)
|
|
|
|
|
return ERR_PTR(err);
|
|
|
|
|
|
|
|
|
|
/* trade a vfsmount reference for active sb one */
|
2011-11-22 12:31:21 -05:00
|
|
|
s = path.mnt->mnt_sb;
|
|
|
|
|
atomic_inc(&s->s_active);
|
2011-11-16 21:43:59 -05:00
|
|
|
mntput(path.mnt);
|
|
|
|
|
/* lock the sucker */
|
2011-11-22 12:31:21 -05:00
|
|
|
down_write(&s->s_umount);
|
2011-11-16 21:43:59 -05:00
|
|
|
/* ... and return the root of (sub)tree on it */
|
|
|
|
|
return path.dentry;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(mount_subtree);
|
|
|
|
|
|
init: use do_mount() instead of ksys_mount()
In prepare_namespace(), do_mount() can be used instead of ksys_mount()
as the first and third argument are const strings in the kernel, the
second and fourth argument are passed through anyway, and the fifth
argument is NULL.
In do_mount_root(), ksys_mount() is called with the first and third
argument being already kernelspace strings, which do not need to be
copied over from userspace to kernelspace (again). The second and
fourth arguments are passed through to do_mount() anyway. The fifth
argument, while already residing in kernelspace, needs to be put into
a page of its own. Then, do_mount() can be used instead of
ksys_mount().
Once this is done, there are no in-kernel users to ksys_mount() left,
which can therefore be removed.
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
2018-10-23 22:41:09 +02:00
|
|
|
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
|
|
|
|
|
char __user *, type, unsigned long, flags, void __user *, data)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
fs: fix overflow in sys_mount() for in-kernel calls
sys_mount() reads/copies a whole page for its "type" parameter. When
do_mount_root() passes a kernel address that points to an object which is
smaller than a whole page, copy_mount_options() will happily go past this
memory object, possibly dereferencing "wild" pointers that could be in any
state (hence the kmemcheck warning, which shows that parts of the next
page are not even allocated).
(The likelihood of something going wrong here is pretty low -- first of
all this only applies to kernel calls to sys_mount(), which are mostly
found in the boot code. Secondly, I guess if the page was not mapped,
exact_copy_from_user() _would_ in fact handle it correctly because of its
access_ok(), etc. checks.)
But it is much nicer to avoid the dubious reads altogether, by stopping as
soon as we find a NUL byte. Is there a good reason why we can't do
something like this, using the already existing strndup_from_user()?
[akpm@linux-foundation.org: make copy_mount_string() static]
[AV: fix compat mount breakage, which involves undoing akpm's change above]
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: al <al@dizzy.pdmi.ras.ru>
2009-09-18 13:05:45 -07:00
|
|
|
int ret;
|
|
|
|
|
char *kernel_type;
|
|
|
|
|
char *kernel_dev;
|
2015-12-14 18:44:44 -05:00
|
|
|
void *options;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2014-08-28 11:26:03 -06:00
|
|
|
kernel_type = copy_mount_string(type);
|
|
|
|
|
ret = PTR_ERR(kernel_type);
|
|
|
|
|
if (IS_ERR(kernel_type))
|
fs: fix overflow in sys_mount() for in-kernel calls
sys_mount() reads/copies a whole page for its "type" parameter. When
do_mount_root() passes a kernel address that points to an object which is
smaller than a whole page, copy_mount_options() will happily go past this
memory object, possibly dereferencing "wild" pointers that could be in any
state (hence the kmemcheck warning, which shows that parts of the next
page are not even allocated).
(The likelihood of something going wrong here is pretty low -- first of
all this only applies to kernel calls to sys_mount(), which are mostly
found in the boot code. Secondly, I guess if the page was not mapped,
exact_copy_from_user() _would_ in fact handle it correctly because of its
access_ok(), etc. checks.)
But it is much nicer to avoid the dubious reads altogether, by stopping as
soon as we find a NUL byte. Is there a good reason why we can't do
something like this, using the already existing strndup_from_user()?
[akpm@linux-foundation.org: make copy_mount_string() static]
[AV: fix compat mount breakage, which involves undoing akpm's change above]
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: al <al@dizzy.pdmi.ras.ru>
2009-09-18 13:05:45 -07:00
|
|
|
goto out_type;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2014-08-28 11:26:03 -06:00
|
|
|
kernel_dev = copy_mount_string(dev_name);
|
|
|
|
|
ret = PTR_ERR(kernel_dev);
|
|
|
|
|
if (IS_ERR(kernel_dev))
|
fs: fix overflow in sys_mount() for in-kernel calls
sys_mount() reads/copies a whole page for its "type" parameter. When
do_mount_root() passes a kernel address that points to an object which is
smaller than a whole page, copy_mount_options() will happily go past this
memory object, possibly dereferencing "wild" pointers that could be in any
state (hence the kmemcheck warning, which shows that parts of the next
page are not even allocated).
(The likelihood of something going wrong here is pretty low -- first of
all this only applies to kernel calls to sys_mount(), which are mostly
found in the boot code. Secondly, I guess if the page was not mapped,
exact_copy_from_user() _would_ in fact handle it correctly because of its
access_ok(), etc. checks.)
But it is much nicer to avoid the dubious reads altogether, by stopping as
soon as we find a NUL byte. Is there a good reason why we can't do
something like this, using the already existing strndup_from_user()?
[akpm@linux-foundation.org: make copy_mount_string() static]
[AV: fix compat mount breakage, which involves undoing akpm's change above]
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: al <al@dizzy.pdmi.ras.ru>
2009-09-18 13:05:45 -07:00
|
|
|
goto out_dev;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2015-12-14 18:44:44 -05:00
|
|
|
options = copy_mount_options(data);
|
|
|
|
|
ret = PTR_ERR(options);
|
|
|
|
|
if (IS_ERR(options))
|
fs: fix overflow in sys_mount() for in-kernel calls
sys_mount() reads/copies a whole page for its "type" parameter. When
do_mount_root() passes a kernel address that points to an object which is
smaller than a whole page, copy_mount_options() will happily go past this
memory object, possibly dereferencing "wild" pointers that could be in any
state (hence the kmemcheck warning, which shows that parts of the next
page are not even allocated).
(The likelihood of something going wrong here is pretty low -- first of
all this only applies to kernel calls to sys_mount(), which are mostly
found in the boot code. Secondly, I guess if the page was not mapped,
exact_copy_from_user() _would_ in fact handle it correctly because of its
access_ok(), etc. checks.)
But it is much nicer to avoid the dubious reads altogether, by stopping as
soon as we find a NUL byte. Is there a good reason why we can't do
something like this, using the already existing strndup_from_user()?
[akpm@linux-foundation.org: make copy_mount_string() static]
[AV: fix compat mount breakage, which involves undoing akpm's change above]
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: al <al@dizzy.pdmi.ras.ru>
2009-09-18 13:05:45 -07:00
|
|
|
goto out_data;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2015-12-14 18:44:44 -05:00
|
|
|
ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2015-12-14 18:44:44 -05:00
|
|
|
kfree(options);
|
fs: fix overflow in sys_mount() for in-kernel calls
sys_mount() reads/copies a whole page for its "type" parameter. When
do_mount_root() passes a kernel address that points to an object which is
smaller than a whole page, copy_mount_options() will happily go past this
memory object, possibly dereferencing "wild" pointers that could be in any
state (hence the kmemcheck warning, which shows that parts of the next
page are not even allocated).
(The likelihood of something going wrong here is pretty low -- first of
all this only applies to kernel calls to sys_mount(), which are mostly
found in the boot code. Secondly, I guess if the page was not mapped,
exact_copy_from_user() _would_ in fact handle it correctly because of its
access_ok(), etc. checks.)
But it is much nicer to avoid the dubious reads altogether, by stopping as
soon as we find a NUL byte. Is there a good reason why we can't do
something like this, using the already existing strndup_from_user()?
[akpm@linux-foundation.org: make copy_mount_string() static]
[AV: fix compat mount breakage, which involves undoing akpm's change above]
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: al <al@dizzy.pdmi.ras.ru>
2009-09-18 13:05:45 -07:00
|
|
|
out_data:
|
|
|
|
|
kfree(kernel_dev);
|
|
|
|
|
out_dev:
|
|
|
|
|
kfree(kernel_type);
|
|
|
|
|
out_type:
|
|
|
|
|
return ret;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2021-06-01 11:33:59 +02:00
|
|
|
#define FSMOUNT_VALID_FLAGS \
|
|
|
|
|
(MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \
|
|
|
|
|
MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME | \
|
|
|
|
|
MOUNT_ATTR_NOSYMFOLLOW)
|
2021-01-21 14:19:52 +01:00
|
|
|
|
2021-01-21 14:19:54 +01:00
|
|
|
#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
|
|
|
|
|
#define MOUNT_SETATTR_PROPAGATION_FLAGS \
|
|
|
|
|
(MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)
|
|
|
|
|
|
2021-01-21 14:19:52 +01:00
|
|
|
static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
|
|
|
|
|
{
|
|
|
|
|
unsigned int mnt_flags = 0;
|
|
|
|
|
|
|
|
|
|
if (attr_flags & MOUNT_ATTR_RDONLY)
|
|
|
|
|
mnt_flags |= MNT_READONLY;
|
|
|
|
|
if (attr_flags & MOUNT_ATTR_NOSUID)
|
|
|
|
|
mnt_flags |= MNT_NOSUID;
|
|
|
|
|
if (attr_flags & MOUNT_ATTR_NODEV)
|
|
|
|
|
mnt_flags |= MNT_NODEV;
|
|
|
|
|
if (attr_flags & MOUNT_ATTR_NOEXEC)
|
|
|
|
|
mnt_flags |= MNT_NOEXEC;
|
|
|
|
|
if (attr_flags & MOUNT_ATTR_NODIRATIME)
|
|
|
|
|
mnt_flags |= MNT_NODIRATIME;
|
2021-06-01 11:33:59 +02:00
|
|
|
if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW)
|
|
|
|
|
mnt_flags |= MNT_NOSYMFOLLOW;
|
2021-01-21 14:19:52 +01:00
|
|
|
|
|
|
|
|
return mnt_flags;
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-05 17:40:30 +00:00
|
|
|
/*
|
2018-11-01 23:36:14 +00:00
|
|
|
* Create a kernel mount representation for a new, prepared superblock
|
|
|
|
|
* (specified by fs_fd) and attach to an open_tree-like file descriptor.
|
|
|
|
|
*/
|
|
|
|
|
SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
|
|
|
|
|
unsigned int, attr_flags)
|
|
|
|
|
{
|
|
|
|
|
struct mnt_namespace *ns;
|
|
|
|
|
struct fs_context *fc;
|
|
|
|
|
struct file *file;
|
|
|
|
|
struct path newmount;
|
|
|
|
|
struct mount *mnt;
|
|
|
|
|
unsigned int mnt_flags = 0;
|
|
|
|
|
long ret;
|
|
|
|
|
|
|
|
|
|
if (!may_mount())
|
|
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
|
|
if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
2021-01-21 14:19:52 +01:00
|
|
|
if (attr_flags & ~FSMOUNT_VALID_FLAGS)
|
2018-11-01 23:36:14 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
2021-01-21 14:19:52 +01:00
|
|
|
mnt_flags = attr_flags_to_mnt_flags(attr_flags);
|
2018-11-01 23:36:14 +00:00
|
|
|
|
|
|
|
|
switch (attr_flags & MOUNT_ATTR__ATIME) {
|
|
|
|
|
case MOUNT_ATTR_STRICTATIME:
|
|
|
|
|
break;
|
|
|
|
|
case MOUNT_ATTR_NOATIME:
|
|
|
|
|
mnt_flags |= MNT_NOATIME;
|
|
|
|
|
break;
|
|
|
|
|
case MOUNT_ATTR_RELATIME:
|
|
|
|
|
mnt_flags |= MNT_RELATIME;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-19 21:19:02 -04:00
|
|
|
CLASS(fd, f)(fs_fd);
|
|
|
|
|
if (fd_empty(f))
|
2018-11-01 23:36:14 +00:00
|
|
|
return -EBADF;
|
|
|
|
|
|
2024-05-31 14:12:01 -04:00
|
|
|
if (fd_file(f)->f_op != &fscontext_fops)
|
2024-07-19 21:19:02 -04:00
|
|
|
return -EINVAL;
|
2018-11-01 23:36:14 +00:00
|
|
|
|
2024-05-31 14:12:01 -04:00
|
|
|
fc = fd_file(f)->private_data;
|
2018-11-01 23:36:14 +00:00
|
|
|
|
|
|
|
|
ret = mutex_lock_interruptible(&fc->uapi_mutex);
|
|
|
|
|
if (ret < 0)
|
2024-07-19 21:19:02 -04:00
|
|
|
return ret;
|
2018-11-01 23:36:14 +00:00
|
|
|
|
|
|
|
|
/* There must be a valid superblock or we can't mount it */
|
|
|
|
|
ret = -EINVAL;
|
|
|
|
|
if (!fc->root)
|
|
|
|
|
goto err_unlock;
|
|
|
|
|
|
|
|
|
|
ret = -EPERM;
|
|
|
|
|
if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
|
|
|
|
|
pr_warn("VFS: Mount too revealing\n");
|
|
|
|
|
goto err_unlock;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ret = -EBUSY;
|
|
|
|
|
if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
|
|
|
|
|
goto err_unlock;
|
|
|
|
|
|
2021-08-19 14:56:38 -04:00
|
|
|
if (fc->sb_flags & SB_MANDLOCK)
|
|
|
|
|
warn_mandlock();
|
2018-11-01 23:36:14 +00:00
|
|
|
|
|
|
|
|
newmount.mnt = vfs_create_mount(fc);
|
|
|
|
|
if (IS_ERR(newmount.mnt)) {
|
|
|
|
|
ret = PTR_ERR(newmount.mnt);
|
|
|
|
|
goto err_unlock;
|
|
|
|
|
}
|
|
|
|
|
newmount.dentry = dget(fc->root);
|
|
|
|
|
newmount.mnt->mnt_flags = mnt_flags;
|
|
|
|
|
|
|
|
|
|
/* We've done the mount bit - now move the file context into more or
|
|
|
|
|
* less the same state as if we'd done an fspick(). We don't want to
|
|
|
|
|
* do any memory allocation or anything like that at this point as we
|
|
|
|
|
* don't want to have to handle any errors incurred.
|
|
|
|
|
*/
|
|
|
|
|
vfs_clean_context(fc);
|
|
|
|
|
|
|
|
|
|
ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
|
|
|
|
|
if (IS_ERR(ns)) {
|
|
|
|
|
ret = PTR_ERR(ns);
|
|
|
|
|
goto err_path;
|
|
|
|
|
}
|
|
|
|
|
mnt = real_mount(newmount.mnt);
|
|
|
|
|
ns->root = mnt;
|
2023-10-25 16:02:00 +02:00
|
|
|
ns->nr_mounts = 1;
|
|
|
|
|
mnt_add_to_ns(ns, mnt);
|
2019-06-12 11:43:13 -07:00
|
|
|
mntget(newmount.mnt);
|
2018-11-01 23:36:14 +00:00
|
|
|
|
|
|
|
|
/* Attach to an apparent O_PATH fd with a note that we need to unmount
|
|
|
|
|
* it, not just simply put it.
|
|
|
|
|
*/
|
|
|
|
|
file = dentry_open(&newmount, O_PATH, fc->cred);
|
|
|
|
|
if (IS_ERR(file)) {
|
|
|
|
|
dissolve_on_fput(newmount.mnt);
|
|
|
|
|
ret = PTR_ERR(file);
|
|
|
|
|
goto err_path;
|
|
|
|
|
}
|
|
|
|
|
file->f_mode |= FMODE_NEED_UNMOUNT;
|
|
|
|
|
|
|
|
|
|
ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
|
|
|
|
|
if (ret >= 0)
|
|
|
|
|
fd_install(ret, file);
|
|
|
|
|
else
|
|
|
|
|
fput(file);
|
|
|
|
|
|
|
|
|
|
err_path:
|
|
|
|
|
path_put(&newmount);
|
|
|
|
|
err_unlock:
|
|
|
|
|
mutex_unlock(&fc->uapi_mutex);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-21 14:13:07 +01:00
|
|
|
static inline int vfs_move_mount(struct path *from_path, struct path *to_path,
|
|
|
|
|
enum mnt_tree_flags_t mflags)
|
|
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
ret = security_move_mount(from_path, to_path);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
|
|
if (mflags & MNT_TREE_PROPAGATION)
|
|
|
|
|
return do_set_group(from_path, to_path);
|
|
|
|
|
|
|
|
|
|
return do_move_mount(from_path, to_path, mflags);
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-01 23:36:14 +00:00
|
|
|
/*
|
|
|
|
|
* Move a mount from one place to another. In combination with
|
|
|
|
|
* fsopen()/fsmount() this is used to install a new mount and in combination
|
|
|
|
|
* with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
|
|
|
|
|
* a mount subtree.
|
2018-11-05 17:40:30 +00:00
|
|
|
*
|
|
|
|
|
* Note the flags value is a combination of MOVE_MOUNT_* flags.
|
|
|
|
|
*/
|
|
|
|
|
SYSCALL_DEFINE5(move_mount,
|
2019-10-15 11:35:02 +01:00
|
|
|
int, from_dfd, const char __user *, from_pathname,
|
|
|
|
|
int, to_dfd, const char __user *, to_pathname,
|
2018-11-05 17:40:30 +00:00
|
|
|
unsigned int, flags)
|
|
|
|
|
{
|
2025-02-21 14:13:07 +01:00
|
|
|
struct path to_path __free(path_put) = {};
|
|
|
|
|
struct path from_path __free(path_put) = {};
|
|
|
|
|
struct filename *to_name __free(putname) = NULL;
|
|
|
|
|
struct filename *from_name __free(putname) = NULL;
|
|
|
|
|
unsigned int lflags, uflags;
|
|
|
|
|
enum mnt_tree_flags_t mflags = 0;
|
2018-11-05 17:40:30 +00:00
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
|
|
if (!may_mount())
|
|
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
|
|
if (flags & ~MOVE_MOUNT__MASK)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) ==
|
|
|
|
|
(MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
2025-02-21 14:13:07 +01:00
|
|
|
if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION;
|
|
|
|
|
if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH;
|
|
|
|
|
|
2018-11-05 17:40:30 +00:00
|
|
|
lflags = 0;
|
|
|
|
|
if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW;
|
|
|
|
|
if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
|
2025-02-26 09:11:54 +01:00
|
|
|
uflags = 0;
|
2025-02-21 14:13:07 +01:00
|
|
|
if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH;
|
|
|
|
|
from_name = getname_maybe_null(from_pathname, uflags);
|
|
|
|
|
if (IS_ERR(from_name))
|
|
|
|
|
return PTR_ERR(from_name);
|
2018-11-05 17:40:30 +00:00
|
|
|
|
|
|
|
|
lflags = 0;
|
|
|
|
|
if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW;
|
|
|
|
|
if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
|
2025-02-26 09:11:54 +01:00
|
|
|
uflags = 0;
|
2025-02-21 14:13:07 +01:00
|
|
|
if (flags & MOVE_MOUNT_T_EMPTY_PATH) uflags = AT_EMPTY_PATH;
|
|
|
|
|
to_name = getname_maybe_null(to_pathname, uflags);
|
|
|
|
|
if (IS_ERR(to_name))
|
|
|
|
|
return PTR_ERR(to_name);
|
|
|
|
|
|
|
|
|
|
if (!to_name && to_dfd >= 0) {
|
|
|
|
|
CLASS(fd_raw, f_to)(to_dfd);
|
|
|
|
|
if (fd_empty(f_to))
|
|
|
|
|
return -EBADF;
|
|
|
|
|
|
|
|
|
|
to_path = fd_file(f_to)->f_path;
|
|
|
|
|
path_get(&to_path);
|
|
|
|
|
} else {
|
|
|
|
|
ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
2018-11-05 17:40:30 +00:00
|
|
|
|
2025-02-21 14:13:07 +01:00
|
|
|
if (!from_name && from_dfd >= 0) {
|
|
|
|
|
CLASS(fd_raw, f_from)(from_dfd);
|
|
|
|
|
if (fd_empty(f_from))
|
|
|
|
|
return -EBADF;
|
2018-11-05 17:40:30 +00:00
|
|
|
|
2025-02-21 14:13:07 +01:00
|
|
|
return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags);
|
|
|
|
|
}
|
2018-11-05 17:40:30 +00:00
|
|
|
|
2025-02-21 14:13:07 +01:00
|
|
|
ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
2018-11-05 17:40:30 +00:00
|
|
|
|
2025-02-21 14:13:07 +01:00
|
|
|
return vfs_move_mount(&from_path, &to_path, mflags);
|
2018-11-05 17:40:30 +00:00
|
|
|
}
|
|
|
|
|
|
2011-11-23 19:34:49 -05:00
|
|
|
/*
|
|
|
|
|
* Return true if path is reachable from root
|
|
|
|
|
*
|
2013-09-29 22:06:07 -04:00
|
|
|
* namespace_sem or mount_lock is held
|
2011-11-23 19:34:49 -05:00
|
|
|
*/
|
2011-11-24 22:00:28 -05:00
|
|
|
bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
|
2011-11-23 19:34:49 -05:00
|
|
|
const struct path *root)
|
|
|
|
|
{
|
2011-11-24 22:00:28 -05:00
|
|
|
while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
|
2011-11-24 22:25:07 -05:00
|
|
|
dentry = mnt->mnt_mountpoint;
|
2011-11-24 22:19:58 -05:00
|
|
|
mnt = mnt->mnt_parent;
|
2011-11-23 19:34:49 -05:00
|
|
|
}
|
2011-11-24 22:00:28 -05:00
|
|
|
return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
|
2011-11-23 19:34:49 -05:00
|
|
|
}
|
|
|
|
|
|
2016-11-14 22:14:35 +01:00
|
|
|
bool path_is_under(const struct path *path1, const struct path *path2)
|
2011-11-23 19:34:49 -05:00
|
|
|
{
|
2015-11-17 14:40:10 +08:00
|
|
|
bool res;
|
2013-09-29 22:06:07 -04:00
|
|
|
read_seqlock_excl(&mount_lock);
|
2011-11-24 22:00:28 -05:00
|
|
|
res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
|
2013-09-29 22:06:07 -04:00
|
|
|
read_sequnlock_excl(&mount_lock);
|
2011-11-23 19:34:49 -05:00
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(path_is_under);
|
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
|
* pivot_root Semantics:
|
|
|
|
|
* Moves the root file system of the current process to the directory put_old,
|
|
|
|
|
* makes new_root as the new root file system of the current process, and sets
|
|
|
|
|
* root/cwd of all processes which had them on the current root to new_root.
|
|
|
|
|
*
|
|
|
|
|
* Restrictions:
|
|
|
|
|
* The new_root and put_old must be directories, and must not be on the
|
|
|
|
|
* same file system as the current process root. The put_old must be
|
|
|
|
|
* underneath new_root, i.e. adding a non-zero number of /.. to the string
|
|
|
|
|
* pointed to by put_old must yield the same directory as new_root. No other
|
|
|
|
|
* file system may be mounted on put_old. After all, new_root is a mountpoint.
|
|
|
|
|
*
|
2006-01-08 01:03:18 -08:00
|
|
|
* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
|
2020-04-14 18:48:37 +02:00
|
|
|
* See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
|
2006-01-08 01:03:18 -08:00
|
|
|
* in this situation.
|
|
|
|
|
*
|
2005-04-16 15:20:36 -07:00
|
|
|
* Notes:
|
|
|
|
|
* - we don't move root/cwd if they are not at the root (reason: if something
|
|
|
|
|
* cared enough to change them, it's probably wrong to force them elsewhere)
|
|
|
|
|
* - it's okay to pick a root that isn't the root of a file system, e.g.
|
|
|
|
|
* /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
|
|
|
|
|
* though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
|
|
|
|
|
* first.
|
|
|
|
|
*/
|
2009-01-14 14:14:16 +01:00
|
|
|
SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
|
|
|
|
|
const char __user *, put_old)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2019-06-30 19:18:53 -04:00
|
|
|
struct path new, old, root;
|
|
|
|
|
struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
|
2025-04-25 16:53:01 -04:00
|
|
|
struct mountpoint *old_mp;
|
2005-04-16 15:20:36 -07:00
|
|
|
int error;
|
|
|
|
|
|
2013-02-22 22:45:42 -05:00
|
|
|
if (!may_mount())
|
2005-04-16 15:20:36 -07:00
|
|
|
return -EPERM;
|
|
|
|
|
|
2019-07-14 16:42:44 -04:00
|
|
|
error = user_path_at(AT_FDCWD, new_root,
|
|
|
|
|
LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (error)
|
|
|
|
|
goto out0;
|
|
|
|
|
|
2019-07-14 16:42:44 -04:00
|
|
|
error = user_path_at(AT_FDCWD, put_old,
|
|
|
|
|
LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (error)
|
|
|
|
|
goto out1;
|
|
|
|
|
|
2008-07-22 09:59:21 -04:00
|
|
|
error = security_sb_pivotroot(&old, &new);
|
2011-03-18 08:55:38 -04:00
|
|
|
if (error)
|
|
|
|
|
goto out2;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2010-08-10 11:41:36 +02:00
|
|
|
get_fs_root(current->fs, &root);
|
2013-03-15 10:53:28 -04:00
|
|
|
old_mp = lock_mount(&old);
|
|
|
|
|
error = PTR_ERR(old_mp);
|
|
|
|
|
if (IS_ERR(old_mp))
|
2011-03-18 08:55:38 -04:00
|
|
|
goto out3;
|
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
error = -EINVAL;
|
2011-11-24 19:41:16 -05:00
|
|
|
new_mnt = real_mount(new.mnt);
|
|
|
|
|
root_mnt = real_mount(root.mnt);
|
2013-03-15 10:53:28 -04:00
|
|
|
old_mnt = real_mount(old.mnt);
|
2019-06-30 19:18:53 -04:00
|
|
|
ex_parent = new_mnt->mnt_parent;
|
|
|
|
|
root_parent = root_mnt->mnt_parent;
|
2013-03-15 10:53:28 -04:00
|
|
|
if (IS_MNT_SHARED(old_mnt) ||
|
2019-06-30 19:18:53 -04:00
|
|
|
IS_MNT_SHARED(ex_parent) ||
|
|
|
|
|
IS_MNT_SHARED(root_parent))
|
2011-03-18 08:55:38 -04:00
|
|
|
goto out4;
|
2011-11-25 00:46:35 -05:00
|
|
|
if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
|
2011-03-18 08:55:38 -04:00
|
|
|
goto out4;
|
2013-03-29 21:04:39 -07:00
|
|
|
if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
|
|
|
|
|
goto out4;
|
2005-04-16 15:20:36 -07:00
|
|
|
error = -ENOENT;
|
2009-05-04 03:32:03 +04:00
|
|
|
if (d_unlinked(new.dentry))
|
2011-03-18 08:55:38 -04:00
|
|
|
goto out4;
|
2005-04-16 15:20:36 -07:00
|
|
|
error = -EBUSY;
|
2013-03-15 10:53:28 -04:00
|
|
|
if (new_mnt == root_mnt || old_mnt == root_mnt)
|
2011-03-18 08:55:38 -04:00
|
|
|
goto out4; /* loop, on the same file system */
|
2005-04-16 15:20:36 -07:00
|
|
|
error = -EINVAL;
|
2023-05-03 13:18:39 +02:00
|
|
|
if (!path_mounted(&root))
|
2011-03-18 08:55:38 -04:00
|
|
|
goto out4; /* not a mountpoint */
|
2011-11-24 21:47:05 -05:00
|
|
|
if (!mnt_has_parent(root_mnt))
|
don't set MNT_LOCKED on parentless mounts
Originally MNT_LOCKED meant only one thing - "don't let this mount to
be peeled off its parent, we don't want to have its mountpoint exposed".
Accordingly, it had only been set on mounts that *do* have a parent.
Later it got overloaded with another use - setting it on the absolute
root had given free protection against umount(2) of absolute root
(was possible to trigger, oopsed). Not a bad trick, but it ended
up costing more than it bought us. Unfortunately, the cost included
both hard-to-reason-about logics and a subtle race between
mount -o remount,ro and mount --[r]bind - lockless &= ~MNT_LOCKED in
the end of __do_loopback() could race with sb_prepare_remount_readonly()
setting and clearing MNT_HOLD_WRITE (under mount_lock, as it should
be). The race wouldn't be much of a problem (there are other ways to
deal with it), but the subtlety is.
Turns out that nobody except umount(2) had ever made use of having
MNT_LOCKED set on absolute root. So let's give up on that trick,
clever as it had been, add an explicit check in do_umount() and
return to using MNT_LOCKED only for mounts that have a parent.
It means that
* clone_mnt() no longer copies MNT_LOCKED
* copy_tree() sets it on submounts if their counterparts had
been marked such, and does that right next to attach_mnt() in there,
in the same mount_lock scope.
* __do_loopback() no longer needs to strip MNT_LOCKED off the
root of subtree it's about to return; no store, no race.
* init_mount_tree() doesn't bother setting MNT_LOCKED on absolute
root.
* lock_mnt_tree() does not set MNT_LOCKED on the subtree's root;
accordingly, its caller (loop in attach_recursive_mnt()) does not need to
bother stripping that MNT_LOCKED on root. Note that lock_mnt_tree() setting
MNT_LOCKED on submounts happens in the same mount_lock scope as __attach_mnt()
(from commit_tree()) that makes them reachable.
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-05-06 18:48:05 -04:00
|
|
|
goto out4; /* absolute root */
|
2023-05-03 13:18:39 +02:00
|
|
|
if (!path_mounted(&new))
|
2011-03-18 08:55:38 -04:00
|
|
|
goto out4; /* not a mountpoint */
|
2011-11-24 21:47:05 -05:00
|
|
|
if (!mnt_has_parent(new_mnt))
|
don't set MNT_LOCKED on parentless mounts
Originally MNT_LOCKED meant only one thing - "don't let this mount to
be peeled off its parent, we don't want to have its mountpoint exposed".
Accordingly, it had only been set on mounts that *do* have a parent.
Later it got overloaded with another use - setting it on the absolute
root had given free protection against umount(2) of absolute root
(was possible to trigger, oopsed). Not a bad trick, but it ended
up costing more than it bought us. Unfortunately, the cost included
both hard-to-reason-about logics and a subtle race between
mount -o remount,ro and mount --[r]bind - lockless &= ~MNT_LOCKED in
the end of __do_loopback() could race with sb_prepare_remount_readonly()
setting and clearing MNT_HOLD_WRITE (under mount_lock, as it should
be). The race wouldn't be much of a problem (there are other ways to
deal with it), but the subtlety is.
Turns out that nobody except umount(2) had ever made use of having
MNT_LOCKED set on absolute root. So let's give up on that trick,
clever as it had been, add an explicit check in do_umount() and
return to using MNT_LOCKED only for mounts that have a parent.
It means that
* clone_mnt() no longer copies MNT_LOCKED
* copy_tree() sets it on submounts if their counterparts had
been marked such, and does that right next to attach_mnt() in there,
in the same mount_lock scope.
* __do_loopback() no longer needs to strip MNT_LOCKED off the
root of subtree it's about to return; no store, no race.
* init_mount_tree() doesn't bother setting MNT_LOCKED on absolute
root.
* lock_mnt_tree() does not set MNT_LOCKED on the subtree's root;
accordingly, its caller (loop in attach_recursive_mnt()) does not need to
bother stripping that MNT_LOCKED on root. Note that lock_mnt_tree() setting
MNT_LOCKED on submounts happens in the same mount_lock scope as __attach_mnt()
(from commit_tree()) that makes them reachable.
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-05-06 18:48:05 -04:00
|
|
|
goto out4; /* absolute root */
|
2008-02-14 19:34:32 -08:00
|
|
|
/* make sure we can reach put_old from new_root */
|
2013-03-15 10:53:28 -04:00
|
|
|
if (!is_path_reachable(old_mnt, old.dentry, &new))
|
2011-03-18 08:55:38 -04:00
|
|
|
goto out4;
|
2014-10-08 10:42:27 -07:00
|
|
|
/* make certain new is below the root */
|
|
|
|
|
if (!is_path_reachable(new_mnt, new.dentry, &root))
|
|
|
|
|
goto out4;
|
2013-09-29 11:24:49 -04:00
|
|
|
lock_mount_hash();
|
2019-06-30 19:18:53 -04:00
|
|
|
umount_mnt(new_mnt);
|
2013-03-29 21:04:39 -07:00
|
|
|
if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
|
|
|
|
|
new_mnt->mnt.mnt_flags |= MNT_LOCKED;
|
|
|
|
|
root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
|
|
|
|
|
}
|
2008-02-14 19:34:32 -08:00
|
|
|
/* mount new_root on / */
|
2025-04-25 16:53:01 -04:00
|
|
|
attach_mnt(new_mnt, root_parent, root_mnt->mnt_mp);
|
|
|
|
|
umount_mnt(root_mnt);
|
2019-06-30 19:18:53 -04:00
|
|
|
mnt_add_count(root_parent, -1);
|
2025-04-25 16:53:01 -04:00
|
|
|
/* mount old root on put_old */
|
|
|
|
|
attach_mnt(root_mnt, old_mnt, old_mp);
|
2006-12-08 02:37:56 -08:00
|
|
|
touch_mnt_namespace(current->nsproxy->mnt_ns);
|
2014-10-08 10:42:57 -07:00
|
|
|
/* A moved mount should not expire automatically */
|
|
|
|
|
list_del_init(&new_mnt->mnt_expire);
|
2013-09-29 11:24:49 -04:00
|
|
|
unlock_mount_hash();
|
2025-01-29 17:58:01 +01:00
|
|
|
mnt_notify_add(root_mnt);
|
|
|
|
|
mnt_notify_add(new_mnt);
|
2008-07-22 09:59:21 -04:00
|
|
|
chroot_fs_refs(&root, &new);
|
2005-04-16 15:20:36 -07:00
|
|
|
error = 0;
|
2011-03-18 08:55:38 -04:00
|
|
|
out4:
|
2013-03-15 10:53:28 -04:00
|
|
|
unlock_mount(old_mp);
|
2019-06-30 19:18:53 -04:00
|
|
|
if (!error)
|
|
|
|
|
mntput_no_expire(ex_parent);
|
2011-03-18 08:55:38 -04:00
|
|
|
out3:
|
2008-03-22 18:00:39 -04:00
|
|
|
path_put(&root);
|
2011-03-18 08:55:38 -04:00
|
|
|
out2:
|
2008-07-22 09:59:21 -04:00
|
|
|
path_put(&old);
|
2005-04-16 15:20:36 -07:00
|
|
|
out1:
|
2008-07-22 09:59:21 -04:00
|
|
|
path_put(&new);
|
2005-04-16 15:20:36 -07:00
|
|
|
out0:
|
|
|
|
|
return error;
|
|
|
|
|
}
|
|
|
|
|
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
|
|
|
|
|
{
|
|
|
|
|
unsigned int flags = mnt->mnt.mnt_flags;
|
|
|
|
|
|
|
|
|
|
/* flags to clear */
|
|
|
|
|
flags &= ~kattr->attr_clr;
|
|
|
|
|
/* flags to raise */
|
|
|
|
|
flags |= kattr->attr_set;
|
|
|
|
|
|
|
|
|
|
return flags;
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-21 14:19:54 +01:00
|
|
|
static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
|
|
|
|
|
{
|
|
|
|
|
struct vfsmount *m = &mnt->mnt;
|
2021-12-03 12:17:07 +01:00
|
|
|
struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
|
2021-01-21 14:19:54 +01:00
|
|
|
|
2022-10-26 12:51:27 +02:00
|
|
|
if (!kattr->mnt_idmap)
|
2021-01-21 14:19:54 +01:00
|
|
|
return 0;
|
|
|
|
|
|
2021-12-03 12:17:07 +01:00
|
|
|
/*
|
|
|
|
|
* Creating an idmapped mount with the filesystem wide idmapping
|
|
|
|
|
* doesn't make sense so block that. We don't allow mushy semantics.
|
|
|
|
|
*/
|
2023-11-22 13:44:37 +01:00
|
|
|
if (kattr->mnt_userns == m->mnt_sb->s_user_ns)
|
2021-12-03 12:17:07 +01:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
2021-01-21 14:19:54 +01:00
|
|
|
/*
|
fs: allow changing idmappings
This patchset makes it possible to create a new idmapped mount from an
already idmapped mount and to clear idmappings.
// Create a first idmapped mount
struct mount_attr attr = {
.attr_set = MOUNT_ATTR_IDMAP
.userns_fd = fd_userns
};
fd_tree = open_tree(-EBADF, "/", OPEN_TREE_CLONE, &attr, sizeof(attr));
move_mount(fd_tree, "", -EBADF, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
// Create a second idmapped mount from the first idmapped mount
attr.attr_set = MOUNT_ATTR_IDMAP;
attr.userns_fd = fd_userns2;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
// Create a second non-idmapped mount from the first idmapped mount:
memset(&attr, 0, sizeof(attr));
attr.attr_clr = MOUNT_ATTR_IDMAP;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
Link: https://lore.kernel.org/r/20250128-work-mnt_idmap-update-v2-v1-5-c25feb0d2eb3@kernel.org
Reviewed-by: "Seth Forshee (DigitalOcean)" <sforshee@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-01-28 11:33:43 +01:00
|
|
|
* We only allow an mount to change it's idmapping if it has
|
|
|
|
|
* never been accessible to userspace.
|
2021-01-21 14:19:54 +01:00
|
|
|
*/
|
fs: allow changing idmappings
This patchset makes it possible to create a new idmapped mount from an
already idmapped mount and to clear idmappings.
// Create a first idmapped mount
struct mount_attr attr = {
.attr_set = MOUNT_ATTR_IDMAP
.userns_fd = fd_userns
};
fd_tree = open_tree(-EBADF, "/", OPEN_TREE_CLONE, &attr, sizeof(attr));
move_mount(fd_tree, "", -EBADF, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
// Create a second idmapped mount from the first idmapped mount
attr.attr_set = MOUNT_ATTR_IDMAP;
attr.userns_fd = fd_userns2;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
// Create a second non-idmapped mount from the first idmapped mount:
memset(&attr, 0, sizeof(attr));
attr.attr_clr = MOUNT_ATTR_IDMAP;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
Link: https://lore.kernel.org/r/20250128-work-mnt_idmap-update-v2-v1-5-c25feb0d2eb3@kernel.org
Reviewed-by: "Seth Forshee (DigitalOcean)" <sforshee@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-01-28 11:33:43 +01:00
|
|
|
if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m))
|
2021-01-21 14:19:54 +01:00
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
|
|
/* The underlying filesystem doesn't support idmapped mounts yet. */
|
|
|
|
|
if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
2024-09-03 17:16:12 +02:00
|
|
|
/* The filesystem has turned off idmapped mounts. */
|
|
|
|
|
if (m->mnt_sb->s_iflags & SB_I_NOIDMAP)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
2021-01-21 14:19:54 +01:00
|
|
|
/* We're not controlling the superblock. */
|
2021-12-03 12:17:07 +01:00
|
|
|
if (!ns_capable(fs_userns, CAP_SYS_ADMIN))
|
2021-01-21 14:19:54 +01:00
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
|
|
/* Mount has already been visible in the filesystem hierarchy. */
|
|
|
|
|
if (!is_anon_ns(mnt->mnt_ns))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2022-02-03 14:14:08 +01:00
|
|
|
/**
|
|
|
|
|
* mnt_allow_writers() - check whether the attribute change allows writers
|
|
|
|
|
* @kattr: the new mount attributes
|
|
|
|
|
* @mnt: the mount to which @kattr will be applied
|
|
|
|
|
*
|
|
|
|
|
* Check whether thew new mount attributes in @kattr allow concurrent writers.
|
|
|
|
|
*
|
|
|
|
|
* Return: true if writers need to be held, false if not
|
|
|
|
|
*/
|
|
|
|
|
static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
|
|
|
|
|
const struct mount *mnt)
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
{
|
2022-05-10 11:58:40 +02:00
|
|
|
return (!(kattr->attr_set & MNT_READONLY) ||
|
|
|
|
|
(mnt->mnt.mnt_flags & MNT_READONLY)) &&
|
2022-10-26 12:51:27 +02:00
|
|
|
!kattr->mnt_idmap;
|
2022-02-03 14:14:08 +01:00
|
|
|
}
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
|
2022-02-03 14:14:11 +01:00
|
|
|
static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
{
|
2022-02-28 23:04:20 -05:00
|
|
|
struct mount *m;
|
|
|
|
|
int err;
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
|
2022-02-28 23:04:20 -05:00
|
|
|
for (m = mnt; m; m = next_mnt(m, mnt)) {
|
|
|
|
|
if (!can_change_locked_flags(m, recalc_flags(kattr, m))) {
|
|
|
|
|
err = -EPERM;
|
|
|
|
|
break;
|
|
|
|
|
}
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
|
2022-02-03 14:14:11 +01:00
|
|
|
err = can_idmap_mount(kattr, m);
|
|
|
|
|
if (err)
|
2022-02-28 23:04:20 -05:00
|
|
|
break;
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
|
2022-02-28 23:04:20 -05:00
|
|
|
if (!mnt_allow_writers(kattr, m)) {
|
|
|
|
|
err = mnt_hold_writers(m);
|
|
|
|
|
if (err)
|
|
|
|
|
break;
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
}
|
|
|
|
|
|
2025-01-28 11:33:42 +01:00
|
|
|
if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
|
2022-02-28 23:04:20 -05:00
|
|
|
return 0;
|
|
|
|
|
}
|
2021-01-21 14:19:54 +01:00
|
|
|
|
2022-02-28 23:04:20 -05:00
|
|
|
if (err) {
|
|
|
|
|
struct mount *p;
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
|
2022-04-20 15:19:25 +02:00
|
|
|
/*
|
|
|
|
|
* If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
|
|
|
|
|
* be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
|
|
|
|
|
* mounts and needs to take care to include the first mount.
|
|
|
|
|
*/
|
|
|
|
|
for (p = mnt; p; p = next_mnt(p, mnt)) {
|
2022-02-28 23:04:20 -05:00
|
|
|
/* If we had to hold writers unblock them. */
|
|
|
|
|
if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
|
|
|
|
|
mnt_unhold_writers(p);
|
2022-04-20 15:19:25 +02:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We're done once the first mount we changed got
|
|
|
|
|
* MNT_WRITE_HOLD unset.
|
|
|
|
|
*/
|
|
|
|
|
if (p == m)
|
|
|
|
|
break;
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
}
|
2022-02-28 23:04:20 -05:00
|
|
|
}
|
|
|
|
|
return err;
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
}
|
|
|
|
|
|
2021-01-21 14:19:54 +01:00
|
|
|
static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
|
|
|
|
|
{
|
fs: allow changing idmappings
This patchset makes it possible to create a new idmapped mount from an
already idmapped mount and to clear idmappings.
// Create a first idmapped mount
struct mount_attr attr = {
.attr_set = MOUNT_ATTR_IDMAP
.userns_fd = fd_userns
};
fd_tree = open_tree(-EBADF, "/", OPEN_TREE_CLONE, &attr, sizeof(attr));
move_mount(fd_tree, "", -EBADF, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
// Create a second idmapped mount from the first idmapped mount
attr.attr_set = MOUNT_ATTR_IDMAP;
attr.userns_fd = fd_userns2;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
// Create a second non-idmapped mount from the first idmapped mount:
memset(&attr, 0, sizeof(attr));
attr.attr_clr = MOUNT_ATTR_IDMAP;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
Link: https://lore.kernel.org/r/20250128-work-mnt_idmap-update-v2-v1-5-c25feb0d2eb3@kernel.org
Reviewed-by: "Seth Forshee (DigitalOcean)" <sforshee@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-01-28 11:33:43 +01:00
|
|
|
struct mnt_idmap *old_idmap;
|
|
|
|
|
|
2022-10-26 12:51:27 +02:00
|
|
|
if (!kattr->mnt_idmap)
|
2021-01-21 14:19:54 +01:00
|
|
|
return;
|
|
|
|
|
|
fs: allow changing idmappings
This patchset makes it possible to create a new idmapped mount from an
already idmapped mount and to clear idmappings.
// Create a first idmapped mount
struct mount_attr attr = {
.attr_set = MOUNT_ATTR_IDMAP
.userns_fd = fd_userns
};
fd_tree = open_tree(-EBADF, "/", OPEN_TREE_CLONE, &attr, sizeof(attr));
move_mount(fd_tree, "", -EBADF, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
// Create a second idmapped mount from the first idmapped mount
attr.attr_set = MOUNT_ATTR_IDMAP;
attr.userns_fd = fd_userns2;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
// Create a second non-idmapped mount from the first idmapped mount:
memset(&attr, 0, sizeof(attr));
attr.attr_clr = MOUNT_ATTR_IDMAP;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
Link: https://lore.kernel.org/r/20250128-work-mnt_idmap-update-v2-v1-5-c25feb0d2eb3@kernel.org
Reviewed-by: "Seth Forshee (DigitalOcean)" <sforshee@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-01-28 11:33:43 +01:00
|
|
|
old_idmap = mnt_idmap(&mnt->mnt);
|
|
|
|
|
|
|
|
|
|
/* Pairs with smp_load_acquire() in mnt_idmap(). */
|
2022-10-26 12:51:27 +02:00
|
|
|
smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
|
fs: allow changing idmappings
This patchset makes it possible to create a new idmapped mount from an
already idmapped mount and to clear idmappings.
// Create a first idmapped mount
struct mount_attr attr = {
.attr_set = MOUNT_ATTR_IDMAP
.userns_fd = fd_userns
};
fd_tree = open_tree(-EBADF, "/", OPEN_TREE_CLONE, &attr, sizeof(attr));
move_mount(fd_tree, "", -EBADF, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
// Create a second idmapped mount from the first idmapped mount
attr.attr_set = MOUNT_ATTR_IDMAP;
attr.userns_fd = fd_userns2;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
// Create a second non-idmapped mount from the first idmapped mount:
memset(&attr, 0, sizeof(attr));
attr.attr_clr = MOUNT_ATTR_IDMAP;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
Link: https://lore.kernel.org/r/20250128-work-mnt_idmap-update-v2-v1-5-c25feb0d2eb3@kernel.org
Reviewed-by: "Seth Forshee (DigitalOcean)" <sforshee@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-01-28 11:33:43 +01:00
|
|
|
mnt_idmap_put(old_idmap);
|
2021-01-21 14:19:54 +01:00
|
|
|
}
|
|
|
|
|
|
2022-02-28 23:04:20 -05:00
|
|
|
static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
{
|
2022-02-28 23:04:20 -05:00
|
|
|
struct mount *m;
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
|
2022-02-28 23:04:20 -05:00
|
|
|
for (m = mnt; m; m = next_mnt(m, mnt)) {
|
|
|
|
|
unsigned int flags;
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
|
2022-02-28 23:04:20 -05:00
|
|
|
do_idmap_mount(kattr, m);
|
|
|
|
|
flags = recalc_flags(kattr, m);
|
|
|
|
|
WRITE_ONCE(m->mnt.mnt_flags, flags);
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
|
2022-02-03 14:14:09 +01:00
|
|
|
/* If we had to hold writers unblock them. */
|
|
|
|
|
if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
mnt_unhold_writers(m);
|
|
|
|
|
|
2022-02-28 23:04:20 -05:00
|
|
|
if (kattr->propagation)
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
change_mnt_propagation(m, kattr->propagation);
|
2025-01-28 11:33:42 +01:00
|
|
|
if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
break;
|
2022-02-28 23:04:20 -05:00
|
|
|
}
|
|
|
|
|
touch_mnt_namespace(mnt->mnt_ns);
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
|
|
|
|
|
{
|
2022-02-03 14:14:11 +01:00
|
|
|
struct mount *mnt = real_mount(path->mnt);
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
int err = 0;
|
|
|
|
|
|
2023-05-03 13:18:39 +02:00
|
|
|
if (!path_mounted(path))
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
2022-10-26 12:51:27 +02:00
|
|
|
if (kattr->mnt_userns) {
|
|
|
|
|
struct mnt_idmap *mnt_idmap;
|
|
|
|
|
|
|
|
|
|
mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns);
|
|
|
|
|
if (IS_ERR(mnt_idmap))
|
|
|
|
|
return PTR_ERR(mnt_idmap);
|
|
|
|
|
kattr->mnt_idmap = mnt_idmap;
|
|
|
|
|
}
|
|
|
|
|
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
if (kattr->propagation) {
|
|
|
|
|
/*
|
|
|
|
|
* Only take namespace_lock() if we're actually changing
|
|
|
|
|
* propagation.
|
|
|
|
|
*/
|
|
|
|
|
namespace_lock();
|
|
|
|
|
if (kattr->propagation == MS_SHARED) {
|
2025-01-28 11:33:42 +01:00
|
|
|
err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE);
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
if (err) {
|
|
|
|
|
namespace_unlock();
|
|
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-02-03 14:14:11 +01:00
|
|
|
err = -EINVAL;
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
lock_mount_hash();
|
|
|
|
|
|
2025-06-08 23:25:36 -04:00
|
|
|
if (!anon_ns_root(mnt) && !check_mnt(mnt))
|
2022-02-03 14:14:11 +01:00
|
|
|
goto out;
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
|
2022-02-03 14:14:11 +01:00
|
|
|
/*
|
|
|
|
|
* First, we get the mount tree in a shape where we can change mount
|
|
|
|
|
* properties without failure. If we succeeded to do so we commit all
|
|
|
|
|
* changes and if we failed we clean up.
|
|
|
|
|
*/
|
|
|
|
|
err = mount_setattr_prepare(kattr, mnt);
|
2022-02-28 23:04:20 -05:00
|
|
|
if (!err)
|
|
|
|
|
mount_setattr_commit(kattr, mnt);
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
|
2022-02-03 14:14:11 +01:00
|
|
|
out:
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
unlock_mount_hash();
|
|
|
|
|
|
|
|
|
|
if (kattr->propagation) {
|
|
|
|
|
if (err)
|
|
|
|
|
cleanup_group_ids(mnt, NULL);
|
2023-03-30 09:13:16 +02:00
|
|
|
namespace_unlock();
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-21 14:19:54 +01:00
|
|
|
static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
|
2025-01-28 11:33:40 +01:00
|
|
|
struct mount_kattr *kattr)
|
2021-01-21 14:19:54 +01:00
|
|
|
{
|
|
|
|
|
struct ns_common *ns;
|
|
|
|
|
struct user_namespace *mnt_userns;
|
|
|
|
|
|
|
|
|
|
if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
|
|
|
|
|
return 0;
|
|
|
|
|
|
fs: allow changing idmappings
This patchset makes it possible to create a new idmapped mount from an
already idmapped mount and to clear idmappings.
// Create a first idmapped mount
struct mount_attr attr = {
.attr_set = MOUNT_ATTR_IDMAP
.userns_fd = fd_userns
};
fd_tree = open_tree(-EBADF, "/", OPEN_TREE_CLONE, &attr, sizeof(attr));
move_mount(fd_tree, "", -EBADF, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
// Create a second idmapped mount from the first idmapped mount
attr.attr_set = MOUNT_ATTR_IDMAP;
attr.userns_fd = fd_userns2;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
// Create a second non-idmapped mount from the first idmapped mount:
memset(&attr, 0, sizeof(attr));
attr.attr_clr = MOUNT_ATTR_IDMAP;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
Link: https://lore.kernel.org/r/20250128-work-mnt_idmap-update-v2-v1-5-c25feb0d2eb3@kernel.org
Reviewed-by: "Seth Forshee (DigitalOcean)" <sforshee@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-01-28 11:33:43 +01:00
|
|
|
if (attr->attr_clr & MOUNT_ATTR_IDMAP) {
|
|
|
|
|
/*
|
|
|
|
|
* We can only remove an idmapping if it's never been
|
|
|
|
|
* exposed to userspace.
|
|
|
|
|
*/
|
|
|
|
|
if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Removal of idmappings is equivalent to setting
|
|
|
|
|
* nop_mnt_idmap.
|
|
|
|
|
*/
|
|
|
|
|
if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) {
|
|
|
|
|
kattr->mnt_idmap = &nop_mnt_idmap;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
2021-01-21 14:19:54 +01:00
|
|
|
|
|
|
|
|
if (attr->userns_fd > INT_MAX)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
2024-07-19 21:19:02 -04:00
|
|
|
CLASS(fd, f)(attr->userns_fd);
|
|
|
|
|
if (fd_empty(f))
|
2021-01-21 14:19:54 +01:00
|
|
|
return -EBADF;
|
|
|
|
|
|
2024-07-19 21:19:02 -04:00
|
|
|
if (!proc_ns_file(fd_file(f)))
|
|
|
|
|
return -EINVAL;
|
2021-01-21 14:19:54 +01:00
|
|
|
|
2024-05-31 14:12:01 -04:00
|
|
|
ns = get_proc_ns(file_inode(fd_file(f)));
|
2024-07-19 21:19:02 -04:00
|
|
|
if (ns->ops->type != CLONE_NEWUSER)
|
|
|
|
|
return -EINVAL;
|
2021-01-21 14:19:54 +01:00
|
|
|
|
|
|
|
|
/*
|
2021-12-03 12:17:07 +01:00
|
|
|
* The initial idmapping cannot be used to create an idmapped
|
|
|
|
|
* mount. We use the initial idmapping as an indicator of a mount
|
|
|
|
|
* that is not idmapped. It can simply be passed into helpers that
|
|
|
|
|
* are aware of idmapped mounts as a convenient shortcut. A user
|
|
|
|
|
* can just create a dedicated identity mapping to achieve the same
|
|
|
|
|
* result.
|
2021-01-21 14:19:54 +01:00
|
|
|
*/
|
|
|
|
|
mnt_userns = container_of(ns, struct user_namespace, ns);
|
2024-07-19 21:19:02 -04:00
|
|
|
if (mnt_userns == &init_user_ns)
|
|
|
|
|
return -EPERM;
|
2022-08-16 11:47:52 -05:00
|
|
|
|
|
|
|
|
/* We're not controlling the target namespace. */
|
2024-07-19 21:19:02 -04:00
|
|
|
if (!ns_capable(mnt_userns, CAP_SYS_ADMIN))
|
|
|
|
|
return -EPERM;
|
2022-08-16 11:47:52 -05:00
|
|
|
|
2021-01-21 14:19:54 +01:00
|
|
|
kattr->mnt_userns = get_user_ns(mnt_userns);
|
2024-07-19 21:19:02 -04:00
|
|
|
return 0;
|
2021-01-21 14:19:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
|
2025-01-28 11:33:40 +01:00
|
|
|
struct mount_kattr *kattr)
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
{
|
|
|
|
|
if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
kattr->propagation = attr->propagation;
|
|
|
|
|
|
|
|
|
|
if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
|
|
|
|
|
kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
|
|
|
|
|
* users wanting to transition to a different atime setting cannot
|
|
|
|
|
* simply specify the atime setting in @attr_set, but must also
|
|
|
|
|
* specify MOUNT_ATTR__ATIME in the @attr_clr field.
|
|
|
|
|
* So ensure that MOUNT_ATTR__ATIME can't be partially set in
|
|
|
|
|
* @attr_clr and that @attr_set can't have any atime bits set if
|
|
|
|
|
* MOUNT_ATTR__ATIME isn't set in @attr_clr.
|
|
|
|
|
*/
|
|
|
|
|
if (attr->attr_clr & MOUNT_ATTR__ATIME) {
|
|
|
|
|
if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Clear all previous time settings as they are mutually
|
|
|
|
|
* exclusive.
|
|
|
|
|
*/
|
|
|
|
|
kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME;
|
|
|
|
|
switch (attr->attr_set & MOUNT_ATTR__ATIME) {
|
|
|
|
|
case MOUNT_ATTR_RELATIME:
|
|
|
|
|
kattr->attr_set |= MNT_RELATIME;
|
|
|
|
|
break;
|
|
|
|
|
case MOUNT_ATTR_NOATIME:
|
|
|
|
|
kattr->attr_set |= MNT_NOATIME;
|
|
|
|
|
break;
|
|
|
|
|
case MOUNT_ATTR_STRICTATIME:
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if (attr->attr_set & MOUNT_ATTR__ATIME)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-28 11:33:40 +01:00
|
|
|
return build_mount_idmapped(attr, usize, kattr);
|
2021-01-21 14:19:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void finish_mount_kattr(struct mount_kattr *kattr)
|
|
|
|
|
{
|
fs: allow changing idmappings
This patchset makes it possible to create a new idmapped mount from an
already idmapped mount and to clear idmappings.
// Create a first idmapped mount
struct mount_attr attr = {
.attr_set = MOUNT_ATTR_IDMAP
.userns_fd = fd_userns
};
fd_tree = open_tree(-EBADF, "/", OPEN_TREE_CLONE, &attr, sizeof(attr));
move_mount(fd_tree, "", -EBADF, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
// Create a second idmapped mount from the first idmapped mount
attr.attr_set = MOUNT_ATTR_IDMAP;
attr.userns_fd = fd_userns2;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
// Create a second non-idmapped mount from the first idmapped mount:
memset(&attr, 0, sizeof(attr));
attr.attr_clr = MOUNT_ATTR_IDMAP;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
Link: https://lore.kernel.org/r/20250128-work-mnt_idmap-update-v2-v1-5-c25feb0d2eb3@kernel.org
Reviewed-by: "Seth Forshee (DigitalOcean)" <sforshee@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-01-28 11:33:43 +01:00
|
|
|
if (kattr->mnt_userns) {
|
|
|
|
|
put_user_ns(kattr->mnt_userns);
|
|
|
|
|
kattr->mnt_userns = NULL;
|
|
|
|
|
}
|
2022-10-26 12:51:27 +02:00
|
|
|
|
|
|
|
|
if (kattr->mnt_idmap)
|
|
|
|
|
mnt_idmap_put(kattr->mnt_idmap);
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
}
|
|
|
|
|
|
2025-04-09 10:00:23 +02:00
|
|
|
static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize,
|
|
|
|
|
struct mount_kattr *kattr)
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
{
|
2025-01-28 11:33:40 +01:00
|
|
|
int ret;
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
struct mount_attr attr;
|
|
|
|
|
|
|
|
|
|
BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
|
|
|
|
|
|
|
|
|
|
if (unlikely(usize > PAGE_SIZE))
|
|
|
|
|
return -E2BIG;
|
|
|
|
|
if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
if (!may_mount())
|
|
|
|
|
return -EPERM;
|
|
|
|
|
|
2025-01-28 11:33:40 +01:00
|
|
|
ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
|
|
|
|
|
/* Don't bother walking through the mounts if this is a nop. */
|
|
|
|
|
if (attr.attr_set == 0 &&
|
|
|
|
|
attr.attr_clr == 0 &&
|
|
|
|
|
attr.propagation == 0)
|
2025-04-09 10:00:23 +02:00
|
|
|
return 0; /* Tell caller to not bother. */
|
|
|
|
|
|
|
|
|
|
ret = build_mount_kattr(&attr, usize, kattr);
|
|
|
|
|
if (ret < 0)
|
|
|
|
|
return ret;
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
|
2025-04-09 10:00:23 +02:00
|
|
|
return 1;
|
2025-01-28 11:33:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
|
|
|
|
|
unsigned int, flags, struct mount_attr __user *, uattr,
|
|
|
|
|
size_t, usize)
|
|
|
|
|
{
|
|
|
|
|
int err;
|
|
|
|
|
struct path target;
|
|
|
|
|
struct mount_kattr kattr;
|
|
|
|
|
unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
|
|
|
|
|
|
|
|
|
|
if (flags & ~(AT_EMPTY_PATH |
|
|
|
|
|
AT_RECURSIVE |
|
|
|
|
|
AT_SYMLINK_NOFOLLOW |
|
|
|
|
|
AT_NO_AUTOMOUNT))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
if (flags & AT_NO_AUTOMOUNT)
|
|
|
|
|
lookup_flags &= ~LOOKUP_AUTOMOUNT;
|
|
|
|
|
if (flags & AT_SYMLINK_NOFOLLOW)
|
|
|
|
|
lookup_flags &= ~LOOKUP_FOLLOW;
|
|
|
|
|
if (flags & AT_EMPTY_PATH)
|
|
|
|
|
lookup_flags |= LOOKUP_EMPTY;
|
|
|
|
|
|
|
|
|
|
kattr = (struct mount_kattr) {
|
|
|
|
|
.lookup_flags = lookup_flags,
|
|
|
|
|
};
|
|
|
|
|
|
2025-01-28 11:33:42 +01:00
|
|
|
if (flags & AT_RECURSIVE)
|
|
|
|
|
kattr.kflags |= MOUNT_KATTR_RECURSE;
|
|
|
|
|
|
2025-04-09 10:00:23 +02:00
|
|
|
err = wants_mount_setattr(uattr, usize, &kattr);
|
|
|
|
|
if (err <= 0)
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
return err;
|
|
|
|
|
|
|
|
|
|
err = user_path_at(dfd, path, kattr.lookup_flags, &target);
|
2021-12-30 20:23:09 +01:00
|
|
|
if (!err) {
|
|
|
|
|
err = do_mount_setattr(&target, &kattr);
|
|
|
|
|
path_put(&target);
|
|
|
|
|
}
|
2021-01-21 14:19:54 +01:00
|
|
|
finish_mount_kattr(&kattr);
|
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-28 11:33:41 +01:00
|
|
|
SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
|
|
|
|
|
unsigned, flags, struct mount_attr __user *, uattr,
|
|
|
|
|
size_t, usize)
|
|
|
|
|
{
|
|
|
|
|
struct file __free(fput) *file = NULL;
|
|
|
|
|
int fd;
|
|
|
|
|
|
|
|
|
|
if (!uattr && usize)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
file = vfs_open_tree(dfd, filename, flags);
|
|
|
|
|
if (IS_ERR(file))
|
|
|
|
|
return PTR_ERR(file);
|
|
|
|
|
|
|
|
|
|
if (uattr) {
|
|
|
|
|
int ret;
|
2025-01-28 11:33:42 +01:00
|
|
|
struct mount_kattr kattr = {};
|
|
|
|
|
|
fs: allow changing idmappings
This patchset makes it possible to create a new idmapped mount from an
already idmapped mount and to clear idmappings.
// Create a first idmapped mount
struct mount_attr attr = {
.attr_set = MOUNT_ATTR_IDMAP
.userns_fd = fd_userns
};
fd_tree = open_tree(-EBADF, "/", OPEN_TREE_CLONE, &attr, sizeof(attr));
move_mount(fd_tree, "", -EBADF, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
// Create a second idmapped mount from the first idmapped mount
attr.attr_set = MOUNT_ATTR_IDMAP;
attr.userns_fd = fd_userns2;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
// Create a second non-idmapped mount from the first idmapped mount:
memset(&attr, 0, sizeof(attr));
attr.attr_clr = MOUNT_ATTR_IDMAP;
fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr));
Link: https://lore.kernel.org/r/20250128-work-mnt_idmap-update-v2-v1-5-c25feb0d2eb3@kernel.org
Reviewed-by: "Seth Forshee (DigitalOcean)" <sforshee@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-01-28 11:33:43 +01:00
|
|
|
kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
|
2025-01-28 11:33:42 +01:00
|
|
|
if (flags & AT_RECURSIVE)
|
|
|
|
|
kattr.kflags |= MOUNT_KATTR_RECURSE;
|
2025-01-28 11:33:41 +01:00
|
|
|
|
2025-04-09 10:00:23 +02:00
|
|
|
ret = wants_mount_setattr(uattr, usize, &kattr);
|
2025-06-24 10:25:04 -04:00
|
|
|
if (ret > 0) {
|
2025-04-09 10:00:23 +02:00
|
|
|
ret = do_mount_setattr(&file->f_path, &kattr);
|
|
|
|
|
finish_mount_kattr(&kattr);
|
|
|
|
|
}
|
2025-06-24 10:25:04 -04:00
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
2025-01-28 11:33:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fd = get_unused_fd_flags(flags & O_CLOEXEC);
|
|
|
|
|
if (fd < 0)
|
|
|
|
|
return fd;
|
|
|
|
|
|
|
|
|
|
fd_install(fd, no_free_ptr(file));
|
|
|
|
|
return fd;
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-25 16:02:01 +02:00
|
|
|
int show_path(struct seq_file *m, struct dentry *root)
|
|
|
|
|
{
|
|
|
|
|
if (root->d_sb->s_op->show_path)
|
|
|
|
|
return root->d_sb->s_op->show_path(m, root);
|
|
|
|
|
|
|
|
|
|
seq_dentry(m, root, " \t\n\\");
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-25 16:02:02 +02:00
|
|
|
static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
|
|
|
|
|
{
|
|
|
|
|
struct mount *mnt = mnt_find_id_at(ns, id);
|
|
|
|
|
|
|
|
|
|
if (!mnt || mnt->mnt_id_unique != id)
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
return &mnt->mnt;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct kstatmount {
|
2023-11-19 20:58:49 +01:00
|
|
|
struct statmount __user *buf;
|
|
|
|
|
size_t bufsize;
|
|
|
|
|
struct vfsmount *mnt;
|
statmount: allow to retrieve idmappings
This adds the STATMOUNT_MNT_UIDMAP and STATMOUNT_MNT_GIDMAP options.
It allows the retrieval of idmappings via statmount().
Currently it isn't possible to figure out what idmappings are applied to
an idmapped mount. This information is often crucial. Before statmount()
the only realistic options for an interface like this would have been to
add it to /proc/<pid>/fdinfo/<nr> or to expose it in
/proc/<pid>/mountinfo. Both solution would have been pretty ugly and
would've shown information that is of strong interest to some
application but not all. statmount() is perfect for this.
The idmappings applied to an idmapped mount are shown relative to the
caller's user namespace. This is the most useful solution that doesn't
risk leaking information or confuse the caller.
For example, an idmapped mount might have been created with the
following idmappings:
mount --bind -o X-mount.idmap="0:10000:1000 2000:2000:1 3000:3000:1" /srv /opt
Listing the idmappings through statmount() in the same context shows:
mnt_id: 2147485088
mnt_parent_id: 2147484816
fs_type: btrfs
mnt_root: /srv
mnt_point: /opt
mnt_opts: ssd,discard=async,space_cache=v2,subvolid=5,subvol=/
mnt_uidmap[0]: 0 10000 1000
mnt_uidmap[1]: 2000 2000 1
mnt_uidmap[2]: 3000 3000 1
mnt_gidmap[0]: 0 10000 1000
mnt_gidmap[1]: 2000 2000 1
mnt_gidmap[2]: 3000 3000 1
But the idmappings might not always be resolvable in the caller's user
namespace. For example:
unshare --user --map-root
In this case statmount() will skip any mappings that fil to resolve in
the caller's idmapping:
mnt_id: 2147485087
mnt_parent_id: 2147484016
fs_type: btrfs
mnt_root: /srv
mnt_point: /opt
mnt_opts: ssd,discard=async,space_cache=v2,subvolid=5,subvol=/
The caller can differentiate between a mount not being idmapped and a
mount that is idmapped but where all mappings fail to resolve in the
caller's idmapping by check for the STATMOUNT_MNT_{G,U}IDMAP flag being
raised but the number of mappings in ->mnt_{g,u}idmap_num being zero.
Note that statmount() requires that the whole range must be resolvable
in the caller's user namespace. If a subrange fails to map it will still
list the map as not resolvable. This is a practical compromise to avoid
having to find which subranges are resovable and wich aren't.
Idmappings are listed as a string array with each mapping separated by
zero bytes. This allows to retrieve the idmappings and immediately use
them for writing to e.g., /proc/<pid>/{g,u}id_map and it also allow for
simple iteration like:
if (stmnt->mask & STATMOUNT_MNT_UIDMAP) {
const char *idmap = stmnt->str + stmnt->mnt_uidmap;
for (size_t idx = 0; idx < stmnt->mnt_uidmap_nr; idx++) {
printf("mnt_uidmap[%lu]: %s\n", idx, idmap);
idmap += strlen(idmap) + 1;
}
}
Link: https://lore.kernel.org/r/20250204-work-mnt_idmap-statmount-v2-2-007720f39f2e@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-02-04 12:27:47 +01:00
|
|
|
struct mnt_idmap *idmap;
|
2023-11-19 20:58:49 +01:00
|
|
|
u64 mask;
|
2023-10-25 16:02:02 +02:00
|
|
|
struct path root;
|
2023-11-19 20:58:49 +01:00
|
|
|
struct seq_file seq;
|
2025-03-26 18:17:44 -06:00
|
|
|
|
|
|
|
|
/* Must be last --ends in a flexible-array member. */
|
|
|
|
|
struct statmount sm;
|
2023-10-25 16:02:02 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static u64 mnt_to_attr_flags(struct vfsmount *mnt)
|
|
|
|
|
{
|
|
|
|
|
unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
|
|
|
|
|
u64 attr_flags = 0;
|
|
|
|
|
|
|
|
|
|
if (mnt_flags & MNT_READONLY)
|
|
|
|
|
attr_flags |= MOUNT_ATTR_RDONLY;
|
|
|
|
|
if (mnt_flags & MNT_NOSUID)
|
|
|
|
|
attr_flags |= MOUNT_ATTR_NOSUID;
|
|
|
|
|
if (mnt_flags & MNT_NODEV)
|
|
|
|
|
attr_flags |= MOUNT_ATTR_NODEV;
|
|
|
|
|
if (mnt_flags & MNT_NOEXEC)
|
|
|
|
|
attr_flags |= MOUNT_ATTR_NOEXEC;
|
|
|
|
|
if (mnt_flags & MNT_NODIRATIME)
|
|
|
|
|
attr_flags |= MOUNT_ATTR_NODIRATIME;
|
|
|
|
|
if (mnt_flags & MNT_NOSYMFOLLOW)
|
|
|
|
|
attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;
|
|
|
|
|
|
|
|
|
|
if (mnt_flags & MNT_NOATIME)
|
|
|
|
|
attr_flags |= MOUNT_ATTR_NOATIME;
|
|
|
|
|
else if (mnt_flags & MNT_RELATIME)
|
|
|
|
|
attr_flags |= MOUNT_ATTR_RELATIME;
|
|
|
|
|
else
|
|
|
|
|
attr_flags |= MOUNT_ATTR_STRICTATIME;
|
|
|
|
|
|
|
|
|
|
if (is_idmapped_mnt(mnt))
|
|
|
|
|
attr_flags |= MOUNT_ATTR_IDMAP;
|
|
|
|
|
|
|
|
|
|
return attr_flags;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static u64 mnt_to_propagation_flags(struct mount *m)
|
|
|
|
|
{
|
|
|
|
|
u64 propagation = 0;
|
|
|
|
|
|
|
|
|
|
if (IS_MNT_SHARED(m))
|
|
|
|
|
propagation |= MS_SHARED;
|
|
|
|
|
if (IS_MNT_SLAVE(m))
|
|
|
|
|
propagation |= MS_SLAVE;
|
|
|
|
|
if (IS_MNT_UNBINDABLE(m))
|
|
|
|
|
propagation |= MS_UNBINDABLE;
|
|
|
|
|
if (!propagation)
|
|
|
|
|
propagation |= MS_PRIVATE;
|
|
|
|
|
|
|
|
|
|
return propagation;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-19 19:03:33 +01:00
|
|
|
static void statmount_sb_basic(struct kstatmount *s)
|
2023-10-25 16:02:02 +02:00
|
|
|
{
|
|
|
|
|
struct super_block *sb = s->mnt->mnt_sb;
|
|
|
|
|
|
2023-11-19 19:03:33 +01:00
|
|
|
s->sm.mask |= STATMOUNT_SB_BASIC;
|
2023-10-25 16:02:02 +02:00
|
|
|
s->sm.sb_dev_major = MAJOR(sb->s_dev);
|
|
|
|
|
s->sm.sb_dev_minor = MINOR(sb->s_dev);
|
|
|
|
|
s->sm.sb_magic = sb->s_magic;
|
|
|
|
|
s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-19 19:03:33 +01:00
|
|
|
static void statmount_mnt_basic(struct kstatmount *s)
|
2023-10-25 16:02:02 +02:00
|
|
|
{
|
|
|
|
|
struct mount *m = real_mount(s->mnt);
|
|
|
|
|
|
2023-11-19 19:03:33 +01:00
|
|
|
s->sm.mask |= STATMOUNT_MNT_BASIC;
|
2023-10-25 16:02:02 +02:00
|
|
|
s->sm.mnt_id = m->mnt_id_unique;
|
|
|
|
|
s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
|
|
|
|
|
s->sm.mnt_id_old = m->mnt_id;
|
|
|
|
|
s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
|
|
|
|
|
s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
|
|
|
|
|
s->sm.mnt_propagation = mnt_to_propagation_flags(m);
|
|
|
|
|
s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
|
|
|
|
|
s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-19 19:03:33 +01:00
|
|
|
static void statmount_propagate_from(struct kstatmount *s)
|
2023-10-25 16:02:02 +02:00
|
|
|
{
|
|
|
|
|
struct mount *m = real_mount(s->mnt);
|
|
|
|
|
|
2023-11-19 19:03:33 +01:00
|
|
|
s->sm.mask |= STATMOUNT_PROPAGATE_FROM;
|
|
|
|
|
if (IS_MNT_SLAVE(m))
|
|
|
|
|
s->sm.propagate_from = get_dominating_id(m, ¤t->fs->root);
|
2023-10-25 16:02:02 +02:00
|
|
|
}
|
|
|
|
|
|
2023-11-19 20:58:49 +01:00
|
|
|
static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq)
|
2023-10-25 16:02:02 +02:00
|
|
|
{
|
2023-11-19 20:58:49 +01:00
|
|
|
int ret;
|
|
|
|
|
size_t start = seq->count;
|
2023-10-25 16:02:02 +02:00
|
|
|
|
2023-11-19 20:58:49 +01:00
|
|
|
ret = show_path(seq, s->mnt->mnt_root);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
|
|
if (unlikely(seq_has_overflowed(seq)))
|
|
|
|
|
return -EAGAIN;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Unescape the result. It would be better if supplied string was not
|
|
|
|
|
* escaped in the first place, but that's a pretty invasive change.
|
|
|
|
|
*/
|
|
|
|
|
seq->buf[seq->count] = '\0';
|
|
|
|
|
seq->count = start;
|
|
|
|
|
seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
|
|
|
|
|
return 0;
|
2023-10-25 16:02:02 +02:00
|
|
|
}
|
|
|
|
|
|
2023-11-19 20:58:49 +01:00
|
|
|
static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq)
|
2023-10-25 16:02:02 +02:00
|
|
|
{
|
|
|
|
|
struct vfsmount *mnt = s->mnt;
|
|
|
|
|
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
|
2023-11-19 20:58:49 +01:00
|
|
|
int err;
|
2023-10-25 16:02:02 +02:00
|
|
|
|
2023-11-19 20:58:49 +01:00
|
|
|
err = seq_path_root(seq, &mnt_path, &s->root, "");
|
2023-10-25 16:02:02 +02:00
|
|
|
return err == SEQ_SKIP ? 0 : err;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-19 20:58:49 +01:00
|
|
|
static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
|
2023-10-25 16:02:02 +02:00
|
|
|
{
|
|
|
|
|
struct super_block *sb = s->mnt->mnt_sb;
|
|
|
|
|
|
|
|
|
|
seq_puts(seq, sb->s_type->name);
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-11 10:09:56 -05:00
|
|
|
static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq)
|
|
|
|
|
{
|
|
|
|
|
struct super_block *sb = s->mnt->mnt_sb;
|
|
|
|
|
|
|
|
|
|
if (sb->s_subtype)
|
|
|
|
|
seq_puts(seq, sb->s_subtype);
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-11 10:09:57 -05:00
|
|
|
static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
|
|
|
|
|
{
|
|
|
|
|
struct super_block *sb = s->mnt->mnt_sb;
|
|
|
|
|
struct mount *r = real_mount(s->mnt);
|
|
|
|
|
|
|
|
|
|
if (sb->s_op->show_devname) {
|
|
|
|
|
size_t start = seq->count;
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
ret = sb->s_op->show_devname(seq, s->mnt->mnt_root);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
|
|
if (unlikely(seq_has_overflowed(seq)))
|
|
|
|
|
return -EAGAIN;
|
|
|
|
|
|
|
|
|
|
/* Unescape the result */
|
|
|
|
|
seq->buf[seq->count] = '\0';
|
|
|
|
|
seq->count = start;
|
|
|
|
|
seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
|
2025-04-21 04:35:09 +01:00
|
|
|
} else {
|
2024-11-11 10:09:57 -05:00
|
|
|
seq_puts(seq, r->mnt_devname);
|
|
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-24 11:49:49 -04:00
|
|
|
static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
|
2024-06-24 11:49:47 -04:00
|
|
|
{
|
|
|
|
|
s->sm.mask |= STATMOUNT_MNT_NS_ID;
|
|
|
|
|
s->sm.mnt_ns_id = ns->seq;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-24 15:40:52 -04:00
|
|
|
static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
|
|
|
|
|
{
|
|
|
|
|
struct vfsmount *mnt = s->mnt;
|
|
|
|
|
struct super_block *sb = mnt->mnt_sb;
|
2025-01-29 16:12:53 +01:00
|
|
|
size_t start = seq->count;
|
2024-06-24 15:40:52 -04:00
|
|
|
int err;
|
|
|
|
|
|
2025-01-29 16:12:53 +01:00
|
|
|
err = security_sb_show_options(seq, sb);
|
|
|
|
|
if (err)
|
|
|
|
|
return err;
|
2024-11-15 10:35:53 -05:00
|
|
|
|
2025-01-29 16:12:53 +01:00
|
|
|
if (sb->s_op->show_options) {
|
2024-06-24 15:40:52 -04:00
|
|
|
err = sb->s_op->show_options(seq, mnt->mnt_root);
|
|
|
|
|
if (err)
|
|
|
|
|
return err;
|
2025-01-29 16:12:53 +01:00
|
|
|
}
|
2024-06-24 15:40:52 -04:00
|
|
|
|
2025-01-29 16:12:53 +01:00
|
|
|
if (unlikely(seq_has_overflowed(seq)))
|
|
|
|
|
return -EAGAIN;
|
2024-06-24 15:40:52 -04:00
|
|
|
|
2025-01-29 16:12:53 +01:00
|
|
|
if (seq->count == start)
|
|
|
|
|
return 0;
|
2024-06-24 15:40:52 -04:00
|
|
|
|
2025-01-29 16:12:53 +01:00
|
|
|
/* skip leading comma */
|
|
|
|
|
memmove(seq->buf + start, seq->buf + start + 1,
|
|
|
|
|
seq->count - start - 1);
|
|
|
|
|
seq->count--;
|
2024-06-24 15:40:52 -04:00
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-20 15:27:23 +01:00
|
|
|
static inline int statmount_opt_process(struct seq_file *seq, size_t start)
|
2024-11-14 16:31:27 +01:00
|
|
|
{
|
2024-11-20 15:27:23 +01:00
|
|
|
char *buf_end, *opt_end, *src, *dst;
|
2024-11-14 16:31:27 +01:00
|
|
|
int count = 0;
|
|
|
|
|
|
2024-11-20 15:27:23 +01:00
|
|
|
if (unlikely(seq_has_overflowed(seq)))
|
|
|
|
|
return -EAGAIN;
|
|
|
|
|
|
2024-11-14 16:31:27 +01:00
|
|
|
buf_end = seq->buf + seq->count;
|
2024-11-20 15:27:23 +01:00
|
|
|
dst = seq->buf + start;
|
|
|
|
|
src = dst + 1; /* skip initial comma */
|
|
|
|
|
|
|
|
|
|
if (src >= buf_end) {
|
|
|
|
|
seq->count = start;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-14 16:31:27 +01:00
|
|
|
*buf_end = '\0';
|
2024-11-20 15:27:23 +01:00
|
|
|
for (; src < buf_end; src = opt_end + 1) {
|
|
|
|
|
opt_end = strchrnul(src, ',');
|
2024-11-14 16:31:27 +01:00
|
|
|
*opt_end = '\0';
|
2024-11-20 15:27:23 +01:00
|
|
|
dst += string_unescape(src, dst, 0, UNESCAPE_OCTAL) + 1;
|
2024-11-14 16:31:27 +01:00
|
|
|
if (WARN_ON_ONCE(++count == INT_MAX))
|
|
|
|
|
return -EOVERFLOW;
|
|
|
|
|
}
|
2024-11-20 15:27:23 +01:00
|
|
|
seq->count = dst - 1 - seq->buf;
|
2024-11-14 16:31:27 +01:00
|
|
|
return count;
|
|
|
|
|
}
|
|
|
|
|
|
statmount: add flag to retrieve unescaped options
Filesystem options can be retrieved with STATMOUNT_MNT_OPTS, which
returns a string of comma separated options, where some characters are
escaped using the \OOO notation.
Add a new flag, STATMOUNT_OPT_ARRAY, which instead returns the raw
option values separated with '\0' charaters.
Since escaped charaters are rare, this inteface is preferable for
non-libmount users which likley don't want to deal with option
de-escaping.
Example code:
if (st->mask & STATMOUNT_OPT_ARRAY) {
const char *opt = st->str + st->opt_array;
for (unsigned int i = 0; i < st->opt_num; i++) {
printf("opt_array[%i]: <%s>\n", i, opt);
opt += strlen(opt) + 1;
}
}
Example ouput:
(1) mnt_opts: <lowerdir+=/l\054w\054r,lowerdir+=/l\054w\054r1,upperdir=/upp\054r,workdir=/w\054rk,redirect_dir=nofollow,uuid=null>
(2) opt_array[0]: <lowerdir+=/l,w,r>
opt_array[1]: <lowerdir+=/l,w,r1>
opt_array[2]: <upperdir=/upp,r>
opt_array[3]: <workdir=/w,rk>
opt_array[4]: <redirect_dir=nofollow>
opt_array[5]: <uuid=null>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Link: https://lore.kernel.org/r/20241112101006.30715-1-mszeredi@redhat.com
Acked-by: Jeff Layton <jlayton@kernel.org>
[brauner: tweak variable naming and parsing add example output]
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-11-12 11:10:04 +01:00
|
|
|
static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq)
|
|
|
|
|
{
|
|
|
|
|
struct vfsmount *mnt = s->mnt;
|
|
|
|
|
struct super_block *sb = mnt->mnt_sb;
|
|
|
|
|
size_t start = seq->count;
|
|
|
|
|
int err;
|
|
|
|
|
|
|
|
|
|
if (!sb->s_op->show_options)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
err = sb->s_op->show_options(seq, mnt->mnt_root);
|
|
|
|
|
if (err)
|
|
|
|
|
return err;
|
|
|
|
|
|
2024-11-20 15:27:23 +01:00
|
|
|
err = statmount_opt_process(seq, start);
|
2024-11-14 16:31:27 +01:00
|
|
|
if (err < 0)
|
|
|
|
|
return err;
|
|
|
|
|
|
|
|
|
|
s->sm.opt_num = err;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq)
|
|
|
|
|
{
|
|
|
|
|
struct vfsmount *mnt = s->mnt;
|
|
|
|
|
struct super_block *sb = mnt->mnt_sb;
|
|
|
|
|
size_t start = seq->count;
|
|
|
|
|
int err;
|
|
|
|
|
|
|
|
|
|
err = security_sb_show_options(seq, sb);
|
2024-11-20 09:17:25 +01:00
|
|
|
if (err)
|
2024-11-14 16:31:27 +01:00
|
|
|
return err;
|
|
|
|
|
|
2024-11-20 15:27:23 +01:00
|
|
|
err = statmount_opt_process(seq, start);
|
2024-11-14 16:31:27 +01:00
|
|
|
if (err < 0)
|
|
|
|
|
return err;
|
|
|
|
|
|
|
|
|
|
s->sm.opt_sec_num = err;
|
statmount: add flag to retrieve unescaped options
Filesystem options can be retrieved with STATMOUNT_MNT_OPTS, which
returns a string of comma separated options, where some characters are
escaped using the \OOO notation.
Add a new flag, STATMOUNT_OPT_ARRAY, which instead returns the raw
option values separated with '\0' charaters.
Since escaped charaters are rare, this inteface is preferable for
non-libmount users which likley don't want to deal with option
de-escaping.
Example code:
if (st->mask & STATMOUNT_OPT_ARRAY) {
const char *opt = st->str + st->opt_array;
for (unsigned int i = 0; i < st->opt_num; i++) {
printf("opt_array[%i]: <%s>\n", i, opt);
opt += strlen(opt) + 1;
}
}
Example ouput:
(1) mnt_opts: <lowerdir+=/l\054w\054r,lowerdir+=/l\054w\054r1,upperdir=/upp\054r,workdir=/w\054rk,redirect_dir=nofollow,uuid=null>
(2) opt_array[0]: <lowerdir+=/l,w,r>
opt_array[1]: <lowerdir+=/l,w,r1>
opt_array[2]: <upperdir=/upp,r>
opt_array[3]: <workdir=/w,rk>
opt_array[4]: <redirect_dir=nofollow>
opt_array[5]: <uuid=null>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Link: https://lore.kernel.org/r/20241112101006.30715-1-mszeredi@redhat.com
Acked-by: Jeff Layton <jlayton@kernel.org>
[brauner: tweak variable naming and parsing add example output]
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-11-12 11:10:04 +01:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
statmount: allow to retrieve idmappings
This adds the STATMOUNT_MNT_UIDMAP and STATMOUNT_MNT_GIDMAP options.
It allows the retrieval of idmappings via statmount().
Currently it isn't possible to figure out what idmappings are applied to
an idmapped mount. This information is often crucial. Before statmount()
the only realistic options for an interface like this would have been to
add it to /proc/<pid>/fdinfo/<nr> or to expose it in
/proc/<pid>/mountinfo. Both solution would have been pretty ugly and
would've shown information that is of strong interest to some
application but not all. statmount() is perfect for this.
The idmappings applied to an idmapped mount are shown relative to the
caller's user namespace. This is the most useful solution that doesn't
risk leaking information or confuse the caller.
For example, an idmapped mount might have been created with the
following idmappings:
mount --bind -o X-mount.idmap="0:10000:1000 2000:2000:1 3000:3000:1" /srv /opt
Listing the idmappings through statmount() in the same context shows:
mnt_id: 2147485088
mnt_parent_id: 2147484816
fs_type: btrfs
mnt_root: /srv
mnt_point: /opt
mnt_opts: ssd,discard=async,space_cache=v2,subvolid=5,subvol=/
mnt_uidmap[0]: 0 10000 1000
mnt_uidmap[1]: 2000 2000 1
mnt_uidmap[2]: 3000 3000 1
mnt_gidmap[0]: 0 10000 1000
mnt_gidmap[1]: 2000 2000 1
mnt_gidmap[2]: 3000 3000 1
But the idmappings might not always be resolvable in the caller's user
namespace. For example:
unshare --user --map-root
In this case statmount() will skip any mappings that fil to resolve in
the caller's idmapping:
mnt_id: 2147485087
mnt_parent_id: 2147484016
fs_type: btrfs
mnt_root: /srv
mnt_point: /opt
mnt_opts: ssd,discard=async,space_cache=v2,subvolid=5,subvol=/
The caller can differentiate between a mount not being idmapped and a
mount that is idmapped but where all mappings fail to resolve in the
caller's idmapping by check for the STATMOUNT_MNT_{G,U}IDMAP flag being
raised but the number of mappings in ->mnt_{g,u}idmap_num being zero.
Note that statmount() requires that the whole range must be resolvable
in the caller's user namespace. If a subrange fails to map it will still
list the map as not resolvable. This is a practical compromise to avoid
having to find which subranges are resovable and wich aren't.
Idmappings are listed as a string array with each mapping separated by
zero bytes. This allows to retrieve the idmappings and immediately use
them for writing to e.g., /proc/<pid>/{g,u}id_map and it also allow for
simple iteration like:
if (stmnt->mask & STATMOUNT_MNT_UIDMAP) {
const char *idmap = stmnt->str + stmnt->mnt_uidmap;
for (size_t idx = 0; idx < stmnt->mnt_uidmap_nr; idx++) {
printf("mnt_uidmap[%lu]: %s\n", idx, idmap);
idmap += strlen(idmap) + 1;
}
}
Link: https://lore.kernel.org/r/20250204-work-mnt_idmap-statmount-v2-2-007720f39f2e@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-02-04 12:27:47 +01:00
|
|
|
static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq)
|
|
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
ret = statmount_mnt_idmap(s->idmap, seq, true);
|
|
|
|
|
if (ret < 0)
|
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
|
|
s->sm.mnt_uidmap_num = ret;
|
|
|
|
|
/*
|
|
|
|
|
* Always raise STATMOUNT_MNT_UIDMAP even if there are no valid
|
|
|
|
|
* mappings. This allows userspace to distinguish between a
|
|
|
|
|
* non-idmapped mount and an idmapped mount where none of the
|
|
|
|
|
* individual mappings are valid in the caller's idmapping.
|
|
|
|
|
*/
|
|
|
|
|
if (is_valid_mnt_idmap(s->idmap))
|
|
|
|
|
s->sm.mask |= STATMOUNT_MNT_UIDMAP;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq)
|
|
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
ret = statmount_mnt_idmap(s->idmap, seq, false);
|
|
|
|
|
if (ret < 0)
|
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
|
|
s->sm.mnt_gidmap_num = ret;
|
|
|
|
|
/*
|
|
|
|
|
* Always raise STATMOUNT_MNT_GIDMAP even if there are no valid
|
|
|
|
|
* mappings. This allows userspace to distinguish between a
|
|
|
|
|
* non-idmapped mount and an idmapped mount where none of the
|
|
|
|
|
* individual mappings are valid in the caller's idmapping.
|
|
|
|
|
*/
|
|
|
|
|
if (is_valid_mnt_idmap(s->idmap))
|
|
|
|
|
s->sm.mask |= STATMOUNT_MNT_GIDMAP;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-19 20:58:49 +01:00
|
|
|
static int statmount_string(struct kstatmount *s, u64 flag)
|
2023-10-25 16:02:02 +02:00
|
|
|
{
|
2024-11-11 10:09:56 -05:00
|
|
|
int ret = 0;
|
2023-11-19 20:58:49 +01:00
|
|
|
size_t kbufsize;
|
|
|
|
|
struct seq_file *seq = &s->seq;
|
2023-10-25 16:02:02 +02:00
|
|
|
struct statmount *sm = &s->sm;
|
2025-01-30 13:15:00 +01:00
|
|
|
u32 start, *offp;
|
|
|
|
|
|
|
|
|
|
/* Reserve an empty string at the beginning for any unset offsets */
|
|
|
|
|
if (!seq->count)
|
|
|
|
|
seq_putc(seq, 0);
|
|
|
|
|
|
|
|
|
|
start = seq->count;
|
2023-11-19 20:58:49 +01:00
|
|
|
|
|
|
|
|
switch (flag) {
|
|
|
|
|
case STATMOUNT_FS_TYPE:
|
2025-01-30 13:15:00 +01:00
|
|
|
offp = &sm->fs_type;
|
2023-11-19 20:58:49 +01:00
|
|
|
ret = statmount_fs_type(s, seq);
|
|
|
|
|
break;
|
|
|
|
|
case STATMOUNT_MNT_ROOT:
|
2025-01-30 13:15:00 +01:00
|
|
|
offp = &sm->mnt_root;
|
2023-11-19 20:58:49 +01:00
|
|
|
ret = statmount_mnt_root(s, seq);
|
|
|
|
|
break;
|
|
|
|
|
case STATMOUNT_MNT_POINT:
|
2025-01-30 13:15:00 +01:00
|
|
|
offp = &sm->mnt_point;
|
2023-11-19 20:58:49 +01:00
|
|
|
ret = statmount_mnt_point(s, seq);
|
|
|
|
|
break;
|
2024-06-24 15:40:52 -04:00
|
|
|
case STATMOUNT_MNT_OPTS:
|
2025-01-30 13:15:00 +01:00
|
|
|
offp = &sm->mnt_opts;
|
2024-06-24 15:40:52 -04:00
|
|
|
ret = statmount_mnt_opts(s, seq);
|
|
|
|
|
break;
|
statmount: add flag to retrieve unescaped options
Filesystem options can be retrieved with STATMOUNT_MNT_OPTS, which
returns a string of comma separated options, where some characters are
escaped using the \OOO notation.
Add a new flag, STATMOUNT_OPT_ARRAY, which instead returns the raw
option values separated with '\0' charaters.
Since escaped charaters are rare, this inteface is preferable for
non-libmount users which likley don't want to deal with option
de-escaping.
Example code:
if (st->mask & STATMOUNT_OPT_ARRAY) {
const char *opt = st->str + st->opt_array;
for (unsigned int i = 0; i < st->opt_num; i++) {
printf("opt_array[%i]: <%s>\n", i, opt);
opt += strlen(opt) + 1;
}
}
Example ouput:
(1) mnt_opts: <lowerdir+=/l\054w\054r,lowerdir+=/l\054w\054r1,upperdir=/upp\054r,workdir=/w\054rk,redirect_dir=nofollow,uuid=null>
(2) opt_array[0]: <lowerdir+=/l,w,r>
opt_array[1]: <lowerdir+=/l,w,r1>
opt_array[2]: <upperdir=/upp,r>
opt_array[3]: <workdir=/w,rk>
opt_array[4]: <redirect_dir=nofollow>
opt_array[5]: <uuid=null>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Link: https://lore.kernel.org/r/20241112101006.30715-1-mszeredi@redhat.com
Acked-by: Jeff Layton <jlayton@kernel.org>
[brauner: tweak variable naming and parsing add example output]
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-11-12 11:10:04 +01:00
|
|
|
case STATMOUNT_OPT_ARRAY:
|
2025-01-30 13:15:00 +01:00
|
|
|
offp = &sm->opt_array;
|
statmount: add flag to retrieve unescaped options
Filesystem options can be retrieved with STATMOUNT_MNT_OPTS, which
returns a string of comma separated options, where some characters are
escaped using the \OOO notation.
Add a new flag, STATMOUNT_OPT_ARRAY, which instead returns the raw
option values separated with '\0' charaters.
Since escaped charaters are rare, this inteface is preferable for
non-libmount users which likley don't want to deal with option
de-escaping.
Example code:
if (st->mask & STATMOUNT_OPT_ARRAY) {
const char *opt = st->str + st->opt_array;
for (unsigned int i = 0; i < st->opt_num; i++) {
printf("opt_array[%i]: <%s>\n", i, opt);
opt += strlen(opt) + 1;
}
}
Example ouput:
(1) mnt_opts: <lowerdir+=/l\054w\054r,lowerdir+=/l\054w\054r1,upperdir=/upp\054r,workdir=/w\054rk,redirect_dir=nofollow,uuid=null>
(2) opt_array[0]: <lowerdir+=/l,w,r>
opt_array[1]: <lowerdir+=/l,w,r1>
opt_array[2]: <upperdir=/upp,r>
opt_array[3]: <workdir=/w,rk>
opt_array[4]: <redirect_dir=nofollow>
opt_array[5]: <uuid=null>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Link: https://lore.kernel.org/r/20241112101006.30715-1-mszeredi@redhat.com
Acked-by: Jeff Layton <jlayton@kernel.org>
[brauner: tweak variable naming and parsing add example output]
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-11-12 11:10:04 +01:00
|
|
|
ret = statmount_opt_array(s, seq);
|
|
|
|
|
break;
|
2024-11-14 16:31:27 +01:00
|
|
|
case STATMOUNT_OPT_SEC_ARRAY:
|
2025-01-30 13:15:00 +01:00
|
|
|
offp = &sm->opt_sec_array;
|
2024-11-14 16:31:27 +01:00
|
|
|
ret = statmount_opt_sec_array(s, seq);
|
|
|
|
|
break;
|
2024-11-11 10:09:56 -05:00
|
|
|
case STATMOUNT_FS_SUBTYPE:
|
2025-01-30 13:15:00 +01:00
|
|
|
offp = &sm->fs_subtype;
|
2024-11-11 10:09:56 -05:00
|
|
|
statmount_fs_subtype(s, seq);
|
|
|
|
|
break;
|
2024-11-11 10:09:57 -05:00
|
|
|
case STATMOUNT_SB_SOURCE:
|
2025-01-30 13:15:00 +01:00
|
|
|
offp = &sm->sb_source;
|
2024-11-11 10:09:57 -05:00
|
|
|
ret = statmount_sb_source(s, seq);
|
|
|
|
|
break;
|
statmount: allow to retrieve idmappings
This adds the STATMOUNT_MNT_UIDMAP and STATMOUNT_MNT_GIDMAP options.
It allows the retrieval of idmappings via statmount().
Currently it isn't possible to figure out what idmappings are applied to
an idmapped mount. This information is often crucial. Before statmount()
the only realistic options for an interface like this would have been to
add it to /proc/<pid>/fdinfo/<nr> or to expose it in
/proc/<pid>/mountinfo. Both solution would have been pretty ugly and
would've shown information that is of strong interest to some
application but not all. statmount() is perfect for this.
The idmappings applied to an idmapped mount are shown relative to the
caller's user namespace. This is the most useful solution that doesn't
risk leaking information or confuse the caller.
For example, an idmapped mount might have been created with the
following idmappings:
mount --bind -o X-mount.idmap="0:10000:1000 2000:2000:1 3000:3000:1" /srv /opt
Listing the idmappings through statmount() in the same context shows:
mnt_id: 2147485088
mnt_parent_id: 2147484816
fs_type: btrfs
mnt_root: /srv
mnt_point: /opt
mnt_opts: ssd,discard=async,space_cache=v2,subvolid=5,subvol=/
mnt_uidmap[0]: 0 10000 1000
mnt_uidmap[1]: 2000 2000 1
mnt_uidmap[2]: 3000 3000 1
mnt_gidmap[0]: 0 10000 1000
mnt_gidmap[1]: 2000 2000 1
mnt_gidmap[2]: 3000 3000 1
But the idmappings might not always be resolvable in the caller's user
namespace. For example:
unshare --user --map-root
In this case statmount() will skip any mappings that fil to resolve in
the caller's idmapping:
mnt_id: 2147485087
mnt_parent_id: 2147484016
fs_type: btrfs
mnt_root: /srv
mnt_point: /opt
mnt_opts: ssd,discard=async,space_cache=v2,subvolid=5,subvol=/
The caller can differentiate between a mount not being idmapped and a
mount that is idmapped but where all mappings fail to resolve in the
caller's idmapping by check for the STATMOUNT_MNT_{G,U}IDMAP flag being
raised but the number of mappings in ->mnt_{g,u}idmap_num being zero.
Note that statmount() requires that the whole range must be resolvable
in the caller's user namespace. If a subrange fails to map it will still
list the map as not resolvable. This is a practical compromise to avoid
having to find which subranges are resovable and wich aren't.
Idmappings are listed as a string array with each mapping separated by
zero bytes. This allows to retrieve the idmappings and immediately use
them for writing to e.g., /proc/<pid>/{g,u}id_map and it also allow for
simple iteration like:
if (stmnt->mask & STATMOUNT_MNT_UIDMAP) {
const char *idmap = stmnt->str + stmnt->mnt_uidmap;
for (size_t idx = 0; idx < stmnt->mnt_uidmap_nr; idx++) {
printf("mnt_uidmap[%lu]: %s\n", idx, idmap);
idmap += strlen(idmap) + 1;
}
}
Link: https://lore.kernel.org/r/20250204-work-mnt_idmap-statmount-v2-2-007720f39f2e@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-02-04 12:27:47 +01:00
|
|
|
case STATMOUNT_MNT_UIDMAP:
|
|
|
|
|
sm->mnt_uidmap = start;
|
|
|
|
|
ret = statmount_mnt_uidmap(s, seq);
|
|
|
|
|
break;
|
|
|
|
|
case STATMOUNT_MNT_GIDMAP:
|
|
|
|
|
sm->mnt_gidmap = start;
|
|
|
|
|
ret = statmount_mnt_gidmap(s, seq);
|
|
|
|
|
break;
|
2023-11-19 20:58:49 +01:00
|
|
|
default:
|
|
|
|
|
WARN_ON_ONCE(true);
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-11 10:09:55 -05:00
|
|
|
/*
|
|
|
|
|
* If nothing was emitted, return to avoid setting the flag
|
|
|
|
|
* and terminating the buffer.
|
|
|
|
|
*/
|
|
|
|
|
if (seq->count == start)
|
|
|
|
|
return ret;
|
2023-11-19 20:58:49 +01:00
|
|
|
if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize)))
|
|
|
|
|
return -EOVERFLOW;
|
|
|
|
|
if (kbufsize >= s->bufsize)
|
|
|
|
|
return -EOVERFLOW;
|
|
|
|
|
|
|
|
|
|
/* signal a retry */
|
|
|
|
|
if (unlikely(seq_has_overflowed(seq)))
|
|
|
|
|
return -EAGAIN;
|
|
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
|
|
seq->buf[seq->count++] = '\0';
|
|
|
|
|
sm->mask |= flag;
|
2025-01-30 13:15:00 +01:00
|
|
|
*offp = start;
|
2023-11-19 20:58:49 +01:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int copy_statmount_to_user(struct kstatmount *s)
|
|
|
|
|
{
|
|
|
|
|
struct statmount *sm = &s->sm;
|
|
|
|
|
struct seq_file *seq = &s->seq;
|
|
|
|
|
char __user *str = ((char __user *)s->buf) + sizeof(*sm);
|
2023-10-25 16:02:02 +02:00
|
|
|
size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));
|
2023-11-19 20:58:49 +01:00
|
|
|
|
|
|
|
|
if (seq->count && copy_to_user(str, seq->buf, seq->count))
|
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
|
|
/* Return the number of bytes copied to the buffer */
|
|
|
|
|
sm->size = copysize + seq->count;
|
|
|
|
|
if (copy_to_user(s->buf, sm, copysize))
|
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-25 14:33:45 +02:00
|
|
|
static struct mount *listmnt_next(struct mount *curr, bool reverse)
|
2023-11-19 20:58:49 +01:00
|
|
|
{
|
2024-06-25 14:33:45 +02:00
|
|
|
struct rb_node *node;
|
|
|
|
|
|
|
|
|
|
if (reverse)
|
|
|
|
|
node = rb_prev(&curr->mnt_node);
|
|
|
|
|
else
|
|
|
|
|
node = rb_next(&curr->mnt_node);
|
|
|
|
|
|
|
|
|
|
return node_to_mount(node);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
|
|
|
|
|
{
|
fs: find rootfs mount of the mount namespace
The method we used was predicated on the assumption that the mount
immediately following the root mount of the mount namespace would be the
rootfs mount of the namespace. That's not always the case though. For
example:
ID PARENT ID
408 412 0:60 /containers/overlay-containers/bc391117192b32071b22ef2083ebe7735d5c390f87a5779e02faf79ba0746ceb/userdata/hosts /etc/hosts rw,nosuid,nodev,relatime - tmpfs tmpfs rw,size=954664k,nr_inodes=238666,mode=700,uid=1000,gid=1000,inode64
409 414 0:61 / /dev/shm rw,nosuid,nodev,noexec,relatime - tmpfs shm rw,size=64000k,uid=1000,gid=1000,inode64
410 412 0:60 /containers/overlay-containers/bc391117192b32071b22ef2083ebe7735d5c390f87a5779e02faf79ba0746ceb/userdata/.containerenv /run/.containerenv rw,nosuid,nodev,relatime - tmpfs tmpfs rw,size=954664k,nr_inodes=238666,mode=700,uid=1000,gid=1000,inode64
411 412 0:60 /containers/overlay-containers/bc391117192b32071b22ef2083ebe7735d5c390f87a5779e02faf79ba0746ceb/userdata/hostname /etc/hostname rw,nosuid,nodev,relatime - tmpfs tmpfs rw,size=954664k,nr_inodes=238666,mode=700,uid=1000,gid=1000,inode64
412 363 0:65 / / rw,relatime - overlay overlay rw,lowerdir=/home/user1/.local/share/containers/storage/overlay/l/JS65SUCGTPCP2EEBHLRP4UCFI5:/home/user1/.local/share/containers/storage/overlay/l/DLW22KVDWUNI4242D6SDJ5GKCL [...]
413 412 0:68 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
414 412 0:69 / /dev rw,nosuid - tmpfs tmpfs rw,size=65536k,mode=755,uid=1000,gid=1000,inode64
415 412 0:70 / /sys ro,nosuid,nodev,noexec,relatime - sysfs sysfs rw
416 414 0:71 / /dev/pts rw,nosuid,noexec,relatime - devpts devpts rw,gid=100004,mode=620,ptmxmode=666
417 414 0:67 / /dev/mqueue rw,nosuid,nodev,noexec,relatime - mqueue mqueue rw
418 415 0:27 / /sys/fs/cgroup ro,nosuid,nodev,noexec,relatime - cgroup2 cgroup2 rw,nsdelegate,memory_recursiveprot
419 414 0:6 /null /dev/null rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
420 414 0:6 /zero /dev/zero rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
422 414 0:6 /full /dev/full rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
423 414 0:6 /tty /dev/tty rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
430 414 0:6 /random /dev/random rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
431 414 0:6 /urandom /dev/urandom rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
433 413 0:72 / /proc/acpi ro,relatime - tmpfs tmpfs rw,size=0k,uid=1000,gid=1000,inode64
440 413 0:6 /null /proc/kcore ro,nosuid - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
441 413 0:6 /null /proc/keys ro,nosuid - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
442 413 0:6 /null /proc/timer_list ro,nosuid - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
443 413 0:73 / /proc/scsi ro,relatime - tmpfs tmpfs rw,size=0k,uid=1000,gid=1000,inode64
444 415 0:74 / /sys/firmware ro,relatime - tmpfs tmpfs rw,size=0k,uid=1000,gid=1000,inode64
445 415 0:75 / /sys/dev/block ro,relatime - tmpfs tmpfs rw,size=0k,uid=1000,gid=1000,inode64
446 413 0:68 /bus /proc/bus ro,nosuid,nodev,noexec,relatime - proc proc rw
447 413 0:68 /fs /proc/fs ro,nosuid,nodev,noexec,relatime - proc proc rw
448 413 0:68 /irq /proc/irq ro,nosuid,nodev,noexec,relatime - proc proc rw
449 413 0:68 /sys /proc/sys ro,nosuid,nodev,noexec,relatime - proc proc rw
450 413 0:68 /sysrq-trigger /proc/sysrq-trigger ro,nosuid,nodev,noexec,relatime - proc proc rw
364 414 0:71 /0 /dev/console rw,relatime - devpts devpts rw,gid=100004,mode=620,ptmxmode=666
In this mount table the root mount of the mount namespace is the mount
with id 363 (It isn't visible because it's literally just what the
rootfs mount is mounted upon and usually it's just a copy of the real
rootfs).
The rootfs mount that's mounted on the root mount of the mount namespace
is the mount with id 412. But the mount namespace contains mounts that
were created before the rootfs mount and thus have earlier mount ids. So
the first call to listmnt_next() would return the mount with the mount
id 408 and not the rootfs mount.
So we need to find the actual rootfs mount mounted on the root mount of
the mount namespace. This logic is also present in mntns_install() where
vfs_path_lookup() is used. We can't use this though as we're holding the
namespace semaphore. We could look at the children of the root mount of
the mount namespace directly but that also seems a bit out of place
while we have the rbtree. So let's just iterate through the rbtree
starting from the root mount of the mount namespace and find the mount
whose parent is the root mount of the mount namespace. That mount will
usually appear very early in the rbtree and afaik there can only be one.
IOW, it would be very strange if we ended up with a root mount of a
mount namespace that has shadow mounts.
Fixes: 0a3deb11858a ("fs: Allow listmount() in foreign mount namespace") # mainline only
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-07-05 13:08:54 +02:00
|
|
|
struct mount *first, *child;
|
2024-06-25 14:33:45 +02:00
|
|
|
|
|
|
|
|
rwsem_assert_held(&namespace_sem);
|
|
|
|
|
|
|
|
|
|
/* We're looking at our own ns, just use get_fs_root. */
|
|
|
|
|
if (ns == current->nsproxy->mnt_ns) {
|
|
|
|
|
get_fs_root(current->fs, root);
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We have to find the first mount in our ns and use that, however it
|
|
|
|
|
* may not exist, so handle that properly.
|
|
|
|
|
*/
|
2025-02-21 14:13:01 +01:00
|
|
|
if (mnt_ns_empty(ns))
|
2024-06-25 14:33:45 +02:00
|
|
|
return -ENOENT;
|
|
|
|
|
|
fs: find rootfs mount of the mount namespace
The method we used was predicated on the assumption that the mount
immediately following the root mount of the mount namespace would be the
rootfs mount of the namespace. That's not always the case though. For
example:
ID PARENT ID
408 412 0:60 /containers/overlay-containers/bc391117192b32071b22ef2083ebe7735d5c390f87a5779e02faf79ba0746ceb/userdata/hosts /etc/hosts rw,nosuid,nodev,relatime - tmpfs tmpfs rw,size=954664k,nr_inodes=238666,mode=700,uid=1000,gid=1000,inode64
409 414 0:61 / /dev/shm rw,nosuid,nodev,noexec,relatime - tmpfs shm rw,size=64000k,uid=1000,gid=1000,inode64
410 412 0:60 /containers/overlay-containers/bc391117192b32071b22ef2083ebe7735d5c390f87a5779e02faf79ba0746ceb/userdata/.containerenv /run/.containerenv rw,nosuid,nodev,relatime - tmpfs tmpfs rw,size=954664k,nr_inodes=238666,mode=700,uid=1000,gid=1000,inode64
411 412 0:60 /containers/overlay-containers/bc391117192b32071b22ef2083ebe7735d5c390f87a5779e02faf79ba0746ceb/userdata/hostname /etc/hostname rw,nosuid,nodev,relatime - tmpfs tmpfs rw,size=954664k,nr_inodes=238666,mode=700,uid=1000,gid=1000,inode64
412 363 0:65 / / rw,relatime - overlay overlay rw,lowerdir=/home/user1/.local/share/containers/storage/overlay/l/JS65SUCGTPCP2EEBHLRP4UCFI5:/home/user1/.local/share/containers/storage/overlay/l/DLW22KVDWUNI4242D6SDJ5GKCL [...]
413 412 0:68 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
414 412 0:69 / /dev rw,nosuid - tmpfs tmpfs rw,size=65536k,mode=755,uid=1000,gid=1000,inode64
415 412 0:70 / /sys ro,nosuid,nodev,noexec,relatime - sysfs sysfs rw
416 414 0:71 / /dev/pts rw,nosuid,noexec,relatime - devpts devpts rw,gid=100004,mode=620,ptmxmode=666
417 414 0:67 / /dev/mqueue rw,nosuid,nodev,noexec,relatime - mqueue mqueue rw
418 415 0:27 / /sys/fs/cgroup ro,nosuid,nodev,noexec,relatime - cgroup2 cgroup2 rw,nsdelegate,memory_recursiveprot
419 414 0:6 /null /dev/null rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
420 414 0:6 /zero /dev/zero rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
422 414 0:6 /full /dev/full rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
423 414 0:6 /tty /dev/tty rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
430 414 0:6 /random /dev/random rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
431 414 0:6 /urandom /dev/urandom rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
433 413 0:72 / /proc/acpi ro,relatime - tmpfs tmpfs rw,size=0k,uid=1000,gid=1000,inode64
440 413 0:6 /null /proc/kcore ro,nosuid - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
441 413 0:6 /null /proc/keys ro,nosuid - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
442 413 0:6 /null /proc/timer_list ro,nosuid - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
443 413 0:73 / /proc/scsi ro,relatime - tmpfs tmpfs rw,size=0k,uid=1000,gid=1000,inode64
444 415 0:74 / /sys/firmware ro,relatime - tmpfs tmpfs rw,size=0k,uid=1000,gid=1000,inode64
445 415 0:75 / /sys/dev/block ro,relatime - tmpfs tmpfs rw,size=0k,uid=1000,gid=1000,inode64
446 413 0:68 /bus /proc/bus ro,nosuid,nodev,noexec,relatime - proc proc rw
447 413 0:68 /fs /proc/fs ro,nosuid,nodev,noexec,relatime - proc proc rw
448 413 0:68 /irq /proc/irq ro,nosuid,nodev,noexec,relatime - proc proc rw
449 413 0:68 /sys /proc/sys ro,nosuid,nodev,noexec,relatime - proc proc rw
450 413 0:68 /sysrq-trigger /proc/sysrq-trigger ro,nosuid,nodev,noexec,relatime - proc proc rw
364 414 0:71 /0 /dev/console rw,relatime - devpts devpts rw,gid=100004,mode=620,ptmxmode=666
In this mount table the root mount of the mount namespace is the mount
with id 363 (It isn't visible because it's literally just what the
rootfs mount is mounted upon and usually it's just a copy of the real
rootfs).
The rootfs mount that's mounted on the root mount of the mount namespace
is the mount with id 412. But the mount namespace contains mounts that
were created before the rootfs mount and thus have earlier mount ids. So
the first call to listmnt_next() would return the mount with the mount
id 408 and not the rootfs mount.
So we need to find the actual rootfs mount mounted on the root mount of
the mount namespace. This logic is also present in mntns_install() where
vfs_path_lookup() is used. We can't use this though as we're holding the
namespace semaphore. We could look at the children of the root mount of
the mount namespace directly but that also seems a bit out of place
while we have the rbtree. So let's just iterate through the rbtree
starting from the root mount of the mount namespace and find the mount
whose parent is the root mount of the mount namespace. That mount will
usually appear very early in the rbtree and afaik there can only be one.
IOW, it would be very strange if we ended up with a root mount of a
mount namespace that has shadow mounts.
Fixes: 0a3deb11858a ("fs: Allow listmount() in foreign mount namespace") # mainline only
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-07-05 13:08:54 +02:00
|
|
|
first = child = ns->root;
|
|
|
|
|
for (;;) {
|
|
|
|
|
child = listmnt_next(child, false);
|
|
|
|
|
if (!child)
|
|
|
|
|
return -ENOENT;
|
|
|
|
|
if (child->mnt_parent == first)
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
root->mnt = mntget(&child->mnt);
|
2024-06-25 14:33:45 +02:00
|
|
|
root->dentry = dget(root->mnt->mnt_root);
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-06 07:51:52 -05:00
|
|
|
/* This must be updated whenever a new flag is added */
|
|
|
|
|
#define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \
|
|
|
|
|
STATMOUNT_MNT_BASIC | \
|
|
|
|
|
STATMOUNT_PROPAGATE_FROM | \
|
|
|
|
|
STATMOUNT_MNT_ROOT | \
|
|
|
|
|
STATMOUNT_MNT_POINT | \
|
|
|
|
|
STATMOUNT_FS_TYPE | \
|
|
|
|
|
STATMOUNT_MNT_NS_ID | \
|
|
|
|
|
STATMOUNT_MNT_OPTS | \
|
|
|
|
|
STATMOUNT_FS_SUBTYPE | \
|
|
|
|
|
STATMOUNT_SB_SOURCE | \
|
|
|
|
|
STATMOUNT_OPT_ARRAY | \
|
|
|
|
|
STATMOUNT_OPT_SEC_ARRAY | \
|
2025-05-12 01:49:53 +03:00
|
|
|
STATMOUNT_SUPPORTED_MASK | \
|
|
|
|
|
STATMOUNT_MNT_UIDMAP | \
|
|
|
|
|
STATMOUNT_MNT_GIDMAP)
|
2025-02-06 07:51:52 -05:00
|
|
|
|
2024-06-25 14:33:45 +02:00
|
|
|
static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
|
|
|
|
|
struct mnt_namespace *ns)
|
|
|
|
|
{
|
|
|
|
|
struct path root __free(path_put) = {};
|
|
|
|
|
struct mount *m;
|
2023-10-25 16:02:02 +02:00
|
|
|
int err;
|
|
|
|
|
|
2024-06-25 14:33:45 +02:00
|
|
|
/* Has the namespace already been emptied? */
|
2025-02-21 14:13:01 +01:00
|
|
|
if (mnt_ns_id && mnt_ns_empty(ns))
|
2024-06-25 14:33:45 +02:00
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
|
|
s->mnt = lookup_mnt_in_ns(mnt_id, ns);
|
|
|
|
|
if (!s->mnt)
|
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
|
|
err = grab_requested_root(ns, &root);
|
|
|
|
|
if (err)
|
|
|
|
|
return err;
|
|
|
|
|
|
2023-10-25 16:02:02 +02:00
|
|
|
/*
|
|
|
|
|
* Don't trigger audit denials. We just want to determine what
|
|
|
|
|
* mounts to show users.
|
|
|
|
|
*/
|
2024-06-25 14:33:45 +02:00
|
|
|
m = real_mount(s->mnt);
|
|
|
|
|
if (!is_path_reachable(m, m->mnt.mnt_root, &root) &&
|
2024-06-24 11:49:45 -04:00
|
|
|
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
|
2023-10-25 16:02:02 +02:00
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
|
|
err = security_sb_statfs(s->mnt->mnt_root);
|
|
|
|
|
if (err)
|
|
|
|
|
return err;
|
|
|
|
|
|
2024-06-25 14:33:45 +02:00
|
|
|
s->root = root;
|
2023-11-19 19:03:33 +01:00
|
|
|
|
2025-04-16 10:19:05 +02:00
|
|
|
/*
|
|
|
|
|
* Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap
|
|
|
|
|
* can change concurrently as we only hold the read-side of the
|
|
|
|
|
* namespace semaphore and mount properties may change with only
|
|
|
|
|
* the mount lock held.
|
|
|
|
|
*
|
|
|
|
|
* We could sample the mount lock sequence counter to detect
|
|
|
|
|
* those changes and retry. But it's not worth it. Worst that
|
|
|
|
|
* happens is that the mnt->mnt_idmap pointer is already changed
|
|
|
|
|
* while mnt->mnt_flags isn't or vica versa. So what.
|
|
|
|
|
*
|
|
|
|
|
* Both mnt->mnt_flags and mnt->mnt_idmap are set and retrieved
|
|
|
|
|
* via READ_ONCE()/WRITE_ONCE() and guard against theoretical
|
|
|
|
|
* torn read/write. That's all we care about right now.
|
|
|
|
|
*/
|
|
|
|
|
s->idmap = mnt_idmap(s->mnt);
|
2023-11-19 19:03:33 +01:00
|
|
|
if (s->mask & STATMOUNT_MNT_BASIC)
|
|
|
|
|
statmount_mnt_basic(s);
|
|
|
|
|
|
2025-04-16 10:19:05 +02:00
|
|
|
if (s->mask & STATMOUNT_SB_BASIC)
|
|
|
|
|
statmount_sb_basic(s);
|
|
|
|
|
|
2023-11-19 19:03:33 +01:00
|
|
|
if (s->mask & STATMOUNT_PROPAGATE_FROM)
|
|
|
|
|
statmount_propagate_from(s);
|
|
|
|
|
|
2023-11-19 20:58:49 +01:00
|
|
|
if (s->mask & STATMOUNT_FS_TYPE)
|
|
|
|
|
err = statmount_string(s, STATMOUNT_FS_TYPE);
|
2023-10-25 16:02:02 +02:00
|
|
|
|
2023-11-19 20:58:49 +01:00
|
|
|
if (!err && s->mask & STATMOUNT_MNT_ROOT)
|
|
|
|
|
err = statmount_string(s, STATMOUNT_MNT_ROOT);
|
2023-10-25 16:02:02 +02:00
|
|
|
|
2023-11-19 20:58:49 +01:00
|
|
|
if (!err && s->mask & STATMOUNT_MNT_POINT)
|
|
|
|
|
err = statmount_string(s, STATMOUNT_MNT_POINT);
|
2023-10-25 16:02:02 +02:00
|
|
|
|
2024-06-24 15:40:52 -04:00
|
|
|
if (!err && s->mask & STATMOUNT_MNT_OPTS)
|
|
|
|
|
err = statmount_string(s, STATMOUNT_MNT_OPTS);
|
|
|
|
|
|
statmount: add flag to retrieve unescaped options
Filesystem options can be retrieved with STATMOUNT_MNT_OPTS, which
returns a string of comma separated options, where some characters are
escaped using the \OOO notation.
Add a new flag, STATMOUNT_OPT_ARRAY, which instead returns the raw
option values separated with '\0' charaters.
Since escaped charaters are rare, this inteface is preferable for
non-libmount users which likley don't want to deal with option
de-escaping.
Example code:
if (st->mask & STATMOUNT_OPT_ARRAY) {
const char *opt = st->str + st->opt_array;
for (unsigned int i = 0; i < st->opt_num; i++) {
printf("opt_array[%i]: <%s>\n", i, opt);
opt += strlen(opt) + 1;
}
}
Example ouput:
(1) mnt_opts: <lowerdir+=/l\054w\054r,lowerdir+=/l\054w\054r1,upperdir=/upp\054r,workdir=/w\054rk,redirect_dir=nofollow,uuid=null>
(2) opt_array[0]: <lowerdir+=/l,w,r>
opt_array[1]: <lowerdir+=/l,w,r1>
opt_array[2]: <upperdir=/upp,r>
opt_array[3]: <workdir=/w,rk>
opt_array[4]: <redirect_dir=nofollow>
opt_array[5]: <uuid=null>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Link: https://lore.kernel.org/r/20241112101006.30715-1-mszeredi@redhat.com
Acked-by: Jeff Layton <jlayton@kernel.org>
[brauner: tweak variable naming and parsing add example output]
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-11-12 11:10:04 +01:00
|
|
|
if (!err && s->mask & STATMOUNT_OPT_ARRAY)
|
|
|
|
|
err = statmount_string(s, STATMOUNT_OPT_ARRAY);
|
|
|
|
|
|
2024-11-14 16:31:27 +01:00
|
|
|
if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY)
|
|
|
|
|
err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY);
|
|
|
|
|
|
2024-11-11 10:09:56 -05:00
|
|
|
if (!err && s->mask & STATMOUNT_FS_SUBTYPE)
|
|
|
|
|
err = statmount_string(s, STATMOUNT_FS_SUBTYPE);
|
|
|
|
|
|
2024-11-11 10:09:57 -05:00
|
|
|
if (!err && s->mask & STATMOUNT_SB_SOURCE)
|
|
|
|
|
err = statmount_string(s, STATMOUNT_SB_SOURCE);
|
|
|
|
|
|
statmount: allow to retrieve idmappings
This adds the STATMOUNT_MNT_UIDMAP and STATMOUNT_MNT_GIDMAP options.
It allows the retrieval of idmappings via statmount().
Currently it isn't possible to figure out what idmappings are applied to
an idmapped mount. This information is often crucial. Before statmount()
the only realistic options for an interface like this would have been to
add it to /proc/<pid>/fdinfo/<nr> or to expose it in
/proc/<pid>/mountinfo. Both solution would have been pretty ugly and
would've shown information that is of strong interest to some
application but not all. statmount() is perfect for this.
The idmappings applied to an idmapped mount are shown relative to the
caller's user namespace. This is the most useful solution that doesn't
risk leaking information or confuse the caller.
For example, an idmapped mount might have been created with the
following idmappings:
mount --bind -o X-mount.idmap="0:10000:1000 2000:2000:1 3000:3000:1" /srv /opt
Listing the idmappings through statmount() in the same context shows:
mnt_id: 2147485088
mnt_parent_id: 2147484816
fs_type: btrfs
mnt_root: /srv
mnt_point: /opt
mnt_opts: ssd,discard=async,space_cache=v2,subvolid=5,subvol=/
mnt_uidmap[0]: 0 10000 1000
mnt_uidmap[1]: 2000 2000 1
mnt_uidmap[2]: 3000 3000 1
mnt_gidmap[0]: 0 10000 1000
mnt_gidmap[1]: 2000 2000 1
mnt_gidmap[2]: 3000 3000 1
But the idmappings might not always be resolvable in the caller's user
namespace. For example:
unshare --user --map-root
In this case statmount() will skip any mappings that fil to resolve in
the caller's idmapping:
mnt_id: 2147485087
mnt_parent_id: 2147484016
fs_type: btrfs
mnt_root: /srv
mnt_point: /opt
mnt_opts: ssd,discard=async,space_cache=v2,subvolid=5,subvol=/
The caller can differentiate between a mount not being idmapped and a
mount that is idmapped but where all mappings fail to resolve in the
caller's idmapping by check for the STATMOUNT_MNT_{G,U}IDMAP flag being
raised but the number of mappings in ->mnt_{g,u}idmap_num being zero.
Note that statmount() requires that the whole range must be resolvable
in the caller's user namespace. If a subrange fails to map it will still
list the map as not resolvable. This is a practical compromise to avoid
having to find which subranges are resovable and wich aren't.
Idmappings are listed as a string array with each mapping separated by
zero bytes. This allows to retrieve the idmappings and immediately use
them for writing to e.g., /proc/<pid>/{g,u}id_map and it also allow for
simple iteration like:
if (stmnt->mask & STATMOUNT_MNT_UIDMAP) {
const char *idmap = stmnt->str + stmnt->mnt_uidmap;
for (size_t idx = 0; idx < stmnt->mnt_uidmap_nr; idx++) {
printf("mnt_uidmap[%lu]: %s\n", idx, idmap);
idmap += strlen(idmap) + 1;
}
}
Link: https://lore.kernel.org/r/20250204-work-mnt_idmap-statmount-v2-2-007720f39f2e@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-02-04 12:27:47 +01:00
|
|
|
if (!err && s->mask & STATMOUNT_MNT_UIDMAP)
|
|
|
|
|
err = statmount_string(s, STATMOUNT_MNT_UIDMAP);
|
|
|
|
|
|
|
|
|
|
if (!err && s->mask & STATMOUNT_MNT_GIDMAP)
|
|
|
|
|
err = statmount_string(s, STATMOUNT_MNT_GIDMAP);
|
|
|
|
|
|
2024-06-24 11:49:47 -04:00
|
|
|
if (!err && s->mask & STATMOUNT_MNT_NS_ID)
|
2024-06-24 11:49:49 -04:00
|
|
|
statmount_mnt_ns_id(s, ns);
|
2024-06-24 11:49:47 -04:00
|
|
|
|
2025-02-06 07:51:52 -05:00
|
|
|
if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) {
|
|
|
|
|
s->sm.mask |= STATMOUNT_SUPPORTED_MASK;
|
|
|
|
|
s->sm.supported_mask = STATMOUNT_SUPPORTED;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-19 20:58:49 +01:00
|
|
|
if (err)
|
|
|
|
|
return err;
|
|
|
|
|
|
2025-02-06 07:51:52 -05:00
|
|
|
/* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */
|
|
|
|
|
WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask);
|
|
|
|
|
|
2023-11-19 20:58:49 +01:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline bool retry_statmount(const long ret, size_t *seq_size)
|
|
|
|
|
{
|
|
|
|
|
if (likely(ret != -EAGAIN))
|
|
|
|
|
return false;
|
|
|
|
|
if (unlikely(check_mul_overflow(*seq_size, 2, seq_size)))
|
|
|
|
|
return false;
|
|
|
|
|
if (unlikely(*seq_size > MAX_RW_COUNT))
|
|
|
|
|
return false;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-25 14:33:45 +02:00
|
|
|
#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
|
2024-11-11 10:09:57 -05:00
|
|
|
STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \
|
statmount: add flag to retrieve unescaped options
Filesystem options can be retrieved with STATMOUNT_MNT_OPTS, which
returns a string of comma separated options, where some characters are
escaped using the \OOO notation.
Add a new flag, STATMOUNT_OPT_ARRAY, which instead returns the raw
option values separated with '\0' charaters.
Since escaped charaters are rare, this inteface is preferable for
non-libmount users which likley don't want to deal with option
de-escaping.
Example code:
if (st->mask & STATMOUNT_OPT_ARRAY) {
const char *opt = st->str + st->opt_array;
for (unsigned int i = 0; i < st->opt_num; i++) {
printf("opt_array[%i]: <%s>\n", i, opt);
opt += strlen(opt) + 1;
}
}
Example ouput:
(1) mnt_opts: <lowerdir+=/l\054w\054r,lowerdir+=/l\054w\054r1,upperdir=/upp\054r,workdir=/w\054rk,redirect_dir=nofollow,uuid=null>
(2) opt_array[0]: <lowerdir+=/l,w,r>
opt_array[1]: <lowerdir+=/l,w,r1>
opt_array[2]: <upperdir=/upp,r>
opt_array[3]: <workdir=/w,rk>
opt_array[4]: <redirect_dir=nofollow>
opt_array[5]: <uuid=null>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Link: https://lore.kernel.org/r/20241112101006.30715-1-mszeredi@redhat.com
Acked-by: Jeff Layton <jlayton@kernel.org>
[brauner: tweak variable naming and parsing add example output]
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-11-12 11:10:04 +01:00
|
|
|
STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \
|
statmount: allow to retrieve idmappings
This adds the STATMOUNT_MNT_UIDMAP and STATMOUNT_MNT_GIDMAP options.
It allows the retrieval of idmappings via statmount().
Currently it isn't possible to figure out what idmappings are applied to
an idmapped mount. This information is often crucial. Before statmount()
the only realistic options for an interface like this would have been to
add it to /proc/<pid>/fdinfo/<nr> or to expose it in
/proc/<pid>/mountinfo. Both solution would have been pretty ugly and
would've shown information that is of strong interest to some
application but not all. statmount() is perfect for this.
The idmappings applied to an idmapped mount are shown relative to the
caller's user namespace. This is the most useful solution that doesn't
risk leaking information or confuse the caller.
For example, an idmapped mount might have been created with the
following idmappings:
mount --bind -o X-mount.idmap="0:10000:1000 2000:2000:1 3000:3000:1" /srv /opt
Listing the idmappings through statmount() in the same context shows:
mnt_id: 2147485088
mnt_parent_id: 2147484816
fs_type: btrfs
mnt_root: /srv
mnt_point: /opt
mnt_opts: ssd,discard=async,space_cache=v2,subvolid=5,subvol=/
mnt_uidmap[0]: 0 10000 1000
mnt_uidmap[1]: 2000 2000 1
mnt_uidmap[2]: 3000 3000 1
mnt_gidmap[0]: 0 10000 1000
mnt_gidmap[1]: 2000 2000 1
mnt_gidmap[2]: 3000 3000 1
But the idmappings might not always be resolvable in the caller's user
namespace. For example:
unshare --user --map-root
In this case statmount() will skip any mappings that fil to resolve in
the caller's idmapping:
mnt_id: 2147485087
mnt_parent_id: 2147484016
fs_type: btrfs
mnt_root: /srv
mnt_point: /opt
mnt_opts: ssd,discard=async,space_cache=v2,subvolid=5,subvol=/
The caller can differentiate between a mount not being idmapped and a
mount that is idmapped but where all mappings fail to resolve in the
caller's idmapping by check for the STATMOUNT_MNT_{G,U}IDMAP flag being
raised but the number of mappings in ->mnt_{g,u}idmap_num being zero.
Note that statmount() requires that the whole range must be resolvable
in the caller's user namespace. If a subrange fails to map it will still
list the map as not resolvable. This is a practical compromise to avoid
having to find which subranges are resovable and wich aren't.
Idmappings are listed as a string array with each mapping separated by
zero bytes. This allows to retrieve the idmappings and immediately use
them for writing to e.g., /proc/<pid>/{g,u}id_map and it also allow for
simple iteration like:
if (stmnt->mask & STATMOUNT_MNT_UIDMAP) {
const char *idmap = stmnt->str + stmnt->mnt_uidmap;
for (size_t idx = 0; idx < stmnt->mnt_uidmap_nr; idx++) {
printf("mnt_uidmap[%lu]: %s\n", idx, idmap);
idmap += strlen(idmap) + 1;
}
}
Link: https://lore.kernel.org/r/20250204-work-mnt_idmap-statmount-v2-2-007720f39f2e@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-02-04 12:27:47 +01:00
|
|
|
STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \
|
|
|
|
|
STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP)
|
2024-06-25 14:33:45 +02:00
|
|
|
|
2023-11-19 20:58:49 +01:00
|
|
|
static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
|
|
|
|
|
struct statmount __user *buf, size_t bufsize,
|
|
|
|
|
size_t seq_size)
|
|
|
|
|
{
|
|
|
|
|
if (!access_ok(buf, bufsize))
|
2023-10-25 16:02:02 +02:00
|
|
|
return -EFAULT;
|
|
|
|
|
|
2023-11-19 20:58:49 +01:00
|
|
|
memset(ks, 0, sizeof(*ks));
|
2023-10-25 16:02:03 +02:00
|
|
|
ks->mask = kreq->param;
|
2023-11-19 20:58:49 +01:00
|
|
|
ks->buf = buf;
|
|
|
|
|
ks->bufsize = bufsize;
|
2024-06-25 14:33:45 +02:00
|
|
|
|
|
|
|
|
if (ks->mask & STATMOUNT_STRING_REQ) {
|
|
|
|
|
if (bufsize == sizeof(ks->sm))
|
|
|
|
|
return -EOVERFLOW;
|
|
|
|
|
|
|
|
|
|
ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
|
|
|
|
|
if (!ks->seq.buf)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
|
|
ks->seq.size = seq_size;
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-25 16:02:02 +02:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-29 12:27:15 +01:00
|
|
|
static int copy_mnt_id_req(const struct mnt_id_req __user *req,
|
|
|
|
|
struct mnt_id_req *kreq)
|
|
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
size_t usize;
|
|
|
|
|
|
2024-06-24 11:49:48 -04:00
|
|
|
BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);
|
2023-11-29 12:27:15 +01:00
|
|
|
|
|
|
|
|
ret = get_user(usize, &req->size);
|
|
|
|
|
if (ret)
|
|
|
|
|
return -EFAULT;
|
|
|
|
|
if (unlikely(usize > PAGE_SIZE))
|
|
|
|
|
return -E2BIG;
|
|
|
|
|
if (unlikely(usize < MNT_ID_REQ_SIZE_VER0))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
memset(kreq, 0, sizeof(*kreq));
|
|
|
|
|
ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
if (kreq->spare != 0)
|
|
|
|
|
return -EINVAL;
|
2024-07-04 10:58:34 +02:00
|
|
|
/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
|
|
|
|
|
if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
|
|
|
|
|
return -EINVAL;
|
2023-11-29 12:27:15 +01:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-24 11:49:48 -04:00
|
|
|
/*
|
|
|
|
|
* If the user requested a specific mount namespace id, look that up and return
|
|
|
|
|
* that, or if not simply grab a passive reference on our mount namespace and
|
|
|
|
|
* return that.
|
|
|
|
|
*/
|
2024-07-19 13:41:49 +02:00
|
|
|
static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq)
|
2024-06-24 11:49:48 -04:00
|
|
|
{
|
2024-07-19 13:41:49 +02:00
|
|
|
struct mnt_namespace *mnt_ns;
|
|
|
|
|
|
|
|
|
|
if (kreq->mnt_ns_id && kreq->spare)
|
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
|
|
|
|
|
if (kreq->mnt_ns_id)
|
|
|
|
|
return lookup_mnt_ns(kreq->mnt_ns_id);
|
|
|
|
|
|
|
|
|
|
if (kreq->spare) {
|
|
|
|
|
struct ns_common *ns;
|
|
|
|
|
|
|
|
|
|
CLASS(fd, f)(kreq->spare);
|
2024-09-23 09:35:36 -07:00
|
|
|
if (fd_empty(f))
|
2024-07-19 13:41:49 +02:00
|
|
|
return ERR_PTR(-EBADF);
|
|
|
|
|
|
2024-09-23 09:35:36 -07:00
|
|
|
if (!proc_ns_file(fd_file(f)))
|
2024-07-19 13:41:49 +02:00
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
|
2024-09-23 09:35:36 -07:00
|
|
|
ns = get_proc_ns(file_inode(fd_file(f)));
|
2024-07-19 13:41:49 +02:00
|
|
|
if (ns->ops->type != CLONE_NEWNS)
|
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
|
|
|
|
|
mnt_ns = to_mnt_ns(ns);
|
|
|
|
|
} else {
|
|
|
|
|
mnt_ns = current->nsproxy->mnt_ns;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
refcount_inc(&mnt_ns->passive);
|
|
|
|
|
return mnt_ns;
|
2024-06-24 11:49:48 -04:00
|
|
|
}
|
|
|
|
|
|
2023-10-25 16:02:02 +02:00
|
|
|
SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
|
|
|
|
|
struct statmount __user *, buf, size_t, bufsize,
|
|
|
|
|
unsigned int, flags)
|
|
|
|
|
{
|
2024-06-24 11:49:49 -04:00
|
|
|
struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
|
2024-06-25 14:33:45 +02:00
|
|
|
struct kstatmount *ks __free(kfree) = NULL;
|
2023-10-25 16:02:02 +02:00
|
|
|
struct mnt_id_req kreq;
|
2023-11-19 20:58:49 +01:00
|
|
|
/* We currently support retrieval of 3 strings. */
|
|
|
|
|
size_t seq_size = 3 * PATH_MAX;
|
2023-10-25 16:02:02 +02:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
if (flags)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
2023-11-29 12:27:15 +01:00
|
|
|
ret = copy_mnt_id_req(req, &kreq);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
2023-10-25 16:02:02 +02:00
|
|
|
|
2024-07-19 13:41:49 +02:00
|
|
|
ns = grab_requested_mnt_ns(&kreq);
|
2024-06-24 11:49:49 -04:00
|
|
|
if (!ns)
|
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
|
|
if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
|
|
|
|
|
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
|
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
2024-06-25 14:33:45 +02:00
|
|
|
ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT);
|
|
|
|
|
if (!ks)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
2023-11-19 20:58:49 +01:00
|
|
|
retry:
|
2024-06-25 14:33:45 +02:00
|
|
|
ret = prepare_kstatmount(ks, &kreq, buf, bufsize, seq_size);
|
2023-11-19 20:58:49 +01:00
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
|
2024-06-25 14:33:45 +02:00
|
|
|
scoped_guard(rwsem_read, &namespace_sem)
|
|
|
|
|
ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns);
|
2023-10-25 16:02:02 +02:00
|
|
|
|
2023-11-19 20:58:49 +01:00
|
|
|
if (!ret)
|
2024-06-25 14:33:45 +02:00
|
|
|
ret = copy_statmount_to_user(ks);
|
|
|
|
|
kvfree(ks->seq.buf);
|
2023-11-19 20:58:49 +01:00
|
|
|
if (retry_statmount(ret, &seq_size))
|
|
|
|
|
goto retry;
|
2023-10-25 16:02:02 +02:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-24 11:49:48 -04:00
|
|
|
static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
|
|
|
|
|
u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids,
|
|
|
|
|
bool reverse)
|
2023-10-25 16:02:03 +02:00
|
|
|
{
|
2024-06-07 16:55:36 +02:00
|
|
|
struct path root __free(path_put) = {};
|
2024-06-10 14:49:54 +02:00
|
|
|
struct path orig;
|
|
|
|
|
struct mount *r, *first;
|
2024-01-12 09:09:14 +01:00
|
|
|
ssize_t ret;
|
2023-10-25 16:02:03 +02:00
|
|
|
|
2024-06-10 14:49:54 +02:00
|
|
|
rwsem_assert_held(&namespace_sem);
|
|
|
|
|
|
2024-06-24 11:49:48 -04:00
|
|
|
ret = grab_requested_root(ns, &root);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
|
2024-06-10 14:49:54 +02:00
|
|
|
if (mnt_parent_id == LSMT_ROOT) {
|
|
|
|
|
orig = root;
|
|
|
|
|
} else {
|
|
|
|
|
orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
|
2024-06-07 16:55:36 +02:00
|
|
|
if (!orig.mnt)
|
|
|
|
|
return -ENOENT;
|
2024-06-10 14:49:54 +02:00
|
|
|
orig.dentry = orig.mnt->mnt_root;
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-25 16:02:03 +02:00
|
|
|
/*
|
|
|
|
|
* Don't trigger audit denials. We just want to determine what
|
|
|
|
|
* mounts to show users.
|
|
|
|
|
*/
|
2024-06-10 14:49:54 +02:00
|
|
|
if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) &&
|
2024-06-24 11:49:44 -04:00
|
|
|
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
|
2023-10-25 16:02:03 +02:00
|
|
|
return -EPERM;
|
|
|
|
|
|
2024-06-10 14:49:54 +02:00
|
|
|
ret = security_sb_statfs(orig.dentry);
|
2024-01-12 09:09:14 +01:00
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
2023-10-25 16:02:03 +02:00
|
|
|
|
2024-06-07 16:55:37 +02:00
|
|
|
if (!last_mnt_id) {
|
|
|
|
|
if (reverse)
|
2024-12-15 21:17:06 +01:00
|
|
|
first = node_to_mount(ns->mnt_last_node);
|
2024-06-07 16:55:37 +02:00
|
|
|
else
|
2024-12-15 21:17:06 +01:00
|
|
|
first = node_to_mount(ns->mnt_first_node);
|
2024-06-07 16:55:37 +02:00
|
|
|
} else {
|
|
|
|
|
if (reverse)
|
|
|
|
|
first = mnt_find_id_at_reverse(ns, last_mnt_id - 1);
|
|
|
|
|
else
|
|
|
|
|
first = mnt_find_id_at(ns, last_mnt_id + 1);
|
|
|
|
|
}
|
2023-10-25 16:02:03 +02:00
|
|
|
|
2024-06-07 16:55:37 +02:00
|
|
|
for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r, reverse)) {
|
2024-01-12 09:09:14 +01:00
|
|
|
if (r->mnt_id_unique == mnt_parent_id)
|
2023-10-25 16:02:03 +02:00
|
|
|
continue;
|
2024-06-10 14:49:54 +02:00
|
|
|
if (!is_path_reachable(r, r->mnt.mnt_root, &orig))
|
2023-10-25 16:02:03 +02:00
|
|
|
continue;
|
2024-06-10 14:49:54 +02:00
|
|
|
*mnt_ids = r->mnt_id_unique;
|
2024-01-12 09:09:14 +01:00
|
|
|
mnt_ids++;
|
|
|
|
|
nr_mnt_ids--;
|
|
|
|
|
ret++;
|
2023-10-25 16:02:03 +02:00
|
|
|
}
|
2024-01-12 09:09:14 +01:00
|
|
|
return ret;
|
2023-10-25 16:02:03 +02:00
|
|
|
}
|
|
|
|
|
|
2024-06-10 14:49:54 +02:00
|
|
|
SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
|
|
|
|
|
u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
|
2023-10-25 16:02:03 +02:00
|
|
|
{
|
2024-06-10 14:49:54 +02:00
|
|
|
u64 *kmnt_ids __free(kvfree) = NULL;
|
|
|
|
|
const size_t maxcount = 1000000;
|
2024-06-24 11:49:48 -04:00
|
|
|
struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
|
2023-10-25 16:02:03 +02:00
|
|
|
struct mnt_id_req kreq;
|
2024-07-04 10:58:35 +02:00
|
|
|
u64 last_mnt_id;
|
2023-10-25 16:02:03 +02:00
|
|
|
ssize_t ret;
|
|
|
|
|
|
2024-06-07 16:55:37 +02:00
|
|
|
if (flags & ~LISTMOUNT_REVERSE)
|
2023-10-25 16:02:03 +02:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
2024-06-10 14:49:54 +02:00
|
|
|
/*
|
|
|
|
|
* If the mount namespace really has more than 1 million mounts the
|
|
|
|
|
* caller must iterate over the mount namespace (and reconsider their
|
|
|
|
|
* system design...).
|
|
|
|
|
*/
|
2024-01-12 09:09:14 +01:00
|
|
|
if (unlikely(nr_mnt_ids > maxcount))
|
2024-06-10 14:49:54 +02:00
|
|
|
return -EOVERFLOW;
|
2024-01-12 09:09:14 +01:00
|
|
|
|
|
|
|
|
if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
|
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
2023-11-29 12:27:15 +01:00
|
|
|
ret = copy_mnt_id_req(req, &kreq);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
2023-10-25 16:02:03 +02:00
|
|
|
|
|
|
|
|
last_mnt_id = kreq.param;
|
2024-07-04 10:58:35 +02:00
|
|
|
/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
|
|
|
|
|
if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
|
|
|
|
|
return -EINVAL;
|
2023-10-25 16:02:03 +02:00
|
|
|
|
2024-06-10 14:49:54 +02:00
|
|
|
kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids),
|
|
|
|
|
GFP_KERNEL_ACCOUNT);
|
|
|
|
|
if (!kmnt_ids)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
2024-07-19 13:41:49 +02:00
|
|
|
ns = grab_requested_mnt_ns(&kreq);
|
2024-06-24 11:49:48 -04:00
|
|
|
if (!ns)
|
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
|
|
if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
|
|
|
|
|
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
|
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
2025-04-16 10:19:05 +02:00
|
|
|
/*
|
|
|
|
|
* We only need to guard against mount topology changes as
|
|
|
|
|
* listmount() doesn't care about any mount properties.
|
|
|
|
|
*/
|
2024-06-10 14:49:54 +02:00
|
|
|
scoped_guard(rwsem_read, &namespace_sem)
|
2024-07-04 10:58:35 +02:00
|
|
|
ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids,
|
2024-06-07 16:55:37 +02:00
|
|
|
nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
|
2024-07-04 17:00:19 +02:00
|
|
|
if (ret <= 0)
|
|
|
|
|
return ret;
|
2024-06-10 14:49:54 +02:00
|
|
|
|
|
|
|
|
if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids)))
|
|
|
|
|
return -EFAULT;
|
2023-10-25 16:02:03 +02:00
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
static void __init init_mount_tree(void)
|
|
|
|
|
{
|
|
|
|
|
struct vfsmount *mnt;
|
2019-01-30 13:30:21 -05:00
|
|
|
struct mount *m;
|
2006-12-08 02:37:56 -08:00
|
|
|
struct mnt_namespace *ns;
|
2008-02-14 19:34:39 -08:00
|
|
|
struct path root;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2019-05-30 17:48:35 -04:00
|
|
|
mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (IS_ERR(mnt))
|
|
|
|
|
panic("Can't create rootfs");
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
|
2019-01-30 13:30:21 -05:00
|
|
|
ns = alloc_mnt_ns(&init_user_ns, false);
|
2009-06-23 17:29:49 -04:00
|
|
|
if (IS_ERR(ns))
|
2005-04-16 15:20:36 -07:00
|
|
|
panic("Can't allocate initial namespace");
|
2019-01-30 13:30:21 -05:00
|
|
|
m = real_mount(mnt);
|
|
|
|
|
ns->root = m;
|
2023-10-25 16:02:00 +02:00
|
|
|
ns->nr_mounts = 1;
|
|
|
|
|
mnt_add_to_ns(ns, m);
|
2006-12-08 02:37:56 -08:00
|
|
|
init_task.nsproxy->mnt_ns = ns;
|
|
|
|
|
get_mnt_ns(ns);
|
|
|
|
|
|
2011-12-06 13:32:36 -05:00
|
|
|
root.mnt = mnt;
|
|
|
|
|
root.dentry = mnt->mnt_root;
|
2008-02-14 19:34:39 -08:00
|
|
|
|
|
|
|
|
set_fs_pwd(current->fs, &root);
|
|
|
|
|
set_fs_root(current->fs, &root);
|
2024-06-24 11:49:46 -04:00
|
|
|
|
|
|
|
|
mnt_ns_tree_add(ns);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
2007-10-16 23:26:30 -07:00
|
|
|
void __init mnt_init(void)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-09-29 01:58:57 -07:00
|
|
|
int err;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2011-11-23 12:14:10 -05:00
|
|
|
mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
|
memcg: enable accounting for mnt_cache entries
Patch series "memcg accounting from OpenVZ", v7.
OpenVZ uses memory accounting 20+ years since v2.2.x linux kernels.
Initially we used our own accounting subsystem, then partially committed
it to upstream, and a few years ago switched to cgroups v1. Now we're
rebasing again, revising our old patches and trying to push them upstream.
We try to protect the host system from any misuse of kernel memory
allocation triggered by untrusted users inside the containers.
Patch-set is addressed mostly to cgroups maintainers and cgroups@ mailing
list, though I would be very grateful for any comments from maintainersi
of affected subsystems or other people added in cc:
Compared to the upstream, we additionally account the following kernel objects:
- network devices and its Tx/Rx queues
- ipv4/v6 addresses and routing-related objects
- inet_bind_bucket cache objects
- VLAN group arrays
- ipv6/sit: ip_tunnel_prl
- scm_fp_list objects used by SCM_RIGHTS messages of Unix sockets
- nsproxy and namespace objects itself
- IPC objects: semaphores, message queues and share memory segments
- mounts
- pollfd and select bits arrays
- signals and posix timers
- file lock
- fasync_struct used by the file lease code and driver's fasync queues
- tty objects
- per-mm LDT
We have an incorrect/incomplete/obsoleted accounting for few other kernel
objects: sk_filter, af_packets, netlink and xt_counters for iptables.
They require rework and probably will be dropped at all.
Also we're going to add an accounting for nft, however it is not ready
yet.
We have not tested performance on upstream, however, our performance team
compares our current RHEL7-based production kernel and reports that they
are at least not worse as the according original RHEL7 kernel.
This patch (of 10):
The kernel allocates ~400 bytes of 'struct mount' for any new mount.
Creating a new mount namespace clones most of the parent mounts, and this
can be repeated many times. Additionally, each mount allocates up to
PATH_MAX=4096 bytes for mnt->mnt_devname.
It makes sense to account for these allocations to restrict the host's
memory consumption from inside the memcg-limited container.
Link: https://lkml.kernel.org/r/045db11f-4a45-7c9b-2664-5b32c2b44943@virtuozzo.com
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Yutian Yang <nglaive@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Borislav Petkov <bp@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 14:55:10 -07:00
|
|
|
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2014-02-28 13:46:44 -05:00
|
|
|
mount_hashtable = alloc_large_system_hash("Mount-cache",
|
2014-03-20 21:10:51 -04:00
|
|
|
sizeof(struct hlist_head),
|
2014-02-28 13:46:44 -05:00
|
|
|
mhash_entries, 19,
|
2017-07-06 15:39:11 -07:00
|
|
|
HASH_ZERO,
|
2014-02-28 13:46:44 -05:00
|
|
|
&m_hash_shift, &m_hash_mask, 0, 0);
|
|
|
|
|
mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
|
|
|
|
|
sizeof(struct hlist_head),
|
|
|
|
|
mphash_entries, 19,
|
2017-07-06 15:39:11 -07:00
|
|
|
HASH_ZERO,
|
2014-02-28 13:46:44 -05:00
|
|
|
&mp_hash_shift, &mp_hash_mask, 0, 0);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2013-03-15 10:53:28 -04:00
|
|
|
if (!mount_hashtable || !mountpoint_hashtable)
|
2005-04-16 15:20:36 -07:00
|
|
|
panic("Failed to allocate mount hash table\n");
|
|
|
|
|
|
2013-11-28 14:54:43 -05:00
|
|
|
kernfs_init();
|
|
|
|
|
|
2006-09-29 01:58:57 -07:00
|
|
|
err = sysfs_init();
|
|
|
|
|
if (err)
|
|
|
|
|
printk(KERN_WARNING "%s: sysfs_init error: %d\n",
|
2008-04-30 00:55:09 -07:00
|
|
|
__func__, err);
|
2007-10-29 14:17:23 -06:00
|
|
|
fs_kobj = kobject_create_and_add("fs", NULL);
|
|
|
|
|
if (!fs_kobj)
|
2008-04-30 00:55:09 -07:00
|
|
|
printk(KERN_WARNING "%s: kobj create error\n", __func__);
|
2019-06-01 18:09:44 -04:00
|
|
|
shmem_init();
|
2005-04-16 15:20:36 -07:00
|
|
|
init_rootfs();
|
|
|
|
|
init_mount_tree();
|
|
|
|
|
}
|
|
|
|
|
|
2009-06-22 15:09:13 -04:00
|
|
|
void put_mnt_ns(struct mnt_namespace *ns)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2020-08-03 13:16:42 +03:00
|
|
|
if (!refcount_dec_and_test(&ns->ns.count))
|
2009-06-22 15:09:13 -04:00
|
|
|
return;
|
replace collect_mounts()/drop_collected_mounts() with a safer variant
collect_mounts() has several problems - one can't iterate over the results
directly, so it has to be done with callback passed to iterate_mounts();
it has an oopsable race with d_invalidate(); it creates temporary clones
of mounts invisibly for sync umount (IOW, you can have non-lazy umount
succeed leaving filesystem not mounted anywhere and yet still busy).
A saner approach is to give caller an array of struct path that would pin
every mount in a subtree, without cloning any mounts.
* collect_mounts()/drop_collected_mounts()/iterate_mounts() is gone
* collect_paths(where, preallocated, size) gives either ERR_PTR(-E...) or
a pointer to array of struct path, one for each chunk of tree visible under
'where' (i.e. the first element is a copy of where, followed by (mount,root)
for everything mounted under it - the same set collect_mounts() would give).
Unlike collect_mounts(), the mounts are *not* cloned - we just get pinning
references to the roots of subtrees in the caller's namespace.
Array is terminated by {NULL, NULL} struct path. If it fits into
preallocated array (on-stack, normally), that's where it goes; otherwise
it's allocated by kmalloc_array(). Passing 0 as size means that 'preallocated'
is ignored (and expected to be NULL).
* drop_collected_paths(paths, preallocated) is given the array returned
by an earlier call of collect_paths() and the preallocated array passed to that
call. All mount/dentry references are dropped and array is kfree'd if it's not
equal to 'preallocated'.
* instead of iterate_mounts(), users should just iterate over array
of struct path - nothing exotic is needed for that. Existing users (all in
audit_tree.c) are converted.
[folded a fix for braino reported by Venkat Rao Bagalkote <venkat88@linux.ibm.com>]
Fixes: 80b5dce8c59b0 ("vfs: Add a function to lazily unmount all mounts from any dentry")
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-06-17 00:09:51 -04:00
|
|
|
namespace_lock();
|
|
|
|
|
lock_mount_hash();
|
|
|
|
|
umount_tree(ns->root, 0);
|
|
|
|
|
unlock_mount_hash();
|
|
|
|
|
namespace_unlock();
|
2012-07-26 21:08:32 -07:00
|
|
|
free_mnt_ns(ns);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2011-03-17 22:08:28 -04:00
|
|
|
|
2018-11-01 23:07:26 +00:00
|
|
|
struct vfsmount *kern_mount(struct file_system_type *type)
|
2011-03-17 22:08:28 -04:00
|
|
|
{
|
2011-07-19 09:32:38 -07:00
|
|
|
struct vfsmount *mnt;
|
2018-11-01 23:07:26 +00:00
|
|
|
mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
|
2011-07-19 09:32:38 -07:00
|
|
|
if (!IS_ERR(mnt)) {
|
|
|
|
|
/*
|
|
|
|
|
* it is a longterm mount, don't release mnt until
|
|
|
|
|
* we unmount before file sys is unregistered
|
|
|
|
|
*/
|
2012-06-09 00:59:08 -04:00
|
|
|
real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
|
2011-07-19 09:32:38 -07:00
|
|
|
}
|
|
|
|
|
return mnt;
|
2011-03-17 22:08:28 -04:00
|
|
|
}
|
2018-11-01 23:07:26 +00:00
|
|
|
EXPORT_SYMBOL_GPL(kern_mount);
|
2011-07-19 09:32:38 -07:00
|
|
|
|
|
|
|
|
void kern_unmount(struct vfsmount *mnt)
|
|
|
|
|
{
|
|
|
|
|
/* release long term mount so mount point can be released */
|
2023-01-27 13:46:51 -05:00
|
|
|
if (!IS_ERR(mnt)) {
|
|
|
|
|
mnt_make_shortterm(mnt);
|
2013-09-29 22:06:07 -04:00
|
|
|
synchronize_rcu(); /* yecchhh... */
|
2011-07-19 09:32:38 -07:00
|
|
|
mntput(mnt);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(kern_unmount);
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 08:43:34 -05:00
|
|
|
|
2020-06-04 10:48:19 +02:00
|
|
|
void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
|
|
|
|
|
{
|
|
|
|
|
unsigned int i;
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < num; i++)
|
2023-01-27 13:46:51 -05:00
|
|
|
mnt_make_shortterm(mnt[i]);
|
2020-06-04 10:48:19 +02:00
|
|
|
synchronize_rcu_expedited();
|
|
|
|
|
for (i = 0; i < num; i++)
|
|
|
|
|
mntput(mnt[i]);
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(kern_unmount_array);
|
|
|
|
|
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 08:43:34 -05:00
|
|
|
bool our_mnt(struct vfsmount *mnt)
|
|
|
|
|
{
|
2011-11-25 00:46:35 -05:00
|
|
|
return check_mnt(real_mount(mnt));
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 08:43:34 -05:00
|
|
|
}
|
2010-03-07 18:49:36 -08:00
|
|
|
|
2013-03-15 01:45:51 -07:00
|
|
|
bool current_chrooted(void)
|
|
|
|
|
{
|
|
|
|
|
/* Does the current process have a non-standard root */
|
|
|
|
|
struct path ns_root;
|
|
|
|
|
struct path fs_root;
|
|
|
|
|
bool chrooted;
|
|
|
|
|
|
|
|
|
|
/* Find the namespace root */
|
|
|
|
|
ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt;
|
|
|
|
|
ns_root.dentry = ns_root.mnt->mnt_root;
|
|
|
|
|
path_get(&ns_root);
|
|
|
|
|
while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
|
|
|
|
|
;
|
|
|
|
|
|
|
|
|
|
get_fs_root(current->fs, &fs_root);
|
|
|
|
|
|
|
|
|
|
chrooted = !path_equal(&fs_root, &ns_root);
|
|
|
|
|
|
|
|
|
|
path_put(&fs_root);
|
|
|
|
|
path_put(&ns_root);
|
|
|
|
|
|
|
|
|
|
return chrooted;
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-04 07:43:08 -05:00
|
|
|
static bool mnt_already_visible(struct mnt_namespace *ns,
|
|
|
|
|
const struct super_block *sb,
|
2016-06-09 16:06:06 -05:00
|
|
|
int *new_mnt_flags)
|
2013-03-24 14:28:27 -07:00
|
|
|
{
|
2015-05-08 23:49:47 -05:00
|
|
|
int new_flags = *new_mnt_flags;
|
2023-10-25 16:02:00 +02:00
|
|
|
struct mount *mnt, *n;
|
2013-03-30 19:57:41 -07:00
|
|
|
bool visible = false;
|
2013-03-24 14:28:27 -07:00
|
|
|
|
2013-09-16 21:37:36 -04:00
|
|
|
down_read(&namespace_sem);
|
2023-10-25 16:02:00 +02:00
|
|
|
rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
|
2013-03-30 19:57:41 -07:00
|
|
|
struct mount *child;
|
2015-06-04 09:43:11 -05:00
|
|
|
int mnt_flags;
|
|
|
|
|
|
2018-11-04 07:43:08 -05:00
|
|
|
if (mnt->mnt.mnt_sb->s_type != sb->s_type)
|
2013-03-30 19:57:41 -07:00
|
|
|
continue;
|
|
|
|
|
|
2015-05-08 16:36:50 -05:00
|
|
|
/* This mount is not fully visible if it's root directory
|
|
|
|
|
* is not the root directory of the filesystem.
|
|
|
|
|
*/
|
|
|
|
|
if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
|
|
|
|
|
continue;
|
|
|
|
|
|
2016-06-15 06:59:49 -05:00
|
|
|
/* A local view of the mount flags */
|
2015-06-04 09:43:11 -05:00
|
|
|
mnt_flags = mnt->mnt.mnt_flags;
|
|
|
|
|
|
2016-06-10 12:21:40 -05:00
|
|
|
/* Don't miss readonly hidden in the superblock flags */
|
2017-07-17 08:45:34 +01:00
|
|
|
if (sb_rdonly(mnt->mnt.mnt_sb))
|
2016-06-10 12:21:40 -05:00
|
|
|
mnt_flags |= MNT_LOCK_READONLY;
|
|
|
|
|
|
2015-05-08 23:49:47 -05:00
|
|
|
/* Verify the mount flags are equal to or more permissive
|
|
|
|
|
* than the proposed new mount.
|
|
|
|
|
*/
|
2015-06-04 09:43:11 -05:00
|
|
|
if ((mnt_flags & MNT_LOCK_READONLY) &&
|
2015-05-08 23:49:47 -05:00
|
|
|
!(new_flags & MNT_READONLY))
|
|
|
|
|
continue;
|
2015-06-04 09:43:11 -05:00
|
|
|
if ((mnt_flags & MNT_LOCK_ATIME) &&
|
|
|
|
|
((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
|
2015-05-08 23:49:47 -05:00
|
|
|
continue;
|
|
|
|
|
|
2015-01-07 08:10:09 -06:00
|
|
|
/* This mount is not fully visible if there are any
|
|
|
|
|
* locked child mounts that cover anything except for
|
|
|
|
|
* empty directories.
|
2013-03-30 19:57:41 -07:00
|
|
|
*/
|
|
|
|
|
list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
|
|
|
|
|
struct inode *inode = child->mnt_mountpoint->d_inode;
|
2015-01-07 08:10:09 -06:00
|
|
|
/* Only worry about locked mounts */
|
2016-05-27 14:50:05 -05:00
|
|
|
if (!(child->mnt.mnt_flags & MNT_LOCKED))
|
2015-01-07 08:10:09 -06:00
|
|
|
continue;
|
2024-08-06 11:47:10 +08:00
|
|
|
/* Is the directory permanently empty? */
|
2015-05-13 20:51:09 -05:00
|
|
|
if (!is_empty_dir_inode(inode))
|
2013-03-30 19:57:41 -07:00
|
|
|
goto next;
|
2013-03-24 14:28:27 -07:00
|
|
|
}
|
2015-05-08 23:49:47 -05:00
|
|
|
/* Preserve the locked attributes */
|
2015-06-04 09:43:11 -05:00
|
|
|
*new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
|
|
|
|
|
MNT_LOCK_ATIME);
|
2013-03-30 19:57:41 -07:00
|
|
|
visible = true;
|
|
|
|
|
goto found;
|
|
|
|
|
next: ;
|
2013-03-24 14:28:27 -07:00
|
|
|
}
|
2013-03-30 19:57:41 -07:00
|
|
|
found:
|
2013-09-16 21:37:36 -04:00
|
|
|
up_read(&namespace_sem);
|
2013-03-30 19:57:41 -07:00
|
|
|
return visible;
|
2013-03-24 14:28:27 -07:00
|
|
|
}
|
|
|
|
|
|
2018-11-04 07:43:08 -05:00
|
|
|
static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
|
2016-06-09 16:06:06 -05:00
|
|
|
{
|
2016-06-15 06:59:49 -05:00
|
|
|
const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
|
2016-06-09 16:06:06 -05:00
|
|
|
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
|
|
|
|
|
unsigned long s_iflags;
|
|
|
|
|
|
|
|
|
|
if (ns->user_ns == &init_user_ns)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* Can this filesystem be too revealing? */
|
2018-11-04 07:43:08 -05:00
|
|
|
s_iflags = sb->s_iflags;
|
2016-06-09 16:06:06 -05:00
|
|
|
if (!(s_iflags & SB_I_USERNS_VISIBLE))
|
|
|
|
|
return false;
|
|
|
|
|
|
2016-06-15 06:59:49 -05:00
|
|
|
if ((s_iflags & required_iflags) != required_iflags) {
|
|
|
|
|
WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
|
|
|
|
|
required_iflags);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-04 07:43:08 -05:00
|
|
|
return !mnt_already_visible(ns, sb, new_mnt_flags);
|
2016-06-09 16:06:06 -05:00
|
|
|
}
|
|
|
|
|
|
fs: Treat foreign mounts as nosuid
If a process gets access to a mount from a different user
namespace, that process should not be able to take advantage of
setuid files or selinux entrypoints from that filesystem. Prevent
this by treating mounts from other mount namespaces and those not
owned by current_user_ns() or an ancestor as nosuid.
This will make it safer to allow more complex filesystems to be
mounted in non-root user namespaces.
This does not remove the need for MNT_LOCK_NOSUID. The setuid,
setgid, and file capability bits can no longer be abused if code in
a user namespace were to clear nosuid on an untrusted filesystem,
but this patch, by itself, is insufficient to protect the system
from abuse of files that, when execed, would increase MAC privilege.
As a more concrete explanation, any task that can manipulate a
vfsmount associated with a given user namespace already has
capabilities in that namespace and all of its descendents. If they
can cause a malicious setuid, setgid, or file-caps executable to
appear in that mount, then that executable will only allow them to
elevate privileges in exactly the set of namespaces in which they
are already privileges.
On the other hand, if they can cause a malicious executable to
appear with a dangerous MAC label, running it could change the
caller's security context in a way that should not have been
possible, even inside the namespace in which the task is confined.
As a hardening measure, this would have made CVE-2014-5207 much
more difficult to exploit.
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Acked-by: James Morris <james.l.morris@oracle.com>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
2016-06-23 16:41:05 -05:00
|
|
|
bool mnt_may_suid(struct vfsmount *mnt)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* Foreign mounts (accessed via fchdir or through /proc
|
|
|
|
|
* symlinks) are always treated as if they are nosuid. This
|
|
|
|
|
* prevents namespaces from trusting potentially unsafe
|
|
|
|
|
* suid/sgid bits, file caps, or security labels that originate
|
|
|
|
|
* in other namespaces.
|
|
|
|
|
*/
|
|
|
|
|
return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
|
|
|
|
|
current_in_userns(mnt->mnt_sb->s_user_ns);
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-01 00:37:32 -04:00
|
|
|
static struct ns_common *mntns_get(struct task_struct *task)
|
2010-03-07 18:49:36 -08:00
|
|
|
{
|
2014-11-01 00:00:23 -04:00
|
|
|
struct ns_common *ns = NULL;
|
2010-03-07 18:49:36 -08:00
|
|
|
struct nsproxy *nsproxy;
|
|
|
|
|
|
2014-02-03 19:13:49 -08:00
|
|
|
task_lock(task);
|
|
|
|
|
nsproxy = task->nsproxy;
|
2010-03-07 18:49:36 -08:00
|
|
|
if (nsproxy) {
|
2014-11-01 00:00:23 -04:00
|
|
|
ns = &nsproxy->mnt_ns->ns;
|
|
|
|
|
get_mnt_ns(to_mnt_ns(ns));
|
2010-03-07 18:49:36 -08:00
|
|
|
}
|
2014-02-03 19:13:49 -08:00
|
|
|
task_unlock(task);
|
2010-03-07 18:49:36 -08:00
|
|
|
|
|
|
|
|
return ns;
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-01 00:37:32 -04:00
|
|
|
static void mntns_put(struct ns_common *ns)
|
2010-03-07 18:49:36 -08:00
|
|
|
{
|
2014-11-01 00:00:23 -04:00
|
|
|
put_mnt_ns(to_mnt_ns(ns));
|
2010-03-07 18:49:36 -08:00
|
|
|
}
|
|
|
|
|
|
2020-05-05 16:04:30 +02:00
|
|
|
static int mntns_install(struct nsset *nsset, struct ns_common *ns)
|
2010-03-07 18:49:36 -08:00
|
|
|
{
|
2020-05-05 16:04:30 +02:00
|
|
|
struct nsproxy *nsproxy = nsset->nsproxy;
|
|
|
|
|
struct fs_struct *fs = nsset->fs;
|
2017-04-15 17:31:22 -04:00
|
|
|
struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
|
2020-05-05 16:04:30 +02:00
|
|
|
struct user_namespace *user_ns = nsset->cred->user_ns;
|
2010-03-07 18:49:36 -08:00
|
|
|
struct path root;
|
2017-04-15 17:31:22 -04:00
|
|
|
int err;
|
2010-03-07 18:49:36 -08:00
|
|
|
|
2012-07-26 21:42:03 -07:00
|
|
|
if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
|
2020-05-05 16:04:30 +02:00
|
|
|
!ns_capable(user_ns, CAP_SYS_CHROOT) ||
|
|
|
|
|
!ns_capable(user_ns, CAP_SYS_ADMIN))
|
2012-09-13 16:38:03 +08:00
|
|
|
return -EPERM;
|
2010-03-07 18:49:36 -08:00
|
|
|
|
2019-01-30 13:30:21 -05:00
|
|
|
if (is_anon_ns(mnt_ns))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
2010-03-07 18:49:36 -08:00
|
|
|
if (fs->users != 1)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
get_mnt_ns(mnt_ns);
|
2017-04-15 17:31:22 -04:00
|
|
|
old_mnt_ns = nsproxy->mnt_ns;
|
2010-03-07 18:49:36 -08:00
|
|
|
nsproxy->mnt_ns = mnt_ns;
|
|
|
|
|
|
|
|
|
|
/* Find the root */
|
2017-04-15 17:31:22 -04:00
|
|
|
err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
|
|
|
|
|
"/", LOOKUP_DOWN, &root);
|
|
|
|
|
if (err) {
|
|
|
|
|
/* revert to old namespace */
|
|
|
|
|
nsproxy->mnt_ns = old_mnt_ns;
|
|
|
|
|
put_mnt_ns(mnt_ns);
|
|
|
|
|
return err;
|
|
|
|
|
}
|
2010-03-07 18:49:36 -08:00
|
|
|
|
2017-06-08 17:32:29 -07:00
|
|
|
put_mnt_ns(old_mnt_ns);
|
|
|
|
|
|
2010-03-07 18:49:36 -08:00
|
|
|
/* Update the pwd and root */
|
|
|
|
|
set_fs_pwd(fs, &root);
|
|
|
|
|
set_fs_root(fs, &root);
|
|
|
|
|
|
|
|
|
|
path_put(&root);
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-06 00:47:13 -07:00
|
|
|
static struct user_namespace *mntns_owner(struct ns_common *ns)
|
|
|
|
|
{
|
|
|
|
|
return to_mnt_ns(ns)->user_ns;
|
|
|
|
|
}
|
|
|
|
|
|
2010-03-07 18:49:36 -08:00
|
|
|
const struct proc_ns_operations mntns_operations = {
|
|
|
|
|
.name = "mnt",
|
|
|
|
|
.type = CLONE_NEWNS,
|
|
|
|
|
.get = mntns_get,
|
|
|
|
|
.put = mntns_put,
|
|
|
|
|
.install = mntns_install,
|
2016-09-06 00:47:13 -07:00
|
|
|
.owner = mntns_owner,
|
2010-03-07 18:49:36 -08:00
|
|
|
};
|
2022-01-21 22:13:27 -08:00
|
|
|
|
|
|
|
|
#ifdef CONFIG_SYSCTL
|
2025-01-28 13:48:37 +01:00
|
|
|
static const struct ctl_table fs_namespace_sysctls[] = {
|
2022-01-21 22:13:27 -08:00
|
|
|
{
|
|
|
|
|
.procname = "mount-max",
|
|
|
|
|
.data = &sysctl_mount_max,
|
|
|
|
|
.maxlen = sizeof(unsigned int),
|
|
|
|
|
.mode = 0644,
|
|
|
|
|
.proc_handler = proc_dointvec_minmax,
|
|
|
|
|
.extra1 = SYSCTL_ONE,
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static int __init init_fs_namespace_sysctls(void)
|
|
|
|
|
{
|
|
|
|
|
register_sysctl_init("fs", fs_namespace_sysctls);
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
fs_initcall(init_fs_namespace_sysctls);
|
|
|
|
|
|
|
|
|
|
#endif /* CONFIG_SYSCTL */
|