2019-05-28 10:10:12 -07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2005-11-07 17:19:07 -05:00
|
|
|
/*
|
|
|
|
|
* linux/fs/pnode.c
|
|
|
|
|
*
|
|
|
|
|
* (C) Copyright IBM Corporation 2005.
|
|
|
|
|
* Author : Ram Pai (linuxram@us.ibm.com)
|
|
|
|
|
*/
|
2006-12-08 02:37:56 -08:00
|
|
|
#include <linux/mnt_namespace.h>
|
2005-11-07 17:19:07 -05:00
|
|
|
#include <linux/mount.h>
|
|
|
|
|
#include <linux/fs.h>
|
2013-03-22 04:08:05 -07:00
|
|
|
#include <linux/nsproxy.h>
|
2018-11-01 23:07:23 +00:00
|
|
|
#include <uapi/linux/mount.h>
|
2008-03-22 15:48:17 -04:00
|
|
|
#include "internal.h"
|
2005-11-07 17:19:07 -05:00
|
|
|
#include "pnode.h"
|
|
|
|
|
|
2005-11-07 17:19:33 -05:00
|
|
|
/* return the next shared peer mount of @p */
|
2011-11-24 23:56:26 -05:00
|
|
|
static inline struct mount *next_peer(struct mount *p)
|
2005-11-07 17:19:33 -05:00
|
|
|
{
|
2011-11-25 00:22:05 -05:00
|
|
|
return list_entry(p->mnt_share.next, struct mount, mnt_share);
|
2005-11-07 17:19:33 -05:00
|
|
|
}
|
|
|
|
|
|
2011-11-24 23:56:26 -05:00
|
|
|
static inline struct mount *first_slave(struct mount *p)
|
2005-11-07 17:21:01 -05:00
|
|
|
{
|
2025-06-24 23:51:31 -04:00
|
|
|
return hlist_entry(p->mnt_slave_list.first, struct mount, mnt_slave);
|
2005-11-07 17:21:01 -05:00
|
|
|
}
|
|
|
|
|
|
2011-11-24 23:56:26 -05:00
|
|
|
static inline struct mount *next_slave(struct mount *p)
|
2005-11-07 17:21:01 -05:00
|
|
|
{
|
2025-06-24 23:51:31 -04:00
|
|
|
return hlist_entry(p->mnt_slave.next, struct mount, mnt_slave);
|
2005-11-07 17:21:01 -05:00
|
|
|
}
|
|
|
|
|
|
2025-08-21 20:48:38 -04:00
|
|
|
/* locks: namespace_shared && is_mounted(mnt) */
|
2011-11-24 23:35:54 -05:00
|
|
|
static struct mount *get_peer_under_root(struct mount *mnt,
|
|
|
|
|
struct mnt_namespace *ns,
|
|
|
|
|
const struct path *root)
|
2008-03-27 13:06:26 +01:00
|
|
|
{
|
2011-11-24 23:35:54 -05:00
|
|
|
struct mount *m = mnt;
|
2008-03-27 13:06:26 +01:00
|
|
|
|
|
|
|
|
do {
|
|
|
|
|
/* Check the namespace first for optimization */
|
2011-11-25 00:46:35 -05:00
|
|
|
if (m->mnt_ns == ns && is_path_reachable(m, m->mnt.mnt_root, root))
|
2011-11-24 23:35:54 -05:00
|
|
|
return m;
|
2008-03-27 13:06:26 +01:00
|
|
|
|
2011-11-24 23:56:26 -05:00
|
|
|
m = next_peer(m);
|
2011-11-24 23:35:54 -05:00
|
|
|
} while (m != mnt);
|
2008-03-27 13:06:26 +01:00
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Get ID of closest dominating peer group having a representative
|
|
|
|
|
* under the given root.
|
|
|
|
|
*
|
2025-08-21 20:48:38 -04:00
|
|
|
* locks: namespace_shared
|
2008-03-27 13:06:26 +01:00
|
|
|
*/
|
2011-11-24 23:35:54 -05:00
|
|
|
int get_dominating_id(struct mount *mnt, const struct path *root)
|
2008-03-27 13:06:26 +01:00
|
|
|
{
|
2011-11-24 23:35:54 -05:00
|
|
|
struct mount *m;
|
2008-03-27 13:06:26 +01:00
|
|
|
|
2011-11-25 00:10:28 -05:00
|
|
|
for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) {
|
2011-11-25 00:46:35 -05:00
|
|
|
struct mount *d = get_peer_under_root(m, mnt->mnt_ns, root);
|
2008-03-27 13:06:26 +01:00
|
|
|
if (d)
|
2011-11-25 00:50:41 -05:00
|
|
|
return d->mnt_group_id;
|
2008-03-27 13:06:26 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
do_make_slave(): choose new master sanely
When mount changes propagation type so that it doesn't propagate
events any more (MS_PRIVATE, MS_SLAVE, MS_UNBINDABLE), we need
to make sure that event propagation between other mounts is
unaffected.
We need to make sure that events from peers and master of that mount
(if any) still reach everything that used to be on its ->mnt_slave_list.
If mount has neither peers nor master, we simply need to dissolve
its ->mnt_slave_list and clear ->mnt_master of everything in there.
If mount has peers, we transfer everything in ->mnt_slave_list of
this mount into that of some of those peers (and adjust ->mnt_master
accordingly).
If mount has a master but no peers, we transfer everything in
->mnt_slave_list of this mount into that of its master (adjusting
->mnt_master, etc.).
There are two problems with the current implementation:
* there's a long-obsolete logics in choosing the peer -
once upon a time it made sense to prefer the peer that had the
same ->mnt_root as our mount, but that had been pointless since
2014 ("smarter propagate_mnt()")
* the most common caller of that thing is umount_tree()
taking the mounts out of propagation graph. In that case it's
possible to have ->mnt_slave_list contents moved many times,
since the replacement master is likely to be taken out by the
same umount_tree(), etc.
Take the choice of replacement master into a separate function
(propagation_source()) and teach it to skip the candidates that
are going to be taken out.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-06-24 18:12:56 -04:00
|
|
|
static inline bool will_be_unmounted(struct mount *m)
|
|
|
|
|
{
|
|
|
|
|
return m->mnt.mnt_flags & MNT_UMOUNT;
|
|
|
|
|
}
|
|
|
|
|
|
2025-06-24 23:36:43 -04:00
|
|
|
static void transfer_propagation(struct mount *mnt, struct mount *to)
|
2005-11-07 17:20:48 -05:00
|
|
|
{
|
2025-06-24 23:51:31 -04:00
|
|
|
struct hlist_node *p = NULL, *n;
|
|
|
|
|
struct mount *m;
|
|
|
|
|
|
|
|
|
|
hlist_for_each_entry_safe(m, n, &mnt->mnt_slave_list, mnt_slave) {
|
|
|
|
|
m->mnt_master = to;
|
|
|
|
|
if (!to)
|
|
|
|
|
hlist_del_init(&m->mnt_slave);
|
|
|
|
|
else
|
|
|
|
|
p = &m->mnt_slave;
|
2005-11-07 17:20:48 -05:00
|
|
|
}
|
2025-06-24 23:51:31 -04:00
|
|
|
if (p)
|
|
|
|
|
hlist_splice_init(&mnt->mnt_slave_list, p, &to->mnt_slave_list);
|
2005-11-07 17:20:48 -05:00
|
|
|
}
|
|
|
|
|
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
/*
|
2025-06-21 18:06:19 -04:00
|
|
|
* EXCL[namespace_sem]
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
*/
|
2011-11-24 20:43:10 -05:00
|
|
|
void change_mnt_propagation(struct mount *mnt, int type)
|
2005-11-07 17:19:07 -05:00
|
|
|
{
|
2025-06-24 23:54:41 -04:00
|
|
|
struct mount *m = mnt->mnt_master;
|
|
|
|
|
|
2005-11-07 17:19:33 -05:00
|
|
|
if (type == MS_SHARED) {
|
2005-11-07 17:19:50 -05:00
|
|
|
set_mnt_shared(mnt);
|
2005-11-07 17:20:48 -05:00
|
|
|
return;
|
|
|
|
|
}
|
2025-06-24 23:36:43 -04:00
|
|
|
if (IS_MNT_SHARED(mnt)) {
|
|
|
|
|
if (list_empty(&mnt->mnt_share)) {
|
|
|
|
|
mnt_release_group_id(mnt);
|
|
|
|
|
} else {
|
2025-08-19 12:22:03 -04:00
|
|
|
m = next_peer(mnt);
|
2025-06-24 23:36:43 -04:00
|
|
|
list_del_init(&mnt->mnt_share);
|
|
|
|
|
mnt->mnt_group_id = 0;
|
|
|
|
|
}
|
|
|
|
|
CLEAR_MNT_SHARED(mnt);
|
|
|
|
|
transfer_propagation(mnt, m);
|
|
|
|
|
}
|
2025-06-24 23:51:31 -04:00
|
|
|
hlist_del_init(&mnt->mnt_slave);
|
2025-06-24 23:20:47 -04:00
|
|
|
if (type == MS_SLAVE) {
|
2025-06-24 23:54:41 -04:00
|
|
|
mnt->mnt_master = m;
|
|
|
|
|
if (m)
|
|
|
|
|
hlist_add_head(&mnt->mnt_slave, &m->mnt_slave_list);
|
2025-06-24 23:20:47 -04:00
|
|
|
} else {
|
2011-11-25 00:07:16 -05:00
|
|
|
mnt->mnt_master = NULL;
|
2005-11-07 17:21:20 -05:00
|
|
|
if (type == MS_UNBINDABLE)
|
2025-06-21 18:06:19 -04:00
|
|
|
mnt->mnt_t_flags |= T_UNBINDABLE;
|
2008-02-06 01:36:32 -08:00
|
|
|
else
|
2025-06-21 18:06:19 -04:00
|
|
|
mnt->mnt_t_flags &= ~T_UNBINDABLE;
|
2005-11-07 17:19:33 -05:00
|
|
|
}
|
2005-11-07 17:19:07 -05:00
|
|
|
}
|
2005-11-07 17:19:50 -05:00
|
|
|
|
2025-08-19 12:22:03 -04:00
|
|
|
static struct mount *trace_transfers(struct mount *m)
|
|
|
|
|
{
|
|
|
|
|
while (1) {
|
|
|
|
|
struct mount *next = next_peer(m);
|
|
|
|
|
|
|
|
|
|
if (next != m) {
|
|
|
|
|
list_del_init(&m->mnt_share);
|
|
|
|
|
m->mnt_group_id = 0;
|
|
|
|
|
m->mnt_master = next;
|
|
|
|
|
} else {
|
|
|
|
|
if (IS_MNT_SHARED(m))
|
|
|
|
|
mnt_release_group_id(m);
|
|
|
|
|
next = m->mnt_master;
|
|
|
|
|
}
|
|
|
|
|
hlist_del_init(&m->mnt_slave);
|
|
|
|
|
CLEAR_MNT_SHARED(m);
|
|
|
|
|
SET_MNT_MARK(m);
|
|
|
|
|
|
|
|
|
|
if (!next || !will_be_unmounted(next))
|
|
|
|
|
return next;
|
|
|
|
|
if (IS_MNT_MARKED(next))
|
|
|
|
|
return next->mnt_master;
|
|
|
|
|
m = next;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void set_destinations(struct mount *m, struct mount *master)
|
|
|
|
|
{
|
|
|
|
|
struct mount *next;
|
|
|
|
|
|
|
|
|
|
while ((next = m->mnt_master) != master) {
|
|
|
|
|
m->mnt_master = master;
|
|
|
|
|
m = next;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void bulk_make_private(struct list_head *set)
|
|
|
|
|
{
|
|
|
|
|
struct mount *m;
|
|
|
|
|
|
|
|
|
|
list_for_each_entry(m, set, mnt_list)
|
|
|
|
|
if (!IS_MNT_MARKED(m))
|
|
|
|
|
set_destinations(m, trace_transfers(m));
|
|
|
|
|
|
|
|
|
|
list_for_each_entry(m, set, mnt_list) {
|
|
|
|
|
transfer_propagation(m, m->mnt_master);
|
|
|
|
|
m->mnt_master = NULL;
|
|
|
|
|
CLEAR_MNT_MARK(m);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-14 20:50:06 -04:00
|
|
|
static struct mount *__propagation_next(struct mount *m,
|
|
|
|
|
struct mount *origin)
|
|
|
|
|
{
|
|
|
|
|
while (1) {
|
|
|
|
|
struct mount *master = m->mnt_master;
|
|
|
|
|
|
|
|
|
|
if (master == origin->mnt_master) {
|
|
|
|
|
struct mount *next = next_peer(m);
|
|
|
|
|
return (next == origin) ? NULL : next;
|
2025-06-24 23:51:31 -04:00
|
|
|
} else if (m->mnt_slave.next)
|
2025-05-14 20:50:06 -04:00
|
|
|
return next_slave(m);
|
|
|
|
|
|
|
|
|
|
/* back at master */
|
|
|
|
|
m = master;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2005-11-07 17:19:50 -05:00
|
|
|
/*
|
|
|
|
|
* get the next mount in the propagation tree.
|
|
|
|
|
* @m: the mount seen last
|
|
|
|
|
* @origin: the original mount from where the tree walk initiated
|
2010-01-16 13:28:47 -05:00
|
|
|
*
|
|
|
|
|
* Note that peer groups form contiguous segments of slave lists.
|
|
|
|
|
* We rely on that in get_source() to be able to find out if
|
|
|
|
|
* vfsmount found while iterating with propagation_next() is
|
|
|
|
|
* a peer of one we'd found earlier.
|
2005-11-07 17:19:50 -05:00
|
|
|
*/
|
2011-11-24 23:56:26 -05:00
|
|
|
static struct mount *propagation_next(struct mount *m,
|
|
|
|
|
struct mount *origin)
|
2005-11-07 17:19:50 -05:00
|
|
|
{
|
2005-11-07 17:21:01 -05:00
|
|
|
/* are there any slaves of this mount? */
|
2025-06-24 23:51:31 -04:00
|
|
|
if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list))
|
2005-11-07 17:21:01 -05:00
|
|
|
return first_slave(m);
|
|
|
|
|
|
2025-05-14 20:50:06 -04:00
|
|
|
return __propagation_next(m, origin);
|
2005-11-07 17:21:01 -05:00
|
|
|
}
|
|
|
|
|
|
2016-10-24 17:25:19 -05:00
|
|
|
static struct mount *skip_propagation_subtree(struct mount *m,
|
|
|
|
|
struct mount *origin)
|
|
|
|
|
{
|
|
|
|
|
/*
|
2025-05-14 20:50:06 -04:00
|
|
|
* Advance m past everything that gets propagation from it.
|
2016-10-24 17:25:19 -05:00
|
|
|
*/
|
2025-05-14 20:50:06 -04:00
|
|
|
struct mount *p = __propagation_next(m, origin);
|
|
|
|
|
|
|
|
|
|
while (p && peers(m, p))
|
|
|
|
|
p = __propagation_next(p, origin);
|
2016-10-24 17:25:19 -05:00
|
|
|
|
2025-05-14 20:50:06 -04:00
|
|
|
return p;
|
2016-10-24 17:25:19 -05:00
|
|
|
}
|
|
|
|
|
|
smarter propagate_mnt()
The current mainline has copies propagated to *all* nodes, then
tears down the copies we made for nodes that do not contain
counterparts of the desired mountpoint. That sets the right
propagation graph for the copies (at teardown time we move
the slaves of removed node to a surviving peer or directly
to master), but we end up paying a fairly steep price in
useless allocations. It's fairly easy to create a situation
where N calls of mount(2) create exactly N bindings, with
O(N^2) vfsmounts allocated and freed in process.
Fortunately, it is possible to avoid those allocations/freeings.
The trick is to create copies in the right order and find which
one would've eventually become a master with the current algorithm.
It turns out to be possible in O(nodes getting propagation) time
and with no extra allocations at all.
One part is that we need to make sure that eventual master will be
created before its slaves, so we need to walk the propagation
tree in a different order - by peer groups. And iterate through
the peers before dealing with the next group.
Another thing is finding the (earlier) copy that will be a master
of one we are about to create; to do that we are (temporary) marking
the masters of mountpoints we are attaching the copies to.
Either we are in a peer of the last mountpoint we'd dealt with,
or we have the following situation: we are attaching to mountpoint M,
the last copy S_0 had been attached to M_0 and there are sequences
S_0...S_n, M_0...M_n such that S_{i+1} is a master of S_{i},
S_{i} mounted on M{i} and we need to create a slave of the first S_{k}
such that M is getting propagation from M_{k}. It means that the master
of M_{k} will be among the sequence of masters of M. On the
other hand, the nearest marked node in that sequence will either
be the master of M_{k} or the master of M_{k-1} (the latter -
in the case if M_{k-1} is a slave of something M gets propagation
from, but in a wrong peer group).
So we go through the sequence of masters of M until we find
a marked one (P). Let N be the one before it. Then we go through
the sequence of masters of S_0 until we find one (say, S) mounted
on a node D that has P as master and check if D is a peer of N.
If it is, S will be the master of new copy, if not - the master of S
will be.
That's it for the hard part; the rest is fairly simple. Iterator
is in next_group(), handling of one prospective mountpoint is
propagate_one().
It seems to survive all tests and gives a noticably better performance
than the current mainline for setups that are seriously using shared
subtrees.
Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-02-27 09:35:45 -05:00
|
|
|
static struct mount *next_group(struct mount *m, struct mount *origin)
|
2005-11-07 17:21:01 -05:00
|
|
|
{
|
smarter propagate_mnt()
The current mainline has copies propagated to *all* nodes, then
tears down the copies we made for nodes that do not contain
counterparts of the desired mountpoint. That sets the right
propagation graph for the copies (at teardown time we move
the slaves of removed node to a surviving peer or directly
to master), but we end up paying a fairly steep price in
useless allocations. It's fairly easy to create a situation
where N calls of mount(2) create exactly N bindings, with
O(N^2) vfsmounts allocated and freed in process.
Fortunately, it is possible to avoid those allocations/freeings.
The trick is to create copies in the right order and find which
one would've eventually become a master with the current algorithm.
It turns out to be possible in O(nodes getting propagation) time
and with no extra allocations at all.
One part is that we need to make sure that eventual master will be
created before its slaves, so we need to walk the propagation
tree in a different order - by peer groups. And iterate through
the peers before dealing with the next group.
Another thing is finding the (earlier) copy that will be a master
of one we are about to create; to do that we are (temporary) marking
the masters of mountpoints we are attaching the copies to.
Either we are in a peer of the last mountpoint we'd dealt with,
or we have the following situation: we are attaching to mountpoint M,
the last copy S_0 had been attached to M_0 and there are sequences
S_0...S_n, M_0...M_n such that S_{i+1} is a master of S_{i},
S_{i} mounted on M{i} and we need to create a slave of the first S_{k}
such that M is getting propagation from M_{k}. It means that the master
of M_{k} will be among the sequence of masters of M. On the
other hand, the nearest marked node in that sequence will either
be the master of M_{k} or the master of M_{k-1} (the latter -
in the case if M_{k-1} is a slave of something M gets propagation
from, but in a wrong peer group).
So we go through the sequence of masters of M until we find
a marked one (P). Let N be the one before it. Then we go through
the sequence of masters of S_0 until we find one (say, S) mounted
on a node D that has P as master and check if D is a peer of N.
If it is, S will be the master of new copy, if not - the master of S
will be.
That's it for the hard part; the rest is fairly simple. Iterator
is in next_group(), handling of one prospective mountpoint is
propagate_one().
It seems to survive all tests and gives a noticably better performance
than the current mainline for setups that are seriously using shared
subtrees.
Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-02-27 09:35:45 -05:00
|
|
|
while (1) {
|
|
|
|
|
while (1) {
|
|
|
|
|
struct mount *next;
|
2025-06-24 23:51:31 -04:00
|
|
|
if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list))
|
smarter propagate_mnt()
The current mainline has copies propagated to *all* nodes, then
tears down the copies we made for nodes that do not contain
counterparts of the desired mountpoint. That sets the right
propagation graph for the copies (at teardown time we move
the slaves of removed node to a surviving peer or directly
to master), but we end up paying a fairly steep price in
useless allocations. It's fairly easy to create a situation
where N calls of mount(2) create exactly N bindings, with
O(N^2) vfsmounts allocated and freed in process.
Fortunately, it is possible to avoid those allocations/freeings.
The trick is to create copies in the right order and find which
one would've eventually become a master with the current algorithm.
It turns out to be possible in O(nodes getting propagation) time
and with no extra allocations at all.
One part is that we need to make sure that eventual master will be
created before its slaves, so we need to walk the propagation
tree in a different order - by peer groups. And iterate through
the peers before dealing with the next group.
Another thing is finding the (earlier) copy that will be a master
of one we are about to create; to do that we are (temporary) marking
the masters of mountpoints we are attaching the copies to.
Either we are in a peer of the last mountpoint we'd dealt with,
or we have the following situation: we are attaching to mountpoint M,
the last copy S_0 had been attached to M_0 and there are sequences
S_0...S_n, M_0...M_n such that S_{i+1} is a master of S_{i},
S_{i} mounted on M{i} and we need to create a slave of the first S_{k}
such that M is getting propagation from M_{k}. It means that the master
of M_{k} will be among the sequence of masters of M. On the
other hand, the nearest marked node in that sequence will either
be the master of M_{k} or the master of M_{k-1} (the latter -
in the case if M_{k-1} is a slave of something M gets propagation
from, but in a wrong peer group).
So we go through the sequence of masters of M until we find
a marked one (P). Let N be the one before it. Then we go through
the sequence of masters of S_0 until we find one (say, S) mounted
on a node D that has P as master and check if D is a peer of N.
If it is, S will be the master of new copy, if not - the master of S
will be.
That's it for the hard part; the rest is fairly simple. Iterator
is in next_group(), handling of one prospective mountpoint is
propagate_one().
It seems to survive all tests and gives a noticably better performance
than the current mainline for setups that are seriously using shared
subtrees.
Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-02-27 09:35:45 -05:00
|
|
|
return first_slave(m);
|
|
|
|
|
next = next_peer(m);
|
|
|
|
|
if (m->mnt_group_id == origin->mnt_group_id) {
|
|
|
|
|
if (next == origin)
|
|
|
|
|
return NULL;
|
|
|
|
|
} else if (m->mnt_slave.next != &next->mnt_slave)
|
|
|
|
|
break;
|
|
|
|
|
m = next;
|
|
|
|
|
}
|
|
|
|
|
/* m is the last peer */
|
|
|
|
|
while (1) {
|
|
|
|
|
struct mount *master = m->mnt_master;
|
2025-06-24 23:51:31 -04:00
|
|
|
if (m->mnt_slave.next)
|
smarter propagate_mnt()
The current mainline has copies propagated to *all* nodes, then
tears down the copies we made for nodes that do not contain
counterparts of the desired mountpoint. That sets the right
propagation graph for the copies (at teardown time we move
the slaves of removed node to a surviving peer or directly
to master), but we end up paying a fairly steep price in
useless allocations. It's fairly easy to create a situation
where N calls of mount(2) create exactly N bindings, with
O(N^2) vfsmounts allocated and freed in process.
Fortunately, it is possible to avoid those allocations/freeings.
The trick is to create copies in the right order and find which
one would've eventually become a master with the current algorithm.
It turns out to be possible in O(nodes getting propagation) time
and with no extra allocations at all.
One part is that we need to make sure that eventual master will be
created before its slaves, so we need to walk the propagation
tree in a different order - by peer groups. And iterate through
the peers before dealing with the next group.
Another thing is finding the (earlier) copy that will be a master
of one we are about to create; to do that we are (temporary) marking
the masters of mountpoints we are attaching the copies to.
Either we are in a peer of the last mountpoint we'd dealt with,
or we have the following situation: we are attaching to mountpoint M,
the last copy S_0 had been attached to M_0 and there are sequences
S_0...S_n, M_0...M_n such that S_{i+1} is a master of S_{i},
S_{i} mounted on M{i} and we need to create a slave of the first S_{k}
such that M is getting propagation from M_{k}. It means that the master
of M_{k} will be among the sequence of masters of M. On the
other hand, the nearest marked node in that sequence will either
be the master of M_{k} or the master of M_{k-1} (the latter -
in the case if M_{k-1} is a slave of something M gets propagation
from, but in a wrong peer group).
So we go through the sequence of masters of M until we find
a marked one (P). Let N be the one before it. Then we go through
the sequence of masters of S_0 until we find one (say, S) mounted
on a node D that has P as master and check if D is a peer of N.
If it is, S will be the master of new copy, if not - the master of S
will be.
That's it for the hard part; the rest is fairly simple. Iterator
is in next_group(), handling of one prospective mountpoint is
propagate_one().
It seems to survive all tests and gives a noticably better performance
than the current mainline for setups that are seriously using shared
subtrees.
Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-02-27 09:35:45 -05:00
|
|
|
return next_slave(m);
|
|
|
|
|
m = next_peer(master);
|
|
|
|
|
if (master->mnt_group_id == origin->mnt_group_id)
|
|
|
|
|
break;
|
|
|
|
|
if (master->mnt_slave.next == &m->mnt_slave)
|
|
|
|
|
break;
|
|
|
|
|
m = master;
|
|
|
|
|
}
|
|
|
|
|
if (m == origin)
|
|
|
|
|
return NULL;
|
2005-11-07 17:21:01 -05:00
|
|
|
}
|
smarter propagate_mnt()
The current mainline has copies propagated to *all* nodes, then
tears down the copies we made for nodes that do not contain
counterparts of the desired mountpoint. That sets the right
propagation graph for the copies (at teardown time we move
the slaves of removed node to a surviving peer or directly
to master), but we end up paying a fairly steep price in
useless allocations. It's fairly easy to create a situation
where N calls of mount(2) create exactly N bindings, with
O(N^2) vfsmounts allocated and freed in process.
Fortunately, it is possible to avoid those allocations/freeings.
The trick is to create copies in the right order and find which
one would've eventually become a master with the current algorithm.
It turns out to be possible in O(nodes getting propagation) time
and with no extra allocations at all.
One part is that we need to make sure that eventual master will be
created before its slaves, so we need to walk the propagation
tree in a different order - by peer groups. And iterate through
the peers before dealing with the next group.
Another thing is finding the (earlier) copy that will be a master
of one we are about to create; to do that we are (temporary) marking
the masters of mountpoints we are attaching the copies to.
Either we are in a peer of the last mountpoint we'd dealt with,
or we have the following situation: we are attaching to mountpoint M,
the last copy S_0 had been attached to M_0 and there are sequences
S_0...S_n, M_0...M_n such that S_{i+1} is a master of S_{i},
S_{i} mounted on M{i} and we need to create a slave of the first S_{k}
such that M is getting propagation from M_{k}. It means that the master
of M_{k} will be among the sequence of masters of M. On the
other hand, the nearest marked node in that sequence will either
be the master of M_{k} or the master of M_{k-1} (the latter -
in the case if M_{k-1} is a slave of something M gets propagation
from, but in a wrong peer group).
So we go through the sequence of masters of M until we find
a marked one (P). Let N be the one before it. Then we go through
the sequence of masters of S_0 until we find one (say, S) mounted
on a node D that has P as master and check if D is a peer of N.
If it is, S will be the master of new copy, if not - the master of S
will be.
That's it for the hard part; the rest is fairly simple. Iterator
is in next_group(), handling of one prospective mountpoint is
propagate_one().
It seems to survive all tests and gives a noticably better performance
than the current mainline for setups that are seriously using shared
subtrees.
Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-02-27 09:35:45 -05:00
|
|
|
}
|
2005-11-07 17:21:01 -05:00
|
|
|
|
2025-06-27 23:04:23 -04:00
|
|
|
static bool need_secondary(struct mount *m, struct mountpoint *dest_mp)
|
smarter propagate_mnt()
The current mainline has copies propagated to *all* nodes, then
tears down the copies we made for nodes that do not contain
counterparts of the desired mountpoint. That sets the right
propagation graph for the copies (at teardown time we move
the slaves of removed node to a surviving peer or directly
to master), but we end up paying a fairly steep price in
useless allocations. It's fairly easy to create a situation
where N calls of mount(2) create exactly N bindings, with
O(N^2) vfsmounts allocated and freed in process.
Fortunately, it is possible to avoid those allocations/freeings.
The trick is to create copies in the right order and find which
one would've eventually become a master with the current algorithm.
It turns out to be possible in O(nodes getting propagation) time
and with no extra allocations at all.
One part is that we need to make sure that eventual master will be
created before its slaves, so we need to walk the propagation
tree in a different order - by peer groups. And iterate through
the peers before dealing with the next group.
Another thing is finding the (earlier) copy that will be a master
of one we are about to create; to do that we are (temporary) marking
the masters of mountpoints we are attaching the copies to.
Either we are in a peer of the last mountpoint we'd dealt with,
or we have the following situation: we are attaching to mountpoint M,
the last copy S_0 had been attached to M_0 and there are sequences
S_0...S_n, M_0...M_n such that S_{i+1} is a master of S_{i},
S_{i} mounted on M{i} and we need to create a slave of the first S_{k}
such that M is getting propagation from M_{k}. It means that the master
of M_{k} will be among the sequence of masters of M. On the
other hand, the nearest marked node in that sequence will either
be the master of M_{k} or the master of M_{k-1} (the latter -
in the case if M_{k-1} is a slave of something M gets propagation
from, but in a wrong peer group).
So we go through the sequence of masters of M until we find
a marked one (P). Let N be the one before it. Then we go through
the sequence of masters of S_0 until we find one (say, S) mounted
on a node D that has P as master and check if D is a peer of N.
If it is, S will be the master of new copy, if not - the master of S
will be.
That's it for the hard part; the rest is fairly simple. Iterator
is in next_group(), handling of one prospective mountpoint is
propagate_one().
It seems to survive all tests and gives a noticably better performance
than the current mainline for setups that are seriously using shared
subtrees.
Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-02-27 09:35:45 -05:00
|
|
|
{
|
|
|
|
|
/* skip ones added by this propagate_mnt() */
|
fix IS_MNT_PROPAGATING uses
propagate_mnt() does not attach anything to mounts created during
propagate_mnt() itself. What's more, anything on ->mnt_slave_list
of such new mount must also be new, so we don't need to even look
there.
When move_mount() had been introduced, we've got an additional
class of mounts to skip - if we are moving from anon namespace,
we do not want to propagate to mounts we are moving (i.e. all
mounts in that anon namespace).
Unfortunately, the part about "everything on their ->mnt_slave_list
will also be ignorable" is not true - if we have propagation graph
A -> B -> C
and do OPEN_TREE_CLONE open_tree() of B, we get
A -> [B <-> B'] -> C
as propagation graph, where B' is a clone of B in our detached tree.
Making B private will result in
A -> B' -> C
C still gets propagation from A, as it would after making B private
if we hadn't done that open_tree(), but now the propagation goes
through B'. Trying to move_mount() our detached tree on subdirectory
in A should have
* moved B' on that subdirectory in A
* skipped the corresponding subdirectory in B' itself
* copied B' on the corresponding subdirectory in C.
As it is, the logics in propagation_next() and friends ends up
skipping propagation into C, since it doesn't consider anything
downstream of B'.
IOW, walking the propagation graph should only skip the ->mnt_slave_list
of new mounts; the only places where the check for "in that one
anon namespace" are applicable are propagate_one() (where we should
treat that as the same kind of thing as "mountpoint we are looking
at is not visible in the mount we are looking at") and
propagation_would_overmount(). The latter is better dealt with
in the caller (can_move_mount_beneath()); on the first call of
propagation_would_overmount() the test is always false, on the
second it is always true in "move from anon namespace" case and
always false in "move within our namespace" one, so it's easier
to just use check_mnt() before bothering with the second call and
be done with that.
Fixes: 064fe6e233e8 ("mount: handle mount propagation for detached mount trees")
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-05-08 15:35:51 -04:00
|
|
|
if (IS_MNT_NEW(m))
|
2025-06-27 23:04:23 -04:00
|
|
|
return false;
|
fix IS_MNT_PROPAGATING uses
propagate_mnt() does not attach anything to mounts created during
propagate_mnt() itself. What's more, anything on ->mnt_slave_list
of such new mount must also be new, so we don't need to even look
there.
When move_mount() had been introduced, we've got an additional
class of mounts to skip - if we are moving from anon namespace,
we do not want to propagate to mounts we are moving (i.e. all
mounts in that anon namespace).
Unfortunately, the part about "everything on their ->mnt_slave_list
will also be ignorable" is not true - if we have propagation graph
A -> B -> C
and do OPEN_TREE_CLONE open_tree() of B, we get
A -> [B <-> B'] -> C
as propagation graph, where B' is a clone of B in our detached tree.
Making B private will result in
A -> B' -> C
C still gets propagation from A, as it would after making B private
if we hadn't done that open_tree(), but now the propagation goes
through B'. Trying to move_mount() our detached tree on subdirectory
in A should have
* moved B' on that subdirectory in A
* skipped the corresponding subdirectory in B' itself
* copied B' on the corresponding subdirectory in C.
As it is, the logics in propagation_next() and friends ends up
skipping propagation into C, since it doesn't consider anything
downstream of B'.
IOW, walking the propagation graph should only skip the ->mnt_slave_list
of new mounts; the only places where the check for "in that one
anon namespace" are applicable are propagate_one() (where we should
treat that as the same kind of thing as "mountpoint we are looking
at is not visible in the mount we are looking at") and
propagation_would_overmount(). The latter is better dealt with
in the caller (can_move_mount_beneath()); on the first call of
propagation_would_overmount() the test is always false, on the
second it is always true in "move from anon namespace" case and
always false in "move within our namespace" one, so it's easier
to just use check_mnt() before bothering with the second call and
be done with that.
Fixes: 064fe6e233e8 ("mount: handle mount propagation for detached mount trees")
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-05-08 15:35:51 -04:00
|
|
|
/* skip if mountpoint isn't visible in m */
|
2023-03-28 18:13:07 +02:00
|
|
|
if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root))
|
2025-06-27 23:04:23 -04:00
|
|
|
return false;
|
2025-05-23 19:20:36 -04:00
|
|
|
/* skip if m is in the anon_ns */
|
|
|
|
|
if (is_anon_ns(m->mnt_ns))
|
2025-06-27 23:04:23 -04:00
|
|
|
return false;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2025-06-27 23:09:47 -04:00
|
|
|
static struct mount *find_master(struct mount *m,
|
|
|
|
|
struct mount *last_copy,
|
|
|
|
|
struct mount *original)
|
|
|
|
|
{
|
|
|
|
|
struct mount *p;
|
|
|
|
|
|
|
|
|
|
// ascend until there's a copy for something with the same master
|
|
|
|
|
for (;;) {
|
|
|
|
|
p = m->mnt_master;
|
|
|
|
|
if (!p || IS_MNT_MARKED(p))
|
|
|
|
|
break;
|
|
|
|
|
m = p;
|
|
|
|
|
}
|
|
|
|
|
while (!peers(last_copy, original)) {
|
|
|
|
|
struct mount *parent = last_copy->mnt_parent;
|
|
|
|
|
if (parent->mnt_master == p) {
|
|
|
|
|
if (!peers(parent, m))
|
|
|
|
|
last_copy = last_copy->mnt_master;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
last_copy = last_copy->mnt_master;
|
|
|
|
|
}
|
|
|
|
|
return last_copy;
|
|
|
|
|
}
|
|
|
|
|
|
2025-06-27 23:39:23 -04:00
|
|
|
/**
|
|
|
|
|
* propagate_mnt() - create secondary copies for tree attachment
|
|
|
|
|
* @dest_mnt: destination mount.
|
|
|
|
|
* @dest_mp: destination mountpoint.
|
|
|
|
|
* @source_mnt: source mount.
|
|
|
|
|
* @tree_list: list of secondaries to be attached.
|
2005-11-07 17:19:50 -05:00
|
|
|
*
|
2025-06-27 23:39:23 -04:00
|
|
|
* Create secondary copies for attaching a tree with root @source_mnt
|
|
|
|
|
* at mount @dest_mnt with mountpoint @dest_mp. Link all new mounts
|
|
|
|
|
* into a propagation graph. Set mountpoints for all secondaries,
|
|
|
|
|
* link their roots into @tree_list via ->mnt_hash.
|
2005-11-07 17:19:50 -05:00
|
|
|
*/
|
2013-03-15 10:53:28 -04:00
|
|
|
int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
|
2025-06-27 23:39:23 -04:00
|
|
|
struct mount *source_mnt, struct hlist_head *tree_list)
|
2005-11-07 17:19:50 -05:00
|
|
|
{
|
2025-06-27 23:27:48 -04:00
|
|
|
struct mount *m, *n, *copy, *this;
|
2025-06-27 23:16:52 -04:00
|
|
|
int err = 0, type;
|
2013-03-22 04:08:05 -07:00
|
|
|
|
2025-06-21 17:41:40 -04:00
|
|
|
if (dest_mnt->mnt_master)
|
|
|
|
|
SET_MNT_MARK(dest_mnt->mnt_master);
|
smarter propagate_mnt()
The current mainline has copies propagated to *all* nodes, then
tears down the copies we made for nodes that do not contain
counterparts of the desired mountpoint. That sets the right
propagation graph for the copies (at teardown time we move
the slaves of removed node to a surviving peer or directly
to master), but we end up paying a fairly steep price in
useless allocations. It's fairly easy to create a situation
where N calls of mount(2) create exactly N bindings, with
O(N^2) vfsmounts allocated and freed in process.
Fortunately, it is possible to avoid those allocations/freeings.
The trick is to create copies in the right order and find which
one would've eventually become a master with the current algorithm.
It turns out to be possible in O(nodes getting propagation) time
and with no extra allocations at all.
One part is that we need to make sure that eventual master will be
created before its slaves, so we need to walk the propagation
tree in a different order - by peer groups. And iterate through
the peers before dealing with the next group.
Another thing is finding the (earlier) copy that will be a master
of one we are about to create; to do that we are (temporary) marking
the masters of mountpoints we are attaching the copies to.
Either we are in a peer of the last mountpoint we'd dealt with,
or we have the following situation: we are attaching to mountpoint M,
the last copy S_0 had been attached to M_0 and there are sequences
S_0...S_n, M_0...M_n such that S_{i+1} is a master of S_{i},
S_{i} mounted on M{i} and we need to create a slave of the first S_{k}
such that M is getting propagation from M_{k}. It means that the master
of M_{k} will be among the sequence of masters of M. On the
other hand, the nearest marked node in that sequence will either
be the master of M_{k} or the master of M_{k-1} (the latter -
in the case if M_{k-1} is a slave of something M gets propagation
from, but in a wrong peer group).
So we go through the sequence of masters of M until we find
a marked one (P). Let N be the one before it. Then we go through
the sequence of masters of S_0 until we find one (say, S) mounted
on a node D that has P as master and check if D is a peer of N.
If it is, S will be the master of new copy, if not - the master of S
will be.
That's it for the hard part; the rest is fairly simple. Iterator
is in next_group(), handling of one prospective mountpoint is
propagate_one().
It seems to survive all tests and gives a noticably better performance
than the current mainline for setups that are seriously using shared
subtrees.
Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-02-27 09:35:45 -05:00
|
|
|
|
2025-06-27 22:56:43 -04:00
|
|
|
/* iterate over peer groups, depth first */
|
|
|
|
|
for (m = dest_mnt; m && !err; m = next_group(m, dest_mnt)) {
|
|
|
|
|
if (m == dest_mnt) { // have one for dest_mnt itself
|
2025-06-27 23:27:48 -04:00
|
|
|
copy = source_mnt;
|
|
|
|
|
type = CL_MAKE_SHARED;
|
2025-06-27 22:56:43 -04:00
|
|
|
n = next_peer(m);
|
|
|
|
|
if (n == m)
|
|
|
|
|
continue;
|
|
|
|
|
} else {
|
2025-06-27 23:27:48 -04:00
|
|
|
type = CL_SLAVE;
|
|
|
|
|
/* beginning of peer group among the slaves? */
|
|
|
|
|
if (IS_MNT_SHARED(m))
|
|
|
|
|
type |= CL_MAKE_SHARED;
|
2025-06-27 22:56:43 -04:00
|
|
|
n = m;
|
|
|
|
|
}
|
smarter propagate_mnt()
The current mainline has copies propagated to *all* nodes, then
tears down the copies we made for nodes that do not contain
counterparts of the desired mountpoint. That sets the right
propagation graph for the copies (at teardown time we move
the slaves of removed node to a surviving peer or directly
to master), but we end up paying a fairly steep price in
useless allocations. It's fairly easy to create a situation
where N calls of mount(2) create exactly N bindings, with
O(N^2) vfsmounts allocated and freed in process.
Fortunately, it is possible to avoid those allocations/freeings.
The trick is to create copies in the right order and find which
one would've eventually become a master with the current algorithm.
It turns out to be possible in O(nodes getting propagation) time
and with no extra allocations at all.
One part is that we need to make sure that eventual master will be
created before its slaves, so we need to walk the propagation
tree in a different order - by peer groups. And iterate through
the peers before dealing with the next group.
Another thing is finding the (earlier) copy that will be a master
of one we are about to create; to do that we are (temporary) marking
the masters of mountpoints we are attaching the copies to.
Either we are in a peer of the last mountpoint we'd dealt with,
or we have the following situation: we are attaching to mountpoint M,
the last copy S_0 had been attached to M_0 and there are sequences
S_0...S_n, M_0...M_n such that S_{i+1} is a master of S_{i},
S_{i} mounted on M{i} and we need to create a slave of the first S_{k}
such that M is getting propagation from M_{k}. It means that the master
of M_{k} will be among the sequence of masters of M. On the
other hand, the nearest marked node in that sequence will either
be the master of M_{k} or the master of M_{k-1} (the latter -
in the case if M_{k-1} is a slave of something M gets propagation
from, but in a wrong peer group).
So we go through the sequence of masters of M until we find
a marked one (P). Let N be the one before it. Then we go through
the sequence of masters of S_0 until we find one (say, S) mounted
on a node D that has P as master and check if D is a peer of N.
If it is, S will be the master of new copy, if not - the master of S
will be.
That's it for the hard part; the rest is fairly simple. Iterator
is in next_group(), handling of one prospective mountpoint is
propagate_one().
It seems to survive all tests and gives a noticably better performance
than the current mainline for setups that are seriously using shared
subtrees.
Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-02-27 09:35:45 -05:00
|
|
|
do {
|
2025-06-27 23:04:23 -04:00
|
|
|
if (!need_secondary(n, dest_mp))
|
|
|
|
|
continue;
|
2025-06-27 23:27:48 -04:00
|
|
|
if (type & CL_SLAVE) // first in this peer group
|
2025-06-27 23:21:57 -04:00
|
|
|
copy = find_master(n, copy, source_mnt);
|
|
|
|
|
this = copy_tree(copy, copy->mnt.mnt_root, type);
|
|
|
|
|
if (IS_ERR(this)) {
|
|
|
|
|
err = PTR_ERR(this);
|
2025-06-27 23:16:52 -04:00
|
|
|
break;
|
|
|
|
|
}
|
2025-08-20 21:40:07 -04:00
|
|
|
scoped_guard(mount_locked_reader)
|
|
|
|
|
mnt_set_mountpoint(n, dest_mp, this);
|
2025-06-27 23:16:52 -04:00
|
|
|
if (n->mnt_master)
|
|
|
|
|
SET_MNT_MARK(n->mnt_master);
|
2025-06-27 23:21:57 -04:00
|
|
|
copy = this;
|
|
|
|
|
hlist_add_head(&this->mnt_hash, tree_list);
|
|
|
|
|
err = count_mounts(n->mnt_ns, this);
|
2025-06-27 22:56:43 -04:00
|
|
|
if (err)
|
|
|
|
|
break;
|
2025-06-27 23:27:48 -04:00
|
|
|
type = CL_MAKE_SHARED;
|
2025-06-27 23:04:23 -04:00
|
|
|
} while ((n = next_peer(n)) != m);
|
2005-11-07 17:19:50 -05:00
|
|
|
}
|
2025-06-27 22:56:43 -04:00
|
|
|
|
smarter propagate_mnt()
The current mainline has copies propagated to *all* nodes, then
tears down the copies we made for nodes that do not contain
counterparts of the desired mountpoint. That sets the right
propagation graph for the copies (at teardown time we move
the slaves of removed node to a surviving peer or directly
to master), but we end up paying a fairly steep price in
useless allocations. It's fairly easy to create a situation
where N calls of mount(2) create exactly N bindings, with
O(N^2) vfsmounts allocated and freed in process.
Fortunately, it is possible to avoid those allocations/freeings.
The trick is to create copies in the right order and find which
one would've eventually become a master with the current algorithm.
It turns out to be possible in O(nodes getting propagation) time
and with no extra allocations at all.
One part is that we need to make sure that eventual master will be
created before its slaves, so we need to walk the propagation
tree in a different order - by peer groups. And iterate through
the peers before dealing with the next group.
Another thing is finding the (earlier) copy that will be a master
of one we are about to create; to do that we are (temporary) marking
the masters of mountpoints we are attaching the copies to.
Either we are in a peer of the last mountpoint we'd dealt with,
or we have the following situation: we are attaching to mountpoint M,
the last copy S_0 had been attached to M_0 and there are sequences
S_0...S_n, M_0...M_n such that S_{i+1} is a master of S_{i},
S_{i} mounted on M{i} and we need to create a slave of the first S_{k}
such that M is getting propagation from M_{k}. It means that the master
of M_{k} will be among the sequence of masters of M. On the
other hand, the nearest marked node in that sequence will either
be the master of M_{k} or the master of M_{k-1} (the latter -
in the case if M_{k-1} is a slave of something M gets propagation
from, but in a wrong peer group).
So we go through the sequence of masters of M until we find
a marked one (P). Let N be the one before it. Then we go through
the sequence of masters of S_0 until we find one (say, S) mounted
on a node D that has P as master and check if D is a peer of N.
If it is, S will be the master of new copy, if not - the master of S
will be.
That's it for the hard part; the rest is fairly simple. Iterator
is in next_group(), handling of one prospective mountpoint is
propagate_one().
It seems to survive all tests and gives a noticably better performance
than the current mainline for setups that are seriously using shared
subtrees.
Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-02-27 09:35:45 -05:00
|
|
|
hlist_for_each_entry(n, tree_list, mnt_hash) {
|
|
|
|
|
m = n->mnt_parent;
|
2025-06-21 17:41:40 -04:00
|
|
|
if (m->mnt_master)
|
smarter propagate_mnt()
The current mainline has copies propagated to *all* nodes, then
tears down the copies we made for nodes that do not contain
counterparts of the desired mountpoint. That sets the right
propagation graph for the copies (at teardown time we move
the slaves of removed node to a surviving peer or directly
to master), but we end up paying a fairly steep price in
useless allocations. It's fairly easy to create a situation
where N calls of mount(2) create exactly N bindings, with
O(N^2) vfsmounts allocated and freed in process.
Fortunately, it is possible to avoid those allocations/freeings.
The trick is to create copies in the right order and find which
one would've eventually become a master with the current algorithm.
It turns out to be possible in O(nodes getting propagation) time
and with no extra allocations at all.
One part is that we need to make sure that eventual master will be
created before its slaves, so we need to walk the propagation
tree in a different order - by peer groups. And iterate through
the peers before dealing with the next group.
Another thing is finding the (earlier) copy that will be a master
of one we are about to create; to do that we are (temporary) marking
the masters of mountpoints we are attaching the copies to.
Either we are in a peer of the last mountpoint we'd dealt with,
or we have the following situation: we are attaching to mountpoint M,
the last copy S_0 had been attached to M_0 and there are sequences
S_0...S_n, M_0...M_n such that S_{i+1} is a master of S_{i},
S_{i} mounted on M{i} and we need to create a slave of the first S_{k}
such that M is getting propagation from M_{k}. It means that the master
of M_{k} will be among the sequence of masters of M. On the
other hand, the nearest marked node in that sequence will either
be the master of M_{k} or the master of M_{k-1} (the latter -
in the case if M_{k-1} is a slave of something M gets propagation
from, but in a wrong peer group).
So we go through the sequence of masters of M until we find
a marked one (P). Let N be the one before it. Then we go through
the sequence of masters of S_0 until we find one (say, S) mounted
on a node D that has P as master and check if D is a peer of N.
If it is, S will be the master of new copy, if not - the master of S
will be.
That's it for the hard part; the rest is fairly simple. Iterator
is in next_group(), handling of one prospective mountpoint is
propagate_one().
It seems to survive all tests and gives a noticably better performance
than the current mainline for setups that are seriously using shared
subtrees.
Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-02-27 09:35:45 -05:00
|
|
|
CLEAR_MNT_MARK(m->mnt_master);
|
2005-11-07 17:19:50 -05:00
|
|
|
}
|
2025-06-21 17:41:40 -04:00
|
|
|
if (dest_mnt->mnt_master)
|
|
|
|
|
CLEAR_MNT_MARK(dest_mnt->mnt_master);
|
2025-06-27 22:56:43 -04:00
|
|
|
return err;
|
2005-11-07 17:19:50 -05:00
|
|
|
}
|
2005-11-07 17:20:17 -05:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* return true if the refcount is greater than count
|
|
|
|
|
*/
|
2011-11-24 21:35:16 -05:00
|
|
|
static inline int do_refcount_check(struct mount *mnt, int count)
|
2005-11-07 17:20:17 -05:00
|
|
|
{
|
2013-09-28 23:10:55 -04:00
|
|
|
return mnt_get_count(mnt) > count;
|
2005-11-07 17:20:17 -05:00
|
|
|
}
|
|
|
|
|
|
fs: allow to mount beneath top mount
Various distributions are adding or are in the process of adding support
for system extensions and in the future configuration extensions through
various tools. A more detailed explanation on system and configuration
extensions can be found on the manpage which is listed below at [1].
System extension images may – dynamically at runtime — extend the /usr/
and /opt/ directory hierarchies with additional files. This is
particularly useful on immutable system images where a /usr/ and/or
/opt/ hierarchy residing on a read-only file system shall be extended
temporarily at runtime without making any persistent modifications.
When one or more system extension images are activated, their /usr/ and
/opt/ hierarchies are combined via overlayfs with the same hierarchies
of the host OS, and the host /usr/ and /opt/ overmounted with it
("merging"). When they are deactivated, the mount point is disassembled
— again revealing the unmodified original host version of the hierarchy
("unmerging"). Merging thus makes the extension's resources suddenly
appear below the /usr/ and /opt/ hierarchies as if they were included in
the base OS image itself. Unmerging makes them disappear again, leaving
in place only the files that were shipped with the base OS image itself.
System configuration images are similar but operate on directories
containing system or service configuration.
On nearly all modern distributions mount propagation plays a crucial
role and the rootfs of the OS is a shared mount in a peer group (usually
with peer group id 1):
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:1 29 1
On such systems all services and containers run in a separate mount
namespace and are pivot_root()ed into their rootfs. A separate mount
namespace is almost always used as it is the minimal isolation mechanism
services have. But usually they are even much more isolated up to the
point where they almost become indistinguishable from containers.
Mount propagation again plays a crucial role here. The rootfs of all
these services is a slave mount to the peer group of the host rootfs.
This is done so the service will receive mount propagation events from
the host when certain files or directories are updated.
In addition, the rootfs of each service, container, and sandbox is also
a shared mount in its separate peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/ / ext4 shared:24 master:1 71 47
For people not too familiar with mount propagation, the master:1 means
that this is a slave mount to peer group 1. Which as one can see is the
host rootfs as indicated by shared:1 above. The shared:24 indicates that
the service rootfs is a shared mount in a separate peer group with peer
group id 24.
A service may run other services. Such nested services will also have a
rootfs mount that is a slave to the peer group of the outer service
rootfs mount.
For containers things are just slighly different. A container's rootfs
isn't a slave to the service's or host rootfs' peer group. The rootfs
mount of a container is simply a shared mount in its own peer group:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/home/ubuntu/debian-tree / ext4 shared:99 61 60
So whereas services are isolated OS components a container is treated
like a separate world and mount propagation into it is restricted to a
single well known mount that is a slave to the peer group of the shared
mount /run on the host:
TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID
/propagate/debian-tree /run/host/incoming tmpfs master:5 71 68
Here, the master:5 indicates that this mount is a slave to the peer
group with peer group id 5. This allows to propagate mounts into the
container and served as a workaround for not being able to insert mounts
into mount namespaces directly. But the new mount api does support
inserting mounts directly. For the interested reader the blogpost in [2]
might be worth reading where I explain the old and the new approach to
inserting mounts into mount namespaces.
Containers of course, can themselves be run as services. They often run
full systems themselves which means they again run services and
containers with the exact same propagation settings explained above.
The whole system is designed so that it can be easily updated, including
all services in various fine-grained ways without having to enter every
single service's mount namespace which would be prohibitively expensive.
The mount propagation layout has been carefully chosen so it is possible
to propagate updates for system extensions and configurations from the
host into all services.
The simplest model to update the whole system is to mount on top of
/usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc
will then propagate into every service. This works cleanly the first
time. However, when the system is updated multiple times it becomes
necessary to unmount the first update on /opt, /usr, /etc and then
propagate the new update. But this means, there's an interval where the
old base system is accessible. This has to be avoided to protect against
downgrade attacks.
The vfs already exposes a mechanism to userspace whereby mounts can be
mounted beneath an existing mount. Such mounts are internally referred
to as "tucked". The patch series exposes the ability to mount beneath a
top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount()
system call. This allows userspace to seamlessly upgrade mounts. After
this series the only thing that will have changed is that mounting
beneath an existing mount can be done explicitly instead of just
implicitly.
Today, there are two scenarios where a mount can be mounted beneath an
existing mount instead of on top of it:
(1) When a service or container is started in a new mount namespace and
pivot_root()s into its new rootfs. The way this is done is by
mounting the new rootfs beneath the old rootfs:
fd_newroot = open("/var/lib/machines/fedora", ...);
fd_oldroot = open("/", ...);
fchdir(fd_newroot);
pivot_root(".", ".");
After the pivot_root(".", ".") call the new rootfs is mounted
beneath the old rootfs which can then be unmounted to reveal the
underlying mount:
fchdir(fd_oldroot);
umount2(".", MNT_DETACH);
Since pivot_root() moves the caller into a new rootfs no mounts must
be propagated out of the new rootfs as a consequence of the
pivot_root() call. Thus, the mounts cannot be shared.
(2) When a mount is propagated to a mount that already has another mount
mounted on the same dentry.
The easiest example for this is to create a new mount namespace. The
following commands will create a mount namespace where the rootfs
mount / will be a slave to the peer group of the host rootfs /
mount's peer group. IOW, it will receive propagation from the host:
mount --make-shared /
unshare --mount --propagation=slave
Now a new mount on the /mnt dentry in that mount namespace is
created. (As it can be confusing it should be spelled out that the
tmpfs mount on the /mnt dentry that was just created doesn't
propagate back to the host because the rootfs mount / of the mount
namespace isn't a peer of the host rootfs.):
mount -t tmpfs tmpfs /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt tmpfs tmpfs
Now another terminal in the host mount namespace can observe that
the mount indeed hasn't propagated back to into the host mount
namespace. A new mount can now be created on top of the /mnt dentry
with the rootfs mount / as its parent:
mount --bind /opt /mnt
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 shared:1
The mount namespace that was created earlier can now observe that
the bind mount created on the host has propagated into it:
TARGET SOURCE FSTYPE PROPAGATION
└─/mnt /dev/sda2[/opt] ext4 master:1
└─/mnt tmpfs tmpfs
But instead of having been mounted on top of the tmpfs mount at the
/mnt dentry the /opt mount has been mounted on top of the rootfs
mount at the /mnt dentry. And the tmpfs mount has been remounted on
top of the propagated /opt mount at the /opt dentry. So in other
words, the propagated mount has been mounted beneath the preexisting
mount in that mount namespace.
Mount namespaces make this easy to illustrate but it's also easy to
mount beneath an existing mount in the same mount namespace
(The following example assumes a shared rootfs mount / with peer
group id 1):
mount --bind /opt /opt
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/opt] ext4 188 29 shared:1
If another mount is mounted on top of the /opt mount at the /opt
dentry:
mount --bind /tmp /opt
The following clunky mount tree will result:
TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION
└─/opt /dev/sda2[/tmp] ext4 405 29 shared:1
└─/opt /dev/sda2[/opt] ext4 188 405 shared:1
└─/opt /dev/sda2[/tmp] ext4 404 188 shared:1
The /tmp mount is mounted beneath the /opt mount and another copy is
mounted on top of the /opt mount. This happens because the rootfs /
and the /opt mount are shared mounts in the same peer group.
When the new /tmp mount is supposed to be mounted at the /opt dentry
then the /tmp mount first propagates to the root mount at the /opt
dentry. But there already is the /opt mount mounted at the /opt
dentry. So the old /opt mount at the /opt dentry will be mounted on
top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent
is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root
is /opt which is what shows up as /opt under SOURCE). So again, a
mount will be mounted beneath a preexisting mount.
(Fwiw, a few iterations of mount --bind /opt /opt in a loop on a
shared rootfs is a good example of what could be referred to as
mount explosion.)
The main point is that such mounts allows userspace to umount a top
mount and reveal an underlying mount. So for example, umounting the
tmpfs mount on /mnt that was created in example (1) using mount
namespaces reveals the /opt mount which was mounted beneath it.
In (2) where a mount was mounted beneath the top mount in the same mount
namespace unmounting the top mount would unmount both the top mount and
the mount beneath. In the process the original mount would be remounted
on top of the rootfs mount / at the /opt dentry again.
This again, is a result of mount propagation only this time it's umount
propagation. However, this can be avoided by simply making the parent
mount / of the @opt mount a private or slave mount. Then the top mount
and the original mount can be unmounted to reveal the mount beneath.
These two examples are fairly arcane and are merely added to make it
clear how mount propagation has effects on current and future features.
More common use-cases will just be things like:
mount -t btrfs /dev/sdA /mnt
mount -t xfs /dev/sdB --beneath /mnt
umount /mnt
after which we'll have updated from a btrfs filesystem to a xfs
filesystem without ever revealing the underlying mountpoint.
The crux is that the proposed mechanism already exists and that it is so
powerful as to cover cases where mounts are supposed to be updated with
new versions. Crucially, it offers an important flexibility. Namely that
updates to a system may either be forced or can be delayed and the
umount of the top mount be left to a service if it is a cooperative one.
This adds a new flag to move_mount() that allows to explicitly move a
beneath the top mount adhering to the following semantics:
* Mounts cannot be mounted beneath the rootfs. This restriction
encompasses the rootfs but also chroots via chroot() and pivot_root().
To mount a mount beneath the rootfs or a chroot, pivot_root() can be
used as illustrated above.
* The source mount must be a private mount to force the kernel to
allocate a new, unused peer group id. This isn't a required
restriction but a voluntary one. It avoids repeating a semantical
quirk that already exists today. If bind mounts which already have a
peer group id are inserted into mount trees that have the same peer
group id this can cause a lot of mount propagation events to be
generated (For example, consider running mount --bind /opt /opt in a
loop where the parent mount is a shared mount.).
* Avoid getting rid of the top mount in the kernel. Cooperative services
need to be able to unmount the top mount themselves.
This also avoids a good deal of additional complexity. The umount
would have to be propagated which would be another rather expensive
operation. So namespace_lock() and lock_mount_hash() would potentially
have to be held for a long time for both a mount and umount
propagation. That should be avoided.
* The path to mount beneath must be mounted and attached.
* The top mount and its parent must be in the caller's mount namespace
and the caller must be able to mount in that mount namespace.
* The caller must be able to unmount the top mount to prove that they
could reveal the underlying mount.
* The propagation tree is calculated based on the destination mount's
parent mount and the destination mount's mountpoint on the parent
mount. Of course, if the parent of the destination mount and the
destination mount are shared mounts in the same peer group and the
mountpoint of the new mount to be mounted is a subdir of their
->mnt_root then both will receive a mount of /opt. That's probably
easier to understand with an example. Assuming a standard shared
rootfs /:
mount --bind /opt /opt
mount --bind /tmp /opt
will cause the same mount tree as:
mount --bind /opt /opt
mount --beneath /tmp /opt
because both / and /opt are shared mounts/peers in the same peer
group and the /opt dentry is a subdirectory of both the parent's and
the child's ->mnt_root. If a mount tree like that is created it almost
always is an accident or abuse of mount propagation. Realistically
what most people probably mean in this scenarios is:
mount --bind /opt /opt
mount --make-private /opt
mount --make-shared /opt
This forces the allocation of a new separate peer group for the /opt
mount. Aferwards a mount --bind or mount --beneath actually makes
sense as the / and /opt mount belong to different peer groups. Before
that it's likely just confusion about what the user wanted to achieve.
* Refuse MOVE_MOUNT_BENEATH if:
(1) the @mnt_from has been overmounted in between path resolution and
acquiring @namespace_sem when locking @mnt_to. This avoids the
proliferation of shadow mounts.
(2) if @to_mnt is moved to a different mountpoint while acquiring
@namespace_sem to lock @to_mnt.
(3) if @to_mnt is unmounted while acquiring @namespace_sem to lock
@to_mnt.
(4) if the parent of the target mount propagates to the target mount
at the same mountpoint.
This would mean mounting @mnt_from on @mnt_to->mnt_parent and then
propagating a copy @c of @mnt_from onto @mnt_to. This defeats the
whole purpose of mounting @mnt_from beneath @mnt_to.
(5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at
the same mountpoint.
If @mnt_to->mnt_parent propagates to @mnt_from this would mean
propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards
@mnt_from would be mounted on top of @mnt_to->mnt_parent and
@mnt_to would be unmounted from @mnt->mnt_parent and remounted on
@mnt_from. But since @c is already mounted on @mnt_from, @mnt_to
would ultimately be remounted on top of @c. Afterwards, @mnt_from
would be covered by a copy @c of @mnt_from and @c would be covered
by @mnt_from itself. This defeats the whole purpose of mounting
@mnt_from beneath @mnt_to.
Cases (1) to (3) are required as they deal with races that would cause
bugs or unexpected behavior for users. Cases (4) and (5) refuse
semantical quirks that would not be a bug but would cause weird mount
trees to be created. While they can already be created via other means
(mount --bind /opt /opt x n) there's no reason to repeat past mistakes
in new features.
Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1]
Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2]
Link: https://github.com/flatcar/sysext-bakery
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1
Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2
Link: https://github.com/systemd/systemd/pull/26013
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-05-03 13:18:42 +02:00
|
|
|
/**
|
|
|
|
|
* propagation_would_overmount - check whether propagation from @from
|
|
|
|
|
* would overmount @to
|
|
|
|
|
* @from: shared mount
|
|
|
|
|
* @to: mount to check
|
|
|
|
|
* @mp: future mountpoint of @to on @from
|
|
|
|
|
*
|
|
|
|
|
* If @from propagates mounts to @to, @from and @to must either be peers
|
|
|
|
|
* or one of the masters in the hierarchy of masters of @to must be a
|
|
|
|
|
* peer of @from.
|
|
|
|
|
*
|
|
|
|
|
* If the root of the @to mount is equal to the future mountpoint @mp of
|
|
|
|
|
* the @to mount on @from then @to will be overmounted by whatever is
|
|
|
|
|
* propagated to it.
|
|
|
|
|
*
|
|
|
|
|
* Context: This function expects namespace_lock() to be held and that
|
|
|
|
|
* @mp is stable.
|
|
|
|
|
* Return: If @from overmounts @to, true is returned, false if not.
|
|
|
|
|
*/
|
|
|
|
|
bool propagation_would_overmount(const struct mount *from,
|
|
|
|
|
const struct mount *to,
|
|
|
|
|
const struct mountpoint *mp)
|
|
|
|
|
{
|
|
|
|
|
if (!IS_MNT_SHARED(from))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (to->mnt.mnt_root != mp->m_dentry)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (const struct mount *m = to; m; m = m->mnt_master) {
|
|
|
|
|
if (peers(from, m))
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2005-11-07 17:20:17 -05:00
|
|
|
/*
|
|
|
|
|
* check if the mount 'mnt' can be unmounted successfully.
|
|
|
|
|
* @mnt: the mount to be checked for unmount
|
|
|
|
|
* NOTE: unmounting 'mnt' would naturally propagate to all
|
|
|
|
|
* other mounts its parent propagates to.
|
|
|
|
|
* Check if any of these mounts that **do not have submounts**
|
|
|
|
|
* have more references than 'refcnt'. If so return busy.
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
*
|
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.
The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.
We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.
- check the global sum once every interval (this will delay zero detection
for some interval, so it's probably a showstopper for vfsmounts).
- keep a local count and only taking the global sum when local reaches 0 (this
is difficult for vfsmounts, because we can't hold preempt off for the life of
a reference, so a counter would need to be per-thread or tied strongly to a
particular CPU which requires more locking).
- keep a local difference of increments and decrements, which allows us to sum
the total difference and hence find the refcount when summing all CPUs. Then,
keep a single integer "long" refcount for slow and long lasting references,
and only take the global sum of local counters when the long refcount is 0.
This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.
This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.
This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:50:11 +11:00
|
|
|
* vfsmount lock must be held for write
|
2005-11-07 17:20:17 -05:00
|
|
|
*/
|
2011-11-24 21:35:16 -05:00
|
|
|
int propagate_mount_busy(struct mount *mnt, int refcnt)
|
2005-11-07 17:20:17 -05:00
|
|
|
{
|
2011-11-24 22:19:58 -05:00
|
|
|
struct mount *parent = mnt->mnt_parent;
|
2005-11-07 17:20:17 -05:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* quickly check if the current mount can be unmounted.
|
|
|
|
|
* If not, we don't have to go checking for all other
|
|
|
|
|
* mounts
|
|
|
|
|
*/
|
2011-11-24 23:24:33 -05:00
|
|
|
if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt))
|
2005-11-07 17:20:17 -05:00
|
|
|
return 1;
|
|
|
|
|
|
don't have mounts pin their parents
Simplify the rules for mount refcounts. Current rules include:
* being a namespace root => +1
* being someone's child => +1
* being someone's child => +1 to parent's refcount, unless you've
already been through umount_tree().
The last part is not needed at all. It makes for more places where need
to decrement refcounts and it creates an asymmetry between the situations
for something that has never been a part of a namespace and something that
left one, both for no good reason.
If mount's refcount has additions from its children, we know that
* it's either someone's child itself (and will remain so
until umount_tree(), at which point contributions from children
will disappear), or
* or is the root of namespace (and will remain such until
it either becomes someone's child in another namespace or goes through
umount_tree()), or
* it is the root of some tree copy, and is currently pinned
by the caller of copy_tree() (and remains such until it either gets
into namespace, or goes to umount_tree()).
In all cases we already have contribution(s) to refcount that will last
as long as the contribution from children remains. In other words, the
lifetime is not affected by refcount contributions from children.
It might be useful for "is it busy" checks, but those are actually
no harder to express without it.
NB: propagate_mnt_busy() part is an equivalent transformation, ugly as it
is; the current logics is actually wrong and may give false negatives,
but fixing that is for a separate patch (probably earlier in the queue).
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-04-27 22:53:13 -04:00
|
|
|
if (mnt == parent)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
for (struct mount *m = propagation_next(parent, parent); m;
|
2011-11-24 23:56:26 -05:00
|
|
|
m = propagation_next(m, parent)) {
|
don't have mounts pin their parents
Simplify the rules for mount refcounts. Current rules include:
* being a namespace root => +1
* being someone's child => +1
* being someone's child => +1 to parent's refcount, unless you've
already been through umount_tree().
The last part is not needed at all. It makes for more places where need
to decrement refcounts and it creates an asymmetry between the situations
for something that has never been a part of a namespace and something that
left one, both for no good reason.
If mount's refcount has additions from its children, we know that
* it's either someone's child itself (and will remain so
until umount_tree(), at which point contributions from children
will disappear), or
* or is the root of namespace (and will remain such until
it either becomes someone's child in another namespace or goes through
umount_tree()), or
* it is the root of some tree copy, and is currently pinned
by the caller of copy_tree() (and remains such until it either gets
into namespace, or goes to umount_tree()).
In all cases we already have contribution(s) to refcount that will last
as long as the contribution from children remains. In other words, the
lifetime is not affected by refcount contributions from children.
It might be useful for "is it busy" checks, but those are actually
no harder to express without it.
NB: propagate_mnt_busy() part is an equivalent transformation, ugly as it
is; the current logics is actually wrong and may give false negatives,
but fixing that is for a separate patch (probably earlier in the queue).
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-04-27 22:53:13 -04:00
|
|
|
struct list_head *head;
|
|
|
|
|
struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
|
2017-01-20 18:28:35 +13:00
|
|
|
|
don't have mounts pin their parents
Simplify the rules for mount refcounts. Current rules include:
* being a namespace root => +1
* being someone's child => +1
* being someone's child => +1 to parent's refcount, unless you've
already been through umount_tree().
The last part is not needed at all. It makes for more places where need
to decrement refcounts and it creates an asymmetry between the situations
for something that has never been a part of a namespace and something that
left one, both for no good reason.
If mount's refcount has additions from its children, we know that
* it's either someone's child itself (and will remain so
until umount_tree(), at which point contributions from children
will disappear), or
* or is the root of namespace (and will remain such until
it either becomes someone's child in another namespace or goes through
umount_tree()), or
* it is the root of some tree copy, and is currently pinned
by the caller of copy_tree() (and remains such until it either gets
into namespace, or goes to umount_tree()).
In all cases we already have contribution(s) to refcount that will last
as long as the contribution from children remains. In other words, the
lifetime is not affected by refcount contributions from children.
It might be useful for "is it busy" checks, but those are actually
no harder to express without it.
NB: propagate_mnt_busy() part is an equivalent transformation, ugly as it
is; the current logics is actually wrong and may give false negatives,
but fixing that is for a separate patch (probably earlier in the queue).
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-04-27 22:53:13 -04:00
|
|
|
if (!child)
|
2017-01-20 18:28:35 +13:00
|
|
|
continue;
|
|
|
|
|
|
don't have mounts pin their parents
Simplify the rules for mount refcounts. Current rules include:
* being a namespace root => +1
* being someone's child => +1
* being someone's child => +1 to parent's refcount, unless you've
already been through umount_tree().
The last part is not needed at all. It makes for more places where need
to decrement refcounts and it creates an asymmetry between the situations
for something that has never been a part of a namespace and something that
left one, both for no good reason.
If mount's refcount has additions from its children, we know that
* it's either someone's child itself (and will remain so
until umount_tree(), at which point contributions from children
will disappear), or
* or is the root of namespace (and will remain such until
it either becomes someone's child in another namespace or goes through
umount_tree()), or
* it is the root of some tree copy, and is currently pinned
by the caller of copy_tree() (and remains such until it either gets
into namespace, or goes to umount_tree()).
In all cases we already have contribution(s) to refcount that will last
as long as the contribution from children remains. In other words, the
lifetime is not affected by refcount contributions from children.
It might be useful for "is it busy" checks, but those are actually
no harder to express without it.
NB: propagate_mnt_busy() part is an equivalent transformation, ugly as it
is; the current logics is actually wrong and may give false negatives,
but fixing that is for a separate patch (probably earlier in the queue).
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-04-27 22:53:13 -04:00
|
|
|
head = &child->mnt_mounts;
|
|
|
|
|
if (!list_empty(head)) {
|
|
|
|
|
/*
|
|
|
|
|
* a mount that covers child completely wouldn't prevent
|
|
|
|
|
* it being pulled out; any other would.
|
|
|
|
|
*/
|
|
|
|
|
if (!list_is_singular(head) || !child->overmount)
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (do_refcount_check(child, 1))
|
2017-01-20 18:28:35 +13:00
|
|
|
return 1;
|
2005-11-07 17:20:17 -05:00
|
|
|
}
|
2017-01-20 18:28:35 +13:00
|
|
|
return 0;
|
2005-11-07 17:20:17 -05:00
|
|
|
}
|
|
|
|
|
|
2015-01-03 05:39:35 -06:00
|
|
|
/*
|
|
|
|
|
* Clear MNT_LOCKED when it can be shown to be safe.
|
|
|
|
|
*
|
|
|
|
|
* mount_lock lock must be held for write
|
|
|
|
|
*/
|
|
|
|
|
void propagate_mount_unlock(struct mount *mnt)
|
|
|
|
|
{
|
|
|
|
|
struct mount *parent = mnt->mnt_parent;
|
|
|
|
|
struct mount *m, *child;
|
|
|
|
|
|
|
|
|
|
BUG_ON(parent == mnt);
|
|
|
|
|
|
|
|
|
|
for (m = propagation_next(parent, parent); m;
|
|
|
|
|
m = propagation_next(m, parent)) {
|
2017-01-20 18:28:35 +13:00
|
|
|
child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
|
2015-01-03 05:39:35 -06:00
|
|
|
if (child)
|
|
|
|
|
child->mnt.mnt_flags &= ~MNT_LOCKED;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-14 20:50:06 -04:00
|
|
|
static inline bool is_candidate(struct mount *m)
|
2015-01-05 13:38:04 -06:00
|
|
|
{
|
2025-06-21 18:06:19 -04:00
|
|
|
return m->mnt_t_flags & T_UMOUNT_CANDIDATE;
|
2015-01-05 13:38:04 -06:00
|
|
|
}
|
|
|
|
|
|
2025-05-14 20:50:06 -04:00
|
|
|
static void umount_one(struct mount *m, struct list_head *to_umount)
|
|
|
|
|
{
|
|
|
|
|
m->mnt.mnt_flags |= MNT_UMOUNT;
|
|
|
|
|
list_del_init(&m->mnt_child);
|
2025-06-17 21:35:22 -04:00
|
|
|
move_from_ns(m);
|
|
|
|
|
list_add_tail(&m->mnt_list, to_umount);
|
2025-05-14 20:50:06 -04:00
|
|
|
}
|
2005-11-07 17:20:17 -05:00
|
|
|
|
2025-05-14 20:50:06 -04:00
|
|
|
static void remove_from_candidate_list(struct mount *m)
|
|
|
|
|
{
|
2025-06-21 18:06:19 -04:00
|
|
|
m->mnt_t_flags &= ~(T_MARKED | T_UMOUNT_CANDIDATE);
|
2025-05-14 20:50:06 -04:00
|
|
|
list_del_init(&m->mnt_list);
|
|
|
|
|
}
|
2017-01-20 18:28:35 +13:00
|
|
|
|
2025-05-14 20:50:06 -04:00
|
|
|
static void gather_candidates(struct list_head *set,
|
|
|
|
|
struct list_head *candidates)
|
|
|
|
|
{
|
|
|
|
|
struct mount *m, *p, *q;
|
2017-01-20 18:28:35 +13:00
|
|
|
|
2025-05-14 20:50:06 -04:00
|
|
|
list_for_each_entry(m, set, mnt_list) {
|
|
|
|
|
if (is_candidate(m))
|
|
|
|
|
continue;
|
2025-06-21 18:06:19 -04:00
|
|
|
m->mnt_t_flags |= T_UMOUNT_CANDIDATE;
|
2025-05-14 20:50:06 -04:00
|
|
|
p = m->mnt_parent;
|
|
|
|
|
q = propagation_next(p, p);
|
|
|
|
|
while (q) {
|
|
|
|
|
struct mount *child = __lookup_mnt(&q->mnt,
|
|
|
|
|
m->mnt_mountpoint);
|
|
|
|
|
if (child) {
|
|
|
|
|
/*
|
|
|
|
|
* We might've already run into this one. That
|
|
|
|
|
* must've happened on earlier iteration of the
|
|
|
|
|
* outer loop; in that case we can skip those
|
|
|
|
|
* parents that get propagation from q - there
|
|
|
|
|
* will be nothing new on those as well.
|
|
|
|
|
*/
|
|
|
|
|
if (is_candidate(child)) {
|
|
|
|
|
q = skip_propagation_subtree(q, p);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2025-06-21 18:06:19 -04:00
|
|
|
child->mnt_t_flags |= T_UMOUNT_CANDIDATE;
|
2025-05-14 20:50:06 -04:00
|
|
|
if (!will_be_unmounted(child))
|
|
|
|
|
list_add(&child->mnt_list, candidates);
|
|
|
|
|
}
|
|
|
|
|
q = propagation_next(q, p);
|
|
|
|
|
}
|
2016-10-24 16:16:13 -05:00
|
|
|
}
|
2025-05-14 20:50:06 -04:00
|
|
|
list_for_each_entry(m, set, mnt_list)
|
2025-06-21 18:06:19 -04:00
|
|
|
m->mnt_t_flags &= ~T_UMOUNT_CANDIDATE;
|
2016-10-24 16:16:13 -05:00
|
|
|
}
|
|
|
|
|
|
2025-05-14 20:50:06 -04:00
|
|
|
/*
|
|
|
|
|
* We know that some child of @m can't be unmounted. In all places where the
|
|
|
|
|
* chain of descent of @m has child not overmounting the root of parent,
|
|
|
|
|
* the parent can't be unmounted either.
|
|
|
|
|
*/
|
|
|
|
|
static void trim_ancestors(struct mount *m)
|
2016-10-24 16:16:13 -05:00
|
|
|
{
|
2025-05-14 20:50:06 -04:00
|
|
|
struct mount *p;
|
|
|
|
|
|
|
|
|
|
for (p = m->mnt_parent; is_candidate(p); m = p, p = p->mnt_parent) {
|
|
|
|
|
if (IS_MNT_MARKED(m)) // all candidates beneath are overmounts
|
|
|
|
|
return;
|
|
|
|
|
SET_MNT_MARK(m);
|
|
|
|
|
if (m != p->overmount)
|
2025-06-21 18:06:19 -04:00
|
|
|
p->mnt_t_flags &= ~T_UMOUNT_CANDIDATE;
|
2005-11-07 17:20:17 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-14 20:50:06 -04:00
|
|
|
/*
|
|
|
|
|
* Find and exclude all umount candidates forbidden by @m
|
|
|
|
|
* (see Documentation/filesystems/propagate_umount.txt)
|
|
|
|
|
* If we can immediately tell that @m is OK to unmount (unlocked
|
|
|
|
|
* and all children are already committed to unmounting) commit
|
|
|
|
|
* to unmounting it.
|
|
|
|
|
* Only @m itself might be taken from the candidates list;
|
|
|
|
|
* anything found by trim_ancestors() is marked non-candidate
|
|
|
|
|
* and left on the list.
|
|
|
|
|
*/
|
|
|
|
|
static void trim_one(struct mount *m, struct list_head *to_umount)
|
mnt: In umount propagation reparent in a separate pass
It was observed that in some pathlogical cases that the current code
does not unmount everything it should. After investigation it
was determined that the issue is that mnt_change_mntpoint can
can change which mounts are available to be unmounted during mount
propagation which is wrong.
The trivial reproducer is:
$ cat ./pathological.sh
mount -t tmpfs test-base /mnt
cd /mnt
mkdir 1 2 1/1
mount --bind 1 1
mount --make-shared 1
mount --bind 1 2
mount --bind 1/1 1/1
mount --bind 1/1 1/1
echo
grep test-base /proc/self/mountinfo
umount 1/1
echo
grep test-base /proc/self/mountinfo
$ unshare -Urm ./pathological.sh
The expected output looks like:
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
The output without the fix looks like:
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
That last mount in the output was in the propgation tree to be unmounted but
was missed because the mnt_change_mountpoint changed it's parent before the walk
through the mount propagation tree observed it.
Cc: stable@vger.kernel.org
Fixes: 1064f874abc0 ("mnt: Tuck mounts under others instead of creating shadow/side mounts.")
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
2017-05-15 14:42:07 -05:00
|
|
|
{
|
2025-05-14 20:50:06 -04:00
|
|
|
bool remove_this = false, found = false, umount_this = false;
|
|
|
|
|
struct mount *n;
|
|
|
|
|
|
|
|
|
|
if (!is_candidate(m)) { // trim_ancestors() left it on list
|
|
|
|
|
remove_from_candidate_list(m);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
list_for_each_entry(n, &m->mnt_mounts, mnt_child) {
|
|
|
|
|
if (!is_candidate(n)) {
|
|
|
|
|
found = true;
|
|
|
|
|
if (n != m->overmount) {
|
|
|
|
|
remove_this = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
2025-01-29 17:58:01 +01:00
|
|
|
}
|
mnt: In umount propagation reparent in a separate pass
It was observed that in some pathlogical cases that the current code
does not unmount everything it should. After investigation it
was determined that the issue is that mnt_change_mntpoint can
can change which mounts are available to be unmounted during mount
propagation which is wrong.
The trivial reproducer is:
$ cat ./pathological.sh
mount -t tmpfs test-base /mnt
cd /mnt
mkdir 1 2 1/1
mount --bind 1 1
mount --make-shared 1
mount --bind 1 2
mount --bind 1/1 1/1
mount --bind 1/1 1/1
echo
grep test-base /proc/self/mountinfo
umount 1/1
echo
grep test-base /proc/self/mountinfo
$ unshare -Urm ./pathological.sh
The expected output looks like:
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
The output without the fix looks like:
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
That last mount in the output was in the propgation tree to be unmounted but
was missed because the mnt_change_mountpoint changed it's parent before the walk
through the mount propagation tree observed it.
Cc: stable@vger.kernel.org
Fixes: 1064f874abc0 ("mnt: Tuck mounts under others instead of creating shadow/side mounts.")
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
2017-05-15 14:42:07 -05:00
|
|
|
}
|
2025-05-14 20:50:06 -04:00
|
|
|
if (found) {
|
|
|
|
|
trim_ancestors(m);
|
|
|
|
|
} else if (!IS_MNT_LOCKED(m) && list_empty(&m->mnt_mounts)) {
|
|
|
|
|
remove_this = true;
|
|
|
|
|
umount_this = true;
|
|
|
|
|
}
|
|
|
|
|
if (remove_this) {
|
|
|
|
|
remove_from_candidate_list(m);
|
|
|
|
|
if (umount_this)
|
|
|
|
|
umount_one(m, to_umount);
|
|
|
|
|
}
|
mnt: In umount propagation reparent in a separate pass
It was observed that in some pathlogical cases that the current code
does not unmount everything it should. After investigation it
was determined that the issue is that mnt_change_mntpoint can
can change which mounts are available to be unmounted during mount
propagation which is wrong.
The trivial reproducer is:
$ cat ./pathological.sh
mount -t tmpfs test-base /mnt
cd /mnt
mkdir 1 2 1/1
mount --bind 1 1
mount --make-shared 1
mount --bind 1 2
mount --bind 1/1 1/1
mount --bind 1/1 1/1
echo
grep test-base /proc/self/mountinfo
umount 1/1
echo
grep test-base /proc/self/mountinfo
$ unshare -Urm ./pathological.sh
The expected output looks like:
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
The output without the fix looks like:
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
That last mount in the output was in the propgation tree to be unmounted but
was missed because the mnt_change_mountpoint changed it's parent before the walk
through the mount propagation tree observed it.
Cc: stable@vger.kernel.org
Fixes: 1064f874abc0 ("mnt: Tuck mounts under others instead of creating shadow/side mounts.")
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
2017-05-15 14:42:07 -05:00
|
|
|
}
|
|
|
|
|
|
2025-05-14 20:50:06 -04:00
|
|
|
static void handle_locked(struct mount *m, struct list_head *to_umount)
|
2016-10-24 17:25:19 -05:00
|
|
|
{
|
2025-05-14 20:50:06 -04:00
|
|
|
struct mount *cutoff = m, *p;
|
|
|
|
|
|
|
|
|
|
if (!is_candidate(m)) { // trim_ancestors() left it on list
|
|
|
|
|
remove_from_candidate_list(m);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
for (p = m; is_candidate(p); p = p->mnt_parent) {
|
|
|
|
|
remove_from_candidate_list(p);
|
|
|
|
|
if (!IS_MNT_LOCKED(p))
|
|
|
|
|
cutoff = p->mnt_parent;
|
|
|
|
|
}
|
|
|
|
|
if (will_be_unmounted(p))
|
|
|
|
|
cutoff = p;
|
|
|
|
|
while (m != cutoff) {
|
|
|
|
|
umount_one(m, to_umount);
|
|
|
|
|
m = m->mnt_parent;
|
2016-10-24 17:25:19 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2005-11-07 17:20:17 -05:00
|
|
|
/*
|
2025-05-14 20:50:06 -04:00
|
|
|
* @m is not to going away, and it overmounts the top of a stack of mounts
|
|
|
|
|
* that are going away. We know that all of those are fully overmounted
|
|
|
|
|
* by the one above (@m being the topmost of the chain), so @m can be slid
|
|
|
|
|
* in place where the bottom of the stack is attached.
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 04:37:39 +10:00
|
|
|
*
|
2025-05-14 20:50:06 -04:00
|
|
|
* NOTE: here we temporarily violate a constraint - two mounts end up with
|
|
|
|
|
* the same parent and mountpoint; that will be remedied as soon as we
|
|
|
|
|
* return from propagate_umount() - its caller (umount_tree()) will detach
|
|
|
|
|
* the stack from the parent it (and now @m) is attached to. umount_tree()
|
|
|
|
|
* might choose to keep unmounted pieces stuck to each other, but it always
|
|
|
|
|
* detaches them from the mounts that remain in the tree.
|
2005-11-07 17:20:17 -05:00
|
|
|
*/
|
2025-05-14 20:50:06 -04:00
|
|
|
static void reparent(struct mount *m)
|
2005-11-07 17:20:17 -05:00
|
|
|
{
|
2025-05-14 20:50:06 -04:00
|
|
|
struct mount *p = m;
|
|
|
|
|
struct mountpoint *mp;
|
2015-01-05 13:38:04 -06:00
|
|
|
|
2025-05-14 20:50:06 -04:00
|
|
|
do {
|
|
|
|
|
mp = p->mnt_mp;
|
|
|
|
|
p = p->mnt_parent;
|
|
|
|
|
} while (will_be_unmounted(p));
|
2016-10-24 17:25:19 -05:00
|
|
|
|
2025-05-14 20:50:06 -04:00
|
|
|
mnt_change_mountpoint(p, mp, m);
|
|
|
|
|
mnt_notify_add(m);
|
|
|
|
|
}
|
2016-10-24 16:16:13 -05:00
|
|
|
|
2025-05-14 20:50:06 -04:00
|
|
|
/**
|
|
|
|
|
* propagate_umount - apply propagation rules to the set of mounts for umount()
|
|
|
|
|
* @set: the list of mounts to be unmounted.
|
|
|
|
|
*
|
|
|
|
|
* Collect all mounts that receive propagation from the mount in @set and have
|
|
|
|
|
* no obstacles to being unmounted. Add these additional mounts to the set.
|
|
|
|
|
*
|
|
|
|
|
* See Documentation/filesystems/propagate_umount.txt if you do anything in
|
|
|
|
|
* this area.
|
|
|
|
|
*
|
|
|
|
|
* Locks held:
|
|
|
|
|
* mount_lock (write_seqlock), namespace_sem (exclusive).
|
|
|
|
|
*/
|
|
|
|
|
void propagate_umount(struct list_head *set)
|
|
|
|
|
{
|
|
|
|
|
struct mount *m, *p;
|
|
|
|
|
LIST_HEAD(to_umount); // committed to unmounting
|
|
|
|
|
LIST_HEAD(candidates); // undecided umount candidates
|
2016-10-24 17:25:19 -05:00
|
|
|
|
2025-05-14 20:50:06 -04:00
|
|
|
// collect all candidates
|
|
|
|
|
gather_candidates(set, &candidates);
|
|
|
|
|
|
|
|
|
|
// reduce the set until it's non-shifting
|
|
|
|
|
list_for_each_entry_safe(m, p, &candidates, mnt_list)
|
|
|
|
|
trim_one(m, &to_umount);
|
|
|
|
|
|
|
|
|
|
// ... and non-revealing
|
|
|
|
|
while (!list_empty(&candidates)) {
|
|
|
|
|
m = list_first_entry(&candidates,struct mount, mnt_list);
|
|
|
|
|
handle_locked(m, &to_umount);
|
2016-10-24 16:16:13 -05:00
|
|
|
}
|
mnt: In umount propagation reparent in a separate pass
It was observed that in some pathlogical cases that the current code
does not unmount everything it should. After investigation it
was determined that the issue is that mnt_change_mntpoint can
can change which mounts are available to be unmounted during mount
propagation which is wrong.
The trivial reproducer is:
$ cat ./pathological.sh
mount -t tmpfs test-base /mnt
cd /mnt
mkdir 1 2 1/1
mount --bind 1 1
mount --make-shared 1
mount --bind 1 2
mount --bind 1/1 1/1
mount --bind 1/1 1/1
echo
grep test-base /proc/self/mountinfo
umount 1/1
echo
grep test-base /proc/self/mountinfo
$ unshare -Urm ./pathological.sh
The expected output looks like:
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
The output without the fix looks like:
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
That last mount in the output was in the propgation tree to be unmounted but
was missed because the mnt_change_mountpoint changed it's parent before the walk
through the mount propagation tree observed it.
Cc: stable@vger.kernel.org
Fixes: 1064f874abc0 ("mnt: Tuck mounts under others instead of creating shadow/side mounts.")
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
2017-05-15 14:42:07 -05:00
|
|
|
|
2025-05-14 20:50:06 -04:00
|
|
|
// now to_umount consists of all acceptable candidates
|
2025-08-14 23:32:26 -04:00
|
|
|
// deal with reparenting of surviving overmounts on those
|
2025-05-14 20:50:06 -04:00
|
|
|
list_for_each_entry(m, &to_umount, mnt_list) {
|
2025-08-14 23:32:26 -04:00
|
|
|
struct mount *over = m->overmount;
|
|
|
|
|
if (over && !will_be_unmounted(over))
|
|
|
|
|
reparent(over);
|
2025-05-14 20:50:06 -04:00
|
|
|
}
|
mnt: In umount propagation reparent in a separate pass
It was observed that in some pathlogical cases that the current code
does not unmount everything it should. After investigation it
was determined that the issue is that mnt_change_mntpoint can
can change which mounts are available to be unmounted during mount
propagation which is wrong.
The trivial reproducer is:
$ cat ./pathological.sh
mount -t tmpfs test-base /mnt
cd /mnt
mkdir 1 2 1/1
mount --bind 1 1
mount --make-shared 1
mount --bind 1 2
mount --bind 1/1 1/1
mount --bind 1/1 1/1
echo
grep test-base /proc/self/mountinfo
umount 1/1
echo
grep test-base /proc/self/mountinfo
$ unshare -Urm ./pathological.sh
The expected output looks like:
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
The output without the fix looks like:
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
That last mount in the output was in the propgation tree to be unmounted but
was missed because the mnt_change_mountpoint changed it's parent before the walk
through the mount propagation tree observed it.
Cc: stable@vger.kernel.org
Fixes: 1064f874abc0 ("mnt: Tuck mounts under others instead of creating shadow/side mounts.")
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
2017-05-15 14:42:07 -05:00
|
|
|
|
2025-05-14 20:50:06 -04:00
|
|
|
// and fold them into the set
|
|
|
|
|
list_splice_tail_init(&to_umount, set);
|
2005-11-07 17:20:17 -05:00
|
|
|
}
|