mirror of
https://github.com/torvalds/linux.git
synced 2025-11-30 23:16:01 +07:00
Pull namespace updates from Christian Brauner: "This contains a larger set of changes around the generic namespace infrastructure of the kernel. Each specific namespace type (net, cgroup, mnt, ...) embedds a struct ns_common which carries the reference count of the namespace and so on. We open-coded and cargo-culted so many quirks for each namespace type that it just wasn't scalable anymore. So given there's a bunch of new changes coming in that area I've started cleaning all of this up. The core change is to make it possible to correctly initialize every namespace uniformly and derive the correct initialization settings from the type of the namespace such as namespace operations, namespace type and so on. This leaves the new ns_common_init() function with a single parameter which is the specific namespace type which derives the correct parameters statically. This also means the compiler will yell as soon as someone does something remotely fishy. The ns_common_init() addition also allows us to remove ns_alloc_inum() and drops any special-casing of the initial network namespace in the network namespace initialization code that Linus complained about. Another part is reworking the reference counting. The reference counting was open-coded and copy-pasted for each namespace type even though they all followed the same rules. This also removes all open accesses to the reference count and makes it private and only uses a very small set of dedicated helpers to manipulate them just like we do for e.g., files. In addition this generalizes the mount namespace iteration infrastructure introduced a few cycles ago. As reminder, the vfs makes it possible to iterate sequentially and bidirectionally through all mount namespaces on the system or all mount namespaces that the caller holds privilege over. This allow userspace to iterate over all mounts in all mount namespaces using the listmount() and statmount() system call. Each mount namespace has a unique identifier for the lifetime of the systems that is exposed to userspace. The network namespace also has a unique identifier working exactly the same way. This extends the concept to all other namespace types. The new nstree type makes it possible to lookup namespaces purely by their identifier and to walk the namespace list sequentially and bidirectionally for all namespace types, allowing userspace to iterate through all namespaces. Looking up namespaces in the namespace tree works completely locklessly. This also means we can move the mount namespace onto the generic infrastructure and remove a bunch of code and members from struct mnt_namespace itself. There's a bunch of stuff coming on top of this in the future but for now this uses the generic namespace tree to extend a concept introduced first for pidfs a few cycles ago. For a while now we have supported pidfs file handles for pidfds. This has proven to be very useful. This extends the concept to cover namespaces as well. It is possible to encode and decode namespace file handles using the common name_to_handle_at() and open_by_handle_at() apis. As with pidfs file handles, namespace file handles are exhaustive, meaning it is not required to actually hold a reference to nsfs in able to decode aka open_by_handle_at() a namespace file handle. Instead the FD_NSFS_ROOT constant can be passed which will let the kernel grab a reference to the root of nsfs internally and thus decode the file handle. Namespaces file descriptors can already be derived from pidfds which means they aren't subject to overmount protection bugs. IOW, it's irrelevant if the caller would not have access to an appropriate /proc/<pid>/ns/ directory as they could always just derive the namespace based on a pidfd already. It has the same advantage as pidfds. It's possible to reliably and for the lifetime of the system refer to a namespace without pinning any resources and to compare them trivially. Permission checking is kept simple. If the caller is located in the namespace the file handle refers to they are able to open it otherwise they must hold privilege over the owning namespace of the relevant namespace. The namespace file handle layout is exposed as uapi and has a stable and extensible format. For now it simply contains the namespace identifier, the namespace type, and the inode number. The stable format means that userspace may construct its own namespace file handles without going through name_to_handle_at() as they are already allowed for pidfs and cgroup file handles" * tag 'namespace-6.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (65 commits) ns: drop assert ns: move ns type into struct ns_common nstree: make struct ns_tree private ns: add ns_debug() ns: simplify ns_common_init() further cgroup: add missing ns_common include ns: use inode initializer for initial namespaces selftests/namespaces: verify initial namespace inode numbers ns: rename to __ns_ref nsfs: port to ns_ref_*() helpers net: port to ns_ref_*() helpers uts: port to ns_ref_*() helpers ipv4: use check_net() net: use check_net() net-sysfs: use check_net() user: port to ns_ref_*() helpers time: port to ns_ref_*() helpers pid: port to ns_ref_*() helpers ipc: port to ns_ref_*() helpers cgroup: port to ns_ref_*() helpers ...
219 lines
5.6 KiB
C
219 lines
5.6 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef __IPC_NAMESPACE_H__
|
|
#define __IPC_NAMESPACE_H__
|
|
|
|
#include <linux/err.h>
|
|
#include <linux/idr.h>
|
|
#include <linux/rwsem.h>
|
|
#include <linux/notifier.h>
|
|
#include <linux/nsproxy.h>
|
|
#include <linux/ns_common.h>
|
|
#include <linux/refcount.h>
|
|
#include <linux/rhashtable-types.h>
|
|
#include <linux/sysctl.h>
|
|
#include <linux/percpu_counter.h>
|
|
|
|
struct user_namespace;
|
|
|
|
struct ipc_ids {
|
|
int in_use;
|
|
unsigned short seq;
|
|
struct rw_semaphore rwsem;
|
|
struct idr ipcs_idr;
|
|
int max_idx;
|
|
int last_idx; /* For wrap around detection */
|
|
#ifdef CONFIG_CHECKPOINT_RESTORE
|
|
int next_id;
|
|
#endif
|
|
struct rhashtable key_ht;
|
|
};
|
|
|
|
struct ipc_namespace {
|
|
struct ipc_ids ids[3];
|
|
|
|
int sem_ctls[4];
|
|
int used_sems;
|
|
|
|
unsigned int msg_ctlmax;
|
|
unsigned int msg_ctlmnb;
|
|
unsigned int msg_ctlmni;
|
|
struct percpu_counter percpu_msg_bytes;
|
|
struct percpu_counter percpu_msg_hdrs;
|
|
|
|
size_t shm_ctlmax;
|
|
size_t shm_ctlall;
|
|
unsigned long shm_tot;
|
|
int shm_ctlmni;
|
|
/*
|
|
* Defines whether IPC_RMID is forced for _all_ shm segments regardless
|
|
* of shmctl()
|
|
*/
|
|
int shm_rmid_forced;
|
|
|
|
struct notifier_block ipcns_nb;
|
|
|
|
/* The kern_mount of the mqueuefs sb. We take a ref on it */
|
|
struct vfsmount *mq_mnt;
|
|
|
|
/* # queues in this ns, protected by mq_lock */
|
|
unsigned int mq_queues_count;
|
|
|
|
/* next fields are set through sysctl */
|
|
unsigned int mq_queues_max; /* initialized to DFLT_QUEUESMAX */
|
|
unsigned int mq_msg_max; /* initialized to DFLT_MSGMAX */
|
|
unsigned int mq_msgsize_max; /* initialized to DFLT_MSGSIZEMAX */
|
|
unsigned int mq_msg_default;
|
|
unsigned int mq_msgsize_default;
|
|
|
|
struct ctl_table_set mq_set;
|
|
struct ctl_table_header *mq_sysctls;
|
|
|
|
struct ctl_table_set ipc_set;
|
|
struct ctl_table_header *ipc_sysctls;
|
|
|
|
/* user_ns which owns the ipc ns */
|
|
struct user_namespace *user_ns;
|
|
struct ucounts *ucounts;
|
|
|
|
struct llist_node mnt_llist;
|
|
|
|
struct ns_common ns;
|
|
} __randomize_layout;
|
|
|
|
extern struct ipc_namespace init_ipc_ns;
|
|
extern spinlock_t mq_lock;
|
|
|
|
#ifdef CONFIG_SYSVIPC
|
|
extern void shm_destroy_orphaned(struct ipc_namespace *ns);
|
|
#else /* CONFIG_SYSVIPC */
|
|
static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {}
|
|
#endif /* CONFIG_SYSVIPC */
|
|
|
|
#ifdef CONFIG_POSIX_MQUEUE
|
|
extern int mq_init_ns(struct ipc_namespace *ns);
|
|
/*
|
|
* POSIX Message Queue default values:
|
|
*
|
|
* MIN_*: Lowest value an admin can set the maximum unprivileged limit to
|
|
* DFLT_*MAX: Default values for the maximum unprivileged limits
|
|
* DFLT_{MSG,MSGSIZE}: Default values used when the user doesn't supply
|
|
* an attribute to the open call and the queue must be created
|
|
* HARD_*: Highest value the maximums can be set to. These are enforced
|
|
* on CAP_SYS_RESOURCE apps as well making them inviolate (so make them
|
|
* suitably high)
|
|
*
|
|
* POSIX Requirements:
|
|
* Per app minimum openable message queues - 8. This does not map well
|
|
* to the fact that we limit the number of queues on a per namespace
|
|
* basis instead of a per app basis. So, make the default high enough
|
|
* that no given app should have a hard time opening 8 queues.
|
|
* Minimum maximum for HARD_MSGMAX - 32767. I bumped this to 65536.
|
|
* Minimum maximum for HARD_MSGSIZEMAX - POSIX is silent on this. However,
|
|
* we have run into a situation where running applications in the wild
|
|
* require this to be at least 5MB, and preferably 10MB, so I set the
|
|
* value to 16MB in hopes that this user is the worst of the bunch and
|
|
* the new maximum will handle anyone else. I may have to revisit this
|
|
* in the future.
|
|
*/
|
|
#define DFLT_QUEUESMAX 256
|
|
#define MIN_MSGMAX 1
|
|
#define DFLT_MSG 10U
|
|
#define DFLT_MSGMAX 10
|
|
#define HARD_MSGMAX 65536
|
|
#define MIN_MSGSIZEMAX 128
|
|
#define DFLT_MSGSIZE 8192U
|
|
#define DFLT_MSGSIZEMAX 8192
|
|
#define HARD_MSGSIZEMAX (16*1024*1024)
|
|
#else
|
|
static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
|
|
#endif
|
|
|
|
#if defined(CONFIG_IPC_NS)
|
|
static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns)
|
|
{
|
|
return container_of(ns, struct ipc_namespace, ns);
|
|
}
|
|
|
|
extern struct ipc_namespace *copy_ipcs(u64 flags,
|
|
struct user_namespace *user_ns, struct ipc_namespace *ns);
|
|
|
|
static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
|
|
{
|
|
if (ns)
|
|
ns_ref_inc(ns);
|
|
return ns;
|
|
}
|
|
|
|
static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
|
|
{
|
|
if (ns) {
|
|
if (ns_ref_get(ns))
|
|
return ns;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
extern void put_ipc_ns(struct ipc_namespace *ns);
|
|
#else
|
|
static inline struct ipc_namespace *copy_ipcs(u64 flags,
|
|
struct user_namespace *user_ns, struct ipc_namespace *ns)
|
|
{
|
|
if (flags & CLONE_NEWIPC)
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
return ns;
|
|
}
|
|
|
|
static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
|
|
{
|
|
return ns;
|
|
}
|
|
|
|
static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
|
|
{
|
|
return ns;
|
|
}
|
|
|
|
static inline void put_ipc_ns(struct ipc_namespace *ns)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
|
|
|
|
void retire_mq_sysctls(struct ipc_namespace *ns);
|
|
bool setup_mq_sysctls(struct ipc_namespace *ns);
|
|
|
|
#else /* CONFIG_POSIX_MQUEUE_SYSCTL */
|
|
|
|
static inline void retire_mq_sysctls(struct ipc_namespace *ns)
|
|
{
|
|
}
|
|
|
|
static inline bool setup_mq_sysctls(struct ipc_namespace *ns)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
#endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
|
|
|
|
#ifdef CONFIG_SYSVIPC_SYSCTL
|
|
|
|
bool setup_ipc_sysctls(struct ipc_namespace *ns);
|
|
void retire_ipc_sysctls(struct ipc_namespace *ns);
|
|
|
|
#else /* CONFIG_SYSVIPC_SYSCTL */
|
|
|
|
static inline void retire_ipc_sysctls(struct ipc_namespace *ns)
|
|
{
|
|
}
|
|
|
|
static inline bool setup_ipc_sysctls(struct ipc_namespace *ns)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
#endif /* CONFIG_SYSVIPC_SYSCTL */
|
|
#endif
|