procfs: add "pidns" mount option

Since the introduction of pid namespaces, their interaction with procfs
has been entirely implicit in ways that require a lot of dancing around
by programs that need to construct sandboxes with different PID
namespaces.

Being able to explicitly specify the pid namespace to use when
constructing a procfs super block will allow programs to no longer need
to fork off a process which does then does unshare(2) / setns(2) and
forks again in order to construct a procfs in a pidns.

So, provide a "pidns" mount option which allows such users to just
explicitly state which pid namespace they want that procfs instance to
use. This interface can be used with fsconfig(2) either with a file
descriptor or a path:

  fsconfig(procfd, FSCONFIG_SET_FD, "pidns", NULL, nsfd);
  fsconfig(procfd, FSCONFIG_SET_STRING, "pidns", "/proc/self/ns/pid", 0);

or with classic mount(2) / mount(8):

  // mount -t proc -o pidns=/proc/self/ns/pid proc /tmp/proc
  mount("proc", "/tmp/proc", "proc", MS_..., "pidns=/proc/self/ns/pid");

As this new API is effectively shorthand for setns(2) followed by
mount(2), the permission model for this mirrors pidns_install() to avoid
opening up new attack surfaces by loosening the existing permission
model.

In order to avoid having to RCU-protect all users of proc_pid_ns() (to
avoid UAFs), attempting to reconfigure an existing procfs instance's pid
namespace will error out with -EBUSY. Creating new procfs instances is
quite cheap, so this should not be an impediment to most users, and lets
us avoid a lot of churn in fs/proc/* for a feature that it seems
unlikely userspace would use.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Link: https://lore.kernel.org/20250805-procfs-pidns-api-v4-2-705f984940e7@cyphar.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
Aleksa Sarai
2025-08-05 15:45:09 +10:00
committed by Christian Brauner
parent 7df8782012
commit fe49652e36
2 changed files with 100 additions and 6 deletions

View File

@@ -2362,6 +2362,7 @@ The following mount options are supported:
hidepid= Set /proc/<pid>/ access mode.
gid= Set the group authorized to learn processes information.
subset= Show only the specified subset of procfs.
pidns= Specify a the namespace used by this procfs.
========= ========================================================
hidepid=off or hidepid=0 means classic mode - everybody may access all
@@ -2394,6 +2395,13 @@ information about processes information, just add identd to this group.
subset=pid hides all top level files and directories in the procfs that
are not related to tasks.
pidns= specifies a pid namespace (either as a string path to something like
`/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that
will be used by the procfs instance when translating pids. By default, procfs
will use the calling process's active pid namespace. Note that the pid
namespace of an existing procfs instance cannot be modified (attempting to do
so will give an `-EBUSY` error).
Chapter 5: Filesystem behavior
==============================

View File

@@ -38,12 +38,14 @@ enum proc_param {
Opt_gid,
Opt_hidepid,
Opt_subset,
Opt_pidns,
};
static const struct fs_parameter_spec proc_fs_parameters[] = {
fsparam_u32("gid", Opt_gid),
fsparam_u32("gid", Opt_gid),
fsparam_string("hidepid", Opt_hidepid),
fsparam_string("subset", Opt_subset),
fsparam_file_or_string("pidns", Opt_pidns),
{}
};
@@ -109,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value)
return 0;
}
#ifdef CONFIG_PID_NS
static int proc_parse_pidns_param(struct fs_context *fc,
struct fs_parameter *param,
struct fs_parse_result *result)
{
struct proc_fs_context *ctx = fc->fs_private;
struct pid_namespace *target, *active = task_active_pid_ns(current);
struct ns_common *ns;
struct file *ns_filp __free(fput) = NULL;
switch (param->type) {
case fs_value_is_file:
/* came through fsconfig, steal the file reference */
ns_filp = no_free_ptr(param->file);
break;
case fs_value_is_string:
ns_filp = filp_open(param->string, O_RDONLY, 0);
break;
default:
WARN_ON_ONCE(true);
break;
}
if (!ns_filp)
ns_filp = ERR_PTR(-EBADF);
if (IS_ERR(ns_filp)) {
errorfc(fc, "could not get file from pidns argument");
return PTR_ERR(ns_filp);
}
if (!proc_ns_file(ns_filp))
return invalfc(fc, "pidns argument is not an nsfs file");
ns = get_proc_ns(file_inode(ns_filp));
if (ns->ops->type != CLONE_NEWPID)
return invalfc(fc, "pidns argument is not a pidns file");
target = container_of(ns, struct pid_namespace, ns);
/*
* pidns= is shorthand for joining the pidns to get a fsopen fd, so the
* permission model should be the same as pidns_install().
*/
if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) {
errorfc(fc, "insufficient permissions to set pidns");
return -EPERM;
}
if (!pidns_is_ancestor(target, active))
return invalfc(fc, "cannot set pidns to non-descendant pidns");
put_pid_ns(ctx->pid_ns);
ctx->pid_ns = get_pid_ns(target);
put_user_ns(fc->user_ns);
fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
return 0;
}
#endif /* CONFIG_PID_NS */
static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct proc_fs_context *ctx = fc->fs_private;
struct fs_parse_result result;
int opt;
int opt, err;
opt = fs_parse(fc, proc_fs_parameters, param, &result);
if (opt < 0)
@@ -125,15 +182,39 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_hidepid:
if (proc_parse_hidepid_param(fc, param))
return -EINVAL;
err = proc_parse_hidepid_param(fc, param);
if (err)
return err;
break;
case Opt_subset:
if (proc_parse_subset_param(fc, param->string) < 0)
return -EINVAL;
err = proc_parse_subset_param(fc, param->string);
if (err)
return err;
break;
case Opt_pidns:
#ifdef CONFIG_PID_NS
/*
* We would have to RCU-protect every proc_pid_ns() or
* proc_sb_info() access if we allowed this to be reconfigured
* for an existing procfs instance. Luckily, procfs instances
* are cheap to create, and mount-beneath would let you
* atomically replace an instance even with overmounts.
*/
if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
errorfc(fc, "cannot reconfigure pidns for existing procfs");
return -EBUSY;
}
err = proc_parse_pidns_param(fc, param, &result);
if (err)
return err;
break;
#else
errorfc(fc, "pidns mount flag not supported on this system");
return -EOPNOTSUPP;
#endif
default:
return -EINVAL;
}
@@ -154,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info,
fs_info->hide_pid = ctx->hidepid;
if (ctx->mask & (1 << Opt_subset))
fs_info->pidonly = ctx->pidonly;
if (ctx->mask & (1 << Opt_pidns) &&
!WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) {
put_pid_ns(fs_info->pid_ns);
fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
}
}
static int proc_fill_super(struct super_block *s, struct fs_context *fc)