sched: Make migrate_{en,dis}able() inline

For now, migrate_enable and migrate_disable are global, which makes them
become hotspots in some case. Take BPF for example, the function calling
to migrate_enable and migrate_disable in BPF trampoline can introduce
significant overhead, and following is the 'perf top' of FENTRY's
benchmark (./tools/testing/selftests/bpf/bench trig-fentry):

  54.63% bpf_prog_2dcccf652aac1793_bench_trigger_fentry [k]
                 bpf_prog_2dcccf652aac1793_bench_trigger_fentry
  10.43% [kernel] [k] migrate_enable
  10.07% bpf_trampoline_6442517037 [k] bpf_trampoline_6442517037
  8.06% [kernel] [k] __bpf_prog_exit_recur
  4.11% libc.so.6 [.] syscall
  2.15% [kernel] [k] entry_SYSCALL_64
  1.48% [kernel] [k] memchr_inv
  1.32% [kernel] [k] fput
  1.16% [kernel] [k] _copy_to_user
  0.73% [kernel] [k] bpf_prog_test_run_raw_tp

So in this commit, we make migrate_enable/migrate_disable inline to obtain
better performance. The struct rq is defined internally in
kernel/sched/sched.h, and the field "nr_pinned" is accessed in
migrate_enable/migrate_disable, which makes it hard to make them inline.

Alexei Starovoitov suggests to generate the offset of "nr_pinned" in [1],
so we can define the migrate_enable/migrate_disable in
include/linux/sched.h and access "this_rq()->nr_pinned" with
"(void *)this_rq() + RQ_nr_pinned".

The offset of "nr_pinned" is generated in include/generated/rq-offsets.h
by kernel/sched/rq-offsets.c.

Generally speaking, we move the definition of migrate_enable and
migrate_disable to include/linux/sched.h from kernel/sched/core.c. The
calling to __set_cpus_allowed_ptr() is leaved in ___migrate_enable().

The "struct rq" is not available in include/linux/sched.h, so we can't
access the "runqueues" with this_cpu_ptr(), as the compilation will fail
in this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr():
  typeof((ptr) + 0)

So we introduce the this_rq_raw() and access the runqueues with
arch_raw_cpu_ptr/PERCPU_PTR directly.

The variable "runqueues" is not visible in the kernel modules, and export
it is not a good idea. As Peter Zijlstra advised in [2], we define and
export migrate_enable/migrate_disable in kernel/sched/core.c too, and use
them for the modules.

Before this patch, the performance of BPF FENTRY is:

  fentry         :  113.030 ± 0.149M/s
  fentry         :  112.501 ± 0.187M/s
  fentry         :  112.828 ± 0.267M/s
  fentry         :  115.287 ± 0.241M/s

After this patch, the performance of BPF FENTRY increases to:

  fentry         :  143.644 ± 0.670M/s
  fentry         :  149.764 ± 0.362M/s
  fentry         :  149.642 ± 0.156M/s
  fentry         :  145.263 ± 0.221M/s

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/bpf/CAADnVQ+5sEDKHdsJY5ZsfGDO_1SEhhQWHrt2SMBG5SYyQ+jt7w@mail.gmail.com/ [1]
Link: https://lore.kernel.org/all/20250819123214.GH4067720@noisy.programming.kicks-ass.net/ [2]
This commit is contained in:
Menglong Dong
2025-09-17 14:09:15 +08:00
committed by Peter Zijlstra
parent 88a90315a9
commit 378b770819
6 changed files with 152 additions and 53 deletions

13
Kbuild
View File

@@ -34,13 +34,24 @@ arch/$(SRCARCH)/kernel/asm-offsets.s: $(timeconst-file) $(bounds-file)
$(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE
$(call filechk,offsets,__ASM_OFFSETS_H__)
# Generate rq-offsets.h
rq-offsets-file := include/generated/rq-offsets.h
targets += kernel/sched/rq-offsets.s
kernel/sched/rq-offsets.s: $(offsets-file)
$(rq-offsets-file): kernel/sched/rq-offsets.s FORCE
$(call filechk,offsets,__RQ_OFFSETS_H__)
# Check for missing system calls
quiet_cmd_syscalls = CALL $<
cmd_syscalls = $(CONFIG_SHELL) $< $(CC) $(c_flags) $(missing_syscalls_flags)
PHONY += missing-syscalls
missing-syscalls: scripts/checksyscalls.sh $(offsets-file)
missing-syscalls: scripts/checksyscalls.sh $(rq-offsets-file)
$(call cmd,syscalls)
# Check the manual modification of atomic headers

View File

@@ -424,8 +424,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
* work-conserving schedulers.
*
*/
extern void migrate_disable(void);
extern void migrate_enable(void);
/**
* preempt_disable_nested - Disable preemption inside a normally preempt disabled section
@@ -471,7 +469,6 @@ static __always_inline void preempt_enable_nested(void)
DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable())
DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace())
DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
#ifdef CONFIG_PREEMPT_DYNAMIC

View File

@@ -49,6 +49,9 @@
#include <linux/tracepoint-defs.h>
#include <linux/unwind_deferred_types.h>
#include <asm/kmap_size.h>
#ifndef COMPILE_OFFSETS
#include <generated/rq-offsets.h>
#endif
/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
@@ -2317,4 +2320,114 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo
#define alloc_tag_restore(_tag, _old) do {} while (0)
#endif
#ifndef MODULE
#ifndef COMPILE_OFFSETS
extern void ___migrate_enable(void);
struct rq;
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
/*
* The "struct rq" is not available here, so we can't access the
* "runqueues" with this_cpu_ptr(), as the compilation will fail in
* this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr():
* typeof((ptr) + 0)
*
* So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here.
*/
#ifdef CONFIG_SMP
#define this_rq_raw() arch_raw_cpu_ptr(&runqueues)
#else
#define this_rq_raw() PERCPU_PTR(&runqueues)
#endif
#define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned))
static inline void __migrate_enable(void)
{
struct task_struct *p = current;
#ifdef CONFIG_DEBUG_PREEMPT
/*
* Check both overflow from migrate_disable() and superfluous
* migrate_enable().
*/
if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
return;
#endif
if (p->migration_disabled > 1) {
p->migration_disabled--;
return;
}
/*
* Ensure stop_task runs either before or after this, and that
* __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
*/
guard(preempt)();
if (unlikely(p->cpus_ptr != &p->cpus_mask))
___migrate_enable();
/*
* Mustn't clear migration_disabled() until cpus_ptr points back at the
* regular cpus_mask, otherwise things that race (eg.
* select_fallback_rq) get confused.
*/
barrier();
p->migration_disabled = 0;
this_rq_pinned()--;
}
static inline void __migrate_disable(void)
{
struct task_struct *p = current;
if (p->migration_disabled) {
#ifdef CONFIG_DEBUG_PREEMPT
/*
*Warn about overflow half-way through the range.
*/
WARN_ON_ONCE((s16)p->migration_disabled < 0);
#endif
p->migration_disabled++;
return;
}
guard(preempt)();
this_rq_pinned()++;
p->migration_disabled = 1;
}
#else /* !COMPILE_OFFSETS */
static inline void __migrate_disable(void) { }
static inline void __migrate_enable(void) { }
#endif /* !COMPILE_OFFSETS */
/*
* So that it is possible to not export the runqueues variable, define and
* export migrate_enable/migrate_disable in kernel/sched/core.c too, and use
* them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will
* be defined in kernel/sched/core.c.
*/
#ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE
static inline void migrate_disable(void)
{
__migrate_disable();
}
static inline void migrate_enable(void)
{
__migrate_enable();
}
#else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */
extern void migrate_disable(void);
extern void migrate_enable(void);
#endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */
#else /* MODULE */
extern void migrate_disable(void);
extern void migrate_enable(void);
#endif /* MODULE */
DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
#endif

View File

@@ -23855,6 +23855,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
BTF_SET_START(btf_id_deny)
BTF_ID_UNUSED
#ifdef CONFIG_SMP
BTF_ID(func, ___migrate_enable)
BTF_ID(func, migrate_disable)
BTF_ID(func, migrate_enable)
#endif

View File

@@ -7,6 +7,8 @@
* Copyright (C) 1991-2002 Linus Torvalds
* Copyright (C) 1998-2024 Ingo Molnar, Red Hat
*/
#define INSTANTIATE_EXPORTED_MIGRATE_DISABLE
#include <linux/sched.h>
#include <linux/highmem.h>
#include <linux/hrtimer_api.h>
#include <linux/ktime_api.h>
@@ -2381,28 +2383,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
__do_set_cpus_allowed(p, &ac);
}
void migrate_disable(void)
{
struct task_struct *p = current;
if (p->migration_disabled) {
#ifdef CONFIG_DEBUG_PREEMPT
/*
*Warn about overflow half-way through the range.
*/
WARN_ON_ONCE((s16)p->migration_disabled < 0);
#endif
p->migration_disabled++;
return;
}
guard(preempt)();
this_rq()->nr_pinned++;
p->migration_disabled = 1;
}
EXPORT_SYMBOL_GPL(migrate_disable);
void migrate_enable(void)
void ___migrate_enable(void)
{
struct task_struct *p = current;
struct affinity_context ac = {
@@ -2410,35 +2391,19 @@ void migrate_enable(void)
.flags = SCA_MIGRATE_ENABLE,
};
#ifdef CONFIG_DEBUG_PREEMPT
/*
* Check both overflow from migrate_disable() and superfluous
* migrate_enable().
*/
if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
return;
#endif
if (p->migration_disabled > 1) {
p->migration_disabled--;
return;
}
/*
* Ensure stop_task runs either before or after this, and that
* __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
*/
guard(preempt)();
if (p->cpus_ptr != &p->cpus_mask)
__set_cpus_allowed_ptr(p, &ac);
/*
* Mustn't clear migration_disabled() until cpus_ptr points back at the
* regular cpus_mask, otherwise things that race (eg.
* select_fallback_rq) get confused.
*/
barrier();
p->migration_disabled = 0;
this_rq()->nr_pinned--;
}
EXPORT_SYMBOL_GPL(___migrate_enable);
void migrate_disable(void)
{
__migrate_disable();
}
EXPORT_SYMBOL_GPL(migrate_disable);
void migrate_enable(void)
{
__migrate_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);

12
kernel/sched/rq-offsets.c Normal file
View File

@@ -0,0 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
#define COMPILE_OFFSETS
#include <linux/kbuild.h>
#include <linux/types.h>
#include "sched.h"
int main(void)
{
DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned));
return 0;
}