Merge tag 'arm64-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux

Pull arm64 fixes from Will Deacon:
 "There's more here than I would ideally like at this stage, but there's
  been a steady trickle of fixes and some of them took a few rounds of
  review.

  The bulk of the changes are fixing some fallout from the recent BBM
  level two support which allows the linear map to be split from block
  to page mappings at runtime, but inadvertently led to sleeping in
  atomic context on some paths where the linear map was already mapped
  with page granularity. The fix is simply to avoid splitting in those
  cases but the implementation of that is a little involved.

  The other interesting fix is addressing a catastophic performance
  issue with our per-cpu atomics discovered by Paul in the SRCU locking
  code but which took some interactions with the hardware folks to
  resolve.

  Summary:

   - Avoid sleeping in atomic context when changing linear map
     permissions for DEBUG_PAGEALLOC or KFENCE

   - Rework printing of Spectre mitigation status to avoid hardlockup
     when enabling per-task mitigations on the context-switch path

   - Reject kernel modules when instruction patching fails either due to
     the DWARF-based SCS patching or because of an alternatives callback
     residing outside of the core kernel text

   - Propagate error when updating kernel memory permissions in kprobes

   - Drop pointless, incorrect message when enabling the ACPI SPCR
     console

   - Use value-returning LSE instructions for per-cpu atomics to reduce
     latency in SRCU locking routines"

* tag 'arm64-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux:
  arm64: Reject modules with internal alternative callbacks
  arm64: Fail module loading if dynamic SCS patching fails
  arm64: proton-pack: Fix hard lockup due to print in scheduler context
  arm64: proton-pack: Drop print when !CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY
  arm64: mm: Tidy up force_pte_mapping()
  arm64: mm: Optimize range_split_to_ptes()
  arm64: mm: Don't sleep in split_kernel_leaf_mapping() when in atomic context
  arm64: kprobes: check the return value of set_memory_rox()
  arm64: acpi: Drop message logging SPCR default console
  Revert "ACPI: Suppress misleading SPCR console message when SPCR table is absent"
  arm64: Use load LSE atomics for the non-return per-CPU atomic operations
This commit is contained in:
Linus Torvalds
2025-11-11 10:31:17 -08:00
15 changed files with 165 additions and 82 deletions

View File

@@ -26,9 +26,12 @@ void __init apply_alternatives_all(void);
bool alternative_is_applied(u16 cpucap);
#ifdef CONFIG_MODULES
void apply_alternatives_module(void *start, size_t length);
int apply_alternatives_module(void *start, size_t length);
#else
static inline void apply_alternatives_module(void *start, size_t length) { }
static inline int apply_alternatives_module(void *start, size_t length)
{
return 0;
}
#endif
void alt_cb_patch_nops(struct alt_instr *alt, __le32 *origptr,

View File

@@ -10,8 +10,6 @@
#include <asm/set_memory.h>
static inline bool arch_kfence_init_pool(void) { return true; }
static inline bool kfence_protect_page(unsigned long addr, bool protect)
{
set_memory_valid(addr, 1, !protect);
@@ -25,6 +23,7 @@ static inline bool arm64_kfence_can_set_direct_map(void)
{
return !kfence_early_init;
}
bool arch_kfence_init_pool(void);
#else /* CONFIG_KFENCE */
static inline bool arm64_kfence_can_set_direct_map(void) { return false; }
#endif /* CONFIG_KFENCE */

View File

@@ -77,7 +77,7 @@ __percpu_##name##_case_##sz(void *ptr, unsigned long val) \
" stxr" #sfx "\t%w[loop], %" #w "[tmp], %[ptr]\n" \
" cbnz %w[loop], 1b", \
/* LSE atomics */ \
#op_lse "\t%" #w "[val], %[ptr]\n" \
#op_lse "\t%" #w "[val], %" #w "[tmp], %[ptr]\n" \
__nops(3)) \
: [loop] "=&r" (loop), [tmp] "=&r" (tmp), \
[ptr] "+Q"(*(u##sz *)ptr) \
@@ -124,9 +124,16 @@ PERCPU_RW_OPS(8)
PERCPU_RW_OPS(16)
PERCPU_RW_OPS(32)
PERCPU_RW_OPS(64)
PERCPU_OP(add, add, stadd)
PERCPU_OP(andnot, bic, stclr)
PERCPU_OP(or, orr, stset)
/*
* Use value-returning atomics for CPU-local ops as they are more likely
* to execute "near" to the CPU (e.g. in L1$).
*
* https://lore.kernel.org/r/e7d539ed-ced0-4b96-8ecd-048a5b803b85@paulmck-laptop
*/
PERCPU_OP(add, add, ldadd)
PERCPU_OP(andnot, bic, ldclr)
PERCPU_OP(or, orr, ldset)
PERCPU_RET_OP(add, add, ldadd)
#undef PERCPU_RW_OPS

View File

@@ -53,7 +53,7 @@ enum {
EDYNSCS_INVALID_CFA_OPCODE = 4,
};
int __pi_scs_patch(const u8 eh_frame[], int size);
int __pi_scs_patch(const u8 eh_frame[], int size, bool skip_dry_run);
#endif /* __ASSEMBLY __ */

View File

@@ -117,6 +117,7 @@ void spectre_bhb_patch_wa3(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst);
void spectre_bhb_patch_clearbhb(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst);
void spectre_print_disabled_mitigations(void);
#endif /* __ASSEMBLY__ */
#endif /* __ASM_SPECTRE_H */

View File

@@ -197,8 +197,6 @@ out:
*/
void __init acpi_boot_table_init(void)
{
int ret;
/*
* Enable ACPI instead of device tree unless
* - ACPI has been disabled explicitly (acpi=off), or
@@ -252,12 +250,8 @@ done:
* behaviour, use acpi=nospcr to disable console in ACPI SPCR
* table as default serial console.
*/
ret = acpi_parse_spcr(earlycon_acpi_spcr_enable,
acpi_parse_spcr(earlycon_acpi_spcr_enable,
!param_acpi_nospcr);
if (!ret || param_acpi_nospcr || !IS_ENABLED(CONFIG_ACPI_SPCR_TABLE))
pr_info("Use ACPI SPCR as default console: No\n");
else
pr_info("Use ACPI SPCR as default console: Yes\n");
if (IS_ENABLED(CONFIG_ACPI_BGRT))
acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);

View File

@@ -139,9 +139,9 @@ static noinstr void clean_dcache_range_nopatch(u64 start, u64 end)
} while (cur += d_size, cur < end);
}
static void __apply_alternatives(const struct alt_region *region,
bool is_module,
unsigned long *cpucap_mask)
static int __apply_alternatives(const struct alt_region *region,
bool is_module,
unsigned long *cpucap_mask)
{
struct alt_instr *alt;
__le32 *origptr, *updptr;
@@ -166,10 +166,13 @@ static void __apply_alternatives(const struct alt_region *region,
updptr = is_module ? origptr : lm_alias(origptr);
nr_inst = alt->orig_len / AARCH64_INSN_SIZE;
if (ALT_HAS_CB(alt))
if (ALT_HAS_CB(alt)) {
alt_cb = ALT_REPL_PTR(alt);
else
if (is_module && !core_kernel_text((unsigned long)alt_cb))
return -ENOEXEC;
} else {
alt_cb = patch_alternative;
}
alt_cb(alt, origptr, updptr, nr_inst);
@@ -193,6 +196,8 @@ static void __apply_alternatives(const struct alt_region *region,
bitmap_and(applied_alternatives, applied_alternatives,
system_cpucaps, ARM64_NCAPS);
}
return 0;
}
static void __init apply_alternatives_vdso(void)
@@ -277,7 +282,7 @@ void __init apply_boot_alternatives(void)
}
#ifdef CONFIG_MODULES
void apply_alternatives_module(void *start, size_t length)
int apply_alternatives_module(void *start, size_t length)
{
struct alt_region region = {
.begin = start,
@@ -287,7 +292,7 @@ void apply_alternatives_module(void *start, size_t length)
bitmap_fill(all_capabilities, ARM64_NCAPS);
__apply_alternatives(&region, true, &all_capabilities[0]);
return __apply_alternatives(&region, true, &all_capabilities[0]);
}
#endif

View File

@@ -95,6 +95,7 @@
#include <asm/vectors.h>
#include <asm/virt.h>
#include <asm/spectre.h>
/* Kernel representation of AT_HWCAP and AT_HWCAP2 */
static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly;
@@ -3875,6 +3876,11 @@ static void __init setup_system_capabilities(void)
*/
if (system_uses_ttbr0_pan())
pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n");
/*
* Report Spectre mitigations status.
*/
spectre_print_disabled_mitigations();
}
void __init setup_system_features(void)

View File

@@ -489,16 +489,29 @@ int module_finalize(const Elf_Ehdr *hdr,
int ret;
s = find_section(hdr, sechdrs, ".altinstructions");
if (s)
apply_alternatives_module((void *)s->sh_addr, s->sh_size);
if (s) {
ret = apply_alternatives_module((void *)s->sh_addr, s->sh_size);
if (ret < 0) {
pr_err("module %s: error occurred when applying alternatives\n", me->name);
return ret;
}
}
if (scs_is_dynamic()) {
s = find_section(hdr, sechdrs, ".init.eh_frame");
if (s) {
ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size);
if (ret)
/*
* Because we can reject modules that are malformed
* so SCS patching fails, skip dry run and try to patch
* it in place. If patching fails, the module would not
* be loaded anyway.
*/
ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size, true);
if (ret) {
pr_err("module %s: error occurred during dynamic SCS patching (%d)\n",
me->name, ret);
return -ENOEXEC;
}
}
}

View File

@@ -104,7 +104,7 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level)
if (enable_scs) {
scs_patch(__eh_frame_start + va_offset,
__eh_frame_end - __eh_frame_start);
__eh_frame_end - __eh_frame_start, false);
asm("ic ialluis");
dynamic_scs_is_enabled = true;

View File

@@ -225,7 +225,7 @@ static int scs_handle_fde_frame(const struct eh_frame *frame,
return 0;
}
int scs_patch(const u8 eh_frame[], int size)
int scs_patch(const u8 eh_frame[], int size, bool skip_dry_run)
{
int code_alignment_factor = 1;
bool fde_use_sdata8 = false;
@@ -277,11 +277,13 @@ int scs_patch(const u8 eh_frame[], int size)
}
} else {
ret = scs_handle_fde_frame(frame, code_alignment_factor,
fde_use_sdata8, true);
fde_use_sdata8, !skip_dry_run);
if (ret)
return ret;
scs_handle_fde_frame(frame, code_alignment_factor,
fde_use_sdata8, false);
if (!skip_dry_run)
scs_handle_fde_frame(frame, code_alignment_factor,
fde_use_sdata8, false);
}
p += sizeof(frame->size) + frame->size;

View File

@@ -27,7 +27,7 @@ extern pgd_t init_pg_dir[], init_pg_end[];
void init_feature_override(u64 boot_status, const void *fdt, int chosen);
u64 kaslr_early_init(void *fdt, int chosen);
void relocate_kernel(u64 offset);
int scs_patch(const u8 eh_frame[], int size);
int scs_patch(const u8 eh_frame[], int size, bool skip_dry_run);
void map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa,
pgprot_t prot, int level, pte_t *tbl, bool may_use_cont,

View File

@@ -49,7 +49,10 @@ void *alloc_insn_page(void)
addr = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
if (!addr)
return NULL;
set_memory_rox((unsigned long)addr, 1);
if (set_memory_rox((unsigned long)addr, 1)) {
execmem_free(addr);
return NULL;
}
return addr;
}

View File

@@ -91,12 +91,7 @@ early_param("nospectre_v2", parse_spectre_v2_param);
static bool spectre_v2_mitigations_off(void)
{
bool ret = __nospectre_v2 || cpu_mitigations_off();
if (ret)
pr_info_once("spectre-v2 mitigation disabled by command line option\n");
return ret;
return __nospectre_v2 || cpu_mitigations_off();
}
static const char *get_bhb_affected_string(enum mitigation_state bhb_state)
@@ -421,13 +416,8 @@ early_param("ssbd", parse_spectre_v4_param);
*/
static bool spectre_v4_mitigations_off(void)
{
bool ret = cpu_mitigations_off() ||
__spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED;
if (ret)
pr_info_once("spectre-v4 mitigation disabled by command-line option\n");
return ret;
return cpu_mitigations_off() ||
__spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED;
}
/* Do we need to toggle the mitigation state on entry to/exit from the kernel? */
@@ -1042,10 +1032,6 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry)
if (arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) {
/* No point mitigating Spectre-BHB alone. */
} else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) {
pr_info_once("spectre-bhb mitigation disabled by compile time option\n");
} else if (cpu_mitigations_off() || __nospectre_bhb) {
pr_info_once("spectre-bhb mitigation disabled by command line option\n");
} else if (supports_ecbhb(SCOPE_LOCAL_CPU)) {
state = SPECTRE_MITIGATED;
set_bit(BHB_HW, &system_bhb_mitigations);
@@ -1199,3 +1185,18 @@ void unpriv_ebpf_notify(int new_state)
pr_err("WARNING: %s", EBPF_WARN);
}
#endif
void spectre_print_disabled_mitigations(void)
{
/* Keep a single copy of the common message suffix to avoid duplication. */
const char *spectre_disabled_suffix = "mitigation disabled by command-line option\n";
if (spectre_v2_mitigations_off())
pr_info("spectre-v2 %s", spectre_disabled_suffix);
if (spectre_v4_mitigations_off())
pr_info("spectre-v4 %s", spectre_disabled_suffix);
if (__nospectre_bhb || cpu_mitigations_off())
pr_info("spectre-bhb %s", spectre_disabled_suffix);
}

View File

@@ -708,6 +708,30 @@ out:
return ret;
}
static inline bool force_pte_mapping(void)
{
const bool bbml2 = system_capabilities_finalized() ?
system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort();
if (debug_pagealloc_enabled())
return true;
if (bbml2)
return false;
return rodata_full || arm64_kfence_can_set_direct_map() || is_realm_world();
}
static inline bool split_leaf_mapping_possible(void)
{
/*
* !BBML2_NOABORT systems should never run into scenarios where we would
* have to split. So exit early and let calling code detect it and raise
* a warning.
*/
if (!system_supports_bbml2_noabort())
return false;
return !force_pte_mapping();
}
static DEFINE_MUTEX(pgtable_split_lock);
int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
@@ -715,12 +739,11 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
int ret;
/*
* !BBML2_NOABORT systems should not be trying to change permissions on
* anything that is not pte-mapped in the first place. Just return early
* and let the permission change code raise a warning if not already
* pte-mapped.
* Exit early if the region is within a pte-mapped area or if we can't
* split. For the latter case, the permission change code will raise a
* warning if not already pte-mapped.
*/
if (!system_supports_bbml2_noabort())
if (!split_leaf_mapping_possible() || is_kfence_address((void *)start))
return 0;
/*
@@ -758,30 +781,30 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
return ret;
}
static int __init split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
unsigned long next,
struct mm_walk *walk)
static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
gfp_t gfp = *(gfp_t *)walk->private;
pud_t pud = pudp_get(pudp);
int ret = 0;
if (pud_leaf(pud))
ret = split_pud(pudp, pud, GFP_ATOMIC, false);
ret = split_pud(pudp, pud, gfp, false);
return ret;
}
static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
unsigned long next,
struct mm_walk *walk)
static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
gfp_t gfp = *(gfp_t *)walk->private;
pmd_t pmd = pmdp_get(pmdp);
int ret = 0;
if (pmd_leaf(pmd)) {
if (pmd_cont(pmd))
split_contpmd(pmdp);
ret = split_pmd(pmdp, pmd, GFP_ATOMIC, false);
ret = split_pmd(pmdp, pmd, gfp, false);
/*
* We have split the pmd directly to ptes so there is no need to
@@ -793,9 +816,8 @@ static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
return ret;
}
static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
unsigned long next,
struct mm_walk *walk)
static int split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
pte_t pte = __ptep_get(ptep);
@@ -805,12 +827,24 @@ static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
return 0;
}
static const struct mm_walk_ops split_to_ptes_ops __initconst = {
static const struct mm_walk_ops split_to_ptes_ops = {
.pud_entry = split_to_ptes_pud_entry,
.pmd_entry = split_to_ptes_pmd_entry,
.pte_entry = split_to_ptes_pte_entry,
};
static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp)
{
int ret;
arch_enter_lazy_mmu_mode();
ret = walk_kernel_page_table_range_lockless(start, end,
&split_to_ptes_ops, NULL, &gfp);
arch_leave_lazy_mmu_mode();
return ret;
}
static bool linear_map_requires_bbml2 __initdata;
u32 idmap_kpti_bbml2_flag;
@@ -847,11 +881,9 @@ static int __init linear_map_split_to_ptes(void *__unused)
* PTE. The kernel alias remains static throughout runtime so
* can continue to be safely mapped with large mappings.
*/
ret = walk_kernel_page_table_range_lockless(lstart, kstart,
&split_to_ptes_ops, NULL, NULL);
ret = range_split_to_ptes(lstart, kstart, GFP_ATOMIC);
if (!ret)
ret = walk_kernel_page_table_range_lockless(kend, lend,
&split_to_ptes_ops, NULL, NULL);
ret = range_split_to_ptes(kend, lend, GFP_ATOMIC);
if (ret)
panic("Failed to split linear map\n");
flush_tlb_kernel_range(lstart, lend);
@@ -1002,6 +1034,33 @@ static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp)
memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
__kfence_pool = phys_to_virt(kfence_pool);
}
bool arch_kfence_init_pool(void)
{
unsigned long start = (unsigned long)__kfence_pool;
unsigned long end = start + KFENCE_POOL_SIZE;
int ret;
/* Exit early if we know the linear map is already pte-mapped. */
if (!split_leaf_mapping_possible())
return true;
/* Kfence pool is already pte-mapped for the early init case. */
if (kfence_early_init)
return true;
mutex_lock(&pgtable_split_lock);
ret = range_split_to_ptes(start, end, GFP_PGTABLE_KERNEL);
mutex_unlock(&pgtable_split_lock);
/*
* Since the system supports bbml2_noabort, tlb invalidation is not
* required here; the pgtable mappings have been split to pte but larger
* entries may safely linger in the TLB.
*/
return !ret;
}
#else /* CONFIG_KFENCE */
static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; }
@@ -1009,16 +1068,6 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) {
#endif /* CONFIG_KFENCE */
static inline bool force_pte_mapping(void)
{
bool bbml2 = system_capabilities_finalized() ?
system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort();
return (!bbml2 && (rodata_full || arm64_kfence_can_set_direct_map() ||
is_realm_world())) ||
debug_pagealloc_enabled();
}
static void __init map_mem(pgd_t *pgdp)
{
static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);