Files
linux/mm/mmap_lock.c
Liam R. Howlett 270065f514 mm/mmap_lock: reset maple state on lock_vma_under_rcu() retry
The retry in lock_vma_under_rcu() drops the rcu read lock before
reacquiring the lock and trying again.  This may cause a use-after-free if
the maple node the maple state was using was freed.

The maple state is protected by the rcu read lock.  When the lock is
dropped, the state cannot be reused as it tracks pointers to objects that
may be freed during the time where the lock was not held.

Any time the rcu read lock is dropped, the maple state must be
invalidated.  Resetting the address and state to MA_START is the safest
course of action, which will result in the next operation starting from
the top of the tree.

Prior to commit 0b16f8bed1 ("mm: change vma_start_read() to drop RCU
lock on failure"), vma_start_read() would drop rcu read lock and return
NULL, so the retry would not have happened.  However, now that
vma_start_read() drops rcu read lock on failure followed by a retry, we
may end up using a freed maple tree node cached in the maple state.

[surenb@google.com: changelog alteration]
  Link: https://lkml.kernel.org/r/CAJuCfpEWMD-Z1j=nPYHcQW4F7E2Wka09KTXzGv7VE7oW1S8hcw@mail.gmail.com
Link: https://lkml.kernel.org/r/20251111215605.1721380-1-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Fixes: 0b16f8bed1 ("mm: change vma_start_read() to drop RCU lock on failure")
Reported-by: syzbot+131f9eb2b5807573275c@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=131f9eb2b5807573275c
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 14:25:17 -08:00

499 lines
14 KiB
C

// SPDX-License-Identifier: GPL-2.0
#define CREATE_TRACE_POINTS
#include <trace/events/mmap_lock.h>
#include <linux/mm.h>
#include <linux/cgroup.h>
#include <linux/memcontrol.h>
#include <linux/mmap_lock.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/smp.h>
#include <linux/trace_events.h>
#include <linux/local_lock.h>
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
#ifdef CONFIG_TRACING
/*
* Trace calls must be in a separate file, as otherwise there's a circular
* dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
*/
void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
{
trace_mmap_lock_start_locking(mm, write);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
bool success)
{
trace_mmap_lock_acquire_returned(mm, write, success);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
{
trace_mmap_lock_released(mm, write);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_released);
#endif /* CONFIG_TRACING */
#ifdef CONFIG_MMU
#ifdef CONFIG_PER_VMA_LOCK
static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
{
unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
/* Additional refcnt if the vma is attached. */
if (!detaching)
tgt_refcnt++;
/*
* If vma is detached then only vma_mark_attached() can raise the
* vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
*/
if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
return false;
rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
refcount_read(&vma->vm_refcnt) == tgt_refcnt,
TASK_UNINTERRUPTIBLE);
lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
return true;
}
static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
{
*detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
}
void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
{
bool locked;
/*
* __vma_enter_locked() returns false immediately if the vma is not
* attached, otherwise it waits until refcnt is indicating that vma
* is attached with no readers.
*/
locked = __vma_enter_locked(vma, false);
/*
* We should use WRITE_ONCE() here because we can have concurrent reads
* from the early lockless pessimistic check in vma_start_read().
* We don't really care about the correctness of that early check, but
* we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
*/
WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
if (locked) {
bool detached;
__vma_exit_locked(vma, &detached);
WARN_ON_ONCE(detached); /* vma should remain attached */
}
}
EXPORT_SYMBOL_GPL(__vma_start_write);
void vma_mark_detached(struct vm_area_struct *vma)
{
vma_assert_write_locked(vma);
vma_assert_attached(vma);
/*
* We are the only writer, so no need to use vma_refcount_put().
* The condition below is unlikely because the vma has been already
* write-locked and readers can increment vm_refcnt only temporarily
* before they check vm_lock_seq, realize the vma is locked and drop
* back the vm_refcnt. That is a narrow window for observing a raised
* vm_refcnt.
*/
if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
/* Wait until vma is detached with no readers. */
if (__vma_enter_locked(vma, true)) {
bool detached;
__vma_exit_locked(vma, &detached);
WARN_ON_ONCE(!detached);
}
}
}
/*
* Try to read-lock a vma. The function is allowed to occasionally yield false
* locked result to avoid performance overhead, in which case we fall back to
* using mmap_lock. The function should never yield false unlocked result.
* False locked result is possible if mm_lock_seq overflows or if vma gets
* reused and attached to a different mm before we lock it.
* Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
* detached.
*
* IMPORTANT: RCU lock must be held upon entering the function, but upon error
* IT IS RELEASED. The caller must handle this correctly.
*/
static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
struct vm_area_struct *vma)
{
struct mm_struct *other_mm;
int oldcnt;
RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
/*
* Check before locking. A race might cause false locked result.
* We can use READ_ONCE() for the mm_lock_seq here, and don't need
* ACQUIRE semantics, because this is just a lockless check whose result
* we don't rely on for anything - the mm_lock_seq read against which we
* need ordering is below.
*/
if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) {
vma = NULL;
goto err;
}
/*
* If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
* will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
* Acquire fence is required here to avoid reordering against later
* vm_lock_seq check and checks inside lock_vma_under_rcu().
*/
if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
VMA_REF_LIMIT))) {
/* return EAGAIN if vma got detached from under us */
vma = oldcnt ? NULL : ERR_PTR(-EAGAIN);
goto err;
}
rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
if (unlikely(vma->vm_mm != mm))
goto err_unstable;
/*
* Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
* False unlocked result is impossible because we modify and check
* vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
* modification invalidates all existing locks.
*
* We must use ACQUIRE semantics for the mm_lock_seq so that if we are
* racing with vma_end_write_all(), we only start reading from the VMA
* after it has been unlocked.
* This pairs with RELEASE semantics in vma_end_write_all().
*/
if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
vma_refcount_put(vma);
vma = NULL;
goto err;
}
return vma;
err:
rcu_read_unlock();
return vma;
err_unstable:
/*
* If vma got attached to another mm from under us, that mm is not
* stable and can be freed in the narrow window after vma->vm_refcnt
* is dropped and before rcuwait_wake_up(mm) is called. Grab it before
* releasing vma->vm_refcnt.
*/
other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */
/* __mmdrop() is a heavy operation, do it after dropping RCU lock. */
rcu_read_unlock();
mmgrab(other_mm);
vma_refcount_put(vma);
mmdrop(other_mm);
return NULL;
}
/*
* Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
* stable and not isolated. If the VMA is not found or is being modified the
* function returns NULL.
*/
struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
unsigned long address)
{
MA_STATE(mas, &mm->mm_mt, address, address);
struct vm_area_struct *vma;
retry:
rcu_read_lock();
vma = mas_walk(&mas);
if (!vma) {
rcu_read_unlock();
goto inval;
}
vma = vma_start_read(mm, vma);
if (IS_ERR_OR_NULL(vma)) {
/* Check if the VMA got isolated after we found it */
if (PTR_ERR(vma) == -EAGAIN) {
count_vm_vma_lock_event(VMA_LOCK_MISS);
/* The area was replaced with another one */
mas_set(&mas, address);
goto retry;
}
/* Failed to lock the VMA */
goto inval;
}
/*
* At this point, we have a stable reference to a VMA: The VMA is
* locked and we know it hasn't already been isolated.
* From here on, we can access the VMA without worrying about which
* fields are accessible for RCU readers.
*/
rcu_read_unlock();
/* Check if the vma we locked is the right one. */
if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
vma_end_read(vma);
goto inval;
}
return vma;
inval:
count_vm_vma_lock_event(VMA_LOCK_ABORT);
return NULL;
}
static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
struct vma_iterator *vmi,
unsigned long from_addr)
{
struct vm_area_struct *vma;
int ret;
ret = mmap_read_lock_killable(mm);
if (ret)
return ERR_PTR(ret);
/* Lookup the vma at the last position again under mmap_read_lock */
vma_iter_set(vmi, from_addr);
vma = vma_next(vmi);
if (vma) {
/* Very unlikely vma->vm_refcnt overflow case */
if (unlikely(!vma_start_read_locked(vma)))
vma = ERR_PTR(-EAGAIN);
}
mmap_read_unlock(mm);
return vma;
}
struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
struct vma_iterator *vmi,
unsigned long from_addr)
{
struct vm_area_struct *vma;
unsigned int mm_wr_seq;
bool mmap_unlocked;
RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
retry:
/* Start mmap_lock speculation in case we need to verify the vma later */
mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
vma = vma_next(vmi);
if (!vma)
return NULL;
vma = vma_start_read(mm, vma);
if (IS_ERR_OR_NULL(vma)) {
/*
* Retry immediately if the vma gets detached from under us.
* Infinite loop should not happen because the vma we find will
* have to be constantly knocked out from under us.
*/
if (PTR_ERR(vma) == -EAGAIN) {
/* reset to search from the last address */
rcu_read_lock();
vma_iter_set(vmi, from_addr);
goto retry;
}
goto fallback;
}
/* Verify the vma is not behind the last search position. */
if (unlikely(from_addr >= vma->vm_end))
goto fallback_unlock;
/*
* vma can be ahead of the last search position but we need to verify
* it was not shrunk after we found it and another vma has not been
* installed ahead of it. Otherwise we might observe a gap that should
* not be there.
*/
if (from_addr < vma->vm_start) {
/* Verify only if the address space might have changed since vma lookup. */
if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
vma_iter_set(vmi, from_addr);
if (vma != vma_next(vmi))
goto fallback_unlock;
}
}
return vma;
fallback_unlock:
rcu_read_unlock();
vma_end_read(vma);
fallback:
vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
rcu_read_lock();
/* Reinitialize the iterator after re-entering rcu read section */
vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end);
return vma;
}
#endif /* CONFIG_PER_VMA_LOCK */
#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
#include <linux/extable.h>
static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
if (likely(mmap_read_trylock(mm)))
return true;
if (regs && !user_mode(regs)) {
unsigned long ip = exception_ip(regs);
if (!search_exception_tables(ip))
return false;
}
return !mmap_read_lock_killable(mm);
}
static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
{
/*
* We don't have this operation yet.
*
* It should be easy enough to do: it's basically a
* atomic_long_try_cmpxchg_acquire()
* from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
* it also needs the proper lockdep magic etc.
*/
return false;
}
static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
mmap_read_unlock(mm);
if (regs && !user_mode(regs)) {
unsigned long ip = exception_ip(regs);
if (!search_exception_tables(ip))
return false;
}
return !mmap_write_lock_killable(mm);
}
/*
* Helper for page fault handling.
*
* This is kind of equivalent to "mmap_read_lock()" followed
* by "find_extend_vma()", except it's a lot more careful about
* the locking (and will drop the lock on failure).
*
* For example, if we have a kernel bug that causes a page
* fault, we don't want to just use mmap_read_lock() to get
* the mm lock, because that would deadlock if the bug were
* to happen while we're holding the mm lock for writing.
*
* So this checks the exception tables on kernel faults in
* order to only do this all for instructions that are actually
* expected to fault.
*
* We can also actually take the mm lock for writing if we
* need to extend the vma, which helps the VM layer a lot.
*/
struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
unsigned long addr, struct pt_regs *regs)
{
struct vm_area_struct *vma;
if (!get_mmap_lock_carefully(mm, regs))
return NULL;
vma = find_vma(mm, addr);
if (likely(vma && (vma->vm_start <= addr)))
return vma;
/*
* Well, dang. We might still be successful, but only
* if we can extend a vma to do so.
*/
if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
mmap_read_unlock(mm);
return NULL;
}
/*
* We can try to upgrade the mmap lock atomically,
* in which case we can continue to use the vma
* we already looked up.
*
* Otherwise we'll have to drop the mmap lock and
* re-take it, and also look up the vma again,
* re-checking it.
*/
if (!mmap_upgrade_trylock(mm)) {
if (!upgrade_mmap_lock_carefully(mm, regs))
return NULL;
vma = find_vma(mm, addr);
if (!vma)
goto fail;
if (vma->vm_start <= addr)
goto success;
if (!(vma->vm_flags & VM_GROWSDOWN))
goto fail;
}
if (expand_stack_locked(vma, addr))
goto fail;
success:
mmap_write_downgrade(mm);
return vma;
fail:
mmap_write_unlock(mm);
return NULL;
}
#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
#else /* CONFIG_MMU */
/*
* At least xtensa ends up having protection faults even with no
* MMU.. No stack expansion, at least.
*/
struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
unsigned long addr, struct pt_regs *regs)
{
struct vm_area_struct *vma;
mmap_read_lock(mm);
vma = vma_lookup(mm, addr);
if (!vma)
mmap_read_unlock(mm);
return vma;
}
#endif /* CONFIG_MMU */