2019-05-19 13:08:55 +01:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2008-07-25 19:44:36 -07:00
|
|
|
#include <linux/mm.h>
|
2006-01-08 01:01:43 -08:00
|
|
|
#include <linux/slab.h>
|
|
|
|
|
#include <linux/string.h>
|
2014-04-07 15:37:26 -07:00
|
|
|
#include <linux/compiler.h>
|
2011-10-16 02:01:52 -04:00
|
|
|
#include <linux/export.h>
|
2006-03-24 03:18:42 -08:00
|
|
|
#include <linux/err.h>
|
2008-07-26 15:22:28 -07:00
|
|
|
#include <linux/sched.h>
|
2017-02-08 18:51:29 +01:00
|
|
|
#include <linux/sched/mm.h>
|
2019-07-16 16:30:54 -07:00
|
|
|
#include <linux/sched/signal.h>
|
2017-02-08 18:51:37 +01:00
|
|
|
#include <linux/sched/task_stack.h>
|
2012-05-30 20:17:35 -04:00
|
|
|
#include <linux/security.h>
|
2013-02-22 16:34:35 -08:00
|
|
|
#include <linux/swap.h>
|
2013-02-22 16:34:37 -08:00
|
|
|
#include <linux/swapops.h>
|
2025-01-11 15:07:40 +08:00
|
|
|
#include <linux/sysctl.h>
|
2013-11-12 15:08:31 -08:00
|
|
|
#include <linux/mman.h>
|
|
|
|
|
#include <linux/hugetlb.h>
|
2014-05-06 14:02:53 -04:00
|
|
|
#include <linux/vmalloc.h>
|
2017-02-24 14:58:22 -08:00
|
|
|
#include <linux/userfaultfd_k.h>
|
2019-09-23 15:38:37 -07:00
|
|
|
#include <linux/elf.h>
|
2019-09-23 15:38:47 -07:00
|
|
|
#include <linux/elf-randomize.h>
|
|
|
|
|
#include <linux/personality.h>
|
2019-09-23 15:38:37 -07:00
|
|
|
#include <linux/random.h>
|
2019-09-23 15:38:47 -07:00
|
|
|
#include <linux/processor.h>
|
|
|
|
|
#include <linux/sizes.h>
|
|
|
|
|
#include <linux/compat.h>
|
2025-03-12 08:38:47 +01:00
|
|
|
#include <linux/fsnotify.h>
|
2025-07-14 09:16:52 -04:00
|
|
|
#include <linux/page_idle.h>
|
2013-11-12 15:08:31 -08:00
|
|
|
|
2016-12-24 11:46:01 -08:00
|
|
|
#include <linux/uaccess.h>
|
2006-01-08 01:01:43 -08:00
|
|
|
|
2024-06-12 12:59:18 -07:00
|
|
|
#include <kunit/visibility.h>
|
|
|
|
|
|
mm: nommu: sort mm->mmap list properly
When I was reading nommu code, I found that it handles the vma list/tree
in an unusual way. IIUC, because there can be more than one
identical/overrapped vmas in the list/tree, it sorts the tree more
strictly and does a linear search on the tree. But it doesn't applied to
the list (i.e. the list could be constructed in a different order than
the tree so that we can't use the list when finding the first vma in that
order).
Since inserting/sorting a vma in the tree and link is done at the same
time, we can easily construct both of them in the same order. And linear
searching on the tree could be more costly than doing it on the list, it
can be converted to use the list.
Also, after the commit 297c5eee3724 ("mm: make the vma list be doubly
linked") made the list be doubly linked, there were a couple of code need
to be fixed to construct the list properly.
Patch 1/6 is a preparation. It maintains the list sorted same as the tree
and construct doubly-linked list properly. Patch 2/6 is a simple
optimization for the vma deletion. Patch 3/6 and 4/6 convert tree
traversal to list traversal and the rest are simple fixes and cleanups.
This patch:
@vma added into @mm should be sorted by start addr, end addr and VMA
struct addr in that order because we may get identical VMAs in the @mm.
However this was true only for the rbtree, not for the list.
This patch fixes this by remembering 'rb_prev' during the tree traversal
like find_vma_prepare() does and linking the @vma via __vma_link_list().
After this patch, we can iterate the whole VMAs in correct order simply by
using @mm->mmap list.
[akpm@linux-foundation.org: avoid duplicating __vma_link_list()]
Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Greg Ungerer <gerg@uclinux.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-05-24 17:11:22 -07:00
|
|
|
#include "internal.h"
|
2022-05-09 18:20:47 -07:00
|
|
|
#include "swap.h"
|
mm: nommu: sort mm->mmap list properly
When I was reading nommu code, I found that it handles the vma list/tree
in an unusual way. IIUC, because there can be more than one
identical/overrapped vmas in the list/tree, it sorts the tree more
strictly and does a linear search on the tree. But it doesn't applied to
the list (i.e. the list could be constructed in a different order than
the tree so that we can't use the list when finding the first vma in that
order).
Since inserting/sorting a vma in the tree and link is done at the same
time, we can easily construct both of them in the same order. And linear
searching on the tree could be more costly than doing it on the list, it
can be converted to use the list.
Also, after the commit 297c5eee3724 ("mm: make the vma list be doubly
linked") made the list be doubly linked, there were a couple of code need
to be fixed to construct the list properly.
Patch 1/6 is a preparation. It maintains the list sorted same as the tree
and construct doubly-linked list properly. Patch 2/6 is a simple
optimization for the vma deletion. Patch 3/6 and 4/6 convert tree
traversal to list traversal and the rest are simple fixes and cleanups.
This patch:
@vma added into @mm should be sorted by start addr, end addr and VMA
struct addr in that order because we may get identical VMAs in the @mm.
However this was true only for the rbtree, not for the list.
This patch fixes this by remembering 'rb_prev' during the tree traversal
like find_vma_prepare() does and linking the @vma via __vma_link_list().
After this patch, we can iterate the whole VMAs in correct order simply by
using @mm->mmap list.
[akpm@linux-foundation.org: avoid duplicating __vma_link_list()]
Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Greg Ungerer <gerg@uclinux.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-05-24 17:11:22 -07:00
|
|
|
|
2015-02-13 14:36:24 -08:00
|
|
|
/**
|
|
|
|
|
* kfree_const - conditionally free memory
|
|
|
|
|
* @x: pointer to the memory
|
|
|
|
|
*
|
|
|
|
|
* Function calls kfree only if @x is not in .rodata section.
|
|
|
|
|
*/
|
|
|
|
|
void kfree_const(const void *x)
|
|
|
|
|
{
|
|
|
|
|
if (!is_kernel_rodata((unsigned long)x))
|
|
|
|
|
kfree(x);
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(kfree_const);
|
|
|
|
|
|
2006-01-08 01:01:43 -08:00
|
|
|
/**
|
2024-10-07 22:49:10 +08:00
|
|
|
* __kmemdup_nul - Create a NUL-terminated string from @s, which might be unterminated.
|
|
|
|
|
* @s: The data to copy
|
|
|
|
|
* @len: The size of the data, not including the NUL terminator
|
2006-01-08 01:01:43 -08:00
|
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
2019-03-05 15:48:42 -08:00
|
|
|
*
|
2024-10-07 22:49:10 +08:00
|
|
|
* Return: newly allocated copy of @s with NUL-termination or %NULL in
|
|
|
|
|
* case of error
|
2006-01-08 01:01:43 -08:00
|
|
|
*/
|
2024-10-07 22:49:10 +08:00
|
|
|
static __always_inline char *__kmemdup_nul(const char *s, size_t len, gfp_t gfp)
|
2006-01-08 01:01:43 -08:00
|
|
|
{
|
|
|
|
|
char *buf;
|
|
|
|
|
|
2024-10-07 22:49:10 +08:00
|
|
|
/* '+1' for the NUL terminator */
|
|
|
|
|
buf = kmalloc_track_caller(len + 1, gfp);
|
|
|
|
|
if (!buf)
|
2006-01-08 01:01:43 -08:00
|
|
|
return NULL;
|
|
|
|
|
|
2024-10-07 22:49:10 +08:00
|
|
|
memcpy(buf, s, len);
|
|
|
|
|
/* Ensure the buf is always NUL-terminated, regardless of @s. */
|
|
|
|
|
buf[len] = '\0';
|
2006-01-08 01:01:43 -08:00
|
|
|
return buf;
|
|
|
|
|
}
|
2024-10-07 22:49:10 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* kstrdup - allocate space for and copy an existing string
|
|
|
|
|
* @s: the string to duplicate
|
|
|
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
|
|
|
|
*
|
|
|
|
|
* Return: newly allocated copy of @s or %NULL in case of error
|
|
|
|
|
*/
|
|
|
|
|
noinline
|
|
|
|
|
char *kstrdup(const char *s, gfp_t gfp)
|
|
|
|
|
{
|
|
|
|
|
return s ? __kmemdup_nul(s, strlen(s), gfp) : NULL;
|
|
|
|
|
}
|
2006-01-08 01:01:43 -08:00
|
|
|
EXPORT_SYMBOL(kstrdup);
|
2006-03-24 03:18:42 -08:00
|
|
|
|
2015-02-13 14:36:24 -08:00
|
|
|
/**
|
|
|
|
|
* kstrdup_const - conditionally duplicate an existing const string
|
|
|
|
|
* @s: the string to duplicate
|
|
|
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
|
|
|
|
*
|
2020-10-15 20:07:39 -07:00
|
|
|
* Note: Strings allocated by kstrdup_const should be freed by kfree_const and
|
|
|
|
|
* must not be passed to krealloc().
|
2019-03-05 15:48:42 -08:00
|
|
|
*
|
|
|
|
|
* Return: source string if it is in .rodata section otherwise
|
|
|
|
|
* fallback to kstrdup.
|
2015-02-13 14:36:24 -08:00
|
|
|
*/
|
|
|
|
|
const char *kstrdup_const(const char *s, gfp_t gfp)
|
|
|
|
|
{
|
|
|
|
|
if (is_kernel_rodata((unsigned long)s))
|
|
|
|
|
return s;
|
|
|
|
|
|
|
|
|
|
return kstrdup(s, gfp);
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(kstrdup_const);
|
|
|
|
|
|
2007-07-17 18:37:02 -07:00
|
|
|
/**
|
|
|
|
|
* kstrndup - allocate space for and copy an existing string
|
|
|
|
|
* @s: the string to duplicate
|
|
|
|
|
* @max: read at most @max chars from @s
|
|
|
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
2017-07-04 17:25:02 +01:00
|
|
|
*
|
|
|
|
|
* Note: Use kmemdup_nul() instead if the size is known exactly.
|
2019-03-05 15:48:42 -08:00
|
|
|
*
|
|
|
|
|
* Return: newly allocated copy of @s or %NULL in case of error
|
2007-07-17 18:37:02 -07:00
|
|
|
*/
|
|
|
|
|
char *kstrndup(const char *s, size_t max, gfp_t gfp)
|
|
|
|
|
{
|
2024-10-07 22:49:10 +08:00
|
|
|
return s ? __kmemdup_nul(s, strnlen(s, max), gfp) : NULL;
|
2007-07-17 18:37:02 -07:00
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(kstrndup);
|
|
|
|
|
|
[PATCH] kmemdup: introduce
One of idiomatic ways to duplicate a region of memory is
dst = kmalloc(len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
memcpy(dst, src, len);
which is neat code except a programmer needs to write size twice. Which
sometimes leads to mistakes. If len passed to kmalloc is smaller that len
passed to memcpy, it's straight overwrite-beyond-end. If len passed to
memcpy is smaller than len passed to kmalloc, it's either a) legit
behaviour ;-), or b) cloned buffer will contain garbage in second half.
Slight trolling of commit lists shows several duplications bugs
done exactly because of diverged lenghts:
Linux:
[CRYPTO]: Fix memcpy/memset args.
[PATCH] memcpy/memset fixes
OpenBSD:
kerberosV/src/lib/asn1: der_copy.c:1.4
If programmer is given only one place to play with lengths, I believe, such
mistakes could be avoided.
With kmemdup, the snippet above will be rewritten as:
dst = kmemdup(src, len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
This also leads to smaller code (kzalloc effect). Quick grep shows
200+ places where kmemdup() can be used.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-30 23:27:20 -07:00
|
|
|
/**
|
|
|
|
|
* kmemdup - duplicate region of memory
|
|
|
|
|
*
|
|
|
|
|
* @src: memory region to duplicate
|
|
|
|
|
* @len: memory region length
|
|
|
|
|
* @gfp: GFP mask to use
|
2019-03-05 15:48:42 -08:00
|
|
|
*
|
2022-12-21 22:42:45 +08:00
|
|
|
* Return: newly allocated copy of @src or %NULL in case of error,
|
|
|
|
|
* result is physically contiguous. Use kfree() to free.
|
[PATCH] kmemdup: introduce
One of idiomatic ways to duplicate a region of memory is
dst = kmalloc(len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
memcpy(dst, src, len);
which is neat code except a programmer needs to write size twice. Which
sometimes leads to mistakes. If len passed to kmalloc is smaller that len
passed to memcpy, it's straight overwrite-beyond-end. If len passed to
memcpy is smaller than len passed to kmalloc, it's either a) legit
behaviour ;-), or b) cloned buffer will contain garbage in second half.
Slight trolling of commit lists shows several duplications bugs
done exactly because of diverged lenghts:
Linux:
[CRYPTO]: Fix memcpy/memset args.
[PATCH] memcpy/memset fixes
OpenBSD:
kerberosV/src/lib/asn1: der_copy.c:1.4
If programmer is given only one place to play with lengths, I believe, such
mistakes could be avoided.
With kmemdup, the snippet above will be rewritten as:
dst = kmemdup(src, len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
This also leads to smaller code (kzalloc effect). Quick grep shows
200+ places where kmemdup() can be used.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-30 23:27:20 -07:00
|
|
|
*/
|
2024-03-21 09:36:47 -07:00
|
|
|
void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp)
|
[PATCH] kmemdup: introduce
One of idiomatic ways to duplicate a region of memory is
dst = kmalloc(len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
memcpy(dst, src, len);
which is neat code except a programmer needs to write size twice. Which
sometimes leads to mistakes. If len passed to kmalloc is smaller that len
passed to memcpy, it's straight overwrite-beyond-end. If len passed to
memcpy is smaller than len passed to kmalloc, it's either a) legit
behaviour ;-), or b) cloned buffer will contain garbage in second half.
Slight trolling of commit lists shows several duplications bugs
done exactly because of diverged lenghts:
Linux:
[CRYPTO]: Fix memcpy/memset args.
[PATCH] memcpy/memset fixes
OpenBSD:
kerberosV/src/lib/asn1: der_copy.c:1.4
If programmer is given only one place to play with lengths, I believe, such
mistakes could be avoided.
With kmemdup, the snippet above will be rewritten as:
dst = kmemdup(src, len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
This also leads to smaller code (kzalloc effect). Quick grep shows
200+ places where kmemdup() can be used.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-30 23:27:20 -07:00
|
|
|
{
|
|
|
|
|
void *p;
|
|
|
|
|
|
2024-03-21 09:36:47 -07:00
|
|
|
p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_);
|
[PATCH] kmemdup: introduce
One of idiomatic ways to duplicate a region of memory is
dst = kmalloc(len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
memcpy(dst, src, len);
which is neat code except a programmer needs to write size twice. Which
sometimes leads to mistakes. If len passed to kmalloc is smaller that len
passed to memcpy, it's straight overwrite-beyond-end. If len passed to
memcpy is smaller than len passed to kmalloc, it's either a) legit
behaviour ;-), or b) cloned buffer will contain garbage in second half.
Slight trolling of commit lists shows several duplications bugs
done exactly because of diverged lenghts:
Linux:
[CRYPTO]: Fix memcpy/memset args.
[PATCH] memcpy/memset fixes
OpenBSD:
kerberosV/src/lib/asn1: der_copy.c:1.4
If programmer is given only one place to play with lengths, I believe, such
mistakes could be avoided.
With kmemdup, the snippet above will be rewritten as:
dst = kmemdup(src, len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
This also leads to smaller code (kzalloc effect). Quick grep shows
200+ places where kmemdup() can be used.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-30 23:27:20 -07:00
|
|
|
if (p)
|
|
|
|
|
memcpy(p, src, len);
|
|
|
|
|
return p;
|
|
|
|
|
}
|
2024-03-21 09:36:47 -07:00
|
|
|
EXPORT_SYMBOL(kmemdup_noprof);
|
[PATCH] kmemdup: introduce
One of idiomatic ways to duplicate a region of memory is
dst = kmalloc(len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
memcpy(dst, src, len);
which is neat code except a programmer needs to write size twice. Which
sometimes leads to mistakes. If len passed to kmalloc is smaller that len
passed to memcpy, it's straight overwrite-beyond-end. If len passed to
memcpy is smaller than len passed to kmalloc, it's either a) legit
behaviour ;-), or b) cloned buffer will contain garbage in second half.
Slight trolling of commit lists shows several duplications bugs
done exactly because of diverged lenghts:
Linux:
[CRYPTO]: Fix memcpy/memset args.
[PATCH] memcpy/memset fixes
OpenBSD:
kerberosV/src/lib/asn1: der_copy.c:1.4
If programmer is given only one place to play with lengths, I believe, such
mistakes could be avoided.
With kmemdup, the snippet above will be rewritten as:
dst = kmemdup(src, len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
This also leads to smaller code (kzalloc effect). Quick grep shows
200+ places where kmemdup() can be used.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-30 23:27:20 -07:00
|
|
|
|
2023-10-17 10:53:15 +05:30
|
|
|
/**
|
|
|
|
|
* kmemdup_array - duplicate a given array.
|
|
|
|
|
*
|
|
|
|
|
* @src: array to duplicate.
|
|
|
|
|
* @count: number of elements to duplicate from array.
|
2024-06-06 15:46:09 +01:00
|
|
|
* @element_size: size of each element of array.
|
2023-10-17 10:53:15 +05:30
|
|
|
* @gfp: GFP mask to use.
|
|
|
|
|
*
|
|
|
|
|
* Return: duplicated array of @src or %NULL in case of error,
|
|
|
|
|
* result is physically contiguous. Use kfree() to free.
|
|
|
|
|
*/
|
2024-06-06 15:46:09 +01:00
|
|
|
void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp)
|
2023-10-17 10:53:15 +05:30
|
|
|
{
|
|
|
|
|
return kmemdup(src, size_mul(element_size, count), gfp);
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(kmemdup_array);
|
|
|
|
|
|
2022-12-21 22:42:45 +08:00
|
|
|
/**
|
|
|
|
|
* kvmemdup - duplicate region of memory
|
|
|
|
|
*
|
|
|
|
|
* @src: memory region to duplicate
|
|
|
|
|
* @len: memory region length
|
|
|
|
|
* @gfp: GFP mask to use
|
|
|
|
|
*
|
|
|
|
|
* Return: newly allocated copy of @src or %NULL in case of error,
|
|
|
|
|
* result may be not physically contiguous. Use kvfree() to free.
|
|
|
|
|
*/
|
|
|
|
|
void *kvmemdup(const void *src, size_t len, gfp_t gfp)
|
|
|
|
|
{
|
|
|
|
|
void *p;
|
|
|
|
|
|
|
|
|
|
p = kvmalloc(len, gfp);
|
|
|
|
|
if (p)
|
|
|
|
|
memcpy(p, src, len);
|
|
|
|
|
return p;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(kvmemdup);
|
|
|
|
|
|
2017-07-04 17:25:02 +01:00
|
|
|
/**
|
|
|
|
|
* kmemdup_nul - Create a NUL-terminated string from unterminated data
|
|
|
|
|
* @s: The data to stringify
|
|
|
|
|
* @len: The size of the data
|
|
|
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
2019-03-05 15:48:42 -08:00
|
|
|
*
|
|
|
|
|
* Return: newly allocated copy of @s with NUL-termination or %NULL in
|
|
|
|
|
* case of error
|
2017-07-04 17:25:02 +01:00
|
|
|
*/
|
|
|
|
|
char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
|
|
|
|
|
{
|
2024-10-07 22:49:10 +08:00
|
|
|
return s ? __kmemdup_nul(s, len, gfp) : NULL;
|
2017-07-04 17:25:02 +01:00
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(kmemdup_nul);
|
|
|
|
|
|
2024-07-01 12:13:03 -07:00
|
|
|
static kmem_buckets *user_buckets __ro_after_init;
|
|
|
|
|
|
|
|
|
|
static int __init init_user_buckets(void)
|
|
|
|
|
{
|
|
|
|
|
user_buckets = kmem_buckets_create("memdup_user", 0, 0, INT_MAX, NULL);
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
subsys_initcall(init_user_buckets);
|
|
|
|
|
|
2009-03-31 15:23:16 -07:00
|
|
|
/**
|
|
|
|
|
* memdup_user - duplicate memory region from user space
|
|
|
|
|
*
|
|
|
|
|
* @src: source address in user space
|
|
|
|
|
* @len: number of bytes to copy
|
|
|
|
|
*
|
2019-03-05 15:48:42 -08:00
|
|
|
* Return: an ERR_PTR() on failure. Result is physically
|
2018-01-07 13:06:15 -05:00
|
|
|
* contiguous, to be freed by kfree().
|
2009-03-31 15:23:16 -07:00
|
|
|
*/
|
|
|
|
|
void *memdup_user(const void __user *src, size_t len)
|
|
|
|
|
{
|
|
|
|
|
void *p;
|
|
|
|
|
|
2024-07-01 12:13:03 -07:00
|
|
|
p = kmem_buckets_alloc_track_caller(user_buckets, len, GFP_USER | __GFP_NOWARN);
|
2009-03-31 15:23:16 -07:00
|
|
|
if (!p)
|
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
|
|
if (copy_from_user(p, src, len)) {
|
|
|
|
|
kfree(p);
|
|
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return p;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(memdup_user);
|
|
|
|
|
|
2018-01-07 13:06:15 -05:00
|
|
|
/**
|
|
|
|
|
* vmemdup_user - duplicate memory region from user space
|
|
|
|
|
*
|
|
|
|
|
* @src: source address in user space
|
|
|
|
|
* @len: number of bytes to copy
|
|
|
|
|
*
|
2019-03-05 15:48:42 -08:00
|
|
|
* Return: an ERR_PTR() on failure. Result may be not
|
2018-01-07 13:06:15 -05:00
|
|
|
* physically contiguous. Use kvfree() to free.
|
|
|
|
|
*/
|
|
|
|
|
void *vmemdup_user(const void __user *src, size_t len)
|
|
|
|
|
{
|
|
|
|
|
void *p;
|
|
|
|
|
|
2024-07-01 12:13:03 -07:00
|
|
|
p = kmem_buckets_valloc(user_buckets, len, GFP_USER);
|
2018-01-07 13:06:15 -05:00
|
|
|
if (!p)
|
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
|
|
if (copy_from_user(p, src, len)) {
|
|
|
|
|
kvfree(p);
|
|
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return p;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(vmemdup_user);
|
|
|
|
|
|
2018-08-23 17:00:59 -07:00
|
|
|
/**
|
2006-03-24 03:18:42 -08:00
|
|
|
* strndup_user - duplicate an existing string from user space
|
|
|
|
|
* @s: The string to duplicate
|
|
|
|
|
* @n: Maximum number of bytes to copy, including the trailing NUL.
|
2019-03-05 15:48:42 -08:00
|
|
|
*
|
2019-04-05 18:39:34 -07:00
|
|
|
* Return: newly allocated copy of @s or an ERR_PTR() in case of error
|
2006-03-24 03:18:42 -08:00
|
|
|
*/
|
|
|
|
|
char *strndup_user(const char __user *s, long n)
|
|
|
|
|
{
|
|
|
|
|
char *p;
|
|
|
|
|
long length;
|
|
|
|
|
|
|
|
|
|
length = strnlen_user(s, n);
|
|
|
|
|
|
|
|
|
|
if (!length)
|
|
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
|
|
|
|
|
|
|
if (length > n)
|
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
|
2010-08-09 17:18:26 -07:00
|
|
|
p = memdup_user(s, length);
|
2006-03-24 03:18:42 -08:00
|
|
|
|
2010-08-09 17:18:26 -07:00
|
|
|
if (IS_ERR(p))
|
|
|
|
|
return p;
|
2006-03-24 03:18:42 -08:00
|
|
|
|
|
|
|
|
p[length - 1] = '\0';
|
|
|
|
|
|
|
|
|
|
return p;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(strndup_user);
|
2008-07-25 19:44:36 -07:00
|
|
|
|
2015-12-24 00:06:05 -05:00
|
|
|
/**
|
|
|
|
|
* memdup_user_nul - duplicate memory region from user space and NUL-terminate
|
|
|
|
|
*
|
|
|
|
|
* @src: source address in user space
|
|
|
|
|
* @len: number of bytes to copy
|
|
|
|
|
*
|
2019-03-05 15:48:42 -08:00
|
|
|
* Return: an ERR_PTR() on failure.
|
2015-12-24 00:06:05 -05:00
|
|
|
*/
|
|
|
|
|
void *memdup_user_nul(const void __user *src, size_t len)
|
|
|
|
|
{
|
|
|
|
|
char *p;
|
|
|
|
|
|
2024-12-21 16:47:29 +09:00
|
|
|
p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN);
|
2015-12-24 00:06:05 -05:00
|
|
|
if (!p)
|
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
|
|
if (copy_from_user(p, src, len)) {
|
|
|
|
|
kfree(p);
|
|
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
|
|
}
|
|
|
|
|
p[len] = '\0';
|
|
|
|
|
|
|
|
|
|
return p;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(memdup_user_nul);
|
|
|
|
|
|
procfs: mark thread stack correctly in proc/<pid>/maps
Stack for a new thread is mapped by userspace code and passed via
sys_clone. This memory is currently seen as anonymous in
/proc/<pid>/maps, which makes it difficult to ascertain which mappings
are being used for thread stacks. This patch uses the individual task
stack pointers to determine which vmas are actually thread stacks.
For a multithreaded program like the following:
#include <pthread.h>
void *thread_main(void *foo)
{
while(1);
}
int main()
{
pthread_t t;
pthread_create(&t, NULL, thread_main, NULL);
pthread_join(t, NULL);
}
proc/PID/maps looks like the following:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Here, one could guess that 7f8a44492000-7f8a44c92000 is a stack since
the earlier vma that has no permissions (7f8a44e3d000-7f8a4503d000) but
that is not always a reliable way to find out which vma is a thread
stack. Also, /proc/PID/maps and /proc/PID/task/TID/maps has the same
content.
With this patch in place, /proc/PID/task/TID/maps are treated as 'maps
as the task would see it' and hence, only the vma that that task uses as
stack is marked as [stack]. All other 'stack' vmas are marked as
anonymous memory. /proc/PID/maps acts as a thread group level view,
where all thread stack vmas are marked as [stack:TID] where TID is the
process ID of the task that uses that vma as stack, while the process
stack is marked as [stack].
So /proc/PID/maps will look like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Thus marking all vmas that are used as stacks by the threads in the
thread group along with the process stack. The task level maps will
however like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
where only the vma that is being used as a stack by *that* task is
marked as [stack].
Analogous changes have been made to /proc/PID/smaps,
/proc/PID/numa_maps, /proc/PID/task/TID/smaps and
/proc/PID/task/TID/numa_maps. Relevant snippets from smaps and
numa_maps:
[siddhesh@localhost ~ ]$ pgrep a.out
1441
[siddhesh@localhost ~ ]$ cat /proc/1441/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/smaps | grep "\[stack"
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/numa_maps | grep "stack"
7f8a44492000 default stack:1442 anon=2 dirty=2 N0=2
7fff6273a000 default stack anon=3 dirty=3 N0=3
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/numa_maps | grep "stack"
7f8a44492000 default stack anon=2 dirty=2 N0=2
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/numa_maps | grep "stack"
7fff6273a000 default stack anon=3 dirty=3 N0=3
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix build]
Signed-off-by: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jamie Lokier <jamie@shareable.org>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 16:34:04 -07:00
|
|
|
/* Check if the vma is being used as a stack by this task */
|
2025-09-01 22:50:15 +02:00
|
|
|
int vma_is_stack_for_current(const struct vm_area_struct *vma)
|
procfs: mark thread stack correctly in proc/<pid>/maps
Stack for a new thread is mapped by userspace code and passed via
sys_clone. This memory is currently seen as anonymous in
/proc/<pid>/maps, which makes it difficult to ascertain which mappings
are being used for thread stacks. This patch uses the individual task
stack pointers to determine which vmas are actually thread stacks.
For a multithreaded program like the following:
#include <pthread.h>
void *thread_main(void *foo)
{
while(1);
}
int main()
{
pthread_t t;
pthread_create(&t, NULL, thread_main, NULL);
pthread_join(t, NULL);
}
proc/PID/maps looks like the following:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Here, one could guess that 7f8a44492000-7f8a44c92000 is a stack since
the earlier vma that has no permissions (7f8a44e3d000-7f8a4503d000) but
that is not always a reliable way to find out which vma is a thread
stack. Also, /proc/PID/maps and /proc/PID/task/TID/maps has the same
content.
With this patch in place, /proc/PID/task/TID/maps are treated as 'maps
as the task would see it' and hence, only the vma that that task uses as
stack is marked as [stack]. All other 'stack' vmas are marked as
anonymous memory. /proc/PID/maps acts as a thread group level view,
where all thread stack vmas are marked as [stack:TID] where TID is the
process ID of the task that uses that vma as stack, while the process
stack is marked as [stack].
So /proc/PID/maps will look like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Thus marking all vmas that are used as stacks by the threads in the
thread group along with the process stack. The task level maps will
however like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
where only the vma that is being used as a stack by *that* task is
marked as [stack].
Analogous changes have been made to /proc/PID/smaps,
/proc/PID/numa_maps, /proc/PID/task/TID/smaps and
/proc/PID/task/TID/numa_maps. Relevant snippets from smaps and
numa_maps:
[siddhesh@localhost ~ ]$ pgrep a.out
1441
[siddhesh@localhost ~ ]$ cat /proc/1441/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/smaps | grep "\[stack"
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/numa_maps | grep "stack"
7f8a44492000 default stack:1442 anon=2 dirty=2 N0=2
7fff6273a000 default stack anon=3 dirty=3 N0=3
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/numa_maps | grep "stack"
7f8a44492000 default stack anon=2 dirty=2 N0=2
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/numa_maps | grep "stack"
7fff6273a000 default stack anon=3 dirty=3 N0=3
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix build]
Signed-off-by: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jamie Lokier <jamie@shareable.org>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 16:34:04 -07:00
|
|
|
{
|
2016-09-30 10:58:58 -07:00
|
|
|
struct task_struct * __maybe_unused t = current;
|
|
|
|
|
|
procfs: mark thread stack correctly in proc/<pid>/maps
Stack for a new thread is mapped by userspace code and passed via
sys_clone. This memory is currently seen as anonymous in
/proc/<pid>/maps, which makes it difficult to ascertain which mappings
are being used for thread stacks. This patch uses the individual task
stack pointers to determine which vmas are actually thread stacks.
For a multithreaded program like the following:
#include <pthread.h>
void *thread_main(void *foo)
{
while(1);
}
int main()
{
pthread_t t;
pthread_create(&t, NULL, thread_main, NULL);
pthread_join(t, NULL);
}
proc/PID/maps looks like the following:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Here, one could guess that 7f8a44492000-7f8a44c92000 is a stack since
the earlier vma that has no permissions (7f8a44e3d000-7f8a4503d000) but
that is not always a reliable way to find out which vma is a thread
stack. Also, /proc/PID/maps and /proc/PID/task/TID/maps has the same
content.
With this patch in place, /proc/PID/task/TID/maps are treated as 'maps
as the task would see it' and hence, only the vma that that task uses as
stack is marked as [stack]. All other 'stack' vmas are marked as
anonymous memory. /proc/PID/maps acts as a thread group level view,
where all thread stack vmas are marked as [stack:TID] where TID is the
process ID of the task that uses that vma as stack, while the process
stack is marked as [stack].
So /proc/PID/maps will look like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Thus marking all vmas that are used as stacks by the threads in the
thread group along with the process stack. The task level maps will
however like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
where only the vma that is being used as a stack by *that* task is
marked as [stack].
Analogous changes have been made to /proc/PID/smaps,
/proc/PID/numa_maps, /proc/PID/task/TID/smaps and
/proc/PID/task/TID/numa_maps. Relevant snippets from smaps and
numa_maps:
[siddhesh@localhost ~ ]$ pgrep a.out
1441
[siddhesh@localhost ~ ]$ cat /proc/1441/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/smaps | grep "\[stack"
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/numa_maps | grep "stack"
7f8a44492000 default stack:1442 anon=2 dirty=2 N0=2
7fff6273a000 default stack anon=3 dirty=3 N0=3
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/numa_maps | grep "stack"
7f8a44492000 default stack anon=2 dirty=2 N0=2
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/numa_maps | grep "stack"
7fff6273a000 default stack anon=3 dirty=3 N0=3
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix build]
Signed-off-by: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jamie Lokier <jamie@shareable.org>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 16:34:04 -07:00
|
|
|
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-14 15:09:33 +02:00
|
|
|
/*
|
|
|
|
|
* Change backing file, only valid to use during initial VMA setup.
|
|
|
|
|
*/
|
|
|
|
|
void vma_set_file(struct vm_area_struct *vma, struct file *file)
|
|
|
|
|
{
|
|
|
|
|
/* Changing an anonymous vma with this is illegal */
|
|
|
|
|
get_file(file);
|
|
|
|
|
swap(vma->vm_file, file);
|
|
|
|
|
fput(file);
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(vma_set_file);
|
|
|
|
|
|
2019-09-23 15:38:37 -07:00
|
|
|
#ifndef STACK_RND_MASK
|
|
|
|
|
#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
unsigned long randomize_stack_top(unsigned long stack_top)
|
|
|
|
|
{
|
|
|
|
|
unsigned long random_variable = 0;
|
|
|
|
|
|
|
|
|
|
if (current->flags & PF_RANDOMIZE) {
|
|
|
|
|
random_variable = get_random_long();
|
|
|
|
|
random_variable &= STACK_RND_MASK;
|
|
|
|
|
random_variable <<= PAGE_SHIFT;
|
|
|
|
|
}
|
|
|
|
|
#ifdef CONFIG_STACK_GROWSUP
|
|
|
|
|
return PAGE_ALIGN(stack_top) + random_variable;
|
|
|
|
|
#else
|
|
|
|
|
return PAGE_ALIGN(stack_top) - random_variable;
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-14 13:59:30 +02:00
|
|
|
/**
|
|
|
|
|
* randomize_page - Generate a random, page aligned address
|
|
|
|
|
* @start: The smallest acceptable address the caller will take.
|
|
|
|
|
* @range: The size of the area, starting at @start, within which the
|
|
|
|
|
* random address must fall.
|
|
|
|
|
*
|
|
|
|
|
* If @start + @range would overflow, @range is capped.
|
|
|
|
|
*
|
|
|
|
|
* NOTE: Historical use of randomize_range, which this replaces, presumed that
|
|
|
|
|
* @start was already page aligned. We now align it regardless.
|
|
|
|
|
*
|
|
|
|
|
* Return: A page aligned address within [start, start + range). On error,
|
|
|
|
|
* @start is returned.
|
|
|
|
|
*/
|
|
|
|
|
unsigned long randomize_page(unsigned long start, unsigned long range)
|
|
|
|
|
{
|
|
|
|
|
if (!PAGE_ALIGNED(start)) {
|
|
|
|
|
range -= PAGE_ALIGN(start) - start;
|
|
|
|
|
start = PAGE_ALIGN(start);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (start > ULONG_MAX - range)
|
|
|
|
|
range = ULONG_MAX - start;
|
|
|
|
|
|
|
|
|
|
range >>= PAGE_SHIFT;
|
|
|
|
|
|
|
|
|
|
if (range == 0)
|
|
|
|
|
return start;
|
|
|
|
|
|
|
|
|
|
return start + (get_random_long() % range << PAGE_SHIFT);
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-23 15:38:47 -07:00
|
|
|
#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
|
2022-04-09 19:17:26 +02:00
|
|
|
unsigned long __weak arch_randomize_brk(struct mm_struct *mm)
|
2019-09-23 15:38:50 -07:00
|
|
|
{
|
|
|
|
|
/* Is the current task 32bit ? */
|
|
|
|
|
if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
|
|
|
|
|
return randomize_page(mm->brk, SZ_32M);
|
|
|
|
|
|
|
|
|
|
return randomize_page(mm->brk, SZ_1G);
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-23 15:38:47 -07:00
|
|
|
unsigned long arch_mmap_rnd(void)
|
|
|
|
|
{
|
|
|
|
|
unsigned long rnd;
|
|
|
|
|
|
|
|
|
|
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
|
|
|
|
|
if (is_compat_task())
|
|
|
|
|
rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
|
|
|
|
|
else
|
|
|
|
|
#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
|
|
|
|
|
rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
|
|
|
|
|
|
|
|
|
|
return rnd << PAGE_SHIFT;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-01 22:50:15 +02:00
|
|
|
static int mmap_is_legacy(const struct rlimit *rlim_stack)
|
2019-09-23 15:38:47 -07:00
|
|
|
{
|
|
|
|
|
if (current->personality & ADDR_COMPAT_LAYOUT)
|
|
|
|
|
return 1;
|
|
|
|
|
|
2023-08-19 00:53:28 +02:00
|
|
|
/* On parisc the stack always grows up - so a unlimited stack should
|
|
|
|
|
* not be an indicator to use the legacy memory layout. */
|
|
|
|
|
if (rlim_stack->rlim_cur == RLIM_INFINITY &&
|
|
|
|
|
!IS_ENABLED(CONFIG_STACK_GROWSUP))
|
2019-09-23 15:38:47 -07:00
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
return sysctl_legacy_va_layout;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Leave enough space between the mmap area and the stack to honour ulimit in
|
|
|
|
|
* the face of randomisation.
|
|
|
|
|
*/
|
|
|
|
|
#define MIN_GAP (SZ_128M)
|
|
|
|
|
#define MAX_GAP (STACK_TOP / 6 * 5)
|
|
|
|
|
|
2025-09-01 22:50:17 +02:00
|
|
|
static unsigned long mmap_base(const unsigned long rnd, const struct rlimit *rlim_stack)
|
2019-09-23 15:38:47 -07:00
|
|
|
{
|
2023-11-13 11:12:57 +01:00
|
|
|
#ifdef CONFIG_STACK_GROWSUP
|
|
|
|
|
/*
|
|
|
|
|
* For an upwards growing stack the calculation is much simpler.
|
|
|
|
|
* Memory for the maximum stack size is reserved at the top of the
|
|
|
|
|
* task. mmap_base starts directly below the stack and grows
|
|
|
|
|
* downwards.
|
|
|
|
|
*/
|
|
|
|
|
return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd);
|
|
|
|
|
#else
|
2019-09-23 15:38:47 -07:00
|
|
|
unsigned long gap = rlim_stack->rlim_cur;
|
|
|
|
|
unsigned long pad = stack_guard_gap;
|
|
|
|
|
|
|
|
|
|
/* Account for stack randomization if necessary */
|
|
|
|
|
if (current->flags & PF_RANDOMIZE)
|
|
|
|
|
pad += (STACK_RND_MASK << PAGE_SHIFT);
|
|
|
|
|
|
|
|
|
|
/* Values close to RLIM_INFINITY can overflow. */
|
|
|
|
|
if (gap + pad > gap)
|
|
|
|
|
gap += pad;
|
|
|
|
|
|
2024-08-03 15:46:41 +08:00
|
|
|
if (gap < MIN_GAP && MIN_GAP < MAX_GAP)
|
2019-09-23 15:38:47 -07:00
|
|
|
gap = MIN_GAP;
|
|
|
|
|
else if (gap > MAX_GAP)
|
|
|
|
|
gap = MAX_GAP;
|
|
|
|
|
|
|
|
|
|
return PAGE_ALIGN(STACK_TOP - gap - rnd);
|
2023-11-13 11:12:57 +01:00
|
|
|
#endif
|
2019-09-23 15:38:47 -07:00
|
|
|
}
|
|
|
|
|
|
2025-09-01 22:50:17 +02:00
|
|
|
void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack)
|
2019-09-23 15:38:47 -07:00
|
|
|
{
|
|
|
|
|
unsigned long random_factor = 0UL;
|
|
|
|
|
|
|
|
|
|
if (current->flags & PF_RANDOMIZE)
|
|
|
|
|
random_factor = arch_mmap_rnd();
|
|
|
|
|
|
|
|
|
|
if (mmap_is_legacy(rlim_stack)) {
|
|
|
|
|
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
|
2025-08-12 16:44:11 +01:00
|
|
|
mm_flags_clear(MMF_TOPDOWN, mm);
|
2019-09-23 15:38:47 -07:00
|
|
|
} else {
|
|
|
|
|
mm->mmap_base = mmap_base(random_factor, rlim_stack);
|
2025-08-12 16:44:11 +01:00
|
|
|
mm_flags_set(MMF_TOPDOWN, mm);
|
2019-09-23 15:38:47 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
|
2025-09-01 22:50:17 +02:00
|
|
|
void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack)
|
2008-07-25 19:44:36 -07:00
|
|
|
{
|
|
|
|
|
mm->mmap_base = TASK_UNMAPPED_BASE;
|
2025-08-12 16:44:11 +01:00
|
|
|
mm_flags_clear(MMF_TOPDOWN, mm);
|
2008-07-25 19:44:36 -07:00
|
|
|
}
|
|
|
|
|
#endif
|
2024-06-19 13:25:17 -07:00
|
|
|
#ifdef CONFIG_MMU
|
2024-06-12 12:59:18 -07:00
|
|
|
EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout);
|
2024-06-19 13:25:17 -07:00
|
|
|
#endif
|
2008-08-12 17:52:52 -05:00
|
|
|
|
2019-07-16 16:30:54 -07:00
|
|
|
/**
|
|
|
|
|
* __account_locked_vm - account locked pages to an mm's locked_vm
|
|
|
|
|
* @mm: mm to account against
|
|
|
|
|
* @pages: number of pages to account
|
|
|
|
|
* @inc: %true if @pages should be considered positive, %false if not
|
|
|
|
|
* @task: task used to check RLIMIT_MEMLOCK
|
|
|
|
|
* @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
|
|
|
|
|
*
|
|
|
|
|
* Assumes @task and @mm are valid (i.e. at least one reference on each), and
|
2020-06-08 21:33:54 -07:00
|
|
|
* that mmap_lock is held as writer.
|
2019-07-16 16:30:54 -07:00
|
|
|
*
|
|
|
|
|
* Return:
|
|
|
|
|
* * 0 on success
|
|
|
|
|
* * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
|
|
|
|
|
*/
|
|
|
|
|
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
|
2025-09-01 22:50:15 +02:00
|
|
|
const struct task_struct *task, bool bypass_rlim)
|
2019-07-16 16:30:54 -07:00
|
|
|
{
|
|
|
|
|
unsigned long locked_vm, limit;
|
|
|
|
|
int ret = 0;
|
|
|
|
|
|
2020-06-08 21:33:44 -07:00
|
|
|
mmap_assert_write_locked(mm);
|
2019-07-16 16:30:54 -07:00
|
|
|
|
|
|
|
|
locked_vm = mm->locked_vm;
|
|
|
|
|
if (inc) {
|
|
|
|
|
if (!bypass_rlim) {
|
|
|
|
|
limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
|
|
|
|
|
if (locked_vm + pages > limit)
|
|
|
|
|
ret = -ENOMEM;
|
|
|
|
|
}
|
|
|
|
|
if (!ret)
|
|
|
|
|
mm->locked_vm = locked_vm + pages;
|
|
|
|
|
} else {
|
|
|
|
|
WARN_ON_ONCE(pages > locked_vm);
|
|
|
|
|
mm->locked_vm = locked_vm - pages;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
|
|
|
|
|
(void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
|
|
|
|
|
locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
|
|
|
|
|
ret ? " - exceeded" : "");
|
|
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL_GPL(__account_locked_vm);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* account_locked_vm - account locked pages to an mm's locked_vm
|
|
|
|
|
* @mm: mm to account against, may be NULL
|
|
|
|
|
* @pages: number of pages to account
|
|
|
|
|
* @inc: %true if @pages should be considered positive, %false if not
|
|
|
|
|
*
|
|
|
|
|
* Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
|
|
|
|
|
*
|
|
|
|
|
* Return:
|
|
|
|
|
* * 0 on success, or if mm is NULL
|
|
|
|
|
* * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
|
|
|
|
|
*/
|
|
|
|
|
int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
|
|
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
if (pages == 0 || !mm)
|
|
|
|
|
return 0;
|
|
|
|
|
|
2020-06-08 21:33:25 -07:00
|
|
|
mmap_write_lock(mm);
|
2019-07-16 16:30:54 -07:00
|
|
|
ret = __account_locked_vm(mm, pages, inc, current,
|
|
|
|
|
capable(CAP_IPC_LOCK));
|
2020-06-08 21:33:25 -07:00
|
|
|
mmap_write_unlock(mm);
|
2019-07-16 16:30:54 -07:00
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL_GPL(account_locked_vm);
|
|
|
|
|
|
2012-05-30 20:17:35 -04:00
|
|
|
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
|
|
|
|
|
unsigned long len, unsigned long prot,
|
2016-05-23 16:25:30 -07:00
|
|
|
unsigned long flag, unsigned long pgoff)
|
2012-05-30 20:17:35 -04:00
|
|
|
{
|
2025-10-03 16:52:36 +01:00
|
|
|
loff_t off = (loff_t)pgoff << PAGE_SHIFT;
|
2012-05-30 20:17:35 -04:00
|
|
|
unsigned long ret;
|
|
|
|
|
struct mm_struct *mm = current->mm;
|
2013-02-22 16:32:47 -08:00
|
|
|
unsigned long populate;
|
2017-02-24 14:58:22 -08:00
|
|
|
LIST_HEAD(uf);
|
2012-05-30 20:17:35 -04:00
|
|
|
|
|
|
|
|
ret = security_mmap_file(file, prot, flag);
|
2025-03-12 08:38:47 +01:00
|
|
|
if (!ret)
|
2025-10-03 16:52:36 +01:00
|
|
|
ret = fsnotify_mmap_perm(file, prot, off, len);
|
2012-05-30 20:17:35 -04:00
|
|
|
if (!ret) {
|
2020-06-08 21:33:25 -07:00
|
|
|
if (mmap_write_lock_killable(mm))
|
2016-05-23 16:25:30 -07:00
|
|
|
return -EINTR;
|
2023-06-12 17:10:30 -07:00
|
|
|
ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate,
|
2020-08-06 23:23:37 -07:00
|
|
|
&uf);
|
2020-06-08 21:33:25 -07:00
|
|
|
mmap_write_unlock(mm);
|
2017-02-24 14:58:22 -08:00
|
|
|
userfaultfd_unmap_complete(mm, &uf);
|
2013-02-22 16:32:47 -08:00
|
|
|
if (populate)
|
|
|
|
|
mm_populate(ret, populate);
|
2012-05-30 20:17:35 -04:00
|
|
|
}
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-12 11:31:52 +00:00
|
|
|
/*
|
|
|
|
|
* Perform a userland memory mapping into the current process address space. See
|
|
|
|
|
* the comment for do_mmap() for more details on this operation in general.
|
|
|
|
|
*
|
|
|
|
|
* This differs from do_mmap() in that:
|
|
|
|
|
*
|
|
|
|
|
* a. An offset parameter is provided rather than pgoff, which is both checked
|
|
|
|
|
* for overflow and page alignment.
|
|
|
|
|
* b. mmap locking is performed on the caller's behalf.
|
|
|
|
|
* c. Userfaultfd unmap events and memory population are handled.
|
|
|
|
|
*
|
|
|
|
|
* This means that this function performs essentially the same work as if
|
|
|
|
|
* userland were invoking mmap (2).
|
|
|
|
|
*
|
|
|
|
|
* Returns either an error, or the address at which the requested mapping has
|
|
|
|
|
* been performed.
|
|
|
|
|
*/
|
2012-05-30 20:17:35 -04:00
|
|
|
unsigned long vm_mmap(struct file *file, unsigned long addr,
|
|
|
|
|
unsigned long len, unsigned long prot,
|
|
|
|
|
unsigned long flag, unsigned long offset)
|
|
|
|
|
{
|
|
|
|
|
if (unlikely(offset + PAGE_ALIGN(len) < offset))
|
|
|
|
|
return -EINVAL;
|
2015-11-05 18:46:46 -08:00
|
|
|
if (unlikely(offset_in_page(offset)))
|
2012-05-30 20:17:35 -04:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
2016-05-23 16:25:30 -07:00
|
|
|
return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
|
2012-05-30 20:17:35 -04:00
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(vm_mmap);
|
|
|
|
|
|
2022-03-08 04:47:22 -05:00
|
|
|
/**
|
|
|
|
|
* __vmalloc_array - allocate memory for a virtually contiguous array.
|
|
|
|
|
* @n: number of elements.
|
|
|
|
|
* @size: element size.
|
|
|
|
|
* @flags: the type of memory to allocate (see kmalloc).
|
|
|
|
|
*/
|
2024-03-21 09:36:52 -07:00
|
|
|
void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
|
2022-03-08 04:47:22 -05:00
|
|
|
{
|
|
|
|
|
size_t bytes;
|
|
|
|
|
|
|
|
|
|
if (unlikely(check_mul_overflow(n, size, &bytes)))
|
|
|
|
|
return NULL;
|
2024-05-31 13:53:50 -07:00
|
|
|
return __vmalloc_noprof(bytes, flags);
|
2022-03-08 04:47:22 -05:00
|
|
|
}
|
2024-03-21 09:36:52 -07:00
|
|
|
EXPORT_SYMBOL(__vmalloc_array_noprof);
|
2022-03-08 04:47:22 -05:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* vmalloc_array - allocate memory for a virtually contiguous array.
|
|
|
|
|
* @n: number of elements.
|
|
|
|
|
* @size: element size.
|
|
|
|
|
*/
|
2024-03-21 09:36:52 -07:00
|
|
|
void *vmalloc_array_noprof(size_t n, size_t size)
|
2022-03-08 04:47:22 -05:00
|
|
|
{
|
2024-05-31 13:53:50 -07:00
|
|
|
return __vmalloc_array_noprof(n, size, GFP_KERNEL);
|
2022-03-08 04:47:22 -05:00
|
|
|
}
|
2024-03-21 09:36:52 -07:00
|
|
|
EXPORT_SYMBOL(vmalloc_array_noprof);
|
2022-03-08 04:47:22 -05:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* __vcalloc - allocate and zero memory for a virtually contiguous array.
|
|
|
|
|
* @n: number of elements.
|
|
|
|
|
* @size: element size.
|
|
|
|
|
* @flags: the type of memory to allocate (see kmalloc).
|
|
|
|
|
*/
|
2024-03-21 09:36:52 -07:00
|
|
|
void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags)
|
2022-03-08 04:47:22 -05:00
|
|
|
{
|
2024-05-31 13:53:50 -07:00
|
|
|
return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO);
|
2022-03-08 04:47:22 -05:00
|
|
|
}
|
2024-03-21 09:36:52 -07:00
|
|
|
EXPORT_SYMBOL(__vcalloc_noprof);
|
2022-03-08 04:47:22 -05:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* vcalloc - allocate and zero memory for a virtually contiguous array.
|
|
|
|
|
* @n: number of elements.
|
|
|
|
|
* @size: element size.
|
|
|
|
|
*/
|
2024-03-21 09:36:52 -07:00
|
|
|
void *vcalloc_noprof(size_t n, size_t size)
|
2022-03-08 04:47:22 -05:00
|
|
|
{
|
2024-05-31 13:53:50 -07:00
|
|
|
return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO);
|
2022-03-08 04:47:22 -05:00
|
|
|
}
|
2024-03-21 09:36:52 -07:00
|
|
|
EXPORT_SYMBOL(vcalloc_noprof);
|
2022-03-08 04:47:22 -05:00
|
|
|
|
2024-10-05 21:01:14 +01:00
|
|
|
struct anon_vma *folio_anon_vma(const struct folio *folio)
|
2015-04-15 16:14:53 -07:00
|
|
|
{
|
2021-05-07 11:17:34 -04:00
|
|
|
unsigned long mapping = (unsigned long)folio->mapping;
|
2015-04-15 16:14:53 -07:00
|
|
|
|
2025-07-04 12:25:20 +02:00
|
|
|
if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
|
2015-04-15 16:14:53 -07:00
|
|
|
return NULL;
|
2025-07-04 12:25:20 +02:00
|
|
|
return (void *)(mapping - FOLIO_MAPPING_ANON);
|
2015-04-15 16:14:53 -07:00
|
|
|
}
|
|
|
|
|
|
mm/util: Add folio_mapping() and folio_file_mapping()
These are the folio equivalent of page_mapping() and page_file_mapping().
Add an out-of-line page_mapping() wrapper around folio_mapping()
in order to prevent the page_folio() call from bloating every caller
of page_mapping(). Adjust page_file_mapping() and page_mapping_file()
to use folios internally. Rename __page_file_mapping() to
swapcache_mapping() and change it to take a folio.
This ends up saving 122 bytes of text overall. folio_mapping() is
45 bytes shorter than page_mapping() was, but the new page_mapping()
wrapper is 30 bytes. The major reduction is a few bytes less in dozens
of nfs functions (which call page_file_mapping()). Most of these appear
to be a slight change in gcc's register allocation decisions, which allow:
48 8b 56 08 mov 0x8(%rsi),%rdx
48 8d 42 ff lea -0x1(%rdx),%rax
83 e2 01 and $0x1,%edx
48 0f 44 c6 cmove %rsi,%rax
to become:
48 8b 46 08 mov 0x8(%rsi),%rax
48 8d 78 ff lea -0x1(%rax),%rdi
a8 01 test $0x1,%al
48 0f 44 fe cmove %rsi,%rdi
for a reduction of a single byte. Once the NFS client is converted to
use folios, this entire sequence will disappear.
Also add folio_mapping() documentation.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
2020-12-10 10:55:05 -05:00
|
|
|
/**
|
|
|
|
|
* folio_mapping - Find the mapping where this folio is stored.
|
|
|
|
|
* @folio: The folio.
|
|
|
|
|
*
|
|
|
|
|
* For folios which are in the page cache, return the mapping that this
|
|
|
|
|
* page belongs to. Folios in the swap cache return the swap mapping
|
|
|
|
|
* this page is stored in (which is different from the mapping for the
|
|
|
|
|
* swap file or swap device where the data is stored).
|
|
|
|
|
*
|
|
|
|
|
* You can call this for folios which aren't in the swap cache or page
|
|
|
|
|
* cache and it will return NULL.
|
|
|
|
|
*/
|
2025-09-01 22:50:15 +02:00
|
|
|
struct address_space *folio_mapping(const struct folio *folio)
|
2013-02-22 16:34:35 -08:00
|
|
|
{
|
2016-01-15 16:52:07 -08:00
|
|
|
struct address_space *mapping;
|
|
|
|
|
|
2014-01-14 17:56:40 -08:00
|
|
|
/* This happens if someone calls flush_dcache_page on slab page */
|
mm/util: Add folio_mapping() and folio_file_mapping()
These are the folio equivalent of page_mapping() and page_file_mapping().
Add an out-of-line page_mapping() wrapper around folio_mapping()
in order to prevent the page_folio() call from bloating every caller
of page_mapping(). Adjust page_file_mapping() and page_mapping_file()
to use folios internally. Rename __page_file_mapping() to
swapcache_mapping() and change it to take a folio.
This ends up saving 122 bytes of text overall. folio_mapping() is
45 bytes shorter than page_mapping() was, but the new page_mapping()
wrapper is 30 bytes. The major reduction is a few bytes less in dozens
of nfs functions (which call page_file_mapping()). Most of these appear
to be a slight change in gcc's register allocation decisions, which allow:
48 8b 56 08 mov 0x8(%rsi),%rdx
48 8d 42 ff lea -0x1(%rdx),%rax
83 e2 01 and $0x1,%edx
48 0f 44 c6 cmove %rsi,%rax
to become:
48 8b 46 08 mov 0x8(%rsi),%rax
48 8d 78 ff lea -0x1(%rax),%rdi
a8 01 test $0x1,%al
48 0f 44 fe cmove %rsi,%rdi
for a reduction of a single byte. Once the NFS client is converted to
use folios, this entire sequence will disappear.
Also add folio_mapping() documentation.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
2020-12-10 10:55:05 -05:00
|
|
|
if (unlikely(folio_test_slab(folio)))
|
2014-01-14 17:56:40 -08:00
|
|
|
return NULL;
|
|
|
|
|
|
mm/util: Add folio_mapping() and folio_file_mapping()
These are the folio equivalent of page_mapping() and page_file_mapping().
Add an out-of-line page_mapping() wrapper around folio_mapping()
in order to prevent the page_folio() call from bloating every caller
of page_mapping(). Adjust page_file_mapping() and page_mapping_file()
to use folios internally. Rename __page_file_mapping() to
swapcache_mapping() and change it to take a folio.
This ends up saving 122 bytes of text overall. folio_mapping() is
45 bytes shorter than page_mapping() was, but the new page_mapping()
wrapper is 30 bytes. The major reduction is a few bytes less in dozens
of nfs functions (which call page_file_mapping()). Most of these appear
to be a slight change in gcc's register allocation decisions, which allow:
48 8b 56 08 mov 0x8(%rsi),%rdx
48 8d 42 ff lea -0x1(%rdx),%rax
83 e2 01 and $0x1,%edx
48 0f 44 c6 cmove %rsi,%rax
to become:
48 8b 46 08 mov 0x8(%rsi),%rax
48 8d 78 ff lea -0x1(%rax),%rdi
a8 01 test $0x1,%al
48 0f 44 fe cmove %rsi,%rdi
for a reduction of a single byte. Once the NFS client is converted to
use folios, this entire sequence will disappear.
Also add folio_mapping() documentation.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
2020-12-10 10:55:05 -05:00
|
|
|
if (unlikely(folio_test_swapcache(folio)))
|
2023-08-21 18:08:48 +02:00
|
|
|
return swap_address_space(folio->swap);
|
2015-04-15 16:14:53 -07:00
|
|
|
|
mm/util: Add folio_mapping() and folio_file_mapping()
These are the folio equivalent of page_mapping() and page_file_mapping().
Add an out-of-line page_mapping() wrapper around folio_mapping()
in order to prevent the page_folio() call from bloating every caller
of page_mapping(). Adjust page_file_mapping() and page_mapping_file()
to use folios internally. Rename __page_file_mapping() to
swapcache_mapping() and change it to take a folio.
This ends up saving 122 bytes of text overall. folio_mapping() is
45 bytes shorter than page_mapping() was, but the new page_mapping()
wrapper is 30 bytes. The major reduction is a few bytes less in dozens
of nfs functions (which call page_file_mapping()). Most of these appear
to be a slight change in gcc's register allocation decisions, which allow:
48 8b 56 08 mov 0x8(%rsi),%rdx
48 8d 42 ff lea -0x1(%rdx),%rax
83 e2 01 and $0x1,%edx
48 0f 44 c6 cmove %rsi,%rax
to become:
48 8b 46 08 mov 0x8(%rsi),%rax
48 8d 78 ff lea -0x1(%rax),%rdi
a8 01 test $0x1,%al
48 0f 44 fe cmove %rsi,%rdi
for a reduction of a single byte. Once the NFS client is converted to
use folios, this entire sequence will disappear.
Also add folio_mapping() documentation.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
2020-12-10 10:55:05 -05:00
|
|
|
mapping = folio->mapping;
|
2025-07-04 12:25:20 +02:00
|
|
|
if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS)
|
2015-04-15 16:14:53 -07:00
|
|
|
return NULL;
|
mm: migrate: support non-lru movable page migration
We have allowed migration for only LRU pages until now and it was enough
to make high-order pages. But recently, embedded system(e.g., webOS,
android) uses lots of non-movable pages(e.g., zram, GPU memory) so we
have seen several reports about troubles of small high-order allocation.
For fixing the problem, there were several efforts (e,g,. enhance
compaction algorithm, SLUB fallback to 0-order page, reserved memory,
vmalloc and so on) but if there are lots of non-movable pages in system,
their solutions are void in the long run.
So, this patch is to support facility to change non-movable pages with
movable. For the feature, this patch introduces functions related to
migration to address_space_operations as well as some page flags.
If a driver want to make own pages movable, it should define three
functions which are function pointers of struct
address_space_operations.
1. bool (*isolate_page) (struct page *page, isolate_mode_t mode);
What VM expects on isolate_page function of driver is to return *true*
if driver isolates page successfully. On returing true, VM marks the
page as PG_isolated so concurrent isolation in several CPUs skip the
page for isolation. If a driver cannot isolate the page, it should
return *false*.
Once page is successfully isolated, VM uses page.lru fields so driver
shouldn't expect to preserve values in that fields.
2. int (*migratepage) (struct address_space *mapping,
struct page *newpage, struct page *oldpage, enum migrate_mode);
After isolation, VM calls migratepage of driver with isolated page. The
function of migratepage is to move content of the old page to new page
and set up fields of struct page newpage. Keep in mind that you should
indicate to the VM the oldpage is no longer movable via
__ClearPageMovable() under page_lock if you migrated the oldpage
successfully and returns 0. If driver cannot migrate the page at the
moment, driver can return -EAGAIN. On -EAGAIN, VM will retry page
migration in a short time because VM interprets -EAGAIN as "temporal
migration failure". On returning any error except -EAGAIN, VM will give
up the page migration without retrying in this time.
Driver shouldn't touch page.lru field VM using in the functions.
3. void (*putback_page)(struct page *);
If migration fails on isolated page, VM should return the isolated page
to the driver so VM calls driver's putback_page with migration failed
page. In this function, driver should put the isolated page back to the
own data structure.
4. non-lru movable page flags
There are two page flags for supporting non-lru movable page.
* PG_movable
Driver should use the below function to make page movable under
page_lock.
void __SetPageMovable(struct page *page, struct address_space *mapping)
It needs argument of address_space for registering migration family
functions which will be called by VM. Exactly speaking, PG_movable is
not a real flag of struct page. Rather than, VM reuses page->mapping's
lower bits to represent it.
#define PAGE_MAPPING_MOVABLE 0x2
page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;
so driver shouldn't access page->mapping directly. Instead, driver
should use page_mapping which mask off the low two bits of page->mapping
so it can get right struct address_space.
For testing of non-lru movable page, VM supports __PageMovable function.
However, it doesn't guarantee to identify non-lru movable page because
page->mapping field is unified with other variables in struct page. As
well, if driver releases the page after isolation by VM, page->mapping
doesn't have stable value although it has PAGE_MAPPING_MOVABLE (Look at
__ClearPageMovable). But __PageMovable is cheap to catch whether page
is LRU or non-lru movable once the page has been isolated. Because LRU
pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also
good for just peeking to test non-lru movable pages before more
expensive checking with lock_page in pfn scanning to select victim.
For guaranteeing non-lru movable page, VM provides PageMovable function.
Unlike __PageMovable, PageMovable functions validates page->mapping and
mapping->a_ops->isolate_page under lock_page. The lock_page prevents
sudden destroying of page->mapping.
Driver using __SetPageMovable should clear the flag via
__ClearMovablePage under page_lock before the releasing the page.
* PG_isolated
To prevent concurrent isolation among several CPUs, VM marks isolated
page as PG_isolated under lock_page. So if a CPU encounters PG_isolated
non-lru movable page, it can skip it. Driver doesn't need to manipulate
the flag because VM will set/clear it automatically. Keep in mind that
if driver sees PG_isolated page, it means the page have been isolated by
VM so it shouldn't touch page.lru field. PG_isolated is alias with
PG_reclaim flag so driver shouldn't use the flag for own purpose.
[opensource.ganesh@gmail.com: mm/compaction: remove local variable is_lru]
Link: http://lkml.kernel.org/r/20160618014841.GA7422@leo-test
Link: http://lkml.kernel.org/r/1464736881-24886-3-git-send-email-minchan@kernel.org
Signed-off-by: Gioh Kim <gi-oh.kim@profitbricks.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rafael Aquini <aquini@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: John Einar Reitan <john.reitan@foss.arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-07-26 15:23:05 -07:00
|
|
|
|
2022-06-07 15:38:48 -04:00
|
|
|
return mapping;
|
2013-02-22 16:34:35 -08:00
|
|
|
}
|
mm/util: Add folio_mapping() and folio_file_mapping()
These are the folio equivalent of page_mapping() and page_file_mapping().
Add an out-of-line page_mapping() wrapper around folio_mapping()
in order to prevent the page_folio() call from bloating every caller
of page_mapping(). Adjust page_file_mapping() and page_mapping_file()
to use folios internally. Rename __page_file_mapping() to
swapcache_mapping() and change it to take a folio.
This ends up saving 122 bytes of text overall. folio_mapping() is
45 bytes shorter than page_mapping() was, but the new page_mapping()
wrapper is 30 bytes. The major reduction is a few bytes less in dozens
of nfs functions (which call page_file_mapping()). Most of these appear
to be a slight change in gcc's register allocation decisions, which allow:
48 8b 56 08 mov 0x8(%rsi),%rdx
48 8d 42 ff lea -0x1(%rdx),%rax
83 e2 01 and $0x1,%edx
48 0f 44 c6 cmove %rsi,%rax
to become:
48 8b 46 08 mov 0x8(%rsi),%rax
48 8d 78 ff lea -0x1(%rax),%rdi
a8 01 test $0x1,%al
48 0f 44 fe cmove %rsi,%rdi
for a reduction of a single byte. Once the NFS client is converted to
use folios, this entire sequence will disappear.
Also add folio_mapping() documentation.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
2020-12-10 10:55:05 -05:00
|
|
|
EXPORT_SYMBOL(folio_mapping);
|
2013-02-22 16:34:35 -08:00
|
|
|
|
2021-05-07 15:05:06 -04:00
|
|
|
/**
|
|
|
|
|
* folio_copy - Copy the contents of one folio to another.
|
|
|
|
|
* @dst: Folio to copy to.
|
|
|
|
|
* @src: Folio to copy from.
|
|
|
|
|
*
|
|
|
|
|
* The bytes in the folio represented by @src are copied to @dst.
|
|
|
|
|
* Assumes the caller has validated that @dst is at least as large as @src.
|
|
|
|
|
* Can be called in atomic context for order-0 folios, but if the folio is
|
|
|
|
|
* larger, it may sleep.
|
|
|
|
|
*/
|
|
|
|
|
void folio_copy(struct folio *dst, struct folio *src)
|
2021-07-12 16:32:07 +01:00
|
|
|
{
|
2021-05-07 15:05:06 -04:00
|
|
|
long i = 0;
|
|
|
|
|
long nr = folio_nr_pages(src);
|
2021-07-12 16:32:07 +01:00
|
|
|
|
2021-05-07 15:05:06 -04:00
|
|
|
for (;;) {
|
|
|
|
|
copy_highpage(folio_page(dst, i), folio_page(src, i));
|
|
|
|
|
if (++i == nr)
|
|
|
|
|
break;
|
2021-07-12 16:32:07 +01:00
|
|
|
cond_resched();
|
|
|
|
|
}
|
|
|
|
|
}
|
2023-10-16 21:10:59 +01:00
|
|
|
EXPORT_SYMBOL(folio_copy);
|
2021-07-12 16:32:07 +01:00
|
|
|
|
2024-06-26 16:53:24 +08:00
|
|
|
int folio_mc_copy(struct folio *dst, struct folio *src)
|
|
|
|
|
{
|
|
|
|
|
long nr = folio_nr_pages(src);
|
|
|
|
|
long i = 0;
|
|
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
|
if (copy_mc_highpage(folio_page(dst, i), folio_page(src, i)))
|
|
|
|
|
return -EHWPOISON;
|
|
|
|
|
if (++i == nr)
|
|
|
|
|
break;
|
|
|
|
|
cond_resched();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(folio_mc_copy);
|
|
|
|
|
|
2016-03-17 14:18:50 -07:00
|
|
|
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
|
2025-01-11 15:07:40 +08:00
|
|
|
static int sysctl_overcommit_ratio __read_mostly = 50;
|
|
|
|
|
static unsigned long sysctl_overcommit_kbytes __read_mostly;
|
2016-03-17 14:18:50 -07:00
|
|
|
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
|
|
|
|
|
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
|
|
|
|
|
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
|
|
|
|
|
|
2025-01-11 15:07:40 +08:00
|
|
|
#ifdef CONFIG_SYSCTL
|
|
|
|
|
|
|
|
|
|
static int overcommit_ratio_handler(const struct ctl_table *table, int write,
|
|
|
|
|
void *buffer, size_t *lenp, loff_t *ppos)
|
2014-01-21 15:49:14 -08:00
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
ret = proc_dointvec(table, write, buffer, lenp, ppos);
|
|
|
|
|
if (ret == 0 && write)
|
|
|
|
|
sysctl_overcommit_kbytes = 0;
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-06 23:23:15 -07:00
|
|
|
static void sync_overcommit_as(struct work_struct *dummy)
|
|
|
|
|
{
|
|
|
|
|
percpu_counter_sync(&vm_committed_as);
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-11 15:07:40 +08:00
|
|
|
static int overcommit_policy_handler(const struct ctl_table *table, int write,
|
|
|
|
|
void *buffer, size_t *lenp, loff_t *ppos)
|
2020-08-06 23:23:15 -07:00
|
|
|
{
|
|
|
|
|
struct ctl_table t;
|
mm: fix uninitialized use in overcommit_policy_handler
We get an unexpected value of /proc/sys/vm/overcommit_memory after
running the following program:
int main()
{
int fd = open("/proc/sys/vm/overcommit_memory", O_RDWR);
write(fd, "1", 1);
write(fd, "2", 1);
close(fd);
}
write(fd, "2", 1) will pass *ppos = 1 to proc_dointvec_minmax.
proc_dointvec_minmax will return 0 without setting new_policy.
t.data = &new_policy;
ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos)
-->do_proc_dointvec
-->__do_proc_dointvec
if (write) {
if (proc_first_pos_non_zero_ignore(ppos, table))
goto out;
sysctl_overcommit_memory = new_policy;
so sysctl_overcommit_memory will be set to an uninitialized value.
Check whether new_policy has been changed by proc_dointvec_minmax.
Link: https://lkml.kernel.org/r/20210923020524.13289-1-chenjun102@huawei.com
Fixes: 56f3547bfa4d ("mm: adjust vm_committed_as_batch according to vm overcommit policy")
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Feng Tang <feng.tang@intel.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Rui Xiang <rui.xiang@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-24 15:44:06 -07:00
|
|
|
int new_policy = -1;
|
2020-08-06 23:23:15 -07:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The deviation of sync_overcommit_as could be big with loose policy
|
|
|
|
|
* like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
|
|
|
|
|
* strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
|
2021-05-04 18:38:35 -07:00
|
|
|
* with the strict "NEVER", and to avoid possible race condition (even
|
2020-08-06 23:23:15 -07:00
|
|
|
* though user usually won't too frequently do the switching to policy
|
|
|
|
|
* OVERCOMMIT_NEVER), the switch is done in the following order:
|
|
|
|
|
* 1. changing the batch
|
|
|
|
|
* 2. sync percpu count on each CPU
|
|
|
|
|
* 3. switch the policy
|
|
|
|
|
*/
|
|
|
|
|
if (write) {
|
|
|
|
|
t = *table;
|
|
|
|
|
t.data = &new_policy;
|
|
|
|
|
ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
|
mm: fix uninitialized use in overcommit_policy_handler
We get an unexpected value of /proc/sys/vm/overcommit_memory after
running the following program:
int main()
{
int fd = open("/proc/sys/vm/overcommit_memory", O_RDWR);
write(fd, "1", 1);
write(fd, "2", 1);
close(fd);
}
write(fd, "2", 1) will pass *ppos = 1 to proc_dointvec_minmax.
proc_dointvec_minmax will return 0 without setting new_policy.
t.data = &new_policy;
ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos)
-->do_proc_dointvec
-->__do_proc_dointvec
if (write) {
if (proc_first_pos_non_zero_ignore(ppos, table))
goto out;
sysctl_overcommit_memory = new_policy;
so sysctl_overcommit_memory will be set to an uninitialized value.
Check whether new_policy has been changed by proc_dointvec_minmax.
Link: https://lkml.kernel.org/r/20210923020524.13289-1-chenjun102@huawei.com
Fixes: 56f3547bfa4d ("mm: adjust vm_committed_as_batch according to vm overcommit policy")
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Feng Tang <feng.tang@intel.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Rui Xiang <rui.xiang@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-24 15:44:06 -07:00
|
|
|
if (ret || new_policy == -1)
|
2020-08-06 23:23:15 -07:00
|
|
|
return ret;
|
|
|
|
|
|
|
|
|
|
mm_compute_batch(new_policy);
|
|
|
|
|
if (new_policy == OVERCOMMIT_NEVER)
|
|
|
|
|
schedule_on_each_cpu(sync_overcommit_as);
|
|
|
|
|
sysctl_overcommit_memory = new_policy;
|
|
|
|
|
} else {
|
|
|
|
|
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-11 15:07:40 +08:00
|
|
|
static int overcommit_kbytes_handler(const struct ctl_table *table, int write,
|
|
|
|
|
void *buffer, size_t *lenp, loff_t *ppos)
|
2014-01-21 15:49:14 -08:00
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
|
|
|
|
|
if (ret == 0 && write)
|
|
|
|
|
sysctl_overcommit_ratio = 0;
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-11 15:07:40 +08:00
|
|
|
static const struct ctl_table util_sysctl_table[] = {
|
|
|
|
|
{
|
|
|
|
|
.procname = "overcommit_memory",
|
|
|
|
|
.data = &sysctl_overcommit_memory,
|
|
|
|
|
.maxlen = sizeof(sysctl_overcommit_memory),
|
|
|
|
|
.mode = 0644,
|
|
|
|
|
.proc_handler = overcommit_policy_handler,
|
|
|
|
|
.extra1 = SYSCTL_ZERO,
|
|
|
|
|
.extra2 = SYSCTL_TWO,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.procname = "overcommit_ratio",
|
|
|
|
|
.data = &sysctl_overcommit_ratio,
|
|
|
|
|
.maxlen = sizeof(sysctl_overcommit_ratio),
|
|
|
|
|
.mode = 0644,
|
|
|
|
|
.proc_handler = overcommit_ratio_handler,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.procname = "overcommit_kbytes",
|
|
|
|
|
.data = &sysctl_overcommit_kbytes,
|
|
|
|
|
.maxlen = sizeof(sysctl_overcommit_kbytes),
|
|
|
|
|
.mode = 0644,
|
|
|
|
|
.proc_handler = overcommit_kbytes_handler,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.procname = "user_reserve_kbytes",
|
|
|
|
|
.data = &sysctl_user_reserve_kbytes,
|
|
|
|
|
.maxlen = sizeof(sysctl_user_reserve_kbytes),
|
|
|
|
|
.mode = 0644,
|
|
|
|
|
.proc_handler = proc_doulongvec_minmax,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.procname = "admin_reserve_kbytes",
|
|
|
|
|
.data = &sysctl_admin_reserve_kbytes,
|
|
|
|
|
.maxlen = sizeof(sysctl_admin_reserve_kbytes),
|
|
|
|
|
.mode = 0644,
|
|
|
|
|
.proc_handler = proc_doulongvec_minmax,
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static int __init init_vm_util_sysctls(void)
|
|
|
|
|
{
|
|
|
|
|
register_sysctl_init("vm", util_sysctl_table);
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
subsys_initcall(init_vm_util_sysctls);
|
|
|
|
|
#endif /* CONFIG_SYSCTL */
|
|
|
|
|
|
2013-11-12 15:08:31 -08:00
|
|
|
/*
|
|
|
|
|
* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
|
|
|
|
|
*/
|
|
|
|
|
unsigned long vm_commit_limit(void)
|
|
|
|
|
{
|
2014-01-21 15:49:14 -08:00
|
|
|
unsigned long allowed;
|
|
|
|
|
|
|
|
|
|
if (sysctl_overcommit_kbytes)
|
|
|
|
|
allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
|
|
|
|
|
else
|
2018-12-28 00:34:29 -08:00
|
|
|
allowed = ((totalram_pages() - hugetlb_total_pages())
|
2014-01-21 15:49:14 -08:00
|
|
|
* sysctl_overcommit_ratio / 100);
|
|
|
|
|
allowed += total_swap_pages;
|
|
|
|
|
|
|
|
|
|
return allowed;
|
2013-11-12 15:08:31 -08:00
|
|
|
}
|
|
|
|
|
|
2016-03-17 14:18:50 -07:00
|
|
|
/*
|
|
|
|
|
* Make sure vm_committed_as in one cacheline and not cacheline shared with
|
|
|
|
|
* other variables. It can be updated by several CPUs frequently.
|
|
|
|
|
*/
|
|
|
|
|
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The global memory commitment made in the system can be a metric
|
|
|
|
|
* that can be used to drive ballooning decisions when Linux is hosted
|
|
|
|
|
* as a guest. On Hyper-V, the host implements a policy engine for dynamically
|
|
|
|
|
* balancing memory across competing virtual machines that are hosted.
|
|
|
|
|
* Several metrics drive this policy engine including the guest reported
|
|
|
|
|
* memory commitment.
|
2020-08-06 23:23:07 -07:00
|
|
|
*
|
|
|
|
|
* The time cost of this is very low for small platforms, and for big
|
|
|
|
|
* platform like a 2S/36C/72T Skylake server, in worst case where
|
|
|
|
|
* vm_committed_as's spinlock is under severe contention, the time cost
|
|
|
|
|
* could be about 30~40 microseconds.
|
2016-03-17 14:18:50 -07:00
|
|
|
*/
|
|
|
|
|
unsigned long vm_memory_committed(void)
|
|
|
|
|
{
|
2020-08-06 23:23:07 -07:00
|
|
|
return percpu_counter_sum_positive(&vm_committed_as);
|
2016-03-17 14:18:50 -07:00
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL_GPL(vm_memory_committed);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Check that a process has enough memory to allocate a new virtual
|
|
|
|
|
* mapping. 0 means there is enough memory for the allocation to
|
|
|
|
|
* succeed and -ENOMEM implies there is not.
|
|
|
|
|
*
|
|
|
|
|
* We currently support three overcommit policies, which are set via the
|
2022-06-27 09:00:26 +03:00
|
|
|
* vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst
|
2016-03-17 14:18:50 -07:00
|
|
|
*
|
|
|
|
|
* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
|
|
|
|
|
* Additional code 2002 Jul 20 by Robert Love.
|
|
|
|
|
*
|
|
|
|
|
* cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
|
|
|
|
|
*
|
|
|
|
|
* Note this is a helper function intended to be used by LSMs which
|
|
|
|
|
* wish to use this logic.
|
|
|
|
|
*/
|
2025-09-01 22:50:15 +02:00
|
|
|
int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin)
|
2016-03-17 14:18:50 -07:00
|
|
|
{
|
2019-05-13 17:21:50 -07:00
|
|
|
long allowed;
|
2024-02-22 19:46:17 +00:00
|
|
|
unsigned long bytes_failed;
|
2016-03-17 14:18:50 -07:00
|
|
|
|
|
|
|
|
vm_acct_memory(pages);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Sometimes we want to use more memory than we have
|
|
|
|
|
*/
|
|
|
|
|
if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
|
2019-05-13 17:21:50 -07:00
|
|
|
if (pages > totalram_pages() + total_swap_pages)
|
2016-03-17 14:18:50 -07:00
|
|
|
goto error;
|
2019-05-13 17:21:50 -07:00
|
|
|
return 0;
|
2016-03-17 14:18:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
allowed = vm_commit_limit();
|
|
|
|
|
/*
|
|
|
|
|
* Reserve some for root
|
|
|
|
|
*/
|
|
|
|
|
if (!cap_sys_admin)
|
|
|
|
|
allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Don't let a single process grow so big a user can't recover
|
|
|
|
|
*/
|
|
|
|
|
if (mm) {
|
2019-05-13 17:21:50 -07:00
|
|
|
long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
|
|
|
|
|
|
2016-03-17 14:18:50 -07:00
|
|
|
allowed -= min_t(long, mm->total_vm / 32, reserve);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
|
|
|
|
|
return 0;
|
|
|
|
|
error:
|
2024-02-22 19:46:17 +00:00
|
|
|
bytes_failed = pages << PAGE_SHIFT;
|
|
|
|
|
pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n",
|
|
|
|
|
__func__, current->pid, current->comm, bytes_failed);
|
2016-03-17 14:18:50 -07:00
|
|
|
vm_unacct_memory(pages);
|
|
|
|
|
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
}
|
|
|
|
|
|
2014-02-11 10:11:59 -08:00
|
|
|
/**
|
|
|
|
|
* get_cmdline() - copy the cmdline value to a buffer.
|
|
|
|
|
* @task: the task whose cmdline value to copy.
|
|
|
|
|
* @buffer: the buffer to copy to.
|
|
|
|
|
* @buflen: the length of the buffer. Larger cmdline values are truncated
|
|
|
|
|
* to this length.
|
2019-03-05 15:48:42 -08:00
|
|
|
*
|
|
|
|
|
* Return: the size of the cmdline field copied. Note that the copy does
|
2014-02-11 10:11:59 -08:00
|
|
|
* not guarantee an ending NULL byte.
|
|
|
|
|
*/
|
|
|
|
|
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
|
|
|
|
|
{
|
|
|
|
|
int res = 0;
|
|
|
|
|
unsigned int len;
|
|
|
|
|
struct mm_struct *mm = get_task_mm(task);
|
2016-01-20 15:01:05 -08:00
|
|
|
unsigned long arg_start, arg_end, env_start, env_end;
|
2014-02-11 10:11:59 -08:00
|
|
|
if (!mm)
|
|
|
|
|
goto out;
|
|
|
|
|
if (!mm->arg_end)
|
|
|
|
|
goto out_mm; /* Shh! No looking before we're done */
|
|
|
|
|
|
2019-05-31 22:30:19 -07:00
|
|
|
spin_lock(&mm->arg_lock);
|
2016-01-20 15:01:05 -08:00
|
|
|
arg_start = mm->arg_start;
|
|
|
|
|
arg_end = mm->arg_end;
|
|
|
|
|
env_start = mm->env_start;
|
|
|
|
|
env_end = mm->env_end;
|
2019-05-31 22:30:19 -07:00
|
|
|
spin_unlock(&mm->arg_lock);
|
2016-01-20 15:01:05 -08:00
|
|
|
|
|
|
|
|
len = arg_end - arg_start;
|
2014-02-11 10:11:59 -08:00
|
|
|
|
|
|
|
|
if (len > buflen)
|
|
|
|
|
len = buflen;
|
|
|
|
|
|
2016-10-13 01:20:20 +01:00
|
|
|
res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
|
2014-02-11 10:11:59 -08:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If the nul at the end of args has been overwritten, then
|
|
|
|
|
* assume application is using setproctitle(3).
|
|
|
|
|
*/
|
|
|
|
|
if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
|
|
|
|
|
len = strnlen(buffer, res);
|
|
|
|
|
if (len < res) {
|
|
|
|
|
res = len;
|
|
|
|
|
} else {
|
2016-01-20 15:01:05 -08:00
|
|
|
len = env_end - env_start;
|
2014-02-11 10:11:59 -08:00
|
|
|
if (len > buflen - res)
|
|
|
|
|
len = buflen - res;
|
2016-01-20 15:01:05 -08:00
|
|
|
res += access_process_vm(task, env_start,
|
2016-10-13 01:20:20 +01:00
|
|
|
buffer+res, len,
|
|
|
|
|
FOLL_FORCE);
|
2014-02-11 10:11:59 -08:00
|
|
|
res = strnlen(buffer, res);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
out_mm:
|
|
|
|
|
mmput(mm);
|
|
|
|
|
out:
|
|
|
|
|
return res;
|
|
|
|
|
}
|
2019-09-23 15:38:19 -07:00
|
|
|
|
2019-11-27 09:53:44 +00:00
|
|
|
int __weak memcmp_pages(struct page *page1, struct page *page2)
|
2019-09-23 15:38:19 -07:00
|
|
|
{
|
|
|
|
|
char *addr1, *addr2;
|
|
|
|
|
int ret;
|
|
|
|
|
|
2023-11-20 15:15:27 +01:00
|
|
|
addr1 = kmap_local_page(page1);
|
|
|
|
|
addr2 = kmap_local_page(page2);
|
2019-09-23 15:38:19 -07:00
|
|
|
ret = memcmp(addr1, addr2, PAGE_SIZE);
|
2023-11-20 15:15:27 +01:00
|
|
|
kunmap_local(addr2);
|
|
|
|
|
kunmap_local(addr1);
|
2019-09-23 15:38:19 -07:00
|
|
|
return ret;
|
|
|
|
|
}
|
mm: Add mem_dump_obj() to print source of memory block
There are kernel facilities such as per-CPU reference counts that give
error messages in generic handlers or callbacks, whose messages are
unenlightening. In the case of per-CPU reference-count underflow, this
is not a problem when creating a new use of this facility because in that
case the bug is almost certainly in the code implementing that new use.
However, trouble arises when deploying across many systems, which might
exercise corner cases that were not seen during development and testing.
Here, it would be really nice to get some kind of hint as to which of
several uses the underflow was caused by.
This commit therefore exposes a mem_dump_obj() function that takes
a pointer to memory (which must still be allocated if it has been
dynamically allocated) and prints available information on where that
memory came from. This pointer can reference the middle of the block as
well as the beginning of the block, as needed by things like RCU callback
functions and timer handlers that might not know where the beginning of
the memory block is. These functions and handlers can use mem_dump_obj()
to print out better hints as to where the problem might lie.
The information printed can depend on kernel configuration. For example,
the allocation return address can be printed only for slab and slub,
and even then only when the necessary debug has been enabled. For slab,
build with CONFIG_DEBUG_SLAB=y, and either use sizes with ample space
to the next power of two or use the SLAB_STORE_USER when creating the
kmem_cache structure. For slub, build with CONFIG_SLUB_DEBUG=y and
boot with slub_debug=U, or pass SLAB_STORE_USER to kmem_cache_create()
if more focused use is desired. Also for slub, use CONFIG_STACKTRACE
to enable printing of the allocation-time stack trace.
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <linux-mm@kvack.org>
Reported-by: Andrii Nakryiko <andrii@kernel.org>
[ paulmck: Convert to printing and change names per Joonsoo Kim. ]
[ paulmck: Move slab definition per Stephen Rothwell and kbuild test robot. ]
[ paulmck: Handle CONFIG_MMU=n case where vmalloc() is kmalloc(). ]
[ paulmck: Apply Vlastimil Babka feedback on slab.c kmem_provenance(). ]
[ paulmck: Extract more info from !SLUB_DEBUG per Joonsoo Kim. ]
[ paulmck: Explicitly check for small pointers per Naresh Kamboju. ]
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
2020-12-07 17:41:02 -08:00
|
|
|
|
2021-01-07 13:46:11 -08:00
|
|
|
#ifdef CONFIG_PRINTK
|
mm: Add mem_dump_obj() to print source of memory block
There are kernel facilities such as per-CPU reference counts that give
error messages in generic handlers or callbacks, whose messages are
unenlightening. In the case of per-CPU reference-count underflow, this
is not a problem when creating a new use of this facility because in that
case the bug is almost certainly in the code implementing that new use.
However, trouble arises when deploying across many systems, which might
exercise corner cases that were not seen during development and testing.
Here, it would be really nice to get some kind of hint as to which of
several uses the underflow was caused by.
This commit therefore exposes a mem_dump_obj() function that takes
a pointer to memory (which must still be allocated if it has been
dynamically allocated) and prints available information on where that
memory came from. This pointer can reference the middle of the block as
well as the beginning of the block, as needed by things like RCU callback
functions and timer handlers that might not know where the beginning of
the memory block is. These functions and handlers can use mem_dump_obj()
to print out better hints as to where the problem might lie.
The information printed can depend on kernel configuration. For example,
the allocation return address can be printed only for slab and slub,
and even then only when the necessary debug has been enabled. For slab,
build with CONFIG_DEBUG_SLAB=y, and either use sizes with ample space
to the next power of two or use the SLAB_STORE_USER when creating the
kmem_cache structure. For slub, build with CONFIG_SLUB_DEBUG=y and
boot with slub_debug=U, or pass SLAB_STORE_USER to kmem_cache_create()
if more focused use is desired. Also for slub, use CONFIG_STACKTRACE
to enable printing of the allocation-time stack trace.
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <linux-mm@kvack.org>
Reported-by: Andrii Nakryiko <andrii@kernel.org>
[ paulmck: Convert to printing and change names per Joonsoo Kim. ]
[ paulmck: Move slab definition per Stephen Rothwell and kbuild test robot. ]
[ paulmck: Handle CONFIG_MMU=n case where vmalloc() is kmalloc(). ]
[ paulmck: Apply Vlastimil Babka feedback on slab.c kmem_provenance(). ]
[ paulmck: Extract more info from !SLUB_DEBUG per Joonsoo Kim. ]
[ paulmck: Explicitly check for small pointers per Naresh Kamboju. ]
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
2020-12-07 17:41:02 -08:00
|
|
|
/**
|
|
|
|
|
* mem_dump_obj - Print available provenance information
|
|
|
|
|
* @object: object for which to find provenance information.
|
|
|
|
|
*
|
|
|
|
|
* This function uses pr_cont(), so that the caller is expected to have
|
|
|
|
|
* printed out whatever preamble is appropriate. The provenance information
|
|
|
|
|
* depends on the type of object and on how much debugging is enabled.
|
|
|
|
|
* For example, for a slab-cache object, the slab name is printed, and,
|
|
|
|
|
* if available, the return address and stack trace from the allocation
|
2021-03-16 16:07:11 +05:30
|
|
|
* and last free path of that object.
|
mm: Add mem_dump_obj() to print source of memory block
There are kernel facilities such as per-CPU reference counts that give
error messages in generic handlers or callbacks, whose messages are
unenlightening. In the case of per-CPU reference-count underflow, this
is not a problem when creating a new use of this facility because in that
case the bug is almost certainly in the code implementing that new use.
However, trouble arises when deploying across many systems, which might
exercise corner cases that were not seen during development and testing.
Here, it would be really nice to get some kind of hint as to which of
several uses the underflow was caused by.
This commit therefore exposes a mem_dump_obj() function that takes
a pointer to memory (which must still be allocated if it has been
dynamically allocated) and prints available information on where that
memory came from. This pointer can reference the middle of the block as
well as the beginning of the block, as needed by things like RCU callback
functions and timer handlers that might not know where the beginning of
the memory block is. These functions and handlers can use mem_dump_obj()
to print out better hints as to where the problem might lie.
The information printed can depend on kernel configuration. For example,
the allocation return address can be printed only for slab and slub,
and even then only when the necessary debug has been enabled. For slab,
build with CONFIG_DEBUG_SLAB=y, and either use sizes with ample space
to the next power of two or use the SLAB_STORE_USER when creating the
kmem_cache structure. For slub, build with CONFIG_SLUB_DEBUG=y and
boot with slub_debug=U, or pass SLAB_STORE_USER to kmem_cache_create()
if more focused use is desired. Also for slub, use CONFIG_STACKTRACE
to enable printing of the allocation-time stack trace.
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <linux-mm@kvack.org>
Reported-by: Andrii Nakryiko <andrii@kernel.org>
[ paulmck: Convert to printing and change names per Joonsoo Kim. ]
[ paulmck: Move slab definition per Stephen Rothwell and kbuild test robot. ]
[ paulmck: Handle CONFIG_MMU=n case where vmalloc() is kmalloc(). ]
[ paulmck: Apply Vlastimil Babka feedback on slab.c kmem_provenance(). ]
[ paulmck: Extract more info from !SLUB_DEBUG per Joonsoo Kim. ]
[ paulmck: Explicitly check for small pointers per Naresh Kamboju. ]
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
2020-12-07 17:41:02 -08:00
|
|
|
*/
|
|
|
|
|
void mem_dump_obj(void *object)
|
|
|
|
|
{
|
2021-05-04 18:38:32 -07:00
|
|
|
const char *type;
|
|
|
|
|
|
2023-08-05 11:17:25 +08:00
|
|
|
if (kmem_dump_obj(object))
|
2020-12-08 16:13:57 -08:00
|
|
|
return;
|
2021-05-04 18:38:32 -07:00
|
|
|
|
2020-12-08 16:13:57 -08:00
|
|
|
if (vmalloc_dump_obj(object))
|
|
|
|
|
return;
|
2021-05-04 18:38:32 -07:00
|
|
|
|
rcu: dump vmalloc memory info safely
Currently, for double invoke call_rcu(), will dump rcu_head objects memory
info, if the objects is not allocated from the slab allocator, the
vmalloc_dump_obj() will be invoke and the vmap_area_lock spinlock need to
be held, since the call_rcu() can be invoked in interrupt context,
therefore, there is a possibility of spinlock deadlock scenarios.
And in Preempt-RT kernel, the rcutorture test also trigger the following
lockdep warning:
BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1, name: swapper/0
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 1
3 locks held by swapper/0/1:
#0: ffffffffb534ee80 (fullstop_mutex){+.+.}-{4:4}, at: torture_init_begin+0x24/0xa0
#1: ffffffffb5307940 (rcu_read_lock){....}-{1:3}, at: rcu_torture_init+0x1ec7/0x2370
#2: ffffffffb536af40 (vmap_area_lock){+.+.}-{3:3}, at: find_vmap_area+0x1f/0x70
irq event stamp: 565512
hardirqs last enabled at (565511): [<ffffffffb379b138>] __call_rcu_common+0x218/0x940
hardirqs last disabled at (565512): [<ffffffffb5804262>] rcu_torture_init+0x20b2/0x2370
softirqs last enabled at (399112): [<ffffffffb36b2586>] __local_bh_enable_ip+0x126/0x170
softirqs last disabled at (399106): [<ffffffffb43fef59>] inet_register_protosw+0x9/0x1d0
Preemption disabled at:
[<ffffffffb58040c3>] rcu_torture_init+0x1f13/0x2370
CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 6.5.0-rc4-rt2-yocto-preempt-rt+ #15
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x68/0xb0
dump_stack+0x14/0x20
__might_resched+0x1aa/0x280
? __pfx_rcu_torture_err_cb+0x10/0x10
rt_spin_lock+0x53/0x130
? find_vmap_area+0x1f/0x70
find_vmap_area+0x1f/0x70
vmalloc_dump_obj+0x20/0x60
mem_dump_obj+0x22/0x90
__call_rcu_common+0x5bf/0x940
? debug_smp_processor_id+0x1b/0x30
call_rcu_hurry+0x14/0x20
rcu_torture_init+0x1f82/0x2370
? __pfx_rcu_torture_leak_cb+0x10/0x10
? __pfx_rcu_torture_leak_cb+0x10/0x10
? __pfx_rcu_torture_init+0x10/0x10
do_one_initcall+0x6c/0x300
? debug_smp_processor_id+0x1b/0x30
kernel_init_freeable+0x2b9/0x540
? __pfx_kernel_init+0x10/0x10
kernel_init+0x1f/0x150
ret_from_fork+0x40/0x50
? __pfx_kernel_init+0x10/0x10
ret_from_fork_asm+0x1b/0x30
</TASK>
The previous patch fixes this by using the deadlock-safe best-effort
version of find_vm_area. However, in case of failure print the fact that
the pointer was a vmalloc pointer so that we print at least something.
Link: https://lkml.kernel.org/r/20230904180806.1002832-2-joel@joelfernandes.org
Fixes: 98f180837a89 ("mm: Make mem_dump_obj() handle vmalloc() memory")
Signed-off-by: Zqiang <qiang.zhang1211@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reported-by: Zhen Lei <thunder.leizhen@huaweicloud.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-09-04 18:08:05 +00:00
|
|
|
if (is_vmalloc_addr(object))
|
|
|
|
|
type = "vmalloc memory";
|
|
|
|
|
else if (virt_addr_valid(object))
|
2021-05-04 18:38:32 -07:00
|
|
|
type = "non-slab/vmalloc memory";
|
|
|
|
|
else if (object == NULL)
|
|
|
|
|
type = "NULL pointer";
|
|
|
|
|
else if (object == ZERO_SIZE_PTR)
|
|
|
|
|
type = "zero-size pointer";
|
|
|
|
|
else
|
|
|
|
|
type = "non-paged memory";
|
|
|
|
|
|
|
|
|
|
pr_cont(" %s\n", type);
|
mm: Add mem_dump_obj() to print source of memory block
There are kernel facilities such as per-CPU reference counts that give
error messages in generic handlers or callbacks, whose messages are
unenlightening. In the case of per-CPU reference-count underflow, this
is not a problem when creating a new use of this facility because in that
case the bug is almost certainly in the code implementing that new use.
However, trouble arises when deploying across many systems, which might
exercise corner cases that were not seen during development and testing.
Here, it would be really nice to get some kind of hint as to which of
several uses the underflow was caused by.
This commit therefore exposes a mem_dump_obj() function that takes
a pointer to memory (which must still be allocated if it has been
dynamically allocated) and prints available information on where that
memory came from. This pointer can reference the middle of the block as
well as the beginning of the block, as needed by things like RCU callback
functions and timer handlers that might not know where the beginning of
the memory block is. These functions and handlers can use mem_dump_obj()
to print out better hints as to where the problem might lie.
The information printed can depend on kernel configuration. For example,
the allocation return address can be printed only for slab and slub,
and even then only when the necessary debug has been enabled. For slab,
build with CONFIG_DEBUG_SLAB=y, and either use sizes with ample space
to the next power of two or use the SLAB_STORE_USER when creating the
kmem_cache structure. For slub, build with CONFIG_SLUB_DEBUG=y and
boot with slub_debug=U, or pass SLAB_STORE_USER to kmem_cache_create()
if more focused use is desired. Also for slub, use CONFIG_STACKTRACE
to enable printing of the allocation-time stack trace.
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <linux-mm@kvack.org>
Reported-by: Andrii Nakryiko <andrii@kernel.org>
[ paulmck: Convert to printing and change names per Joonsoo Kim. ]
[ paulmck: Move slab definition per Stephen Rothwell and kbuild test robot. ]
[ paulmck: Handle CONFIG_MMU=n case where vmalloc() is kmalloc(). ]
[ paulmck: Apply Vlastimil Babka feedback on slab.c kmem_provenance(). ]
[ paulmck: Extract more info from !SLUB_DEBUG per Joonsoo Kim. ]
[ paulmck: Explicitly check for small pointers per Naresh Kamboju. ]
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
2020-12-07 17:41:02 -08:00
|
|
|
}
|
2020-12-07 21:23:36 -08:00
|
|
|
EXPORT_SYMBOL_GPL(mem_dump_obj);
|
2021-01-07 13:46:11 -08:00
|
|
|
#endif
|
2021-06-30 18:50:14 -07:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* A driver might set a page logically offline -- PageOffline() -- and
|
|
|
|
|
* turn the page inaccessible in the hypervisor; after that, access to page
|
|
|
|
|
* content can be fatal.
|
|
|
|
|
*
|
|
|
|
|
* Some special PFN walkers -- i.e., /proc/kcore -- read content of random
|
|
|
|
|
* pages after checking PageOffline(); however, these PFN walkers can race
|
|
|
|
|
* with drivers that set PageOffline().
|
|
|
|
|
*
|
|
|
|
|
* page_offline_freeze()/page_offline_thaw() allows for a subsystem to
|
|
|
|
|
* synchronize with such drivers, achieving that a page cannot be set
|
|
|
|
|
* PageOffline() while frozen.
|
|
|
|
|
*
|
|
|
|
|
* page_offline_begin()/page_offline_end() is used by drivers that care about
|
|
|
|
|
* such races when setting a page PageOffline().
|
|
|
|
|
*/
|
|
|
|
|
static DECLARE_RWSEM(page_offline_rwsem);
|
|
|
|
|
|
|
|
|
|
void page_offline_freeze(void)
|
|
|
|
|
{
|
|
|
|
|
down_read(&page_offline_rwsem);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void page_offline_thaw(void)
|
|
|
|
|
{
|
|
|
|
|
up_read(&page_offline_rwsem);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void page_offline_begin(void)
|
|
|
|
|
{
|
|
|
|
|
down_write(&page_offline_rwsem);
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(page_offline_begin);
|
|
|
|
|
|
|
|
|
|
void page_offline_end(void)
|
|
|
|
|
{
|
|
|
|
|
up_write(&page_offline_rwsem);
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(page_offline_end);
|
2020-12-16 11:06:33 -05:00
|
|
|
|
2023-08-02 16:13:33 +01:00
|
|
|
#ifndef flush_dcache_folio
|
2020-12-16 11:06:33 -05:00
|
|
|
void flush_dcache_folio(struct folio *folio)
|
|
|
|
|
{
|
|
|
|
|
long i, nr = folio_nr_pages(folio);
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < nr; i++)
|
|
|
|
|
flush_dcache_page(folio_page(folio, i));
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(flush_dcache_folio);
|
|
|
|
|
#endif
|
2025-06-09 17:57:49 +01:00
|
|
|
|
mm: do not assume file == vma->vm_file in compat_vma_mmap_prepare()
In commit bb666b7c2707 ("mm: add mmap_prepare() compatibility layer for
nested file systems") we introduced the ability for stacked drivers and
file systems to correctly invoke the f_op->mmap_prepare() handler from an
f_op->mmap() handler via a compatibility layer implemented in
compat_vma_mmap_prepare().
This populates vm_area_desc fields according to those found in the (not
yet fully initialised) VMA passed to f_op->mmap().
However this function implicitly assumes that the struct file which we are
operating upon is equal to vma->vm_file. This is not a safe assumption in
all cases.
The only really sane situation in which this matters would be something
like e.g. i915_gem_dmabuf_mmap() which invokes vfs_mmap() against
obj->base.filp:
ret = vfs_mmap(obj->base.filp, vma);
if (ret)
return ret;
And then sets the VMA's file to this, should the mmap operation succeed:
vma_set_file(vma, obj->base.filp);
That is - it is the file that is intended to back the VMA mapping.
This is not an issue currently, as so far we have only implemented
f_op->mmap_prepare() handlers for some file systems and internal mm uses,
and the only stacked f_op->mmap() operations that can be performed upon
these are those in backing_file_mmap() and coda_file_mmap(), both of which
use vma->vm_file.
However, moving forward, as we convert drivers to using
f_op->mmap_prepare(), this will become a problem.
Resolve this issue by explicitly setting desc->file to the provided file
parameter and update callers accordingly.
Callers are expected to read desc->file and update desc->vm_file - the
former will be the file provided by the caller (if stacked, this may
differ from vma->vm_file).
If the caller needs to differentiate between the two they therefore now
can.
While we are here, also provide a variant of compat_vma_mmap_prepare()
that operates against a pointer to any file_operations struct and does not
assume that the file_operations struct we are interested in is file->f_op.
This function is __compat_vma_mmap_prepare() and we invoke it from
compat_vma_mmap_prepare() so that we share code between the two functions.
This is important, because some drivers provide hooks in a separate
struct, for instance struct drm_device provides an fops field for this
purpose.
Also update the VMA selftests accordingly.
Link: https://lkml.kernel.org/r/dd0c72df8a33e8ffaa243eeb9b01010b670610e9.1756920635.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-03 18:48:42 +01:00
|
|
|
/**
|
|
|
|
|
* __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare()
|
|
|
|
|
* for details. This is the same operation, only with a specific file operations
|
|
|
|
|
* struct which may or may not be the same as vma->vm_file->f_op.
|
|
|
|
|
* @f_op: The file operations whose .mmap_prepare() hook is specified.
|
|
|
|
|
* @file: The file which backs or will back the mapping.
|
|
|
|
|
* @vma: The VMA to apply the .mmap_prepare() hook to.
|
|
|
|
|
* Returns: 0 on success or error.
|
|
|
|
|
*/
|
|
|
|
|
int __compat_vma_mmap_prepare(const struct file_operations *f_op,
|
|
|
|
|
struct file *file, struct vm_area_struct *vma)
|
|
|
|
|
{
|
|
|
|
|
struct vm_area_desc desc = {
|
|
|
|
|
.mm = vma->vm_mm,
|
|
|
|
|
.file = file,
|
|
|
|
|
.start = vma->vm_start,
|
|
|
|
|
.end = vma->vm_end,
|
|
|
|
|
|
|
|
|
|
.pgoff = vma->vm_pgoff,
|
|
|
|
|
.vm_file = vma->vm_file,
|
|
|
|
|
.vm_flags = vma->vm_flags,
|
|
|
|
|
.page_prot = vma->vm_page_prot,
|
|
|
|
|
};
|
|
|
|
|
int err;
|
|
|
|
|
|
|
|
|
|
err = f_op->mmap_prepare(&desc);
|
|
|
|
|
if (err)
|
|
|
|
|
return err;
|
|
|
|
|
set_vma_from_desc(vma, &desc);
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(__compat_vma_mmap_prepare);
|
|
|
|
|
|
2025-06-09 17:57:49 +01:00
|
|
|
/**
|
|
|
|
|
* compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an
|
mm: do not assume file == vma->vm_file in compat_vma_mmap_prepare()
In commit bb666b7c2707 ("mm: add mmap_prepare() compatibility layer for
nested file systems") we introduced the ability for stacked drivers and
file systems to correctly invoke the f_op->mmap_prepare() handler from an
f_op->mmap() handler via a compatibility layer implemented in
compat_vma_mmap_prepare().
This populates vm_area_desc fields according to those found in the (not
yet fully initialised) VMA passed to f_op->mmap().
However this function implicitly assumes that the struct file which we are
operating upon is equal to vma->vm_file. This is not a safe assumption in
all cases.
The only really sane situation in which this matters would be something
like e.g. i915_gem_dmabuf_mmap() which invokes vfs_mmap() against
obj->base.filp:
ret = vfs_mmap(obj->base.filp, vma);
if (ret)
return ret;
And then sets the VMA's file to this, should the mmap operation succeed:
vma_set_file(vma, obj->base.filp);
That is - it is the file that is intended to back the VMA mapping.
This is not an issue currently, as so far we have only implemented
f_op->mmap_prepare() handlers for some file systems and internal mm uses,
and the only stacked f_op->mmap() operations that can be performed upon
these are those in backing_file_mmap() and coda_file_mmap(), both of which
use vma->vm_file.
However, moving forward, as we convert drivers to using
f_op->mmap_prepare(), this will become a problem.
Resolve this issue by explicitly setting desc->file to the provided file
parameter and update callers accordingly.
Callers are expected to read desc->file and update desc->vm_file - the
former will be the file provided by the caller (if stacked, this may
differ from vma->vm_file).
If the caller needs to differentiate between the two they therefore now
can.
While we are here, also provide a variant of compat_vma_mmap_prepare()
that operates against a pointer to any file_operations struct and does not
assume that the file_operations struct we are interested in is file->f_op.
This function is __compat_vma_mmap_prepare() and we invoke it from
compat_vma_mmap_prepare() so that we share code between the two functions.
This is important, because some drivers provide hooks in a separate
struct, for instance struct drm_device provides an fops field for this
purpose.
Also update the VMA selftests accordingly.
Link: https://lkml.kernel.org/r/dd0c72df8a33e8ffaa243eeb9b01010b670610e9.1756920635.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-03 18:48:42 +01:00
|
|
|
* existing VMA.
|
|
|
|
|
* @file: The file which possesss an f_op->mmap_prepare() hook.
|
2025-06-09 17:57:49 +01:00
|
|
|
* @vma: The VMA to apply the .mmap_prepare() hook to.
|
|
|
|
|
*
|
|
|
|
|
* Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain
|
mm: do not assume file == vma->vm_file in compat_vma_mmap_prepare()
In commit bb666b7c2707 ("mm: add mmap_prepare() compatibility layer for
nested file systems") we introduced the ability for stacked drivers and
file systems to correctly invoke the f_op->mmap_prepare() handler from an
f_op->mmap() handler via a compatibility layer implemented in
compat_vma_mmap_prepare().
This populates vm_area_desc fields according to those found in the (not
yet fully initialised) VMA passed to f_op->mmap().
However this function implicitly assumes that the struct file which we are
operating upon is equal to vma->vm_file. This is not a safe assumption in
all cases.
The only really sane situation in which this matters would be something
like e.g. i915_gem_dmabuf_mmap() which invokes vfs_mmap() against
obj->base.filp:
ret = vfs_mmap(obj->base.filp, vma);
if (ret)
return ret;
And then sets the VMA's file to this, should the mmap operation succeed:
vma_set_file(vma, obj->base.filp);
That is - it is the file that is intended to back the VMA mapping.
This is not an issue currently, as so far we have only implemented
f_op->mmap_prepare() handlers for some file systems and internal mm uses,
and the only stacked f_op->mmap() operations that can be performed upon
these are those in backing_file_mmap() and coda_file_mmap(), both of which
use vma->vm_file.
However, moving forward, as we convert drivers to using
f_op->mmap_prepare(), this will become a problem.
Resolve this issue by explicitly setting desc->file to the provided file
parameter and update callers accordingly.
Callers are expected to read desc->file and update desc->vm_file - the
former will be the file provided by the caller (if stacked, this may
differ from vma->vm_file).
If the caller needs to differentiate between the two they therefore now
can.
While we are here, also provide a variant of compat_vma_mmap_prepare()
that operates against a pointer to any file_operations struct and does not
assume that the file_operations struct we are interested in is file->f_op.
This function is __compat_vma_mmap_prepare() and we invoke it from
compat_vma_mmap_prepare() so that we share code between the two functions.
This is important, because some drivers provide hooks in a separate
struct, for instance struct drm_device provides an fops field for this
purpose.
Also update the VMA selftests accordingly.
Link: https://lkml.kernel.org/r/dd0c72df8a33e8ffaa243eeb9b01010b670610e9.1756920635.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-03 18:48:42 +01:00
|
|
|
* stacked filesystems invoke a nested mmap hook of an underlying file.
|
2025-06-09 17:57:49 +01:00
|
|
|
*
|
|
|
|
|
* Until all filesystems are converted to use .mmap_prepare(), we must be
|
mm: do not assume file == vma->vm_file in compat_vma_mmap_prepare()
In commit bb666b7c2707 ("mm: add mmap_prepare() compatibility layer for
nested file systems") we introduced the ability for stacked drivers and
file systems to correctly invoke the f_op->mmap_prepare() handler from an
f_op->mmap() handler via a compatibility layer implemented in
compat_vma_mmap_prepare().
This populates vm_area_desc fields according to those found in the (not
yet fully initialised) VMA passed to f_op->mmap().
However this function implicitly assumes that the struct file which we are
operating upon is equal to vma->vm_file. This is not a safe assumption in
all cases.
The only really sane situation in which this matters would be something
like e.g. i915_gem_dmabuf_mmap() which invokes vfs_mmap() against
obj->base.filp:
ret = vfs_mmap(obj->base.filp, vma);
if (ret)
return ret;
And then sets the VMA's file to this, should the mmap operation succeed:
vma_set_file(vma, obj->base.filp);
That is - it is the file that is intended to back the VMA mapping.
This is not an issue currently, as so far we have only implemented
f_op->mmap_prepare() handlers for some file systems and internal mm uses,
and the only stacked f_op->mmap() operations that can be performed upon
these are those in backing_file_mmap() and coda_file_mmap(), both of which
use vma->vm_file.
However, moving forward, as we convert drivers to using
f_op->mmap_prepare(), this will become a problem.
Resolve this issue by explicitly setting desc->file to the provided file
parameter and update callers accordingly.
Callers are expected to read desc->file and update desc->vm_file - the
former will be the file provided by the caller (if stacked, this may
differ from vma->vm_file).
If the caller needs to differentiate between the two they therefore now
can.
While we are here, also provide a variant of compat_vma_mmap_prepare()
that operates against a pointer to any file_operations struct and does not
assume that the file_operations struct we are interested in is file->f_op.
This function is __compat_vma_mmap_prepare() and we invoke it from
compat_vma_mmap_prepare() so that we share code between the two functions.
This is important, because some drivers provide hooks in a separate
struct, for instance struct drm_device provides an fops field for this
purpose.
Also update the VMA selftests accordingly.
Link: https://lkml.kernel.org/r/dd0c72df8a33e8ffaa243eeb9b01010b670610e9.1756920635.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-03 18:48:42 +01:00
|
|
|
* conservative and continue to invoke these stacked filesystems using the
|
2025-06-09 17:57:49 +01:00
|
|
|
* deprecated .mmap() hook.
|
|
|
|
|
*
|
|
|
|
|
* However we have a problem if the underlying file system possesses an
|
|
|
|
|
* .mmap_prepare() hook, as we are in a different context when we invoke the
|
|
|
|
|
* .mmap() hook, already having a VMA to deal with.
|
|
|
|
|
*
|
|
|
|
|
* compat_vma_mmap_prepare() is a compatibility function that takes VMA state,
|
|
|
|
|
* establishes a struct vm_area_desc descriptor, passes to the underlying
|
|
|
|
|
* .mmap_prepare() hook and applies any changes performed by it.
|
|
|
|
|
*
|
|
|
|
|
* Once the conversion of filesystems is complete this function will no longer
|
|
|
|
|
* be required and will be removed.
|
|
|
|
|
*
|
|
|
|
|
* Returns: 0 on success or error.
|
|
|
|
|
*/
|
|
|
|
|
int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma)
|
|
|
|
|
{
|
mm: do not assume file == vma->vm_file in compat_vma_mmap_prepare()
In commit bb666b7c2707 ("mm: add mmap_prepare() compatibility layer for
nested file systems") we introduced the ability for stacked drivers and
file systems to correctly invoke the f_op->mmap_prepare() handler from an
f_op->mmap() handler via a compatibility layer implemented in
compat_vma_mmap_prepare().
This populates vm_area_desc fields according to those found in the (not
yet fully initialised) VMA passed to f_op->mmap().
However this function implicitly assumes that the struct file which we are
operating upon is equal to vma->vm_file. This is not a safe assumption in
all cases.
The only really sane situation in which this matters would be something
like e.g. i915_gem_dmabuf_mmap() which invokes vfs_mmap() against
obj->base.filp:
ret = vfs_mmap(obj->base.filp, vma);
if (ret)
return ret;
And then sets the VMA's file to this, should the mmap operation succeed:
vma_set_file(vma, obj->base.filp);
That is - it is the file that is intended to back the VMA mapping.
This is not an issue currently, as so far we have only implemented
f_op->mmap_prepare() handlers for some file systems and internal mm uses,
and the only stacked f_op->mmap() operations that can be performed upon
these are those in backing_file_mmap() and coda_file_mmap(), both of which
use vma->vm_file.
However, moving forward, as we convert drivers to using
f_op->mmap_prepare(), this will become a problem.
Resolve this issue by explicitly setting desc->file to the provided file
parameter and update callers accordingly.
Callers are expected to read desc->file and update desc->vm_file - the
former will be the file provided by the caller (if stacked, this may
differ from vma->vm_file).
If the caller needs to differentiate between the two they therefore now
can.
While we are here, also provide a variant of compat_vma_mmap_prepare()
that operates against a pointer to any file_operations struct and does not
assume that the file_operations struct we are interested in is file->f_op.
This function is __compat_vma_mmap_prepare() and we invoke it from
compat_vma_mmap_prepare() so that we share code between the two functions.
This is important, because some drivers provide hooks in a separate
struct, for instance struct drm_device provides an fops field for this
purpose.
Also update the VMA selftests accordingly.
Link: https://lkml.kernel.org/r/dd0c72df8a33e8ffaa243eeb9b01010b670610e9.1756920635.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-03 18:48:42 +01:00
|
|
|
return __compat_vma_mmap_prepare(file->f_op, file, vma);
|
2025-06-09 17:57:49 +01:00
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(compat_vma_mmap_prepare);
|
mm: split folio_pte_batch() into folio_pte_batch() and folio_pte_batch_flags()
Many users (including upcoming ones) don't really need the flags etc, and
can live with the possible overhead of a function call.
So let's provide a basic, non-inlined folio_pte_batch(), to avoid code
bloat while still providing a variant that optimizes out all flag checks
at runtime. folio_pte_batch_flags() will get inlined into
folio_pte_batch(), optimizing out any conditionals that depend on input
flags.
folio_pte_batch() will behave like folio_pte_batch_flags() when no flags
are specified. It's okay to add new users of folio_pte_batch_flags(), but
using folio_pte_batch() if applicable is preferred.
So, before this change, folio_pte_batch() was inlined into the C file
optimized by propagating constants within the resulting object file.
With this change, we now also have a folio_pte_batch() that is optimized
by propagating all constants. But instead of having one instance per
object file, we have a single shared one.
In zap_present_ptes(), where we care about performance, the compiler
already seem to generate a call to a common inlined folio_pte_batch()
variant, shared with fork() code. So calling the new non-inlined variant
should not make a difference.
While at it, drop the "addr" parameter that is unused.
Link: https://lkml.kernel.org/r/20250702104926.212243-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Link: https://lore.kernel.org/linux-mm/20250503182858.5a02729fcffd6d4723afcfc2@linux-foundation.org/
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jann Horn <jannh@google.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-07-02 12:49:25 +02:00
|
|
|
|
2025-07-14 09:16:52 -04:00
|
|
|
static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
|
|
|
|
|
const struct page *page)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* Only the first page of a high-order buddy page has PageBuddy() set.
|
|
|
|
|
* So we have to check manually whether this page is part of a high-
|
|
|
|
|
* order buddy page.
|
|
|
|
|
*/
|
|
|
|
|
if (PageBuddy(page))
|
|
|
|
|
ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;
|
|
|
|
|
else if (page_count(page) == 0 && is_free_buddy_page(page))
|
|
|
|
|
ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;
|
|
|
|
|
|
|
|
|
|
if (folio_test_idle(folio))
|
|
|
|
|
ps->flags |= PAGE_SNAPSHOT_PG_IDLE;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* snapshot_page() - Create a snapshot of a struct page
|
|
|
|
|
* @ps: Pointer to a struct page_snapshot to store the page snapshot
|
|
|
|
|
* @page: The page to snapshot
|
|
|
|
|
*
|
|
|
|
|
* Create a snapshot of the page and store both its struct page and struct
|
|
|
|
|
* folio representations in @ps.
|
|
|
|
|
*
|
|
|
|
|
* A snapshot is marked as "faithful" if the compound state of @page was
|
|
|
|
|
* stable and allowed safe reconstruction of the folio representation. In
|
|
|
|
|
* rare cases where this is not possible (e.g. due to folio splitting),
|
|
|
|
|
* snapshot_page() falls back to treating @page as a single page and the
|
|
|
|
|
* snapshot is marked as "unfaithful". The snapshot_page_is_faithful()
|
|
|
|
|
* helper can be used to check for this condition.
|
|
|
|
|
*/
|
|
|
|
|
void snapshot_page(struct page_snapshot *ps, const struct page *page)
|
|
|
|
|
{
|
|
|
|
|
unsigned long head, nr_pages = 1;
|
|
|
|
|
struct folio *foliop;
|
|
|
|
|
int loops = 5;
|
|
|
|
|
|
|
|
|
|
ps->pfn = page_to_pfn(page);
|
|
|
|
|
ps->flags = PAGE_SNAPSHOT_FAITHFUL;
|
|
|
|
|
|
|
|
|
|
again:
|
|
|
|
|
memset(&ps->folio_snapshot, 0, sizeof(struct folio));
|
|
|
|
|
memcpy(&ps->page_snapshot, page, sizeof(*page));
|
|
|
|
|
head = ps->page_snapshot.compound_head;
|
|
|
|
|
if ((head & 1) == 0) {
|
|
|
|
|
ps->idx = 0;
|
|
|
|
|
foliop = (struct folio *)&ps->page_snapshot;
|
|
|
|
|
if (!folio_test_large(foliop)) {
|
|
|
|
|
set_ps_flags(ps, page_folio(page), page);
|
|
|
|
|
memcpy(&ps->folio_snapshot, foliop,
|
|
|
|
|
sizeof(struct page));
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
foliop = (struct folio *)page;
|
|
|
|
|
} else {
|
|
|
|
|
foliop = (struct folio *)(head - 1);
|
|
|
|
|
ps->idx = folio_page_idx(foliop, page);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (ps->idx < MAX_FOLIO_NR_PAGES) {
|
|
|
|
|
memcpy(&ps->folio_snapshot, foliop, 2 * sizeof(struct page));
|
|
|
|
|
nr_pages = folio_nr_pages(&ps->folio_snapshot);
|
|
|
|
|
if (nr_pages > 1)
|
|
|
|
|
memcpy(&ps->folio_snapshot.__page_2, &foliop->__page_2,
|
|
|
|
|
sizeof(struct page));
|
|
|
|
|
set_ps_flags(ps, foliop, page);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (ps->idx > nr_pages) {
|
|
|
|
|
if (loops-- > 0)
|
|
|
|
|
goto again;
|
|
|
|
|
clear_compound_head(&ps->page_snapshot);
|
|
|
|
|
foliop = (struct folio *)&ps->page_snapshot;
|
|
|
|
|
memcpy(&ps->folio_snapshot, foliop, sizeof(struct page));
|
|
|
|
|
ps->flags = 0;
|
|
|
|
|
ps->idx = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
mm: split folio_pte_batch() into folio_pte_batch() and folio_pte_batch_flags()
Many users (including upcoming ones) don't really need the flags etc, and
can live with the possible overhead of a function call.
So let's provide a basic, non-inlined folio_pte_batch(), to avoid code
bloat while still providing a variant that optimizes out all flag checks
at runtime. folio_pte_batch_flags() will get inlined into
folio_pte_batch(), optimizing out any conditionals that depend on input
flags.
folio_pte_batch() will behave like folio_pte_batch_flags() when no flags
are specified. It's okay to add new users of folio_pte_batch_flags(), but
using folio_pte_batch() if applicable is preferred.
So, before this change, folio_pte_batch() was inlined into the C file
optimized by propagating constants within the resulting object file.
With this change, we now also have a folio_pte_batch() that is optimized
by propagating all constants. But instead of having one instance per
object file, we have a single shared one.
In zap_present_ptes(), where we care about performance, the compiler
already seem to generate a call to a common inlined folio_pte_batch()
variant, shared with fork() code. So calling the new non-inlined variant
should not make a difference.
While at it, drop the "addr" parameter that is unused.
Link: https://lkml.kernel.org/r/20250702104926.212243-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Link: https://lore.kernel.org/linux-mm/20250503182858.5a02729fcffd6d4723afcfc2@linux-foundation.org/
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jann Horn <jannh@google.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-07-02 12:49:25 +02:00
|
|
|
#ifdef CONFIG_MMU
|
|
|
|
|
/**
|
|
|
|
|
* folio_pte_batch - detect a PTE batch for a large folio
|
|
|
|
|
* @folio: The large folio to detect a PTE batch for.
|
|
|
|
|
* @ptep: Page table pointer for the first entry.
|
|
|
|
|
* @pte: Page table entry for the first page.
|
|
|
|
|
* @max_nr: The maximum number of table entries to consider.
|
|
|
|
|
*
|
|
|
|
|
* This is a simplified variant of folio_pte_batch_flags().
|
|
|
|
|
*
|
|
|
|
|
* Detect a PTE batch: consecutive (present) PTEs that map consecutive
|
|
|
|
|
* pages of the same large folio in a single VMA and a single page table.
|
|
|
|
|
*
|
|
|
|
|
* All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
|
|
|
|
|
* the accessed bit, writable bit, dirt-bit and soft-dirty bit.
|
|
|
|
|
*
|
|
|
|
|
* ptep must map any page of the folio. max_nr must be at least one and
|
|
|
|
|
* must be limited by the caller so scanning cannot exceed a single VMA and
|
|
|
|
|
* a single page table.
|
|
|
|
|
*
|
|
|
|
|
* Return: the number of table entries in the batch.
|
|
|
|
|
*/
|
|
|
|
|
unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
|
|
|
|
|
unsigned int max_nr)
|
|
|
|
|
{
|
2025-07-02 12:49:26 +02:00
|
|
|
return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0);
|
mm: split folio_pte_batch() into folio_pte_batch() and folio_pte_batch_flags()
Many users (including upcoming ones) don't really need the flags etc, and
can live with the possible overhead of a function call.
So let's provide a basic, non-inlined folio_pte_batch(), to avoid code
bloat while still providing a variant that optimizes out all flag checks
at runtime. folio_pte_batch_flags() will get inlined into
folio_pte_batch(), optimizing out any conditionals that depend on input
flags.
folio_pte_batch() will behave like folio_pte_batch_flags() when no flags
are specified. It's okay to add new users of folio_pte_batch_flags(), but
using folio_pte_batch() if applicable is preferred.
So, before this change, folio_pte_batch() was inlined into the C file
optimized by propagating constants within the resulting object file.
With this change, we now also have a folio_pte_batch() that is optimized
by propagating all constants. But instead of having one instance per
object file, we have a single shared one.
In zap_present_ptes(), where we care about performance, the compiler
already seem to generate a call to a common inlined folio_pte_batch()
variant, shared with fork() code. So calling the new non-inlined variant
should not make a difference.
While at it, drop the "addr" parameter that is unused.
Link: https://lkml.kernel.org/r/20250702104926.212243-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Link: https://lore.kernel.org/linux-mm/20250503182858.5a02729fcffd6d4723afcfc2@linux-foundation.org/
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jann Horn <jannh@google.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-07-02 12:49:25 +02:00
|
|
|
}
|
|
|
|
|
#endif /* CONFIG_MMU */
|
2025-09-01 17:03:43 +02:00
|
|
|
|
|
|
|
|
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
|
|
|
|
|
/**
|
|
|
|
|
* page_range_contiguous - test whether the page range is contiguous
|
|
|
|
|
* @page: the start of the page range.
|
|
|
|
|
* @nr_pages: the number of pages in the range.
|
|
|
|
|
*
|
|
|
|
|
* Test whether the page range is contiguous, such that they can be iterated
|
|
|
|
|
* naively, corresponding to iterating a contiguous PFN range.
|
|
|
|
|
*
|
|
|
|
|
* This function should primarily only be used for debug checks, or when
|
|
|
|
|
* working with page ranges that are not naturally contiguous (e.g., pages
|
|
|
|
|
* within a folio are).
|
|
|
|
|
*
|
|
|
|
|
* Returns true if contiguous, otherwise false.
|
|
|
|
|
*/
|
|
|
|
|
bool page_range_contiguous(const struct page *page, unsigned long nr_pages)
|
|
|
|
|
{
|
|
|
|
|
const unsigned long start_pfn = page_to_pfn(page);
|
|
|
|
|
const unsigned long end_pfn = start_pfn + nr_pages;
|
|
|
|
|
unsigned long pfn;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The memmap is allocated per memory section, so no need to check
|
|
|
|
|
* within the first section. However, we need to check each other
|
|
|
|
|
* spanned memory section once, making sure the first page in a
|
|
|
|
|
* section could similarly be reached by just iterating pages.
|
|
|
|
|
*/
|
|
|
|
|
for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION);
|
|
|
|
|
pfn < end_pfn; pfn += PAGES_PER_SECTION)
|
|
|
|
|
if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn)))
|
|
|
|
|
return false;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2025-09-01 17:03:45 +02:00
|
|
|
EXPORT_SYMBOL(page_range_contiguous);
|
2025-09-01 17:03:43 +02:00
|
|
|
#endif
|