2024-03-21 09:36:35 -07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
|
#include <linux/alloc_tag.h>
|
2024-10-23 10:07:56 -07:00
|
|
|
#include <linux/execmem.h>
|
2024-03-21 09:36:35 -07:00
|
|
|
#include <linux/fs.h>
|
|
|
|
|
#include <linux/gfp.h>
|
2024-10-23 10:07:59 -07:00
|
|
|
#include <linux/kallsyms.h>
|
2024-03-21 09:36:35 -07:00
|
|
|
#include <linux/module.h>
|
2024-03-21 09:36:36 -07:00
|
|
|
#include <linux/page_ext.h>
|
2024-03-21 09:36:35 -07:00
|
|
|
#include <linux/proc_fs.h>
|
|
|
|
|
#include <linux/seq_buf.h>
|
|
|
|
|
#include <linux/seq_file.h>
|
2025-08-14 17:38:27 +08:00
|
|
|
#include <linux/string_choices.h>
|
2024-10-23 10:07:57 -07:00
|
|
|
#include <linux/vmalloc.h>
|
2025-06-20 02:31:54 +08:00
|
|
|
#include <linux/kmemleak.h>
|
2024-03-21 09:36:35 -07:00
|
|
|
|
2024-10-23 10:07:55 -07:00
|
|
|
#define ALLOCINFO_FILE_NAME "allocinfo"
|
2024-10-23 10:07:56 -07:00
|
|
|
#define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag))
|
2024-10-23 10:07:59 -07:00
|
|
|
#define SECTION_START(NAME) (CODETAG_SECTION_START_PREFIX NAME)
|
|
|
|
|
#define SECTION_STOP(NAME) (CODETAG_SECTION_STOP_PREFIX NAME)
|
2024-10-23 10:07:55 -07:00
|
|
|
|
|
|
|
|
#ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
|
2024-10-23 10:07:57 -07:00
|
|
|
static bool mem_profiling_support = true;
|
2024-10-23 10:07:55 -07:00
|
|
|
#else
|
2024-10-23 10:07:57 -07:00
|
|
|
static bool mem_profiling_support;
|
2024-10-23 10:07:55 -07:00
|
|
|
#endif
|
|
|
|
|
|
2024-03-21 09:36:35 -07:00
|
|
|
static struct codetag_type *alloc_tag_cttype;
|
|
|
|
|
|
2025-06-18 09:58:09 +08:00
|
|
|
#ifdef CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU
|
2024-03-21 09:36:35 -07:00
|
|
|
DEFINE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
|
|
|
|
|
EXPORT_SYMBOL(_shared_alloc_tag);
|
2025-06-18 09:58:09 +08:00
|
|
|
#endif
|
2024-03-21 09:36:35 -07:00
|
|
|
|
|
|
|
|
DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
|
|
|
|
|
mem_alloc_profiling_key);
|
2024-12-26 13:16:38 -08:00
|
|
|
EXPORT_SYMBOL(mem_alloc_profiling_key);
|
|
|
|
|
|
2024-10-23 10:07:59 -07:00
|
|
|
DEFINE_STATIC_KEY_FALSE(mem_profiling_compressed);
|
|
|
|
|
|
|
|
|
|
struct alloc_tag_kernel_section kernel_tags = { NULL, 0 };
|
|
|
|
|
unsigned long alloc_tag_ref_mask;
|
|
|
|
|
int alloc_tag_ref_offs;
|
2024-03-21 09:36:35 -07:00
|
|
|
|
2024-05-14 09:31:28 -07:00
|
|
|
struct allocinfo_private {
|
|
|
|
|
struct codetag_iterator iter;
|
|
|
|
|
bool print_header;
|
|
|
|
|
};
|
|
|
|
|
|
2024-03-21 09:36:35 -07:00
|
|
|
static void *allocinfo_start(struct seq_file *m, loff_t *pos)
|
|
|
|
|
{
|
2024-05-14 09:31:28 -07:00
|
|
|
struct allocinfo_private *priv;
|
2024-03-21 09:36:35 -07:00
|
|
|
loff_t node = *pos;
|
|
|
|
|
|
alloc_tag: keep codetag iterator active between read()
When reading /proc/allocinfo, for each read syscall, seq_file would invoke
start/stop callbacks. In start callback, a memory is alloced to store
iterator and the iterator would start from beginning to walk linearly to
current read position.
seq_file read() takes at most 4096 bytes, even if read with a larger user
space buffer, meaning read out all of /proc/allocinfo, tens of read
syscalls are needed. For example, a 306036 bytes allocinfo files need 76
reads:
$ sudo cat /proc/allocinfo | wc
3964 16678 306036
$ sudo strace -T -e read cat /proc/allocinfo
...
read(3, " 4096 1 arch/x86/k"..., 131072) = 4063 <0.000062>
...
read(3, " 0 0 sound/core"..., 131072) = 4021 <0.000150>
...
For those n=3964 lines, each read takes about m=3964/76=52 lines,
since iterator restart from beginning for each read(),
it would move forward
m steps on 1st read
2*m steps on 2nd read
3*m steps on 3rd read
...
n steps on last read
As read() along, those linear seek steps make read() calls slower and
slower. Adding those up, codetag iterator moves about O(n*n/m) steps,
making data structure traversal take significant part of the whole
reading. Profiling when stress reading /proc/allocinfo confirms it:
vfs_read(99.959% 1677299/1677995)
proc_reg_read_iter(99.856% 1674881/1677299)
seq_read_iter(99.959% 1674191/1674881)
allocinfo_start(75.664% 1266755/1674191)
codetag_next_ct(79.217% 1003487/1266755) <---
srso_return_thunk(1.264% 16011/1266755)
__kmalloc_cache_noprof(0.102% 1296/1266755)
...
allocinfo_show(21.287% 356378/1674191)
allocinfo_next(1.530% 25621/1674191)
codetag_next_ct() takes major part.
A private data alloced at open() time can be used to carry iterator alive
across read() calls, and avoid the memory allocation and iterator reset
for each read(). This way, only O(1) memory allocation and O(n) steps
iterating, and `time` shows performance improvement from ~7ms to ~4ms.
Profiling with the change:
vfs_read(99.865% 1581073/1583214)
proc_reg_read_iter(99.485% 1572934/1581073)
seq_read_iter(99.846% 1570519/1572934)
allocinfo_show(87.428% 1373074/1570519)
seq_buf_printf(83.695% 1149196/1373074)
seq_buf_putc(1.917% 26321/1373074)
_find_next_bit(1.531% 21023/1373074)
...
codetag_to_text(0.490% 6727/1373074)
...
allocinfo_next(6.275% 98543/1570519)
...
allocinfo_start(0.369% 5790/1570519)
...
Now seq_buf_printf() takes major part.
Link: https://lkml.kernel.org/r/20250609064408.112783-1-00107082@163.com
Signed-off-by: David Wang <00107082@163.com>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-06-09 14:44:08 +08:00
|
|
|
priv = (struct allocinfo_private *)m->private;
|
2024-03-21 09:36:35 -07:00
|
|
|
codetag_lock_module_list(alloc_tag_cttype, true);
|
alloc_tag: keep codetag iterator active between read()
When reading /proc/allocinfo, for each read syscall, seq_file would invoke
start/stop callbacks. In start callback, a memory is alloced to store
iterator and the iterator would start from beginning to walk linearly to
current read position.
seq_file read() takes at most 4096 bytes, even if read with a larger user
space buffer, meaning read out all of /proc/allocinfo, tens of read
syscalls are needed. For example, a 306036 bytes allocinfo files need 76
reads:
$ sudo cat /proc/allocinfo | wc
3964 16678 306036
$ sudo strace -T -e read cat /proc/allocinfo
...
read(3, " 4096 1 arch/x86/k"..., 131072) = 4063 <0.000062>
...
read(3, " 0 0 sound/core"..., 131072) = 4021 <0.000150>
...
For those n=3964 lines, each read takes about m=3964/76=52 lines,
since iterator restart from beginning for each read(),
it would move forward
m steps on 1st read
2*m steps on 2nd read
3*m steps on 3rd read
...
n steps on last read
As read() along, those linear seek steps make read() calls slower and
slower. Adding those up, codetag iterator moves about O(n*n/m) steps,
making data structure traversal take significant part of the whole
reading. Profiling when stress reading /proc/allocinfo confirms it:
vfs_read(99.959% 1677299/1677995)
proc_reg_read_iter(99.856% 1674881/1677299)
seq_read_iter(99.959% 1674191/1674881)
allocinfo_start(75.664% 1266755/1674191)
codetag_next_ct(79.217% 1003487/1266755) <---
srso_return_thunk(1.264% 16011/1266755)
__kmalloc_cache_noprof(0.102% 1296/1266755)
...
allocinfo_show(21.287% 356378/1674191)
allocinfo_next(1.530% 25621/1674191)
codetag_next_ct() takes major part.
A private data alloced at open() time can be used to carry iterator alive
across read() calls, and avoid the memory allocation and iterator reset
for each read(). This way, only O(1) memory allocation and O(n) steps
iterating, and `time` shows performance improvement from ~7ms to ~4ms.
Profiling with the change:
vfs_read(99.865% 1581073/1583214)
proc_reg_read_iter(99.485% 1572934/1581073)
seq_read_iter(99.846% 1570519/1572934)
allocinfo_show(87.428% 1373074/1570519)
seq_buf_printf(83.695% 1149196/1373074)
seq_buf_putc(1.917% 26321/1373074)
_find_next_bit(1.531% 21023/1373074)
...
codetag_to_text(0.490% 6727/1373074)
...
allocinfo_next(6.275% 98543/1570519)
...
allocinfo_start(0.369% 5790/1570519)
...
Now seq_buf_printf() takes major part.
Link: https://lkml.kernel.org/r/20250609064408.112783-1-00107082@163.com
Signed-off-by: David Wang <00107082@163.com>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-06-09 14:44:08 +08:00
|
|
|
if (node == 0) {
|
|
|
|
|
priv->print_header = true;
|
|
|
|
|
priv->iter = codetag_get_ct_iter(alloc_tag_cttype);
|
|
|
|
|
codetag_next_ct(&priv->iter);
|
|
|
|
|
}
|
|
|
|
|
return priv->iter.ct ? priv : NULL;
|
2024-03-21 09:36:35 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos)
|
|
|
|
|
{
|
2024-05-14 09:31:28 -07:00
|
|
|
struct allocinfo_private *priv = (struct allocinfo_private *)arg;
|
|
|
|
|
struct codetag *ct = codetag_next_ct(&priv->iter);
|
2024-03-21 09:36:35 -07:00
|
|
|
|
|
|
|
|
(*pos)++;
|
|
|
|
|
if (!ct)
|
|
|
|
|
return NULL;
|
|
|
|
|
|
2024-05-14 09:31:28 -07:00
|
|
|
return priv;
|
2024-03-21 09:36:35 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void allocinfo_stop(struct seq_file *m, void *arg)
|
|
|
|
|
{
|
alloc_tag: keep codetag iterator active between read()
When reading /proc/allocinfo, for each read syscall, seq_file would invoke
start/stop callbacks. In start callback, a memory is alloced to store
iterator and the iterator would start from beginning to walk linearly to
current read position.
seq_file read() takes at most 4096 bytes, even if read with a larger user
space buffer, meaning read out all of /proc/allocinfo, tens of read
syscalls are needed. For example, a 306036 bytes allocinfo files need 76
reads:
$ sudo cat /proc/allocinfo | wc
3964 16678 306036
$ sudo strace -T -e read cat /proc/allocinfo
...
read(3, " 4096 1 arch/x86/k"..., 131072) = 4063 <0.000062>
...
read(3, " 0 0 sound/core"..., 131072) = 4021 <0.000150>
...
For those n=3964 lines, each read takes about m=3964/76=52 lines,
since iterator restart from beginning for each read(),
it would move forward
m steps on 1st read
2*m steps on 2nd read
3*m steps on 3rd read
...
n steps on last read
As read() along, those linear seek steps make read() calls slower and
slower. Adding those up, codetag iterator moves about O(n*n/m) steps,
making data structure traversal take significant part of the whole
reading. Profiling when stress reading /proc/allocinfo confirms it:
vfs_read(99.959% 1677299/1677995)
proc_reg_read_iter(99.856% 1674881/1677299)
seq_read_iter(99.959% 1674191/1674881)
allocinfo_start(75.664% 1266755/1674191)
codetag_next_ct(79.217% 1003487/1266755) <---
srso_return_thunk(1.264% 16011/1266755)
__kmalloc_cache_noprof(0.102% 1296/1266755)
...
allocinfo_show(21.287% 356378/1674191)
allocinfo_next(1.530% 25621/1674191)
codetag_next_ct() takes major part.
A private data alloced at open() time can be used to carry iterator alive
across read() calls, and avoid the memory allocation and iterator reset
for each read(). This way, only O(1) memory allocation and O(n) steps
iterating, and `time` shows performance improvement from ~7ms to ~4ms.
Profiling with the change:
vfs_read(99.865% 1581073/1583214)
proc_reg_read_iter(99.485% 1572934/1581073)
seq_read_iter(99.846% 1570519/1572934)
allocinfo_show(87.428% 1373074/1570519)
seq_buf_printf(83.695% 1149196/1373074)
seq_buf_putc(1.917% 26321/1373074)
_find_next_bit(1.531% 21023/1373074)
...
codetag_to_text(0.490% 6727/1373074)
...
allocinfo_next(6.275% 98543/1570519)
...
allocinfo_start(0.369% 5790/1570519)
...
Now seq_buf_printf() takes major part.
Link: https://lkml.kernel.org/r/20250609064408.112783-1-00107082@163.com
Signed-off-by: David Wang <00107082@163.com>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-06-09 14:44:08 +08:00
|
|
|
codetag_lock_module_list(alloc_tag_cttype, false);
|
2024-03-21 09:36:35 -07:00
|
|
|
}
|
|
|
|
|
|
2024-05-14 09:31:28 -07:00
|
|
|
static void print_allocinfo_header(struct seq_buf *buf)
|
|
|
|
|
{
|
|
|
|
|
/* Output format version, so we can change it. */
|
2025-09-15 16:02:24 -07:00
|
|
|
seq_buf_printf(buf, "allocinfo - version: 2.0\n");
|
2024-05-14 09:31:28 -07:00
|
|
|
seq_buf_printf(buf, "# <size> <calls> <tag info>\n");
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-21 09:36:35 -07:00
|
|
|
static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
|
|
|
|
|
{
|
|
|
|
|
struct alloc_tag *tag = ct_to_alloc_tag(ct);
|
|
|
|
|
struct alloc_tag_counters counter = alloc_tag_read(tag);
|
|
|
|
|
s64 bytes = counter.bytes;
|
|
|
|
|
|
|
|
|
|
seq_buf_printf(out, "%12lli %8llu ", bytes, counter.calls);
|
|
|
|
|
codetag_to_text(out, ct);
|
2025-09-15 16:02:24 -07:00
|
|
|
if (unlikely(alloc_tag_is_inaccurate(tag)))
|
|
|
|
|
seq_buf_printf(out, " accurate:no");
|
2024-03-21 09:36:35 -07:00
|
|
|
seq_buf_putc(out, ' ');
|
|
|
|
|
seq_buf_putc(out, '\n');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int allocinfo_show(struct seq_file *m, void *arg)
|
|
|
|
|
{
|
2024-05-14 09:31:28 -07:00
|
|
|
struct allocinfo_private *priv = (struct allocinfo_private *)arg;
|
2024-03-21 09:36:35 -07:00
|
|
|
char *bufp;
|
|
|
|
|
size_t n = seq_get_buf(m, &bufp);
|
|
|
|
|
struct seq_buf buf;
|
|
|
|
|
|
|
|
|
|
seq_buf_init(&buf, bufp, n);
|
2024-05-14 09:31:28 -07:00
|
|
|
if (priv->print_header) {
|
|
|
|
|
print_allocinfo_header(&buf);
|
|
|
|
|
priv->print_header = false;
|
|
|
|
|
}
|
|
|
|
|
alloc_tag_to_text(&buf, priv->iter.ct);
|
2024-03-21 09:36:35 -07:00
|
|
|
seq_commit(m, seq_buf_used(&buf));
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static const struct seq_operations allocinfo_seq_op = {
|
|
|
|
|
.start = allocinfo_start,
|
|
|
|
|
.next = allocinfo_next,
|
|
|
|
|
.stop = allocinfo_stop,
|
|
|
|
|
.show = allocinfo_show,
|
|
|
|
|
};
|
|
|
|
|
|
2024-03-21 09:36:54 -07:00
|
|
|
size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sleep)
|
|
|
|
|
{
|
|
|
|
|
struct codetag_iterator iter;
|
|
|
|
|
struct codetag *ct;
|
|
|
|
|
struct codetag_bytes n;
|
|
|
|
|
unsigned int i, nr = 0;
|
|
|
|
|
|
lib/alloc_tag: do not acquire non-existent lock in alloc_tag_top_users()
alloc_tag_top_users() attempts to lock alloc_tag_cttype->mod_lock even
when the alloc_tag_cttype is not allocated because:
1) alloc tagging is disabled because mem profiling is disabled
(!alloc_tag_cttype)
2) alloc tagging is enabled, but not yet initialized (!alloc_tag_cttype)
3) alloc tagging is enabled, but failed initialization
(!alloc_tag_cttype or IS_ERR(alloc_tag_cttype))
In all cases, alloc_tag_cttype is not allocated, and therefore
alloc_tag_top_users() should not attempt to acquire the semaphore.
This leads to a crash on memory allocation failure by attempting to
acquire a non-existent semaphore:
Oops: general protection fault, probably for non-canonical address 0xdffffc000000001b: 0000 [#3] SMP KASAN NOPTI
KASAN: null-ptr-deref in range [0x00000000000000d8-0x00000000000000df]
CPU: 2 UID: 0 PID: 1 Comm: systemd Tainted: G D 6.16.0-rc2 #1 VOLUNTARY
Tainted: [D]=DIE
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
RIP: 0010:down_read_trylock+0xaa/0x3b0
Code: d0 7c 08 84 d2 0f 85 a0 02 00 00 8b 0d df 31 dd 04 85 c9 75 29 48 b8 00 00 00 00 00 fc ff df 48 8d 6b 68 48 89 ea 48 c1 ea 03 <80> 3c 02 00 0f 85 88 02 00 00 48 3b 5b 68 0f 85 53 01 00 00 65 ff
RSP: 0000:ffff8881002ce9b8 EFLAGS: 00010016
RAX: dffffc0000000000 RBX: 0000000000000070 RCX: 0000000000000000
RDX: 000000000000001b RSI: 000000000000000a RDI: 0000000000000070
RBP: 00000000000000d8 R08: 0000000000000001 R09: ffffed107dde49d1
R10: ffff8883eef24e8b R11: ffff8881002cec20 R12: 1ffff11020059d37
R13: 00000000003fff7b R14: ffff8881002cec20 R15: dffffc0000000000
FS: 00007f963f21d940(0000) GS:ffff888458ca6000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f963f5edf71 CR3: 000000010672c000 CR4: 0000000000350ef0
Call Trace:
<TASK>
codetag_trylock_module_list+0xd/0x20
alloc_tag_top_users+0x369/0x4b0
__show_mem+0x1cd/0x6e0
warn_alloc+0x2b1/0x390
__alloc_frozen_pages_noprof+0x12b9/0x21a0
alloc_pages_mpol+0x135/0x3e0
alloc_slab_page+0x82/0xe0
new_slab+0x212/0x240
___slab_alloc+0x82a/0xe00
</TASK>
As David Wang points out, this issue became easier to trigger after commit
780138b12381 ("alloc_tag: check mem_profiling_support in alloc_tag_init").
Before the commit, the issue occurred only when it failed to allocate and
initialize alloc_tag_cttype or if a memory allocation fails before
alloc_tag_init() is called. After the commit, it can be easily triggered
when memory profiling is compiled but disabled at boot.
To properly determine whether alloc_tag_init() has been called and its
data structures initialized, verify that alloc_tag_cttype is a valid
pointer before acquiring the semaphore. If the variable is NULL or an
error value, it has not been properly initialized. In such a case, just
skip and do not attempt to acquire the semaphore.
[harry.yoo@oracle.com: v3]
Link: https://lkml.kernel.org/r/20250624072513.84219-1-harry.yoo@oracle.com
Link: https://lkml.kernel.org/r/20250620195305.1115151-1-harry.yoo@oracle.com
Fixes: 780138b12381 ("alloc_tag: check mem_profiling_support in alloc_tag_init")
Fixes: 1438d349d16b ("lib: add memory allocations report in show_mem()")
Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202506181351.bba867dd-lkp@intel.com
Acked-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Raghavendra K T <raghavendra.kt@amd.com>
Cc: Casey Chen <cachen@purestorage.com>
Cc: David Wang <00107082@163.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Yuanyuan Zhong <yzhong@purestorage.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-06-21 04:53:05 +09:00
|
|
|
if (IS_ERR_OR_NULL(alloc_tag_cttype))
|
|
|
|
|
return 0;
|
|
|
|
|
|
2024-03-21 09:36:54 -07:00
|
|
|
if (can_sleep)
|
|
|
|
|
codetag_lock_module_list(alloc_tag_cttype, true);
|
|
|
|
|
else if (!codetag_trylock_module_list(alloc_tag_cttype))
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
iter = codetag_get_ct_iter(alloc_tag_cttype);
|
|
|
|
|
while ((ct = codetag_next_ct(&iter))) {
|
|
|
|
|
struct alloc_tag_counters counter = alloc_tag_read(ct_to_alloc_tag(ct));
|
|
|
|
|
|
|
|
|
|
n.ct = ct;
|
|
|
|
|
n.bytes = counter.bytes;
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < nr; i++)
|
|
|
|
|
if (n.bytes > tags[i].bytes)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
if (i < count) {
|
|
|
|
|
nr -= nr == count;
|
|
|
|
|
memmove(&tags[i + 1],
|
|
|
|
|
&tags[i],
|
|
|
|
|
sizeof(tags[0]) * (nr - i));
|
|
|
|
|
nr++;
|
|
|
|
|
tags[i] = n;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
codetag_lock_module_list(alloc_tag_cttype, false);
|
|
|
|
|
|
|
|
|
|
return nr;
|
|
|
|
|
}
|
|
|
|
|
|
2024-10-24 09:23:18 -07:00
|
|
|
void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
|
|
|
|
|
{
|
|
|
|
|
int i;
|
|
|
|
|
struct alloc_tag *tag;
|
|
|
|
|
unsigned int nr_pages = 1 << new_order;
|
|
|
|
|
|
|
|
|
|
if (!mem_alloc_profiling_enabled())
|
|
|
|
|
return;
|
|
|
|
|
|
2025-02-01 15:18:00 -08:00
|
|
|
tag = __pgalloc_tag_get(&folio->page);
|
2024-10-24 09:23:18 -07:00
|
|
|
if (!tag)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
for (i = nr_pages; i < (1 << old_order); i += nr_pages) {
|
|
|
|
|
union pgtag_ref_handle handle;
|
|
|
|
|
union codetag_ref ref;
|
|
|
|
|
|
|
|
|
|
if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) {
|
|
|
|
|
/* Set new reference to point to the original tag */
|
|
|
|
|
alloc_tag_ref_set(&ref, tag);
|
|
|
|
|
update_page_tag_ref(handle, &ref);
|
|
|
|
|
put_page_tag_ref(handle);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-29 10:52:13 +08:00
|
|
|
void pgalloc_tag_swap(struct folio *new, struct folio *old)
|
2024-10-24 09:23:18 -07:00
|
|
|
{
|
2024-11-29 10:52:13 +08:00
|
|
|
union pgtag_ref_handle handle_old, handle_new;
|
|
|
|
|
union codetag_ref ref_old, ref_new;
|
|
|
|
|
struct alloc_tag *tag_old, *tag_new;
|
2024-10-24 09:23:18 -07:00
|
|
|
|
2024-12-26 13:16:39 -08:00
|
|
|
if (!mem_alloc_profiling_enabled())
|
|
|
|
|
return;
|
|
|
|
|
|
2025-02-01 15:18:00 -08:00
|
|
|
tag_old = __pgalloc_tag_get(&old->page);
|
2024-11-29 10:52:13 +08:00
|
|
|
if (!tag_old)
|
|
|
|
|
return;
|
2025-02-01 15:18:00 -08:00
|
|
|
tag_new = __pgalloc_tag_get(&new->page);
|
2024-11-29 10:52:13 +08:00
|
|
|
if (!tag_new)
|
2024-10-24 09:23:18 -07:00
|
|
|
return;
|
|
|
|
|
|
2024-11-29 10:52:13 +08:00
|
|
|
if (!get_page_tag_ref(&old->page, &ref_old, &handle_old))
|
2024-10-24 09:23:18 -07:00
|
|
|
return;
|
2024-11-29 10:52:13 +08:00
|
|
|
if (!get_page_tag_ref(&new->page, &ref_new, &handle_new)) {
|
|
|
|
|
put_page_tag_ref(handle_old);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-13 09:33:32 +08:00
|
|
|
/*
|
|
|
|
|
* Clear tag references to avoid debug warning when using
|
|
|
|
|
* __alloc_tag_ref_set() with non-empty reference.
|
|
|
|
|
*/
|
|
|
|
|
set_codetag_empty(&ref_old);
|
|
|
|
|
set_codetag_empty(&ref_new);
|
|
|
|
|
|
2024-11-29 10:52:13 +08:00
|
|
|
/* swap tags */
|
|
|
|
|
__alloc_tag_ref_set(&ref_old, tag_new);
|
|
|
|
|
update_page_tag_ref(handle_old, &ref_old);
|
|
|
|
|
__alloc_tag_ref_set(&ref_new, tag_old);
|
|
|
|
|
update_page_tag_ref(handle_new, &ref_new);
|
2024-10-24 09:23:18 -07:00
|
|
|
|
2024-11-29 10:52:13 +08:00
|
|
|
put_page_tag_ref(handle_old);
|
|
|
|
|
put_page_tag_ref(handle_new);
|
2024-10-24 09:23:18 -07:00
|
|
|
}
|
|
|
|
|
|
2024-10-23 10:07:59 -07:00
|
|
|
static void shutdown_mem_profiling(bool remove_file)
|
2024-10-23 10:07:55 -07:00
|
|
|
{
|
|
|
|
|
if (mem_alloc_profiling_enabled())
|
|
|
|
|
static_branch_disable(&mem_alloc_profiling_key);
|
|
|
|
|
|
|
|
|
|
if (!mem_profiling_support)
|
|
|
|
|
return;
|
|
|
|
|
|
2024-10-23 10:07:59 -07:00
|
|
|
if (remove_file)
|
|
|
|
|
remove_proc_entry(ALLOCINFO_FILE_NAME, NULL);
|
2024-10-23 10:07:55 -07:00
|
|
|
mem_profiling_support = false;
|
|
|
|
|
}
|
|
|
|
|
|
2024-10-23 10:07:59 -07:00
|
|
|
void __init alloc_tag_sec_init(void)
|
|
|
|
|
{
|
|
|
|
|
struct alloc_tag *last_codetag;
|
|
|
|
|
|
|
|
|
|
if (!mem_profiling_support)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (!static_key_enabled(&mem_profiling_compressed))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
kernel_tags.first_tag = (struct alloc_tag *)kallsyms_lookup_name(
|
|
|
|
|
SECTION_START(ALLOC_TAG_SECTION_NAME));
|
|
|
|
|
last_codetag = (struct alloc_tag *)kallsyms_lookup_name(
|
|
|
|
|
SECTION_STOP(ALLOC_TAG_SECTION_NAME));
|
|
|
|
|
kernel_tags.count = last_codetag - kernel_tags.first_tag;
|
|
|
|
|
|
|
|
|
|
/* Check if kernel tags fit into page flags */
|
|
|
|
|
if (kernel_tags.count > (1UL << NR_UNUSED_PAGEFLAG_BITS)) {
|
|
|
|
|
shutdown_mem_profiling(false); /* allocinfo file does not exist yet */
|
|
|
|
|
pr_err("%lu allocation tags cannot be references using %d available page flag bits. Memory allocation profiling is disabled!\n",
|
|
|
|
|
kernel_tags.count, NR_UNUSED_PAGEFLAG_BITS);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
alloc_tag_ref_offs = (LRU_REFS_PGOFF - NR_UNUSED_PAGEFLAG_BITS);
|
|
|
|
|
alloc_tag_ref_mask = ((1UL << NR_UNUSED_PAGEFLAG_BITS) - 1);
|
|
|
|
|
pr_debug("Memory allocation profiling compression is using %d page flag bits!\n",
|
|
|
|
|
NR_UNUSED_PAGEFLAG_BITS);
|
|
|
|
|
}
|
|
|
|
|
|
2024-10-23 10:07:56 -07:00
|
|
|
#ifdef CONFIG_MODULES
|
|
|
|
|
|
|
|
|
|
static struct maple_tree mod_area_mt = MTREE_INIT(mod_area_mt, MT_FLAGS_ALLOC_RANGE);
|
2024-10-23 10:07:57 -07:00
|
|
|
static struct vm_struct *vm_module_tags;
|
2024-10-23 10:07:56 -07:00
|
|
|
/* A dummy object used to indicate an unloaded module */
|
|
|
|
|
static struct module unloaded_mod;
|
|
|
|
|
/* A dummy object used to indicate a module prepended area */
|
|
|
|
|
static struct module prepend_mod;
|
|
|
|
|
|
2024-10-23 10:07:59 -07:00
|
|
|
struct alloc_tag_module_section module_tags;
|
|
|
|
|
|
|
|
|
|
static inline unsigned long alloc_tag_align(unsigned long val)
|
|
|
|
|
{
|
|
|
|
|
if (!static_key_enabled(&mem_profiling_compressed)) {
|
|
|
|
|
/* No alignment requirements when we are not indexing the tags */
|
|
|
|
|
return val;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (val % sizeof(struct alloc_tag) == 0)
|
|
|
|
|
return val;
|
|
|
|
|
return ((val / sizeof(struct alloc_tag)) + 1) * sizeof(struct alloc_tag);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool ensure_alignment(unsigned long align, unsigned int *prepend)
|
|
|
|
|
{
|
|
|
|
|
if (!static_key_enabled(&mem_profiling_compressed)) {
|
|
|
|
|
/* No alignment requirements when we are not indexing the tags */
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If alloc_tag size is not a multiple of required alignment, tag
|
|
|
|
|
* indexing does not work.
|
|
|
|
|
*/
|
|
|
|
|
if (!IS_ALIGNED(sizeof(struct alloc_tag), align))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* Ensure prepend consumes multiple of alloc_tag-sized blocks */
|
|
|
|
|
if (*prepend)
|
|
|
|
|
*prepend = alloc_tag_align(*prepend);
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline bool tags_addressable(void)
|
|
|
|
|
{
|
|
|
|
|
unsigned long tag_idx_count;
|
|
|
|
|
|
|
|
|
|
if (!static_key_enabled(&mem_profiling_compressed))
|
|
|
|
|
return true; /* with page_ext tags are always addressable */
|
|
|
|
|
|
|
|
|
|
tag_idx_count = CODETAG_ID_FIRST + kernel_tags.count +
|
|
|
|
|
module_tags.size / sizeof(struct alloc_tag);
|
|
|
|
|
|
|
|
|
|
return tag_idx_count < (1UL << NR_UNUSED_PAGEFLAG_BITS);
|
|
|
|
|
}
|
2024-10-23 10:07:56 -07:00
|
|
|
|
|
|
|
|
static bool needs_section_mem(struct module *mod, unsigned long size)
|
2024-03-21 09:36:35 -07:00
|
|
|
{
|
2024-10-23 10:07:59 -07:00
|
|
|
if (!mem_profiling_support)
|
|
|
|
|
return false;
|
|
|
|
|
|
2024-10-23 10:07:56 -07:00
|
|
|
return size >= sizeof(struct alloc_tag);
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-16 17:07:39 -07:00
|
|
|
static bool clean_unused_counters(struct alloc_tag *start_tag,
|
|
|
|
|
struct alloc_tag *end_tag)
|
2024-10-23 10:07:56 -07:00
|
|
|
{
|
2025-05-16 17:07:39 -07:00
|
|
|
struct alloc_tag *tag;
|
|
|
|
|
bool ret = true;
|
|
|
|
|
|
|
|
|
|
for (tag = start_tag; tag <= end_tag; tag++) {
|
2024-10-23 10:07:56 -07:00
|
|
|
struct alloc_tag_counters counter;
|
2024-03-21 09:36:35 -07:00
|
|
|
|
2025-05-16 17:07:39 -07:00
|
|
|
if (!tag->counters)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
counter = alloc_tag_read(tag);
|
|
|
|
|
if (!counter.bytes) {
|
|
|
|
|
free_percpu(tag->counters);
|
|
|
|
|
tag->counters = NULL;
|
|
|
|
|
} else {
|
|
|
|
|
ret = false;
|
|
|
|
|
}
|
2024-10-23 10:07:56 -07:00
|
|
|
}
|
|
|
|
|
|
2025-05-16 17:07:39 -07:00
|
|
|
return ret;
|
2024-10-23 10:07:56 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Called with mod_area_mt locked */
|
|
|
|
|
static void clean_unused_module_areas_locked(void)
|
|
|
|
|
{
|
|
|
|
|
MA_STATE(mas, &mod_area_mt, 0, module_tags.size);
|
|
|
|
|
struct module *val;
|
|
|
|
|
|
|
|
|
|
mas_for_each(&mas, val, module_tags.size) {
|
2025-05-16 17:07:39 -07:00
|
|
|
struct alloc_tag *start_tag;
|
|
|
|
|
struct alloc_tag *end_tag;
|
|
|
|
|
|
2024-10-23 10:07:56 -07:00
|
|
|
if (val != &unloaded_mod)
|
2024-03-21 09:36:35 -07:00
|
|
|
continue;
|
|
|
|
|
|
2024-10-23 10:07:56 -07:00
|
|
|
/* Release area if all tags are unused */
|
2025-05-16 17:07:39 -07:00
|
|
|
start_tag = (struct alloc_tag *)(module_tags.start_addr + mas.index);
|
|
|
|
|
end_tag = (struct alloc_tag *)(module_tags.start_addr + mas.last);
|
|
|
|
|
if (clean_unused_counters(start_tag, end_tag))
|
2024-10-23 10:07:56 -07:00
|
|
|
mas_erase(&mas);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Called with mod_area_mt locked */
|
|
|
|
|
static bool find_aligned_area(struct ma_state *mas, unsigned long section_size,
|
|
|
|
|
unsigned long size, unsigned int prepend, unsigned long align)
|
|
|
|
|
{
|
|
|
|
|
bool cleanup_done = false;
|
|
|
|
|
|
|
|
|
|
repeat:
|
|
|
|
|
/* Try finding exact size and hope the start is aligned */
|
|
|
|
|
if (!mas_empty_area(mas, 0, section_size - 1, prepend + size)) {
|
|
|
|
|
if (IS_ALIGNED(mas->index + prepend, align))
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
/* Try finding larger area to align later */
|
|
|
|
|
mas_reset(mas);
|
|
|
|
|
if (!mas_empty_area(mas, 0, section_size - 1,
|
|
|
|
|
size + prepend + align - 1))
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* No free area, try cleanup stale data and repeat the search once */
|
|
|
|
|
if (!cleanup_done) {
|
|
|
|
|
clean_unused_module_areas_locked();
|
|
|
|
|
cleanup_done = true;
|
|
|
|
|
mas_reset(mas);
|
|
|
|
|
goto repeat;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2024-10-23 10:07:57 -07:00
|
|
|
static int vm_module_tags_populate(void)
|
|
|
|
|
{
|
2024-11-29 16:14:22 -08:00
|
|
|
unsigned long phys_end = ALIGN_DOWN(module_tags.start_addr, PAGE_SIZE) +
|
|
|
|
|
(vm_module_tags->nr_pages << PAGE_SHIFT);
|
|
|
|
|
unsigned long new_end = module_tags.start_addr + module_tags.size;
|
2024-10-23 10:07:57 -07:00
|
|
|
|
2024-11-29 16:14:22 -08:00
|
|
|
if (phys_end < new_end) {
|
2024-10-23 10:07:57 -07:00
|
|
|
struct page **next_page = vm_module_tags->pages + vm_module_tags->nr_pages;
|
2024-11-29 16:14:22 -08:00
|
|
|
unsigned long old_shadow_end = ALIGN(phys_end, MODULE_ALIGN);
|
|
|
|
|
unsigned long new_shadow_end = ALIGN(new_end, MODULE_ALIGN);
|
2024-10-23 10:07:57 -07:00
|
|
|
unsigned long more_pages;
|
2025-04-09 22:51:11 +00:00
|
|
|
unsigned long nr = 0;
|
2024-10-23 10:07:57 -07:00
|
|
|
|
2024-11-29 16:14:22 -08:00
|
|
|
more_pages = ALIGN(new_end - phys_end, PAGE_SIZE) >> PAGE_SHIFT;
|
2025-04-09 22:51:11 +00:00
|
|
|
while (nr < more_pages) {
|
|
|
|
|
unsigned long allocated;
|
|
|
|
|
|
|
|
|
|
allocated = alloc_pages_bulk_node(GFP_KERNEL | __GFP_NOWARN,
|
|
|
|
|
NUMA_NO_NODE, more_pages - nr, next_page + nr);
|
|
|
|
|
|
|
|
|
|
if (!allocated)
|
|
|
|
|
break;
|
|
|
|
|
nr += allocated;
|
|
|
|
|
}
|
|
|
|
|
|
2024-10-23 10:07:57 -07:00
|
|
|
if (nr < more_pages ||
|
2024-11-29 16:14:22 -08:00
|
|
|
vmap_pages_range(phys_end, phys_end + (nr << PAGE_SHIFT), PAGE_KERNEL,
|
2024-10-23 10:07:57 -07:00
|
|
|
next_page, PAGE_SHIFT) < 0) {
|
2025-09-15 14:27:54 -07:00
|
|
|
release_pages_arg arg = { .pages = next_page };
|
|
|
|
|
|
2024-10-23 10:07:57 -07:00
|
|
|
/* Clean up and error out */
|
2025-09-15 14:27:54 -07:00
|
|
|
release_pages(arg, nr);
|
2024-10-23 10:07:57 -07:00
|
|
|
return -ENOMEM;
|
|
|
|
|
}
|
2024-11-29 16:14:22 -08:00
|
|
|
|
2024-10-23 10:07:57 -07:00
|
|
|
vm_module_tags->nr_pages += nr;
|
2024-11-29 16:14:22 -08:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Kasan allocates 1 byte of shadow for every 8 bytes of data.
|
|
|
|
|
* When kasan_alloc_module_shadow allocates shadow memory,
|
|
|
|
|
* its unit of allocation is a page.
|
|
|
|
|
* Therefore, here we need to align to MODULE_ALIGN.
|
|
|
|
|
*/
|
|
|
|
|
if (old_shadow_end < new_shadow_end)
|
|
|
|
|
kasan_alloc_module_shadow((void *)old_shadow_end,
|
|
|
|
|
new_shadow_end - old_shadow_end,
|
|
|
|
|
GFP_KERNEL);
|
2024-10-23 10:07:57 -07:00
|
|
|
}
|
|
|
|
|
|
2024-11-29 16:14:22 -08:00
|
|
|
/*
|
|
|
|
|
* Mark the pages as accessible, now that they are mapped.
|
|
|
|
|
* With hardware tag-based KASAN, marking is skipped for
|
|
|
|
|
* non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
|
|
|
|
|
*/
|
|
|
|
|
kasan_unpoison_vmalloc((void *)module_tags.start_addr,
|
|
|
|
|
new_end - module_tags.start_addr,
|
|
|
|
|
KASAN_VMALLOC_PROT_NORMAL);
|
|
|
|
|
|
2024-10-23 10:07:57 -07:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2024-10-23 10:07:56 -07:00
|
|
|
static void *reserve_module_tags(struct module *mod, unsigned long size,
|
|
|
|
|
unsigned int prepend, unsigned long align)
|
|
|
|
|
{
|
|
|
|
|
unsigned long section_size = module_tags.end_addr - module_tags.start_addr;
|
|
|
|
|
MA_STATE(mas, &mod_area_mt, 0, section_size - 1);
|
|
|
|
|
unsigned long offset;
|
|
|
|
|
void *ret = NULL;
|
|
|
|
|
|
|
|
|
|
/* If no tags return error */
|
|
|
|
|
if (size < sizeof(struct alloc_tag))
|
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* align is always power of 2, so we can use IS_ALIGNED and ALIGN.
|
|
|
|
|
* align 0 or 1 means no alignment, to simplify set to 1.
|
|
|
|
|
*/
|
|
|
|
|
if (!align)
|
|
|
|
|
align = 1;
|
|
|
|
|
|
2024-10-23 10:07:59 -07:00
|
|
|
if (!ensure_alignment(align, &prepend)) {
|
|
|
|
|
shutdown_mem_profiling(true);
|
|
|
|
|
pr_err("%s: alignment %lu is incompatible with allocation tag indexing. Memory allocation profiling is disabled!\n",
|
|
|
|
|
mod->name, align);
|
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
}
|
|
|
|
|
|
2024-10-23 10:07:56 -07:00
|
|
|
mas_lock(&mas);
|
|
|
|
|
if (!find_aligned_area(&mas, section_size, size, prepend, align)) {
|
|
|
|
|
ret = ERR_PTR(-ENOMEM);
|
|
|
|
|
goto unlock;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Mark found area as reserved */
|
|
|
|
|
offset = mas.index;
|
|
|
|
|
offset += prepend;
|
|
|
|
|
offset = ALIGN(offset, align);
|
|
|
|
|
if (offset != mas.index) {
|
|
|
|
|
unsigned long pad_start = mas.index;
|
|
|
|
|
|
|
|
|
|
mas.last = offset - 1;
|
|
|
|
|
mas_store(&mas, &prepend_mod);
|
|
|
|
|
if (mas_is_err(&mas)) {
|
|
|
|
|
ret = ERR_PTR(xa_err(mas.node));
|
|
|
|
|
goto unlock;
|
|
|
|
|
}
|
|
|
|
|
mas.index = offset;
|
|
|
|
|
mas.last = offset + size - 1;
|
|
|
|
|
mas_store(&mas, mod);
|
|
|
|
|
if (mas_is_err(&mas)) {
|
|
|
|
|
mas.index = pad_start;
|
|
|
|
|
mas_erase(&mas);
|
|
|
|
|
ret = ERR_PTR(xa_err(mas.node));
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
mas.last = offset + size - 1;
|
|
|
|
|
mas_store(&mas, mod);
|
|
|
|
|
if (mas_is_err(&mas))
|
|
|
|
|
ret = ERR_PTR(xa_err(mas.node));
|
|
|
|
|
}
|
|
|
|
|
unlock:
|
|
|
|
|
mas_unlock(&mas);
|
|
|
|
|
|
|
|
|
|
if (IS_ERR(ret))
|
|
|
|
|
return ret;
|
2024-03-21 09:36:35 -07:00
|
|
|
|
2024-10-23 10:07:57 -07:00
|
|
|
if (module_tags.size < offset + size) {
|
|
|
|
|
int grow_res;
|
|
|
|
|
|
2024-10-23 10:07:56 -07:00
|
|
|
module_tags.size = offset + size;
|
2024-10-23 10:07:59 -07:00
|
|
|
if (mem_alloc_profiling_enabled() && !tags_addressable()) {
|
|
|
|
|
shutdown_mem_profiling(true);
|
|
|
|
|
pr_warn("With module %s there are too many tags to fit in %d page flag bits. Memory allocation profiling is disabled!\n",
|
|
|
|
|
mod->name, NR_UNUSED_PAGEFLAG_BITS);
|
|
|
|
|
}
|
|
|
|
|
|
2024-10-23 10:07:57 -07:00
|
|
|
grow_res = vm_module_tags_populate();
|
|
|
|
|
if (grow_res) {
|
2024-10-23 10:07:59 -07:00
|
|
|
shutdown_mem_profiling(true);
|
2024-10-23 10:07:57 -07:00
|
|
|
pr_err("Failed to allocate memory for allocation tags in the module %s. Memory allocation profiling is disabled!\n",
|
|
|
|
|
mod->name);
|
|
|
|
|
return ERR_PTR(grow_res);
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-10-23 10:07:56 -07:00
|
|
|
|
|
|
|
|
return (struct alloc_tag *)(module_tags.start_addr + offset);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void release_module_tags(struct module *mod, bool used)
|
|
|
|
|
{
|
|
|
|
|
MA_STATE(mas, &mod_area_mt, module_tags.size, module_tags.size);
|
2025-05-16 17:07:39 -07:00
|
|
|
struct alloc_tag *start_tag;
|
|
|
|
|
struct alloc_tag *end_tag;
|
2024-10-23 10:07:56 -07:00
|
|
|
struct module *val;
|
|
|
|
|
|
|
|
|
|
mas_lock(&mas);
|
|
|
|
|
mas_for_each_rev(&mas, val, 0)
|
|
|
|
|
if (val == mod)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
if (!val) /* module not found */
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
if (!used)
|
|
|
|
|
goto release_area;
|
|
|
|
|
|
2025-05-16 17:07:39 -07:00
|
|
|
start_tag = (struct alloc_tag *)(module_tags.start_addr + mas.index);
|
|
|
|
|
end_tag = (struct alloc_tag *)(module_tags.start_addr + mas.last);
|
|
|
|
|
if (!clean_unused_counters(start_tag, end_tag)) {
|
|
|
|
|
struct alloc_tag *tag;
|
|
|
|
|
|
|
|
|
|
for (tag = start_tag; tag <= end_tag; tag++) {
|
|
|
|
|
struct alloc_tag_counters counter;
|
|
|
|
|
|
|
|
|
|
if (!tag->counters)
|
|
|
|
|
continue;
|
2024-10-23 10:07:56 -07:00
|
|
|
|
2025-05-16 17:07:39 -07:00
|
|
|
counter = alloc_tag_read(tag);
|
|
|
|
|
pr_info("%s:%u module %s func:%s has %llu allocated at module unload\n",
|
|
|
|
|
tag->ct.filename, tag->ct.lineno, tag->ct.modname,
|
|
|
|
|
tag->ct.function, counter.bytes);
|
|
|
|
|
}
|
2024-10-23 10:07:56 -07:00
|
|
|
} else {
|
|
|
|
|
used = false;
|
|
|
|
|
}
|
|
|
|
|
release_area:
|
|
|
|
|
mas_store(&mas, used ? &unloaded_mod : NULL);
|
|
|
|
|
val = mas_prev_range(&mas, 0);
|
|
|
|
|
if (val == &prepend_mod)
|
|
|
|
|
mas_store(&mas, NULL);
|
|
|
|
|
out:
|
|
|
|
|
mas_unlock(&mas);
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-21 09:06:02 -07:00
|
|
|
static int load_module(struct module *mod, struct codetag *start, struct codetag *stop)
|
2025-05-16 17:07:39 -07:00
|
|
|
{
|
|
|
|
|
/* Allocate module alloc_tag percpu counters */
|
|
|
|
|
struct alloc_tag *start_tag;
|
|
|
|
|
struct alloc_tag *stop_tag;
|
|
|
|
|
struct alloc_tag *tag;
|
|
|
|
|
|
2025-05-21 09:06:02 -07:00
|
|
|
/* percpu counters for core allocations are already statically allocated */
|
2025-05-16 17:07:39 -07:00
|
|
|
if (!mod)
|
2025-05-21 09:06:02 -07:00
|
|
|
return 0;
|
2025-05-16 17:07:39 -07:00
|
|
|
|
|
|
|
|
start_tag = ct_to_alloc_tag(start);
|
|
|
|
|
stop_tag = ct_to_alloc_tag(stop);
|
|
|
|
|
for (tag = start_tag; tag < stop_tag; tag++) {
|
|
|
|
|
WARN_ON(tag->counters);
|
|
|
|
|
tag->counters = alloc_percpu(struct alloc_tag_counters);
|
|
|
|
|
if (!tag->counters) {
|
|
|
|
|
while (--tag >= start_tag) {
|
|
|
|
|
free_percpu(tag->counters);
|
|
|
|
|
tag->counters = NULL;
|
|
|
|
|
}
|
2025-05-21 09:06:02 -07:00
|
|
|
pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s\n",
|
2025-05-16 17:07:39 -07:00
|
|
|
mod->name);
|
2025-05-21 09:06:02 -07:00
|
|
|
return -ENOMEM;
|
2025-05-16 17:07:39 -07:00
|
|
|
}
|
2025-05-21 09:06:02 -07:00
|
|
|
|
2025-06-20 02:31:54 +08:00
|
|
|
/*
|
|
|
|
|
* Avoid a kmemleak false positive. The pointer to the counters is stored
|
|
|
|
|
* in the alloc_tag section of the module and cannot be directly accessed.
|
|
|
|
|
*/
|
|
|
|
|
kmemleak_ignore_percpu(tag->counters);
|
|
|
|
|
}
|
2025-05-21 09:06:02 -07:00
|
|
|
return 0;
|
2025-05-16 17:07:39 -07:00
|
|
|
}
|
|
|
|
|
|
2024-10-23 10:07:56 -07:00
|
|
|
static void replace_module(struct module *mod, struct module *new_mod)
|
|
|
|
|
{
|
|
|
|
|
MA_STATE(mas, &mod_area_mt, 0, module_tags.size);
|
|
|
|
|
struct module *val;
|
|
|
|
|
|
|
|
|
|
mas_lock(&mas);
|
|
|
|
|
mas_for_each(&mas, val, module_tags.size) {
|
|
|
|
|
if (val != mod)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
mas_store_gfp(&mas, new_mod, GFP_KERNEL);
|
|
|
|
|
break;
|
2024-03-21 09:36:35 -07:00
|
|
|
}
|
2024-10-23 10:07:56 -07:00
|
|
|
mas_unlock(&mas);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int __init alloc_mod_tags_mem(void)
|
|
|
|
|
{
|
2024-10-23 10:07:57 -07:00
|
|
|
/* Map space to copy allocation tags */
|
|
|
|
|
vm_module_tags = execmem_vmap(MODULE_ALLOC_TAG_VMAP_SIZE);
|
|
|
|
|
if (!vm_module_tags) {
|
|
|
|
|
pr_err("Failed to map %lu bytes for module allocation tags\n",
|
|
|
|
|
MODULE_ALLOC_TAG_VMAP_SIZE);
|
|
|
|
|
module_tags.start_addr = 0;
|
2024-10-23 10:07:56 -07:00
|
|
|
return -ENOMEM;
|
2024-10-23 10:07:57 -07:00
|
|
|
}
|
2024-10-23 10:07:56 -07:00
|
|
|
|
2024-10-23 10:07:57 -07:00
|
|
|
vm_module_tags->pages = kmalloc_array(get_vm_area_size(vm_module_tags) >> PAGE_SHIFT,
|
|
|
|
|
sizeof(struct page *), GFP_KERNEL | __GFP_ZERO);
|
|
|
|
|
if (!vm_module_tags->pages) {
|
|
|
|
|
free_vm_area(vm_module_tags);
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
module_tags.start_addr = (unsigned long)vm_module_tags->addr;
|
2024-10-23 10:07:56 -07:00
|
|
|
module_tags.end_addr = module_tags.start_addr + MODULE_ALLOC_TAG_VMAP_SIZE;
|
2024-10-23 10:07:59 -07:00
|
|
|
/* Ensure the base is alloc_tag aligned when required for indexing */
|
|
|
|
|
module_tags.start_addr = alloc_tag_align(module_tags.start_addr);
|
2024-10-23 10:07:56 -07:00
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
2024-03-21 09:36:35 -07:00
|
|
|
|
2024-10-23 10:07:56 -07:00
|
|
|
static void __init free_mod_tags_mem(void)
|
|
|
|
|
{
|
2025-09-15 14:27:54 -07:00
|
|
|
release_pages_arg arg = { .pages = vm_module_tags->pages };
|
2024-10-23 10:07:57 -07:00
|
|
|
|
2024-10-23 10:07:56 -07:00
|
|
|
module_tags.start_addr = 0;
|
2025-09-15 14:27:54 -07:00
|
|
|
release_pages(arg, vm_module_tags->nr_pages);
|
2024-10-23 10:07:57 -07:00
|
|
|
kfree(vm_module_tags->pages);
|
|
|
|
|
free_vm_area(vm_module_tags);
|
2024-03-21 09:36:35 -07:00
|
|
|
}
|
|
|
|
|
|
2024-10-23 10:07:56 -07:00
|
|
|
#else /* CONFIG_MODULES */
|
|
|
|
|
|
|
|
|
|
static inline int alloc_mod_tags_mem(void) { return 0; }
|
|
|
|
|
static inline void free_mod_tags_mem(void) {}
|
|
|
|
|
|
|
|
|
|
#endif /* CONFIG_MODULES */
|
|
|
|
|
|
2024-10-23 10:07:59 -07:00
|
|
|
/* See: Documentation/mm/allocation-profiling.rst */
|
2024-03-21 09:36:37 -07:00
|
|
|
static int __init setup_early_mem_profiling(char *str)
|
|
|
|
|
{
|
2024-10-23 10:07:59 -07:00
|
|
|
bool compressed = false;
|
2024-03-21 09:36:37 -07:00
|
|
|
bool enable;
|
|
|
|
|
|
|
|
|
|
if (!str || !str[0])
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
if (!strncmp(str, "never", 5)) {
|
|
|
|
|
enable = false;
|
|
|
|
|
mem_profiling_support = false;
|
2024-10-23 10:07:59 -07:00
|
|
|
pr_info("Memory allocation profiling is disabled!\n");
|
2024-03-21 09:36:37 -07:00
|
|
|
} else {
|
2024-10-23 10:07:59 -07:00
|
|
|
char *token = strsep(&str, ",");
|
|
|
|
|
|
|
|
|
|
if (kstrtobool(token, &enable))
|
|
|
|
|
return -EINVAL;
|
2024-03-21 09:36:37 -07:00
|
|
|
|
2024-10-23 10:07:59 -07:00
|
|
|
if (str) {
|
2024-03-21 09:36:37 -07:00
|
|
|
|
2024-10-23 10:07:59 -07:00
|
|
|
if (strcmp(str, "compressed"))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
compressed = true;
|
|
|
|
|
}
|
2024-03-21 09:36:37 -07:00
|
|
|
mem_profiling_support = true;
|
2024-10-23 10:07:59 -07:00
|
|
|
pr_info("Memory allocation profiling is enabled %s compression and is turned %s!\n",
|
2025-08-14 17:38:27 +08:00
|
|
|
compressed ? "with" : "without", str_on_off(enable));
|
2024-03-21 09:36:37 -07:00
|
|
|
}
|
|
|
|
|
|
2024-10-23 10:07:59 -07:00
|
|
|
if (enable != mem_alloc_profiling_enabled()) {
|
2024-03-21 09:36:37 -07:00
|
|
|
if (enable)
|
|
|
|
|
static_branch_enable(&mem_alloc_profiling_key);
|
|
|
|
|
else
|
|
|
|
|
static_branch_disable(&mem_alloc_profiling_key);
|
|
|
|
|
}
|
2024-10-23 10:07:59 -07:00
|
|
|
if (compressed != static_key_enabled(&mem_profiling_compressed)) {
|
|
|
|
|
if (compressed)
|
|
|
|
|
static_branch_enable(&mem_profiling_compressed);
|
|
|
|
|
else
|
|
|
|
|
static_branch_disable(&mem_profiling_compressed);
|
|
|
|
|
}
|
2024-03-21 09:36:37 -07:00
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
early_param("sysctl.vm.mem_profiling", setup_early_mem_profiling);
|
|
|
|
|
|
2024-03-21 09:36:36 -07:00
|
|
|
static __init bool need_page_alloc_tagging(void)
|
|
|
|
|
{
|
2024-10-23 10:07:59 -07:00
|
|
|
if (static_key_enabled(&mem_profiling_compressed))
|
|
|
|
|
return false;
|
|
|
|
|
|
2024-03-21 09:36:37 -07:00
|
|
|
return mem_profiling_support;
|
2024-03-21 09:36:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static __init void init_page_alloc_tagging(void)
|
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct page_ext_operations page_alloc_tagging_ops = {
|
|
|
|
|
.size = sizeof(union codetag_ref),
|
|
|
|
|
.need = need_page_alloc_tagging,
|
|
|
|
|
.init = init_page_alloc_tagging,
|
|
|
|
|
};
|
|
|
|
|
EXPORT_SYMBOL(page_alloc_tagging_ops);
|
|
|
|
|
|
2024-06-01 16:38:31 -07:00
|
|
|
#ifdef CONFIG_SYSCTL
|
2025-09-15 14:27:55 -07:00
|
|
|
/*
|
|
|
|
|
* Not using proc_do_static_key() directly to prevent enabling profiling
|
|
|
|
|
* after it was shut down.
|
|
|
|
|
*/
|
|
|
|
|
static int proc_mem_profiling_handler(const struct ctl_table *table, int write,
|
|
|
|
|
void *buffer, size_t *lenp, loff_t *ppos)
|
|
|
|
|
{
|
|
|
|
|
if (!mem_profiling_support && write)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
return proc_do_static_key(table, write, buffer, lenp, ppos);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2024-03-21 09:36:35 -07:00
|
|
|
static struct ctl_table memory_allocation_profiling_sysctls[] = {
|
|
|
|
|
{
|
|
|
|
|
.procname = "mem_profiling",
|
|
|
|
|
.data = &mem_alloc_profiling_key,
|
|
|
|
|
#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
|
|
|
|
|
.mode = 0444,
|
|
|
|
|
#else
|
|
|
|
|
.mode = 0644,
|
|
|
|
|
#endif
|
2025-09-15 14:27:55 -07:00
|
|
|
.proc_handler = proc_mem_profiling_handler,
|
2024-03-21 09:36:35 -07:00
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
|
2024-06-01 16:38:31 -07:00
|
|
|
static void __init sysctl_init(void)
|
|
|
|
|
{
|
|
|
|
|
if (!mem_profiling_support)
|
|
|
|
|
memory_allocation_profiling_sysctls[0].mode = 0444;
|
|
|
|
|
|
|
|
|
|
register_sysctl_init("vm", memory_allocation_profiling_sysctls);
|
|
|
|
|
}
|
|
|
|
|
#else /* CONFIG_SYSCTL */
|
|
|
|
|
static inline void sysctl_init(void) {}
|
|
|
|
|
#endif /* CONFIG_SYSCTL */
|
|
|
|
|
|
2024-03-21 09:36:35 -07:00
|
|
|
static int __init alloc_tag_init(void)
|
|
|
|
|
{
|
|
|
|
|
const struct codetag_type_desc desc = {
|
2024-10-23 10:07:56 -07:00
|
|
|
.section = ALLOC_TAG_SECTION_NAME,
|
|
|
|
|
.tag_size = sizeof(struct alloc_tag),
|
|
|
|
|
#ifdef CONFIG_MODULES
|
|
|
|
|
.needs_section_mem = needs_section_mem,
|
|
|
|
|
.alloc_section_mem = reserve_module_tags,
|
|
|
|
|
.free_section_mem = release_module_tags,
|
2025-05-16 17:07:39 -07:00
|
|
|
.module_load = load_module,
|
2024-10-23 10:07:56 -07:00
|
|
|
.module_replaced = replace_module,
|
|
|
|
|
#endif
|
2024-03-21 09:36:35 -07:00
|
|
|
};
|
2024-10-23 10:07:56 -07:00
|
|
|
int res;
|
|
|
|
|
|
2025-05-13 12:26:02 -06:00
|
|
|
sysctl_init();
|
|
|
|
|
|
|
|
|
|
if (!mem_profiling_support) {
|
|
|
|
|
pr_info("Memory allocation profiling is not supported!\n");
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
alloc_tag: keep codetag iterator active between read()
When reading /proc/allocinfo, for each read syscall, seq_file would invoke
start/stop callbacks. In start callback, a memory is alloced to store
iterator and the iterator would start from beginning to walk linearly to
current read position.
seq_file read() takes at most 4096 bytes, even if read with a larger user
space buffer, meaning read out all of /proc/allocinfo, tens of read
syscalls are needed. For example, a 306036 bytes allocinfo files need 76
reads:
$ sudo cat /proc/allocinfo | wc
3964 16678 306036
$ sudo strace -T -e read cat /proc/allocinfo
...
read(3, " 4096 1 arch/x86/k"..., 131072) = 4063 <0.000062>
...
read(3, " 0 0 sound/core"..., 131072) = 4021 <0.000150>
...
For those n=3964 lines, each read takes about m=3964/76=52 lines,
since iterator restart from beginning for each read(),
it would move forward
m steps on 1st read
2*m steps on 2nd read
3*m steps on 3rd read
...
n steps on last read
As read() along, those linear seek steps make read() calls slower and
slower. Adding those up, codetag iterator moves about O(n*n/m) steps,
making data structure traversal take significant part of the whole
reading. Profiling when stress reading /proc/allocinfo confirms it:
vfs_read(99.959% 1677299/1677995)
proc_reg_read_iter(99.856% 1674881/1677299)
seq_read_iter(99.959% 1674191/1674881)
allocinfo_start(75.664% 1266755/1674191)
codetag_next_ct(79.217% 1003487/1266755) <---
srso_return_thunk(1.264% 16011/1266755)
__kmalloc_cache_noprof(0.102% 1296/1266755)
...
allocinfo_show(21.287% 356378/1674191)
allocinfo_next(1.530% 25621/1674191)
codetag_next_ct() takes major part.
A private data alloced at open() time can be used to carry iterator alive
across read() calls, and avoid the memory allocation and iterator reset
for each read(). This way, only O(1) memory allocation and O(n) steps
iterating, and `time` shows performance improvement from ~7ms to ~4ms.
Profiling with the change:
vfs_read(99.865% 1581073/1583214)
proc_reg_read_iter(99.485% 1572934/1581073)
seq_read_iter(99.846% 1570519/1572934)
allocinfo_show(87.428% 1373074/1570519)
seq_buf_printf(83.695% 1149196/1373074)
seq_buf_putc(1.917% 26321/1373074)
_find_next_bit(1.531% 21023/1373074)
...
codetag_to_text(0.490% 6727/1373074)
...
allocinfo_next(6.275% 98543/1570519)
...
allocinfo_start(0.369% 5790/1570519)
...
Now seq_buf_printf() takes major part.
Link: https://lkml.kernel.org/r/20250609064408.112783-1-00107082@163.com
Signed-off-by: David Wang <00107082@163.com>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-06-09 14:44:08 +08:00
|
|
|
if (!proc_create_seq_private(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op,
|
|
|
|
|
sizeof(struct allocinfo_private), NULL)) {
|
2025-05-13 12:26:02 -06:00
|
|
|
pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME);
|
|
|
|
|
shutdown_mem_profiling(false);
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
}
|
|
|
|
|
|
2024-10-23 10:07:56 -07:00
|
|
|
res = alloc_mod_tags_mem();
|
2025-05-13 12:26:02 -06:00
|
|
|
if (res) {
|
|
|
|
|
pr_err("Failed to reserve address space for module tags, errno = %d\n", res);
|
|
|
|
|
shutdown_mem_profiling(true);
|
2024-10-23 10:07:56 -07:00
|
|
|
return res;
|
2025-05-13 12:26:02 -06:00
|
|
|
}
|
2024-03-21 09:36:35 -07:00
|
|
|
|
|
|
|
|
alloc_tag_cttype = codetag_register_type(&desc);
|
2024-10-23 10:07:56 -07:00
|
|
|
if (IS_ERR(alloc_tag_cttype)) {
|
2025-05-13 12:26:02 -06:00
|
|
|
pr_err("Allocation tags registration failed, errno = %ld\n", PTR_ERR(alloc_tag_cttype));
|
2024-10-23 10:07:56 -07:00
|
|
|
free_mod_tags_mem();
|
2025-05-13 12:26:02 -06:00
|
|
|
shutdown_mem_profiling(true);
|
2024-03-21 09:36:35 -07:00
|
|
|
return PTR_ERR(alloc_tag_cttype);
|
2024-10-23 10:07:56 -07:00
|
|
|
}
|
2024-03-21 09:36:35 -07:00
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
module_init(alloc_tag_init);
|