It was reported that the following panic may be triggered when
NUMA_BALANCING_MEMORY_TIERING is disabled in runtime.
BUG: unable to handle kernel paging request at ffffffff9670bae0
PGD 29a620c067 P4D 29a620c067 PUD 29a620d063 PMD 2f3c2ef063 PTE 800fffd6598f4062
Oops: 0000 [#1] SMP PTI
CPU: 14 PID: 518 Comm: systemd-journal Kdump: loaded Tainted: G E
4.19.91-001.ali4000_20210617_6287b9d5de_cbp.alios7.x86_64 #1
Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 8c24b4c 04/01/2014
RIP: 0010:should_numa_migrate_memory+0xc3/0x760
Code: ff 00 00 00 0f 84 89 03 00 00 41 0f b6 b5 f0 05 00 00 39 f1 0f 84 79 03 00 00 c1 f8
08 25 ff 01 00 00 48 8b 04 c5 20 37 16 96 <46> 3b 24 30 74 19 48 83 c4 38 31 c0 5b
5d 41 5c 41 5d 41 5e 41 5f
RSP: 0000:ffff9ff78d35bd08 EFLAGS: 00010206
RAX: ffffffff966ec000 RBX: ffffc96b10868980 RCX: 000000000000005c
RDX: 05169704c2022014 RSI: 0000000000000006 RDI: ffffc96b10868980
RBP: 0000000000000000 R08: ff80003fffffffff R09: ffff93c63ac01d10
R10: 0000000000000002 R11: 0000000000000000 R12: 0000000000000000
R13: ffff93c63af8c740 R14: 000000000001fae0 R15: 0000000000000001
FS: 00007f89d8602880(0000) GS:ffff93c641b80000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffffff9670bae0 CR3: 0000000f77c12006 CR4: 00000000003606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
mpol_misplaced+0x17e/0x200
do_numa_page+0x1e5/0x288
handle_mm_fault+0x91b/0x960
__do_page_fault+0x26b/0x4a0
do_page_fault+0x32/0x110
? async_page_fault+0x8/0x30
async_page_fault+0x1e/0x30
RIP: 0033:0x558e2f2e21e0
Code: 8b 83 c8 00 00 00 48 8b 48 70 48 c1 e9 04 48 85 c9 0f 84 9c 01 00 00 31 d2 48 89 e8
48 f7 f1 48 c1 e2 04 48 03 93 d0 00 00 00 <4c> 8b 3a 4
Because page "cpupid" field is used to record the CPU and PID when
NUMA_BALANCING_MEMORY_TIERING is disabled, while page "cpupid" field
is used to record the NUMA balancing page scan time when
NUMA_BALANCING_MEMORY_TIERING is enabled. During disabling
NUMA_BALANCING_MEMORY_TIERING in runtime, "cpupid" may become
invalid so that panic is triggered.
This is fixed via checking the validity of "cpupid" before use it.
Signed-off-by: "Huang, Ying" <ying.huang(a)intel.com>
Reported-and-tested-by: zhong jiang <zhongjiang-ali(a)linux.alibaba.com>
---
include/linux/mm.h | 5 +++++
kernel/sched/fair.c | 20 +++++++++++++++++---
2 files changed, 22 insertions(+), 3 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a9ea778eafe0..327fe01b1ae1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1440,6 +1440,11 @@ static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid);
}
+static inline bool check_cpupid(int cpupid)
+{
+ return cpupid_to_cpu(cpupid) < nr_cpu_ids;
+}
+
#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7ca4ab033180..85a01906d796 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1597,6 +1597,14 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page
* page,
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ /*
+ * The cpupid may be invalid when NUMA_BALANCING_MEMORY_TIERING
+ * is disabled dynamically.
+ */
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ !node_is_toptier(src_nid) && !check_cpupid(last_cpupid))
+ return false;
+
/*
* Allow first faults or private faults to migrate immediately early in
* the lifetime of a task. The magic number 4 is based on waiting for
@@ -2812,9 +2820,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int
flags)
if (!p->mm)
return;
- /* Numa faults statistics are unnecessary for the slow memory node */
- if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
- !node_is_toptier(mem_node))
+ /*
+ * NUMA faults statistics are unnecessary for the slow memory node.
+ *
+ * And, the cpupid may be invalid when NUMA_BALANCING_MEMORY_TIERING
+ * is disabled dynamically.
+ */
+ if (!node_is_toptier(mem_node) &&
+ (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
+ !check_cpupid(last_cpupid)))
return;
/* Allocate buffer to track faults on a per-node basis */
--
2.30.2
显示某日回复