On 2022/9/1 9:46 上午, Feng Tang wrote:
Hi Zhong Jiang,
Thanks for the review!
On Wed, Aug 31, 2022 at 07:20:21PM +0800, zhong jiang wrote:
On 2022/8/31 3:31 下午, Huaisheng Ye wrote:
From: Feng Tang <feng.tang(a)intel.com>
When platform has PMEM node, those node are mostly a pure memory node
without any CPU attached. To avoid performance/latency sensitive
kernel allocation on these nodes, make all cpu-less memory nodes
be enabled as node with movable zone only.
And when users want it to be normal node, appending "cpuless_node_normal"
to kernel cmdline parameters can achive that.
Signed-off-by: Feng Tang <feng.tang(a)intel.com>
Signed-off-by: Huaisheng Ye <huaisheng.ye(a)intel.com>
---
arch/x86/mm/srat.c | 5 +++++
drivers/base/node.c | 2 ++
include/linux/nodemask.h | 1 +
mm/page_alloc.c | 30 ++++++++++++++++++++++++++++++
4 files changed, 38 insertions(+)
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index dac07e4f5834..dcb568067d79 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -60,6 +60,9 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity
*pa)
node_set(node, numa_nodes_parsed);
printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
pxm, apic_id, node);
+
+ if (!node_state(node, N_POSSIBLE_CPU))
+ node_set_state(node, N_POSSIBLE_CPU);
}
/* Callback for Proximity Domain -> LAPIC mapping */
@@ -101,6 +104,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity
*pa)
node_set(node, numa_nodes_parsed);
printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
pxm, apic_id, node);
+ if (!node_state(node, N_POSSIBLE_CPU))
+ node_set_state(node, N_POSSIBLE_CPU);
}
IIUC, pmem memory also store in the srat table, the above change seems to
fail to work because pmem memory has marked as the N_POSSIBLE_CPU.
PMEM node will not be set for N_POSSIBLE_CPU, the upper 2 functions
are only for parsing CPU entry with interrupt controllers.
IIUC, you are refering the NUMA node info maintained by architectural
code (say arch/x86): node_possible_map / numa_nodes_parsed, where PMEM
node will be set.
And the node map here N_POSSIBLE_CPU is from another set mainly for
mm/ code, and used later in memory init part.
This patch has been tested on QEMU and RE7P machines.
Thanks for your clarification.
Thanks,
Feng
> Am I missing?
>
> Thanks,
>> int __init x86_acpi_numa_init(void)
>> diff --git a/drivers/base/node.c b/drivers/base/node.c
>> index 619fb1e38dd2..39ee52c69609 100644
>> --- a/drivers/base/node.c
>> +++ b/drivers/base/node.c
>> @@ -990,6 +990,7 @@ static struct node_attr node_state_attr[] = {
>> #endif
>> [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
>> [N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
>> + [N_POSSIBLE_CPU] = _NODE_ATTR(has_possible_cpu, N_POSSIBLE_CPU),
>> };
>> static struct attribute *node_state_attrs[] = {
>> @@ -1001,6 +1002,7 @@ static struct attribute *node_state_attrs[] = {
>> #endif
>> &node_state_attr[N_MEMORY].attr.attr,
>> &node_state_attr[N_CPU].attr.attr,
>> + &node_state_attr[N_POSSIBLE_CPU].attr.attr,
>> NULL
>> };
>> diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
>> index 5a30ad594ccc..ad3100b06938 100644
>> --- a/include/linux/nodemask.h
>> +++ b/include/linux/nodemask.h
>> @@ -399,6 +399,7 @@ enum node_states {
>> #endif
>> N_MEMORY, /* The node has memory(regular, high, movable) */
>> N_CPU, /* The node has one or more cpus */
>> + N_POSSIBLE_CPU, /* possible nodes that have cpu attached, only used in
early boot phase */
>> NR_NODE_STATES
>> };
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 8cfce927fb9b..9570ee499c12 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -294,6 +294,7 @@ static unsigned long required_kernelcore_percent
__initdata;
>> static unsigned long required_movablecore __initdata;
>> static unsigned long required_movablecore_percent __initdata;
>> static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata;
>> +static bool enable_cpuless_memnode_normal_node __initdata;
>> static bool mirrored_kernelcore __meminitdata;
>> /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken
from */
>> @@ -7163,6 +7164,24 @@ static void __init
find_zone_movable_pfns_for_nodes(void)
>> required_movablecore = (totalpages * 100 * required_movablecore_percent) /
>> 10000UL;
>> + if (!required_kernelcore && !required_movablecore &&
>> + !nodes_empty(node_states[N_POSSIBLE_CPU]) &&
>> + !enable_cpuless_memnode_normal_node) {
>> +
>> + /* Put meory from cpu-less nodes into movable zones */
>> + for_each_memblock(memory, r) {
>> + nid = memblock_get_region_node(r);
>> +
>> + if (node_isset(nid, node_states[N_POSSIBLE_CPU]))
>> + continue;
>> +
>> + usable_startpfn = PFN_DOWN(r->base);
>> + zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
>> + min(usable_startpfn, zone_movable_pfn[nid]) :
>> + usable_startpfn;
>> + }
>> + }
>> +
>> /*
>> * If movablecore= was specified, calculate what size of
>> * kernelcore that corresponds so that memory usable for
>> @@ -7457,8 +7476,19 @@ static int __init cmdline_parse_movablecore(char *p)
>> &required_movablecore_percent);
>> }
>> +/*
>> + * cpuless memory nodes will be enabled to movable node by default,
>> + * add this cmdline to make it be enabled as a normal node
>> + */
>> +static int __init cmdline_parse_cpuless_memnode(char *str)
>> +{
>> + enable_cpuless_memnode_normal_node = true;
>> + return 0;
>> +}
>> +
>> early_param("kernelcore", cmdline_parse_kernelcore);
>> early_param("movablecore", cmdline_parse_movablecore);
>> +early_param("cpuless_node_normal", cmdline_parse_cpuless_memnode);
>> #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */