Baolin Wang <baolin.wang(a)linux.alibaba.com> writes:
( 2/24/2022 9:26 AM, Huang, Ying S:
Baolin Wang <baolin.wang(a)linux.alibaba.com>
writes:
( 2/23/2022 4:44 PM, Huang, Ying S:
Baolin Wang <baolin.wang(a)linux.alibaba.com>
writes:
> ( 2/23/2022 3:51 PM, Huang, Ying S:
>> Baolin Wang <baolin.wang(a)linux.alibaba.com> writes:
>>
>>> ANBZ: #80
>>>
>>> Add page promotion throttle statistic, which can be used to check
>>> how many cold pages were trying to be prmoted to DRAM, and help
>>> to tuning the latency threhold.
>> Can this be calculated via the following formula?
>> numa_hint_faults - pgpromote_candidate
>
> They are not same. Since numa_hint_faults will not contain file page
> promotion statistics, while the should_numa_migrate_memory() will
> check the file pages latency before promotion. Here some statistics I
> observed:
>
> pgpromote_candidate 3890100
> pgpromote_cold_throttle 50342001
> numa_hint_faults 35341839
Then how about adding a counter for unmapped pages accesses we
checked?
And I think these kind of counters are for debugging only at least
for
now. Should we merge them formally?
Do you have an example about how to use this new counter or similar
one?
We can add a counter for unmapped pages, but it still be little clear
for these cold throttling counter. Please also considering
numa_hint_faults will contain local faults, and some page promotion
counts if the DRAM is free enough.
I think the cold throttling counter is not only for debugging, but
also for doing some decision in future. We can know how many cold
pages are accessed, and we can measure the cold and hot memory
distribution for this workdload to decide if the workload is suitable
for the tiered-memory. So a clear and accurate cold throttling counter
will be helpful. How do you think?
If our target is the page temperature
distribution, then a histogram
may
be even better. I have implemented one before. The histogram will
count the hint page fault number within each access latency range. And
we can output and clear the histogram every
sysctl_numa_balancing_scan_period_max.
OK, that's great, do you have a link to share? Thanks.
The patch is written about 2 years ago. So it cannot be used now. For
histogram only, it can be simplified greatly. You can work on it if you
have interest. Paste it as below.
Best Regards,
Huang, Ying
-----------------------------------8<-----------------------------------------
From 7c5ddada5f3608e7ce3405eaf3b8028dcf5094bb Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang(a)intel.com>
Date: Mon, 12 Aug 2019 10:51:06 +0800
Subject: [PATCH] autonuma, memory tiering: Adjust hot threshold automatically
via latency histogram
It isn't easy for the administrator to determine the hot threshold.
So in this patch, a method to adjust the hot threshold automatically
is implemented.
The hint page fault latency of the pages in a slow memory node is
recorded in a histogram. Then the max latency of the N hottest pages
can be calculated with the histogram. And it will be used as the
threshold to promote hot pages from the slow memory node to the fast
memory node. Where "N" is the 20% of all migratable pages in the
node, or rate limit number for the histogram recording interval.
The hint page fault may not occur for really cold pages in a full NUMA
balancing scanning period. This could be detected during scanning,
and the hint page fault latency of these pages could be considered as
maximum.
The sysctl knob kernel.numa_balancing_hot_threshold_ms becomes the
initial value and max value of the hot threshold.
Signed-off-by: "Huang, Ying" <ying.huang(a)intel.com>
Suggested-by: Fengguang Wu <fengguang.wu(a)intel.com>
---
include/linux/mm_types.h | 11 ++
include/linux/mmzone.h | 3 +-
include/linux/sched/mm.h | 12 ++
include/linux/sched/numa_balancing.h | 7 +
kernel/fork.c | 5 +
kernel/sched/fair.c | 189 ++++++++++++++++++++-------
mm/huge_memory.c | 9 +-
mm/mprotect.c | 15 ++-
mm/page_alloc.c | 11 ++
9 files changed, 213 insertions(+), 49 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 59e2151734ab..fa5a580040bf 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -350,6 +350,15 @@ struct core_state {
struct completion startup;
};
+#define NUMA_LATENCY_HISTOGRAM_BUCKETS 16
+
+struct numa_latency_histogram
+{
+ int generation;
+ atomic_long_t nr;
+ atomic_long_t buckets[NUMA_LATENCY_HISTOGRAM_BUCKETS];
+};
+
struct kioctx_table;
struct mm_struct {
struct {
@@ -489,6 +498,8 @@ struct mm_struct {
int numa_scan_idx;
unsigned long numa_scan_jiffies[NUMA_SCAN_NR_HIST];
unsigned long numa_scan_starts[NUMA_SCAN_NR_HIST];
+
+ struct numa_latency_histogram *numa_latency_histograms;
#endif
/*
* An operation with batched TLB flushing is going on. Anything
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 203a19391f5d..d71c451f3250 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -777,8 +777,9 @@ typedef struct pglist_data {
unsigned long autonuma_jiffies;
unsigned long autonuma_try_migrate;
unsigned long autonuma_threshold_jiffies;
- unsigned long autonuma_threshold_try_migrate;
unsigned long autonuma_threshold;
+ spinlock_t autonuma_histogram_lock;
+ struct numa_latency_histogram autonuma_latency_histogram;
#endif
/* Fields commonly accessed by the page reclaim scanner */
struct lruvec lruvec;
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 4a7944078cc3..5b9e32aed357 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -388,4 +388,16 @@ static inline void membarrier_mm_sync_core_before_usermode(struct
mm_struct *mm)
}
#endif
+#ifdef CONFIG_NUMA_BALANCING
+int mm_init_hmem(struct mm_struct *mm);
+void mm_free_hmem(struct mm_struct *mm);
+#else
+static inline int mm_init_hmem(struct mm_struct *mm)
+{
+ return 0;
+}
+
+static inline void mm_free_hmem(struct mm_struct *mm) {}
+#endif
+
#endif /* _LINUX_SCHED_MM_H */
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 65dc8c6e8377..4c766aa19574 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -25,6 +25,8 @@ extern void task_numa_free(struct task_struct *p);
extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
int src_nid, int dst_cpu,
unsigned long addr, int flags);
+extern void numa_record_latency(struct mm_struct *mm, int nid, long latency,
+ int nr);
#else
static inline void task_numa_fault(int last_node, int node, int pages,
int flags)
@@ -47,6 +49,11 @@ static inline bool should_numa_migrate_memory(struct task_struct *p,
{
return true;
}
+
+static inline void numa_record_latency(struct mm_struct *mm, int nid,
+ long latency, int nr)
+{
+}
#endif
#endif /* _LINUX_SCHED_NUMA_BALANCING_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index fe83343da24b..312731179390 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1021,6 +1021,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct
task_struct *p,
mm->def_flags = 0;
}
+ if (mm_init_hmem(mm))
+ goto fail_nohmem;
+
if (mm_alloc_pgd(mm))
goto fail_nopgd;
@@ -1033,6 +1036,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct
task_struct *p,
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
+ mm_free_hmem(mm);
+fail_nohmem:
free_mm(mm);
return NULL;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c85cf52149e0..ad9f9f596087 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1411,6 +1411,22 @@ static inline unsigned long group_weight(struct task_struct *p, int
nid,
return 1000 * faults / total_faults;
}
+int mm_init_hmem(struct mm_struct *mm)
+{
+ mm->numa_latency_histograms =
+ kvzalloc(nr_node_ids * sizeof(struct numa_latency_histogram),
+ GFP_KERNEL);
+ if (!mm->numa_latency_histograms)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void mm_free_hmem(struct mm_struct *mm)
+{
+ kvfree(mm->numa_latency_histograms);
+}
+
static bool pgdat_free_space_enough(struct pglist_data *pgdat)
{
int z;
@@ -1468,6 +1484,19 @@ static long numa_hint_fault_latency(struct task_struct *p, unsigned
long addr)
return latency;
}
+void numa_record_latency(struct mm_struct *mm, int nid, long latency, int nr)
+{
+ long max_latency;
+ struct numa_latency_histogram *hist;
+
+ max_latency = msecs_to_jiffies(sysctl_numa_balancing_hot_threshold);
+ latency = latency * NUMA_LATENCY_HISTOGRAM_BUCKETS / max_latency;
+ latency = min_t(long, latency, NUMA_LATENCY_HISTOGRAM_BUCKETS - 1);
+ hist = &mm->numa_latency_histograms[nid];
+ atomic_long_add(nr, &hist->nr);
+ atomic_long_add(nr, &hist->buckets[latency]);
+}
+
static bool numa_migration_check_rate_limit(struct pglist_data *pgdat,
unsigned long rate_limit, int nr)
{
@@ -1486,43 +1515,6 @@ static bool numa_migration_check_rate_limit(struct pglist_data
*pgdat,
return true;
}
-#define NUMA_MIGRATION_ADJUST_STEPS 16
-
-static void numa_migration_adjust_threshold(struct pglist_data *pgdat,
- unsigned long rate_limit,
- unsigned long ref_threshold)
-{
- unsigned long now = jiffies, last_threshold_jiffies;
- unsigned long unit_threshold, threshold;
- unsigned long try_migrate, ref_try_migrate, mdiff;
-
- last_threshold_jiffies = pgdat->autonuma_threshold_jiffies;
- if (now > last_threshold_jiffies +
- msecs_to_jiffies(sysctl_numa_balancing_scan_period_max) &&
- cmpxchg(&pgdat->autonuma_threshold_jiffies,
- last_threshold_jiffies, now) == last_threshold_jiffies) {
-
- ref_try_migrate = rate_limit *
- sysctl_numa_balancing_scan_period_max / 1000;
- try_migrate = node_page_state(pgdat, NUMA_TRY_MIGRATE);
- mdiff = try_migrate - pgdat->autonuma_threshold_try_migrate;
- unit_threshold = ref_threshold / NUMA_MIGRATION_ADJUST_STEPS;
- threshold = pgdat->autonuma_threshold;
- if (!threshold)
- threshold = ref_threshold;
- if (mdiff > ref_try_migrate * 11 / 10)
- threshold = max(threshold - unit_threshold,
- unit_threshold);
- else if (mdiff < ref_try_migrate * 9 / 10)
- threshold = min(threshold + unit_threshold,
- ref_threshold);
- pgdat->autonuma_threshold_try_migrate = try_migrate;
- pgdat->autonuma_threshold = threshold;
- trace_autonuma_threshold(pgdat->node_id, mdiff, ref_try_migrate,
- threshold, 0, ref_threshold);
- }
-}
-
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu, unsigned long addr,
int flags)
@@ -1537,9 +1529,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page
* page,
*/
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
next_promotion_node(src_nid) != -1) {
- struct pglist_data *pgdat;
+ struct pglist_data *pgdat, *src_pgdat;
unsigned long rate_limit, latency, threshold, def_threshold;
+ latency = numa_hint_fault_latency(p, addr);
+ numa_record_latency(p->mm, src_nid, latency, hpage_nr_pages(page));
+
pgdat = NODE_DATA(dst_nid);
if (pgdat_free_space_enough(pgdat))
return true;
@@ -1550,19 +1545,16 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page
* page,
def_threshold = msecs_to_jiffies(
sysctl_numa_balancing_hot_threshold);
- rate_limit = sysctl_numa_balancing_rate_limit <<
- (20 - PAGE_SHIFT);
- numa_migration_adjust_threshold(pgdat, rate_limit,
- def_threshold);
-
- threshold = pgdat->autonuma_threshold;
+ src_pgdat = NODE_DATA(src_nid);
+ threshold = src_pgdat->autonuma_threshold;
threshold = threshold ? : def_threshold;
if (flags & TNF_WRITE)
threshold *= 2;
- latency = numa_hint_fault_latency(p, addr);
if (latency > threshold)
return false;
+ rate_limit = sysctl_numa_balancing_rate_limit <<
+ (20 - PAGE_SHIFT);
return numa_migration_check_rate_limit(pgdat, rate_limit,
hpage_nr_pages(page));
}
@@ -2597,6 +2589,111 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int
flags)
p->numa_faults_locality[local] += pages;
}
+static long numa_histogram_latency(struct numa_latency_histogram *histogram,
+ long nr)
+{
+ int i;
+ long acc = 0, max_latency;
+
+ for (i = 0; i < NUMA_LATENCY_HISTOGRAM_BUCKETS; i++) {
+ acc += atomic_long_read(&histogram->buckets[i]);
+ if (acc > nr)
+ break;
+ }
+ i = min_t(int, i, NUMA_LATENCY_HISTOGRAM_BUCKETS - 1);
+ max_latency = msecs_to_jiffies(sysctl_numa_balancing_hot_threshold);
+ return (i + 1) * max_latency / NUMA_LATENCY_HISTOGRAM_BUCKETS;
+}
+
+static void numa_migration_update_threshold(struct pglist_data *pgdat,
+ unsigned long max_interval)
+{
+ int i;
+ long nr, nr_page;
+ unsigned long now = jiffies, rate_limit, interval;
+ struct numa_latency_histogram *node_histogram;
+
+ node_histogram = &pgdat->autonuma_latency_histogram;
+ nr = atomic_long_read(&node_histogram->nr);
+ nr_page = node_page_state(pgdat, NR_ANON_MAPPED) +
+ node_page_state(pgdat, NR_FILE_MAPPED);
+
+ if (nr_page <= 0)
+ return;
+
+ if (nr < nr_page * 4 / 5 &&
+ now <= READ_ONCE(pgdat->autonuma_threshold_jiffies) + max_interval)
+ return;
+
+ /* To few data to update threshold */
+ if (nr < max(nr_page / 8, 10240L))
+ goto skip;
+
+ /*
+ * 20% hottest pages are promotion candidates.
+ * Rate limit are considered too.
+ */
+ interval = now - pgdat->autonuma_threshold_jiffies;
+ rate_limit = (sysctl_numa_balancing_rate_limit << (20 - PAGE_SHIFT))
+ * interval / HZ;
+ pgdat->autonuma_threshold = numa_histogram_latency(node_histogram,
+ min_t(long, nr / 5, rate_limit));
+ atomic_long_set(&node_histogram->nr, 0);
+ for (i = 0; i < NUMA_LATENCY_HISTOGRAM_BUCKETS; i++)
+ atomic_long_set(&node_histogram->buckets[i], 0);
+skip:
+ node_histogram->generation++;
+ WRITE_ONCE(pgdat->autonuma_threshold_jiffies, now);
+}
+
+static void aggregate_numa_latency_histogram(struct mm_struct *mm)
+{
+ int nid, i;
+ long nr;
+ unsigned long now = jiffies, max_interval;
+ struct pglist_data *pgdat;
+ struct numa_latency_histogram *mm_histogram, *node_histogram;
+
+ for_each_node(nid) {
+ mm_histogram = &mm->numa_latency_histograms[nid];
+ nr = atomic_long_read(&mm_histogram->nr);
+ if (!nr)
+ continue;
+ max_interval = msecs_to_jiffies(
+ sysctl_numa_balancing_scan_period_max);
+ pgdat = NODE_DATA(nid);
+ node_histogram = &pgdat->autonuma_latency_histogram;
+ atomic_long_set(&mm_histogram->nr, 0);
+ /*
+ * The histogram of mm has been aggregated into that
+ * of node in this period, skip.
+ */
+ if (node_histogram->generation == mm_histogram->generation) {
+ for (i = 0; i < NUMA_LATENCY_HISTOGRAM_BUCKETS; i++)
+ atomic_long_set(&mm_histogram->buckets[i], 0);
+
+ if (now < max_interval + \
+ READ_ONCE(pgdat->autonuma_threshold_jiffies))
+ continue;
+ spin_lock(&pgdat->autonuma_histogram_lock);
+ goto update_threshold;
+ }
+ spin_lock(&pgdat->autonuma_histogram_lock);
+ atomic_long_add(nr, &node_histogram->nr);
+ for (i = 0; i < NUMA_LATENCY_HISTOGRAM_BUCKETS; i++) {
+ atomic_long_add(
+ atomic_long_read(&mm_histogram->buckets[i]),
+ &node_histogram->buckets[i]);
+ atomic_long_set(&mm_histogram->buckets[i], 0);
+ }
+ mm_histogram->generation = node_histogram->generation;
+
+update_threshold:
+ numa_migration_update_threshold(pgdat, max_interval);
+ spin_unlock(&pgdat->autonuma_histogram_lock);
+ }
+}
+
static void reset_ptenuma_scan(struct task_struct *p)
{
/*
@@ -2609,6 +2706,8 @@ static void reset_ptenuma_scan(struct task_struct *p)
*/
WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
WRITE_ONCE(p->mm->numa_scan_offset, 0);
+
+ aggregate_numa_latency_histogram(p->mm);
}
/*
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3b156759a963..5b5fe97e6779 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1952,6 +1952,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
if (is_huge_zero_pmd(*pmd))
goto unlock;
+ page = pmd_page(*pmd);
+
if (pmd_protnone(*pmd)) {
if (!(sysctl_numa_balancing_mode &
NUMA_BALANCING_MEMORY_TIERING))
@@ -1963,10 +1965,15 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
*/
if (pmd_young(*pmd))
set_pmd_at(mm, addr, pmd, pmd_mkold(*pmd));
+ /*
+ * The page hasn't been accessed in the last scan
+ * period, the access latency is max.
+ */
+ numa_record_latency(mm, page_to_nid(page), INT_MAX,
+ HPAGE_PMD_NR);
goto unlock;
}
- page = pmd_page(*pmd);
/*
* Skip if normal numa balancing is disabled and no
* faster memory node to promote to
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4f209a7ffa50..4630c2e18440 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -29,6 +29,7 @@
#include <linux/uaccess.h>
#include <linux/mm_inline.h>
#include <linux/sched/sysctl.h>
+#include <linux/sched/numa_balancing.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
@@ -82,6 +83,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t
*pmd,
struct page *page;
int nid;
+ page = vm_normal_page(vma, addr, oldpte);
+ if (!page)
+ continue;
+
/* Avoid TLB flush if possible */
if (pte_protnone(oldpte)) {
if (!(sysctl_numa_balancing_mode &
@@ -96,11 +101,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma,
pmd_t *pmd,
if (pte_young(oldpte))
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(oldpte));
+ /*
+ * The page hasn't been accessed in the
+ * last scan period, the access latency
+ * is max.
+ */
+ numa_record_latency(vma->vm_mm,
+ page_to_nid(page), INT_MAX, 1);
continue;
}
- page = vm_normal_page(vma, addr, oldpte);
- if (!page || PageKsm(page))
+ if (PageKsm(page))
continue;
/* Also skip shared copy-on-write pages */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ebf33bc3db23..a6e0d20da832 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6600,12 +6600,23 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat)
static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
#endif
+#ifdef CONFIG_NUMA_BALANCING
+static void pgdat_init_autonuma(struct pglist_data *pgdat)
+{
+ spin_lock_init(&pgdat->autonuma_histogram_lock);
+ pgdat->autonuma_latency_histogram.generation = 1;
+}
+#else
+static void pgdat_init_autonuma(struct pglist_data *pgdat) {}
+#endif
+
static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
{
pgdat_resize_init(pgdat);
pgdat_init_split_queue(pgdat);
pgdat_init_kcompactd(pgdat);
+ pgdat_init_autonuma(pgdat);
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
--
2.30.2