From: Huang Ying <ying.huang(a)intel.com>
ANBZ: #80
cherry-picked from
https://git.kernel.org/pub/scm/linux/kernel/git/vishal/tiering.git/commit/?…
In memory tiering system, the promotion threshold should be adjusted as soon as
possible to adapt to the workload accessing pattern changes as quickly as
possible.
In the current implementation, the memory tiering promotion threshold will be
adjusted every statistics period (60s by default) to keep the number of pages
that pass the promotion threshold less than or equals the ratelimit per
statistics period. In fact, if in the middle of the statistics period, and the
number of pages that pass the promotion threshold has exceeded the ratelimit per
statistics period, we can decrease the promotion threshold immediately. This
can accelerate the promotion threshold adjustment effectively.
In the test with pmbench memory accessing benchmark on a 2-socket server with
Optane DCPMM, the threshold can be adjusted to the target value from about 1000s
to about 200s. The pmbench score increases 3.8% with -11.8% less pages
promoted.
Signed-off-by: "Huang, Ying" <ying.huang(a)intel.com>
Signed-off-by: Baolin Wang <baolin.wang(a)linux.alibaba.com>
---
include/linux/mmzone.h | 1 +
kernel/sched/fair.c | 23 +++++++++++++++--------
mm/page_alloc.c | 3 +++
3 files changed, 19 insertions(+), 8 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9c2363f..a2b6636 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -760,6 +760,7 @@ struct hugepage_reclaim {
#endif
#ifdef CONFIG_NUMA_BALANCING
+ spinlock_t numa_lock;
unsigned long numa_ts;
unsigned long numa_nr_candidate;
unsigned long numa_threshold_ts;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a16a1a9..5806316 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2935,16 +2935,23 @@ static void numa_migration_adjust_threshold(struct pglist_data
*pgdat,
{
unsigned long now = jiffies, last_th_ts, th_period;
unsigned long unit_th, th, oth;
- unsigned long nr_cand, ref_cand, diff_cand;
+ unsigned long last_nr_cand, nr_cand, ref_cand, diff_cand;
th_period = msecs_to_jiffies(sysctl_numa_balancing_scan_period_max);
last_th_ts = pgdat->numa_threshold_ts;
- if (now > last_th_ts + th_period &&
- cmpxchg(&pgdat->numa_threshold_ts, last_th_ts, now) == last_th_ts) {
- ref_cand = rate_limit *
- sysctl_numa_balancing_scan_period_max / 1000;
- nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
- diff_cand = nr_cand - pgdat->numa_threshold_nr_candidate;
+ ref_cand = rate_limit * sysctl_numa_balancing_scan_period_max / 1000;
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ last_nr_cand = pgdat->numa_threshold_nr_candidate;
+ diff_cand = nr_cand - last_nr_cand;
+ if ((now > last_th_ts + th_period || diff_cand > ref_cand * 11 / 10)) {
+ spin_lock(&pgdat->numa_lock);
+ if (pgdat->numa_threshold_ts != last_th_ts ||
+ pgdat->numa_threshold_nr_candidate != last_nr_cand) {
+ spin_unlock(&pgdat->numa_lock);
+ return;
+ }
+ pgdat->numa_threshold_ts = now;
+ pgdat->numa_threshold_nr_candidate = nr_cand;
unit_th = ref_th / NUMA_MIGRATION_ADJUST_STEPS;
oth = pgdat->numa_threshold;
th = oth ? : ref_th;
@@ -2952,8 +2959,8 @@ static void numa_migration_adjust_threshold(struct pglist_data
*pgdat,
th = max(th - unit_th, unit_th);
else if (diff_cand < ref_cand * 9 / 10)
th = min(th + unit_th, ref_th);
- pgdat->numa_threshold_nr_candidate = nr_cand;
pgdat->numa_threshold = th;
+ spin_unlock(&pgdat->numa_lock);
trace_autonuma_threshold(pgdat->node_id, diff_cand, th);
mod_node_page_state(pgdat, PROMOTE_THRESHOLD, th - oth);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 912fd55..8cfce92 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6700,6 +6700,9 @@ static void __meminit pgdat_init_internals(struct pglist_data
*pgdat)
pgdat_page_ext_init(pgdat);
spin_lock_init(&pgdat->lru_lock);
lruvec_init(&pgdat->__lruvec);
+#ifdef CONFIG_NUMA_BALANCING
+ spin_lock_init(&pgdat->numa_lock);
+#endif
}
static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int
nid,
--
1.8.3.1