From: Huang Ying <ying.huang(a)intel.com>
ANBZ: #80
cherry-picked from
https://git.kernel.org/pub/scm/linux/kernel/git/vishal/tiering.git/commit/?…
In a memory tiering system with DRAM and PMEM, if the memory access
pattern of the workloads is kept stable for long time, along with the
autonuma based page placement optimization algorithm runs, the DRAM
pages becomes hotter and hotter, while the PMEM pages become colder
and colder. Finally, the cold DRAM pages demoted become as hot as the
hot PMEM pages promoted. So that, the just demoted DRAM pages will be
promoted. That is, there's thrashing between DRAM and PMEM.
To reduce the thrashing in the above situation, a pgpromote_demoted
counter (count just demoted pages that is promoted) based rate limit
control algorithm is implemented in this patch. In a statistic
interval, if the number of the just demoted pages that is promoted is
more that 1/4 of the total number of the demoted pages, we will try to
reduce the rate limit to reduce the number of pages demoted/promoted.
TODO: Test results
Signed-off-by: "Huang, Ying" <ying.huang(a)intel.com>
Signed-off-by: Baolin Wang <baolin.wang(a)linux.alibaba.com>
---
include/linux/mmzone.h | 3 +++
include/trace/events/sched.h | 12 ++++++++----
kernel/sched/fair.c | 25 ++++++++++++++++++++++---
3 files changed, 33 insertions(+), 7 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6b7cc7e..b053928 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -769,6 +769,9 @@ struct hugepage_reclaim {
unsigned long numa_threshold_nr_candidate;
unsigned long numa_threshold_try;
unsigned long numa_threshold;
+ unsigned long numa_threshold_demoted;
+ unsigned long numa_threshold_pdemoted;
+ unsigned long numa_rate_limit;
#endif
/* Fields commonly accessed by the page reclaim scanner */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index d36b9a3..a965d5e 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -55,24 +55,28 @@
TP_PROTO(int nid,
long nr_candidate,
- int threshold),
+ int threshold,
+ int ratelimit),
- TP_ARGS(nid, nr_candidate, threshold),
+ TP_ARGS(nid, nr_candidate, threshold, ratelimit),
TP_STRUCT__entry(
__field( int, nid )
__field( long, nr_candidate )
__field( int, threshold )
+ __field( int, ratelimit )
),
TP_fast_assign(
__entry->nid = nid;
__entry->nr_candidate = nr_candidate;
__entry->threshold = threshold;
+ __entry->ratelimit = ratelimit;
),
- TP_printk("nid=%d nr_candidate=%ld threshold=%d",
- __entry->nid, __entry->nr_candidate, __entry->threshold)
+ TP_printk("nid=%d nr_candidate=%ld threshold=%d ratelimit=%d",
+ __entry->nid, __entry->nr_candidate, __entry->threshold,
+ __entry->ratelimit)
);
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8caeb72..6bc2bed 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2937,7 +2937,7 @@ static void numa_migration_adjust_threshold(struct pglist_data
*pgdat,
unsigned long ref_th)
{
unsigned long now = jiffies, last_th_ts, th_period;
- unsigned long th, oth;
+ unsigned long th, oth, nr_demoted, nr_pdemoted;
unsigned long last_nr_cand, nr_cand, ref_cand, diff_cand;
th_period = msecs_to_jiffies(sysctl_numa_balancing_scan_period_max);
@@ -2964,11 +2964,29 @@ static void numa_migration_adjust_threshold(struct pglist_data
*pgdat,
th = max(th * 11 / 10, th + 1);
th = min(th, ref_th * 2);
}
+ nr_demoted = node_page_state(pgdat, PGDEMOTE_KSWAPD) +
+ node_page_state(pgdat, PGDEMOTE_DIRECT);
+ nr_pdemoted = node_page_state(pgdat, PGPROMOTE_DEMOTED);
+ if (nr_pdemoted - pgdat->numa_threshold_pdemoted >
+ (nr_demoted - pgdat->numa_threshold_demoted) / 4) {
+ rate_limit = min(rate_limit * 9 / 10, rate_limit - 1);
+ rate_limit = max(rate_limit, 1UL);
+ } else {
+ unsigned long ref_rl;
+
+ ref_rl = sysctl_numa_balancing_rate_limit << (20 - PAGE_SHIFT);
+ rate_limit = max(rate_limit * 11 / 10, rate_limit + 1);
+ rate_limit = min(rate_limit, ref_rl);
+ }
+ pgdat->numa_rate_limit = rate_limit;
pgdat->numa_threshold = th;
pgdat->numa_threshold_try =
node_page_state(pgdat, PGPROMOTE_TRY);
+ pgdat->numa_threshold_demoted = nr_demoted;
+ pgdat->numa_threshold_pdemoted = nr_pdemoted;
spin_unlock(&pgdat->numa_lock);
- trace_autonuma_threshold(pgdat->node_id, diff_cand, th);
+ trace_autonuma_threshold(pgdat->node_id, diff_cand, th,
+ pgdat->numa_rate_limit);
mod_node_page_state(pgdat, PROMOTE_THRESHOLD, th - oth);
}
}
@@ -2997,7 +3015,8 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page *
page,
}
def_th = sysctl_numa_balancing_hot_threshold;
- rate_limit =
+ rate_limit = pgdat->numa_rate_limit;
+ rate_limit = rate_limit ? :
sysctl_numa_balancing_rate_limit << (20 - PAGE_SHIFT);
numa_migration_adjust_threshold(pgdat, rate_limit, def_th);
--
1.8.3.1