From: Huang Ying <ying.huang(a)intel.com>
ANBZ: #80
cherry-picked from
https://git.kernel.org/pub/scm/linux/kernel/git/vishal/tiering.git/commit/?…
In AutoNUMA, the pages of the processes are scanned linearly, because of the
spatial locality, the distribution of the pages with time will not be evenly.
But now, the ratelimit is enforced per-second. That is, if the number of the
pages that pass the threshold in the last second has exceeded the ratelimit, no
pages will be promoted until the next second. The end result is that in the
busy second, the ratelimit is too small to promote all hot pages, while in the
idle second, not enough pages pass the threshold to be promoted, even if the
promotion threshold is accurate. To resolve this issue, we adjust the ratelimt
control algorithm.
Firstly, the number of pages that pass the ratelimit in a statistic period is
still kept less than the ratelimit/s * period. Secondly, the number pages pass
the ratelimit in one second can be a little more, say, “N * ratelimit/s” at
most. To identify the best “N”, we have done some experiments and found that
"5" works well and seems reasonable.
In the test with pmbench memory accessing benchmark on a 2-socket server machine
with Optane DCPMM. With the patch, the pmbench score improves 10.5%, with 41.9%
more pages promoted. But the pages promoted is still below the rate limit.
Signed-off-by: "Huang, Ying" <ying.huang(a)intel.com>
Signed-off-by: Baolin Wang <baolin.wang(a)linux.alibaba.com>
---
include/linux/mmzone.h | 2 ++
kernel/sched/fair.c | 13 ++++++++++---
mm/vmstat.c | 1 +
3 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a2b6636..c588570 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -211,6 +211,7 @@ enum node_stat_item {
PGPROMOTE_CANDIDATE, /* candidate pages to promote */
PROMOTE_THRESHOLD,
PGPROMOTE_FILE,
+ PGPROMOTE_TRY, /* pages to try to migrate via NUMA balancing */
#endif
NR_VM_NODE_STAT_ITEMS
};
@@ -765,6 +766,7 @@ struct hugepage_reclaim {
unsigned long numa_nr_candidate;
unsigned long numa_threshold_ts;
unsigned long numa_threshold_nr_candidate;
+ unsigned long numa_threshold_try;
unsigned long numa_threshold;
#endif
/* Fields commonly accessed by the page reclaim scanner */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 74dec2e..62c9953 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2894,8 +2894,8 @@ static int numa_hint_fault_latency(struct page *page)
static bool numa_migration_check_rate_limit(struct pglist_data *pgdat,
unsigned long rate_limit, int nr)
{
- unsigned long nr_candidate;
- unsigned long now = jiffies, last_ts;
+ unsigned long nr_candidate, try;
+ unsigned long now = jiffies, last_ts, dms;
mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
nr_candidate = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
@@ -2903,8 +2903,13 @@ static bool numa_migration_check_rate_limit(struct pglist_data
*pgdat,
if (now > last_ts + HZ &&
cmpxchg(&pgdat->numa_ts, last_ts, now) == last_ts)
pgdat->numa_nr_candidate = nr_candidate;
- if (nr_candidate - pgdat->numa_nr_candidate > rate_limit)
+ if (nr_candidate - pgdat->numa_nr_candidate > 5 * rate_limit)
return false;
+ try = node_page_state(pgdat, PGPROMOTE_TRY);
+ dms = jiffies_to_msecs(now - pgdat->numa_threshold_ts);
+ if (try - pgdat->numa_threshold_try > rate_limit * dms / 1000)
+ return false;
+ mod_node_page_state(pgdat, PGPROMOTE_TRY, nr);
return true;
}
@@ -2960,6 +2965,8 @@ static void numa_migration_adjust_threshold(struct pglist_data
*pgdat,
th = min(th, ref_th * 2);
}
pgdat->numa_threshold = th;
+ pgdat->numa_threshold_try =
+ node_page_state(pgdat, PGPROMOTE_TRY);
spin_unlock(&pgdat->numa_lock);
trace_autonuma_threshold(pgdat->node_id, diff_cand, th);
mod_node_page_state(pgdat, PROMOTE_THRESHOLD, th - oth);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 5651e09..e1618da 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1191,6 +1191,7 @@ int fragmentation_index(struct zone *zone, unsigned int order)
"pgpromote_candidate",
"promote_threshold",
"pgpromote_file",
+ "pgpromote_try",
#endif
/* enum writeback_stat_item counters */
--
1.8.3.1