From: Huang Ying <ying.huang(a)intel.com>
In NUMA balancing memory tiering mode, the hot slow memory pages could
be promoted to the fast memory node via NUMA balancing. But this
incurs some overhead too. So that sometimes the workload performance
may be hurt. To avoid too much disturbing to the workload in these
situations, we should make it possible to rate limit the promotion
throughput.
So, in this patch, we implement a simple rate limit algorithm as
follows. The number of the candidate pages to be promoted to the fast
memory node via NUMA balancing is counted, if the count exceeds the
limit specified by the users, the NUMA balancing promotion will be
stopped until the next second.
A new sysctl knob kernel.numa_balancing_rate_limit_mbps is added for
the users to specify the limit.
TODO: Add ABI document for new sysctl knob.
Signed-off-by: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Rik van Riel <riel(a)surriel.com>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Cc: Wei Xu <weixugc(a)google.com>
Cc: osalvador <osalvador(a)suse.de>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Hasan Al Maruf <hasanalmaruf(a)fb.com>
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-mm(a)kvack.org
Signed-off-by: zhongjiang-ali <zhongjiang-ali(a)linux.alibaba.com>
---
include/linux/mmzone.h | 5 +++++
include/linux/sched/sysctl.h | 1 +
kernel/sched/fair.c | 29 +++++++++++++++++++++++++++--
kernel/sysctl.c | 8 ++++++++
mm/vmstat.c | 1 +
5 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 48982e3..4463451 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -208,6 +208,7 @@ enum node_stat_item {
NR_INDIRECTLY_RECLAIMABLE_BYTES, /* measured in bytes */
#ifdef CONFIG_NUMA_BALANCING
PGPROMOTE_SUCCESS, /* promote successfully */
+ PGPROMOTE_CANDIDATE, /* candidate pages to promote */
#endif
NR_VM_NODE_STAT_ITEMS
};
@@ -756,6 +757,10 @@ struct hugepage_reclaim {
struct deferred_split deferred_split_queue;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned long numa_ts;
+ unsigned long numa_nr_candidate;
+#endif
/* Fields commonly accessed by the page reclaim scanner */
/*
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 6be27b6..bda72b8 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -39,6 +39,7 @@ enum sched_tunable_scaling {
#ifdef CONFIG_NUMA_BALANCING
extern int sysctl_numa_balancing_mode;
extern unsigned int sysctl_numa_balancing_hot_threshold;
+extern unsigned int sysctl_numa_balancing_rate_limit;
#else
#define sysctl_numa_balancing_mode 0
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 41d6863..e102fab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2459,6 +2459,11 @@ static void update_curr_fair(struct rq *rq)
/* The page with hint page fault latency < threshold in ms is considered hot */
unsigned int sysctl_numa_balancing_hot_threshold = 1000;
+/*
+ * Restrict the NUMA migration per second in MB for each target node
+ * if no enough free space in target node
+ */
+unsigned int sysctl_numa_balancing_rate_limit = 65536;
struct numa_group {
atomic_t refcount;
@@ -2882,6 +2887,23 @@ static int numa_hint_fault_latency(struct page *page)
return (time - last_time) & PAGE_ACCESS_TIME_MASK;
}
+static bool numa_migration_check_rate_limit(struct pglist_data *pgdat,
+ unsigned long rate_limit, int nr)
+{
+ unsigned long nr_candidate;
+ unsigned long now = jiffies, last_ts;
+
+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
+ nr_candidate = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ last_ts = pgdat->numa_ts;
+ if (now > last_ts + HZ &&
+ cmpxchg(&pgdat->numa_ts, last_ts, now) == last_ts)
+ pgdat->numa_nr_candidate = nr_candidate;
+ if (nr_candidate - pgdat->numa_nr_candidate > rate_limit)
+ return false;
+ return true;
+}
+
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
@@ -2896,7 +2918,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page *
page,
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
!node_is_toptier(src_nid)) {
struct pglist_data *pgdat;
- unsigned long latency, th;
+ unsigned long rate_limit, latency, th;
pgdat = NODE_DATA(dst_nid);
if (pgdat_free_space_enough(pgdat))
@@ -2907,7 +2929,10 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page
* page,
if (latency > th)
return false;
- return true;
+ rate_limit =
+ sysctl_numa_balancing_rate_limit << (20 - PAGE_SHIFT);
+ return numa_migration_check_rate_limit(pgdat, rate_limit,
+ 1UL << compound_order(page));
}
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ab063be..32e8218 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -463,6 +463,14 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "numa_balancing_rate_limit_mbps",
+ .data = &sysctl_numa_balancing_rate_limit,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
{
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b87b8b2..d921c42d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1188,6 +1188,7 @@ int fragmentation_index(struct zone *zone, unsigned int order)
"", /* nr_indirectly_reclaimable */
#ifdef CONFIG_NUMA_BALANCING
"pgpromote_success",
+ "pgpromote_candidate",
#endif
/* enum writeback_stat_item counters */
--
1.8.3.1