From: Huang Ying <ying.huang(a)intel.com>
ANBZ: #80
The page demotion is implemented with the general page migration.
The process is as follow,
1. unmap the old page
2. allocate the new page on target node
3. remap the new page
4. free the old page
In step 3, instead of restoring the original page table mapping, we
can map the new page as PROT_NONE. That is, as if the page table
entry has been scanned by NUMA balancing scanning mechanism. So that,
some page table entries are scanned almost without overhead. This can
help identify pages demoted wrongly.
More importantly, with page reclaiming based page demotion, we lose
the refault information which can be used to guide the page reclaiming
algorithm. With the mechanism in the patch, we can restore some
refault information.
When the demoted pages are accessed at the first time, the refault
latency (access time - demote time) will be calculated. If the
refault latency is less than the page promotion threshold, the page
will be promoted. Based on the number of the demoted pages that are
promoted, and the number of the demoted pages, a rate limit control
algorithm can be built in the kernel space or user space.
Signed-off-by: "Huang, Ying" <ying.huang(a)intel.com>
Signed-off-by: zhong jiang <zhongjiang-ali(a)linux.alibaba.com>
Signed-off-by: zhongjiang-ali <zhongjiang-ali(a)linux.alibaba.com>
---
include/linux/mmzone.h | 1 +
include/linux/page-flags.h | 9 ++++++
include/linux/page_ext.h | 3 ++
include/linux/rmap.h | 1 +
include/linux/sched/numa_balancing.h | 62 ++++++++++++++++++++++++++++++++++++
include/linux/sched/sysctl.h | 2 ++
include/trace/events/mmflags.h | 8 ++++-
kernel/sched/fair.c | 8 +++++
kernel/sysctl.c | 8 +++++
mm/mempolicy.c | 2 ++
mm/migrate.c | 47 ++++++++++++++++++++++++---
mm/vmstat.c | 1 +
12 files changed, 147 insertions(+), 5 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c588570..e0bbfab 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -212,6 +212,7 @@ enum node_stat_item {
PROMOTE_THRESHOLD,
PGPROMOTE_FILE,
PGPROMOTE_TRY, /* pages to try to migrate via NUMA balancing */
+ PGDEMOTED_HOT,
#endif
NR_VM_NODE_STAT_ITEMS
};
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d00efae..fc9f793 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -102,6 +102,9 @@ enum pageflags {
PG_young,
PG_idle,
#endif
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_64BIT)
+ PG_demoted,
+#endif
__NR_PAGEFLAGS,
/* Filesystems */
@@ -392,6 +395,12 @@ static inline bool set_hwpoison_free_buddy_page(struct page *page)
PAGEFLAG(Idle, idle, PF_ANY)
#endif
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_64BIT)
+TESTPAGEFLAG(Demoted, demoted, PF_NO_TAIL)
+SETPAGEFLAG(Demoted, demoted, PF_NO_TAIL)
+TESTCLEARFLAG(Demoted, demoted, PF_NO_TAIL)
+#endif
+
/*
* PageReported() is used to track reported free pages within the Buddy
* allocator. We can use the non-atomic version of the test and set
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index f84f167..93a45bd 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -23,6 +23,9 @@ enum page_ext_flags {
PAGE_EXT_YOUNG,
PAGE_EXT_IDLE,
#endif
+#if defined(CONFIG_NUMA_BALANCING) && !defined(CONFIG_64BIT)
+ PAGE_EXT_DEMOTED,
+#endif
};
/*
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 2c455f9..8222a3b 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -243,6 +243,7 @@ static inline void page_vma_mapped_walk_done(struct
page_vma_mapped_walk *pvmw)
enum rmpte_flags {
RMPTE_LOCKED = 0x1,
+ RMPTE_PROT_NUMA = 0x2,
};
void remove_migration_ptes(struct page *old, struct page *new,
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index f5608ea..492a410 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -8,6 +8,8 @@
*/
#include <linux/sched.h>
+#include <linux/page-flags.h>
+#include <linux/page_ext.h>
#define TNF_MIGRATED 0x01
#define TNF_NO_GROUP 0x02
@@ -15,6 +17,7 @@
#define TNF_FAULT_LOCAL 0x08
#define TNF_MIGRATE_FAIL 0x10
#define TNF_WRITE 0x40
+#define TNF_DEMOTED 0x80
#ifdef CONFIG_NUMA_BALANCING
extern void task_numa_fault(int last_node, int node, int pages, int flags);
@@ -23,6 +26,54 @@
extern void task_numa_free(struct task_struct *p, bool final);
extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
int src_nid, int dst_cpu, int flags);
+
+#ifdef CONFIG_64BIT
+static inline bool page_is_demoted(struct page *page)
+{
+ return PageDemoted(page);
+}
+
+static inline void set_page_demoted(struct page *page)
+{
+ SetPageDemoted(page);
+}
+
+static inline bool test_and_clear_page_demoted(struct page *page)
+{
+ return TestClearPageDemoted(page);
+}
+#else /* !CONFIG_64BIT */
+static inline bool page_is_demoted(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ if (unlikely(!page_ext))
+ return false;
+
+ return test_bit(PAGE_EXT_DEMOTED, &page_ext->flags);
+}
+
+static inline void set_page_demoted(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ if (unlikely(!page_ext))
+ return false;
+
+ return set_bit(PAGE_EXT_DEMOTED, &page_ext->flags);
+}
+
+static inline bool test_and_clear_page_demoted(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ if (unlikely(!page_ext))
+ return false;
+
+ return test_and_clear_bit(PAGE_EXT_DEMOTED, &page_ext->flags);
+}
+#endif /* !CONFIG_64BIT */
+
#else
static inline void task_numa_fault(int last_node, int node, int pages,
int flags)
@@ -43,6 +94,17 @@ static inline bool should_numa_migrate_memory(struct task_struct *p,
{
return true;
}
+static inline bool page_is_demoted(struct page *page)
+{
+ return false;
+}
+static inline void set_page_demoted(struct page *page)
+{
+}
+static inline bool test_and_clear_page_demoted(struct page *page)
+{
+ return false;
+}
#endif
#endif /* _LINUX_SCHED_NUMA_BALANCING_H */
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index ac11582..2ae6533 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -40,8 +40,10 @@ enum sched_tunable_scaling {
extern int sysctl_numa_balancing_mode;
extern unsigned int sysctl_numa_balancing_hot_threshold;
extern unsigned int sysctl_numa_balancing_rate_limit;
+extern unsigned int sysctl_numa_balancing_scan_demoted;
#else
#define sysctl_numa_balancing_mode 0
+#define sysctl_numa_balancing_scan_demoted 0
#endif
extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index a1675d4..f058d6c 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -78,6 +78,11 @@
#else
#define IF_HAVE_PG_IDLE(flag,string)
#endif
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_64BIT)
+#define IF_HAVE_PG_DEMOTED(flag,string) ,{1UL << flag, string}
+#else
+#define IF_HAVE_PG_DEMOTED(flag,string)
+#endif
#define __def_pageflag_names \
{1UL << PG_locked, "locked" }, \
@@ -105,7 +110,8 @@
IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \
IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \
IF_HAVE_PG_IDLE(PG_young, "young" ) \
-IF_HAVE_PG_IDLE(PG_idle, "idle" )
+IF_HAVE_PG_IDLE(PG_idle, "idle" ) \
+IF_HAVE_PG_DEMOTED(PG_demoted, "demoted" )
#define show_page_flags(flags) \
(flags) ? __print_flags(flags, "|", \
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0184145..eb78398 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2464,6 +2464,11 @@ static void update_curr_fair(struct rq *rq)
* if no enough free space in target node
*/
unsigned int sysctl_numa_balancing_rate_limit = 65536;
+/*
+ * Make just demoted pages work like just scanned by NUMA balancing
+ * page table scanner.
+ */
+unsigned int sysctl_numa_balancing_scan_demoted;
struct numa_group {
atomic_t refcount;
@@ -3008,6 +3013,9 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page *
page,
if (latency > th)
return false;
+ if (flags & TNF_DEMOTED)
+ mod_node_page_state(pgdat, PGDEMOTED_HOT,
+ hpage_nr_pages(page));
return numa_migration_check_rate_limit(pgdat, rate_limit,
1UL << compound_order(page));
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0fe1ea1..b5d05d8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -472,6 +472,14 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
},
+ {
+ .procname = "numa_balancing_scan_demoted",
+ .data = &sysctl_numa_balancing_scan_demoted,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
{
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 077a0eb..21c7606 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2400,6 +2400,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
unsigned long
int polnid = -1;
int ret = -1;
+ if (test_and_clear_page_demoted(page))
+ flags |= TNF_DEMOTED;
pol = get_vma_policy(vma, addr);
if (!(pol->flags & MPOL_F_MOF))
goto out;
diff --git a/mm/migrate.c b/mm/migrate.c
index e549a27..1193821 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -201,14 +201,20 @@ void putback_movable_pages(struct list_head *l)
}
}
+struct rmpte_params {
+ struct page *oldpage;
+ enum rmpte_flags flags;
+};
+
/*
* Restore a potential migration pte to a working pte entry
*/
static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
- unsigned long addr, void *old)
+ unsigned long addr, void *params)
{
+ struct rmpte_params *rparams = params;
struct page_vma_mapped_walk pvmw = {
- .page = old,
+ .page = rparams->oldpage,
.vma = vma,
.address = addr,
.flags = PVMW_SYNC | PVMW_MIGRATION,
@@ -267,6 +273,16 @@ static bool remove_migration_pte(struct page *page, struct
vm_area_struct *vma,
} else
#endif
{
+#ifdef CONFIG_NUMA_BALANCING
+ if (rparams->flags & RMPTE_PROT_NUMA &&
+ page_is_demoted(page) && vma_migratable(vma)) {
+ bool writable = pte_write(pte);
+
+ pte = pte_modify(pte, PAGE_NONE);
+ if (writable)
+ pte = pte_mk_savedwrite(pte);
+ }
+#endif
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
if (PageAnon(new))
@@ -294,9 +310,13 @@ static bool remove_migration_pte(struct page *page, struct
vm_area_struct *vma,
void remove_migration_ptes(struct page *old, struct page *new,
enum rmpte_flags flags)
{
+ struct rmpte_params rp = {
+ .oldpage = old,
+ .flags = flags,
+ };
struct rmap_walk_control rwc = {
.rmap_one = remove_migration_pte,
- .arg = old,
+ .arg = &rp,
};
if (flags & RMPTE_LOCKED)
@@ -732,10 +752,19 @@ void migrate_page_states(struct page *newpage, struct page *page)
bool f_toptier = node_is_toptier(page_to_nid(page));
bool t_toptier = node_is_toptier(page_to_nid(newpage));
+ /*
+ * Treat demoted pages as just scanned, so they can be
+ * promoted as soon as possible if hot
+ */
+ if (page_is_demoted(newpage)) {
+ xchg_page_access_time(newpage, jiffies_to_msecs(jiffies));
+ goto cpupid_done;
+ }
if (f_toptier != t_toptier)
cpupid = -1;
}
page_cpupid_xchg_last(newpage, cpupid);
+cpupid_done:
ksm_migrate_page(newpage, page);
/*
@@ -1145,7 +1174,9 @@ static int __unmap_and_move(struct page *page, struct page
*newpage,
if (page_was_mapped)
remove_migration_ptes(page,
- rc == MIGRATEPAGE_SUCCESS ? newpage : page, 0);
+ rc == MIGRATEPAGE_SUCCESS ? newpage : page,
+ rc == MIGRATEPAGE_SUCCESS && page_is_demoted(newpage) ?
+ RMPTE_PROT_NUMA : 0);
out_unlock_both:
unlock_page(newpage);
@@ -1238,6 +1269,14 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
if (!newpage)
return -ENOMEM;
+ /* TODO: check whether Ksm pages can be demoted? */
+ if (reason == MR_DEMOTION &&
+ sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ sysctl_numa_balancing_scan_demoted &&
+ !PageKsm(page)) {
+ set_page_demoted(newpage);
+ }
+
if (page_count(page) == 1) {
/* page was freed from under us. So we are done. */
ClearPageActive(page);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8ce048b..7f2c498 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1192,6 +1192,7 @@ int fragmentation_index(struct zone *zone, unsigned int order)
"promote_threshold",
"pgpromote_file",
"pgpromote_try",
+ "pgpromote_demoted",
#endif
/* enum writeback_stat_item counters */
--
1.8.3.1