From: Huang Ying <ying.huang(a)intel.com>
ANBZ: #80
cherry-picked from
https://git.kernel.org/pub/scm/linux/kernel/git/vishal/tiering.git/commit/?…
The write performance of PMEM is much worse than its read
performance. So even if a write-mostly pages is colder than a
read-mostly pages, it is usually better to put the write-mostly pages
in DRAM and read-mostly pages in PMEM.
To give write-mostly pages more opportunity to be promoted to DRAM, in
this patch, the hot threshold for write hint page fault is
doubled (easier to be promoted).
Signed-off-by: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Rik van Riel <riel(a)redhat.com>
Cc: Mel Gorman <mgorman(a)suse.de>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-mm(a)kvack.org
Signed-off-by: Baolin Wang <baolin.wang(a)linux.alibaba.com>
---
include/linux/mempolicy.h | 4 ++--
include/linux/migrate.h | 1 +
include/linux/sched/numa_balancing.h | 5 +++--
kernel/sched/fair.c | 4 +++-
mm/filemap.c | 3 ++-
mm/huge_memory.c | 2 +-
mm/memory.c | 5 ++++-
mm/mempolicy.c | 6 ++++--
mm/migrate.c | 3 ++-
9 files changed, 22 insertions(+), 11 deletions(-)
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 9f08c25..5453883 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -202,7 +202,7 @@ static inline bool vma_migratable(struct vm_area_struct *vma)
return true;
}
-extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
+extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long, int);
extern void mpol_put_task_policy(struct task_struct *);
#else
@@ -300,7 +300,7 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
#endif
static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
- unsigned long address)
+ unsigned long address, int flags)
{
return -1; /* no node preference */
}
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index c9bf8b2..8c94650 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -31,6 +31,7 @@ enum migrate_reason {
/* promote_file_page() flags */
#define PFP_LOCKED 0x1
+#define PFP_WRITE 0x2
/* In mm/debug.c; also keep sync with include/trace/events/migrate.h */
extern char *migrate_reason_names[MR_TYPES];
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 3988762..f5608ea 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -14,6 +14,7 @@
#define TNF_SHARED 0x04
#define TNF_FAULT_LOCAL 0x08
#define TNF_MIGRATE_FAIL 0x10
+#define TNF_WRITE 0x40
#ifdef CONFIG_NUMA_BALANCING
extern void task_numa_fault(int last_node, int node, int pages, int flags);
@@ -21,7 +22,7 @@
extern void set_numabalancing_state(bool enabled);
extern void task_numa_free(struct task_struct *p, bool final);
extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
- int src_nid, int dst_cpu);
+ int src_nid, int dst_cpu, int flags);
#else
static inline void task_numa_fault(int last_node, int node, int pages,
int flags)
@@ -38,7 +39,7 @@ static inline void task_numa_free(struct task_struct *p, bool final)
{
}
static inline bool should_numa_migrate_memory(struct task_struct *p,
- struct page *page, int src_nid, int dst_cpu)
+ struct page *page, int src_nid, int dst_cpu, int flags)
{
return true;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 62c9953..0184145 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2974,7 +2974,7 @@ static void numa_migration_adjust_threshold(struct pglist_data
*pgdat,
}
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
- int src_nid, int dst_cpu)
+ int src_nid, int dst_cpu, int flags)
{
struct numa_group *ng = deref_curr_numa_group(p);
int dst_nid = cpu_to_node(dst_cpu);
@@ -3002,6 +3002,8 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page *
page,
numa_migration_adjust_threshold(pgdat, rate_limit, def_th);
th = pgdat->numa_threshold ? : def_th;
+ if (flags & TNF_WRITE)
+ th *= 2;
latency = numa_hint_fault_latency(page);
if (latency > th)
return false;
diff --git a/mm/filemap.c b/mm/filemap.c
index ad863b5..ebd32fd 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3264,7 +3264,8 @@ struct page *grab_cache_page_write_begin(struct address_space
*mapping,
if (page) {
wait_for_stable_page(page);
- if (!promotion_tried && promote_file_page(page, PFP_LOCKED)) {
+ if (!promotion_tried &&
+ promote_file_page(page, PFP_LOCKED | PFP_WRITE)) {
promotion_tried = true;
goto find_page;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0a9126c..98ab428 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1853,7 +1853,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
* page_table_lock if at all possible
*/
page_locked = trylock_page(page);
- target_nid = mpol_misplaced(page, vma, haddr);
+ target_nid = mpol_misplaced(page, vma, haddr, flags);
if (target_nid == -1) {
/* If the page was locked, there are no parallel migrations */
if (page_locked)
diff --git a/mm/memory.c b/mm/memory.c
index 2100f65..c0ece4f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3860,7 +3860,7 @@ static int numa_migrate_prep(struct page *page, struct
vm_area_struct *vma,
*flags |= TNF_FAULT_LOCAL;
}
- return mpol_misplaced(page, vma, addr);
+ return mpol_misplaced(page, vma, addr, *flags);
}
static vm_fault_t do_numa_page(struct vm_fault *vmf)
@@ -3899,6 +3899,9 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
update_mmu_cache(vma, vmf->address, vmf->pte);
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ flags |= TNF_WRITE;
+
page = vm_normal_page(vma, vmf->address, pte);
if (!page) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1a1593c..077a0eb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2388,7 +2388,8 @@ static void sp_free(struct sp_node *n)
* Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
*/
-int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr,
+ int flags)
{
struct mempolicy *pol;
struct zoneref *z;
@@ -2442,7 +2443,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
unsigned long
if (pol->flags & MPOL_F_MORON) {
polnid = thisnid;
- if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
+ if (!should_numa_migrate_memory(current, page, curnid, thiscpu,
+ flags))
goto out;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 7e6dcef..6d25ea0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -50,6 +50,7 @@
#include <linux/memory.h>
#include <linux/random.h>
#include <linux/sched/sysctl.h>
+#include <linux/sched/numa_balancing.h>
#include <asm/tlbflush.h>
@@ -1194,7 +1195,7 @@ bool promote_file_page(struct page *page, int flags)
node_is_toptier(nid))
return false;
- nid = mpol_misplaced(page, NULL, 0);
+ nid = mpol_misplaced(page, NULL, 0, (flags & PFP_WRITE) ? TNF_WRITE : 0);
if (nid == NUMA_NO_NODE)
return false;
--
1.8.3.1