From: Feng Tang <feng.tang(a)intel.com>
ANBZ: #80
cherry-picked from
https://git.kernel.org/pub/scm/linux/kernel/git/vishal/tiering.git/commit/?…
Originally, NUMA balancing can only optimize the page placement among
the fast and slow memory nodes for the anonymous and mapped file cache
pages. The optimization needs to be done for unmapped file cache
pages too. So in this patch, this is implemented.
Because the unmapped file cache pages are accessed via syscall, we can
hook their accessing to record the access time in the struct page and
calculate the latency between the two consecutive memory accesses.
Based on the page access latency measurement, we can select the hot
pages in the slow memory and promote them just like the mapped pages.
In fact, most code are shared between the mapped and unmapped page
promotion.
The patch improves the read/write bandwidth of fio benchmark with
80:20 read/write ratio and normal random distribution by 107% on a 2
socket Intel server with Optane DC Persistent Memory Model.
Signed-off-by: Feng Tang <feng.tang(a)intel.com>
Co-developed-by: "Huang, Ying" <ying.huang(a)intel.com>
Signed-off-by: Baolin Wang <baolin.wang(a)linux.alibaba.com>
---
Note: The file cache pages read of Anolis branch is very differnet with
the mainline, which does not support batch read, so we should promote
page one by one in generic_file_buffered_read().
---
include/linux/migrate.h | 10 +++++++++-
kernel/sched/fair.c | 4 ++++
mm/filemap.c | 25 ++++++++++++++++++++++++-
mm/migrate.c | 27 +++++++++++++++++++++++++++
4 files changed, 64 insertions(+), 2 deletions(-)
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 26e6a11..c9bf8b2 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -29,6 +29,9 @@ enum migrate_reason {
MR_TYPES
};
+/* promote_file_page() flags */
+#define PFP_LOCKED 0x1
+
/* In mm/debug.c; also keep sync with include/trace/events/migrate.h */
extern char *migrate_reason_names[MR_TYPES];
@@ -82,7 +85,7 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page,
struct buffer_head *head, enum migrate_mode mode,
int extra_count);
-
+extern bool promote_file_page(struct page *page, int flags);
extern bool numa_demotion_enabled;
#else
@@ -110,6 +113,11 @@ static inline int migrate_huge_page_move_mapping(struct address_space
*mapping,
return -ENOSYS;
}
+static inline bool promote_file_page(struct page *page, int flags)
+{
+ return false;
+}
+
#define numa_demotion_enabled false
#endif /* CONFIG_MIGRATION */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f938351..a16a1a9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2884,6 +2884,10 @@ static int numa_hint_fault_latency(struct page *page)
time = jiffies_to_msecs(jiffies);
last_time = xchg_page_access_time(page, time);
+ /* First access */
+ if (last_time == PAGE_ACCESS_TIME_MASK)
+ return PAGE_ACCESS_TIME_MASK;
+
return (time - last_time) & PAGE_ACCESS_TIME_MASK;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index f00cc7f..ad863b5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -40,6 +40,7 @@
#include <linux/psi.h>
#include <linux/pfn_t.h>
+#include <linux/migrate.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -2140,6 +2141,7 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
unsigned long offset; /* offset into pagecache page */
unsigned int prev_offset;
int error = 0;
+ bool promotion_tried = false;
if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
return 0;
@@ -2227,8 +2229,21 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
goto page_not_up_to_date_locked;
unlock_page(page);
}
+
page_ok:
/*
+ * We may lose the reference to the page in
+ * promote_file_page(), e.g. page is migrated, if so
+ * we need to find the page again. Same for the file
+ * writing.
+ */
+ if (!promotion_tried) {
+ promotion_tried = true;
+ if (promote_file_page(page, 0))
+ goto find_page;
+ }
+
+ /*
* i_size must be checked after we know the page is Uptodate.
*
* Checking i_size after the check allows us to calculate
@@ -3238,15 +3253,23 @@ struct page *grab_cache_page_write_begin(struct address_space
*mapping,
{
struct page *page;
int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
+ bool promotion_tried = false;
if (flags & AOP_FLAG_NOFS)
fgp_flags |= FGP_NOFS;
+find_page:
page = pagecache_get_page(mapping, index, fgp_flags,
mapping_gfp_mask(mapping));
- if (page)
+ if (page) {
wait_for_stable_page(page);
+ if (!promotion_tried && promote_file_page(page, PFP_LOCKED)) {
+ promotion_tried = true;
+ goto find_page;
+ }
+ }
+
return page;
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
diff --git a/mm/migrate.c b/mm/migrate.c
index 15838b3..c7632c9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1179,6 +1179,33 @@ static int __unmap_and_move(struct page *page, struct page
*newpage,
#define ICE_noinline
#endif
+ /*
+ * Return true if the caller has lost the reference to the page,
+ * otherwise return false.
+ */
+bool promote_file_page(struct page *page, int flags)
+{
+ int nid = page_to_nid(page);
+
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) ||
+ node_is_toptier(nid))
+ return false;
+
+ nid = mpol_misplaced(page, NULL, 0);
+ if (nid == NUMA_NO_NODE)
+ return false;
+
+ if (!PageLRU(page))
+ return false;
+
+ if (flags & PFP_LOCKED)
+ unlock_page(page);
+
+ migrate_misplaced_page(page, NULL, nid);
+
+ return true;
+}
+
/*
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
--
1.8.3.1