From: Dave Hansen <dave.hansen(a)linux.intel.com>
commit 26aa2d199d6f2cfa6f2ef2a5dfe891f2250e71a0 upstream
This is mostly derived from a patch from Yang Shi:
https://lore.kernel.org/linux-mm/1560468577-101178-10-git-send-email-yang.s…
Add code to the reclaim path (shrink_page_list()) to "demote" data to
another NUMA node instead of discarding the data. This always avoids the
cost of I/O needed to read the page back in and sometimes avoids the
writeout cost when the page is dirty.
A second pass through shrink_page_list() will be made if any demotions
fail. This essentially falls back to normal reclaim behavior in the case
that demotions fail. Previous versions of this patch may have simply
failed to reclaim pages which were eligible for demotion but were unable
to be demoted in practice.
For some cases, for example, MADV_PAGEOUT, the pages are always discarded
instead of demoted to follow the kernel API definition. Because
MADV_PAGEOUT is defined as freeing specified pages regardless in which
tier they are.
Note: This just adds the start of infrastructure for migration. It is
actually disabled next to the FIXME in migrate_demote_page_ok().
[dave.hansen(a)linux.intel.com: v11]
Link:
https://lkml.kernel.org/r/20210715055145.195411-5-ying.huang@intel.com
Link:
https://lkml.kernel.org/r/20210721063926.3024591-4-ying.huang@intel.com
Link:
https://lkml.kernel.org/r/20210715055145.195411-5-ying.huang@intel.com
Signed-off-by: Dave Hansen <dave.hansen(a)linux.intel.com>
Signed-off-by: "Huang, Ying" <ying.huang(a)intel.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Reviewed-by: Wei Xu <weixugc(a)google.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Reviewed-by: Zi Yan <ziy(a)nvidia.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: David Rientjes <rientjes(a)google.com>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Greg Thelen <gthelen(a)google.com>
Cc: Keith Busch <kbusch(a)kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Baolin Wang <baolin.wang(a)linux.alibaba.com>
---
Note: Just call alloc_migration_target() in alloc_demote_page()
---
include/linux/migrate.h | 10 ++++++
include/trace/events/migrate.h | 3 +-
mm/vmscan.c | 73 ++++++++++++++++++++++++++++++++++++++++++
3 files changed, 85 insertions(+), 1 deletion(-)
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 624e3a4..d1fad66 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -25,6 +25,7 @@ enum migrate_reason {
MR_MEMPOLICY_MBIND,
MR_NUMA_MISPLACED,
MR_CONTIG_RANGE,
+ MR_DEMOTION,
MR_TYPES
};
@@ -287,6 +288,15 @@ static inline int migrate_vma(const struct migrate_vma_ops *ops,
}
#endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */
+int next_demotion_node(int node);
+
+#else /* CONFIG_MIGRATION disabled: */
+
+static inline int next_demotion_node(int node)
+{
+ return NUMA_NO_NODE;
+}
+
#endif /* CONFIG_MIGRATION */
#endif /* _LINUX_MIGRATE_H */
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 705b33d..861ef7d 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -20,7 +20,8 @@
EM( MR_SYSCALL, "syscall_or_cpuset") \
EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \
EM( MR_NUMA_MISPLACED, "numa_misplaced") \
- EMe(MR_CONTIG_RANGE, "contig_range")
+ EM(MR_CONTIG_RANGE, "contig_range") \
+ EMe(MR_DEMOTION, "demotion")
/*
* First define the enums in the above macros to be exported to userspace
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3dd3a83..d63504e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -43,6 +43,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
+#include <linux/migrate.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
#include <linux/oom.h>
@@ -119,6 +120,9 @@ struct scan_control {
/* The file pages on the current node are dangerously low */
unsigned int file_is_tiny:1;
+ /* Always discard instead of demoting to lower tier memory */
+ unsigned int no_demotion:1;
+
/* Allocation order */
s8 order;
@@ -293,6 +297,17 @@ static bool writeback_throttling_sane(struct scan_control *sc)
}
#endif
+static bool can_demote(int nid, struct scan_control *sc)
+{
+ if (sc->no_demotion)
+ return false;
+ if (next_demotion_node(nid) == NUMA_NO_NODE)
+ return false;
+
+ // FIXME: actually enable this later in the series
+ return false;
+}
+
/*
* This misses isolated pages which are not accounted for to save counters.
* As the data only determines if reclaim or compaction continues, it is
@@ -1072,6 +1087,37 @@ static void page_check_dirty_writeback(struct page *page,
mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
}
+static struct page *alloc_demote_page(struct page *page, unsigned long node)
+{
+ return alloc_migration_target(page, node);
+}
+
+/*
+ * Take pages on @demote_list and attempt to demote them to
+ * another node. Pages which are not demoted are left on
+ * @demote_pages.
+ */
+static unsigned int demote_page_list(struct list_head *demote_pages,
+ struct pglist_data *pgdat)
+{
+ int target_nid = next_demotion_node(pgdat->node_id);
+ unsigned int nr_succeeded;
+ int err;
+
+ if (list_empty(demote_pages))
+ return 0;
+
+ if (target_nid == NUMA_NO_NODE)
+ return 0;
+
+ /* Demotion ignores all cpuset and mempolicy settings */
+ err = migrate_pages(demote_pages, alloc_demote_page, NULL,
+ target_nid, MIGRATE_ASYNC, MR_DEMOTION,
+ &nr_succeeded);
+
+ return nr_succeeded;
+}
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -1084,6 +1130,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
+ LIST_HEAD(demote_pages);
int pgactivate = 0;
unsigned nr_unqueued_dirty = 0;
unsigned nr_dirty = 0;
@@ -1094,11 +1141,14 @@ static unsigned long shrink_page_list(struct list_head
*page_list,
unsigned nr_ref_keep = 0;
unsigned nr_unmap_fail = 0;
struct lruvec *target_lruvec;
+ bool do_demote_pass;
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
cond_resched();
+ do_demote_pass = can_demote(pgdat->node_id, sc);
+retry:
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
@@ -1250,6 +1300,17 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
/*
+ * Before reclaiming the page, try to relocate
+ * its contents to another node.
+ */
+ if (do_demote_pass &&
+ (thp_migration_supported() || !PageTransHuge(page))) {
+ list_add(&page->lru, &demote_pages);
+ unlock_page(page);
+ continue;
+ }
+
+ /*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
* Lazyfree page could be freed directly
@@ -1472,6 +1533,17 @@ static unsigned long shrink_page_list(struct list_head *page_list,
list_add(&page->lru, &ret_pages);
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
+ /* 'page_list' is always empty here */
+
+ /* Migrate pages selected for demotion */
+ nr_reclaimed += demote_page_list(&demote_pages, pgdat);
+ /* Pages that could not be demoted are still in @demote_pages */
+ if (!list_empty(&demote_pages)) {
+ /* Pages which failed to demoted go back on @page_list for retry: */
+ list_splice_init(&demote_pages, page_list);
+ do_demote_pass = false;
+ goto retry;
+ }
mem_cgroup_uncharge_list(&free_pages);
try_to_unmap_flush();
@@ -2158,6 +2230,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
.may_writepage = 1,
.may_unmap = 1,
.may_swap = 1,
+ .no_demotion = 1,
};
while (!list_empty(page_list)) {
--
1.8.3.1