Last active
October 25, 2019 08:20
-
-
Save laoar/90c6acb388cf541f297a7aca2f52b117 to your computer and use it in GitHub Desktop.
mm, memcg: introduce multiple level memory low protection
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From f16bb9724a4a9b802a981231c7021d2aea9f4dc2 Mon Sep 17 00:00:00 2001 | |
From: Yafang Shao <[email protected]> | |
Date: Tue, 22 Oct 2019 22:17:15 -0400 | |
Subject: [PATCH] mm, memcg: introduce multiple level memory low protection | |
This patch introduces a new memory controller file memory.low.level, | |
which is used to set multiple level memory.low protetion. | |
The valid value of memory.low.level is [0..3], meaning we support four | |
levels protection now. This new controller file takes effect only when | |
memory.low is set. | |
If both memory.low and memory.low.level are set to many MEMCGs, under | |
memory pressure the reclaimer will reclaim the unprotected memory first, | |
and then reclaims the protected memory with lower memory.low.level and at | |
last relcaims the protected memory with highest memory.low.level. | |
under_memory_pressure | |
reclaim_unprotected_memory | |
if (meet_the_request) | |
exit | |
reclaim_protected_memory_with_lowest_memory.low.level | |
if (meet_the_request) | |
exit | |
reclaim_protected_memory_with_higher_memory.low.level | |
if (meet_the_request) | |
exit | |
reclaim_protected_memory_with_highest_memory.low.level | |
One example how this multiple level controller works, | |
target | |
/ \ | |
B C | |
/\ | |
B1 B2 | |
B/memory.low.level=2 | |
B1/memory.low.level=3 | |
B2/memory.low.level=0 | |
C/memory.low.level=1 | |
Then the reclaimer will reclaims these priority in this order: B2->C->B1/B | |
Signed-off-by: Yafang Shao <[email protected]> | |
--- | |
Documentation/admin-guide/cgroup-v2.rst | 11 ++++++++ | |
include/linux/page_counter.h | 3 +++ | |
mm/memcontrol.c | 47 +++++++++++++++++++++++++++++++++ | |
mm/vmscan.c | 27 +++++++++++++++---- | |
4 files changed, 83 insertions(+), 5 deletions(-) | |
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst | |
index 5361ebe..08675b6 100644 | |
--- a/Documentation/admin-guide/cgroup-v2.rst | |
+++ b/Documentation/admin-guide/cgroup-v2.rst | |
@@ -1136,6 +1136,17 @@ PAGE_SIZE multiple when read back. | |
Putting more memory than generally available under this | |
protection is discouraged. | |
+ memory.low.level | |
+ A read-write single value file which exists on non-root | |
+ cgroups. The default is "0". The valid value is [0..3]. | |
+ | |
+ The controller file takes effect only after memory.low is set. | |
+ If both memory.low and memory.low.level are set to many MEMCGs, | |
+ when under memory pressure the reclaimer will reclaim the | |
+ unprotected memory first, and then reclaims the protected memory | |
+ with lower memory.low.level and at last relcaims the protected | |
+ memory with highest memory.low.level. | |
+ | |
memory.high | |
A read-write single value file which exists on non-root | |
cgroups. The default is "max". | |
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h | |
index bab7e57..19bc589 100644 | |
--- a/include/linux/page_counter.h | |
+++ b/include/linux/page_counter.h | |
@@ -6,6 +6,7 @@ | |
#include <linux/kernel.h> | |
#include <asm/page.h> | |
+#define MEMCG_LOW_LEVEL_MAX 4 | |
struct page_counter { | |
atomic_long_t usage; | |
unsigned long min; | |
@@ -22,6 +23,8 @@ struct page_counter { | |
unsigned long elow; | |
atomic_long_t low_usage; | |
atomic_long_t children_low_usage; | |
+ unsigned long low_level; | |
+ unsigned long elow_level; | |
/* legacy */ | |
unsigned long watermark; | |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c | |
index 1c4c08b..4cbf765 100644 | |
--- a/mm/memcontrol.c | |
+++ b/mm/memcontrol.c | |
@@ -6064,6 +6064,37 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, | |
return nbytes; | |
} | |
+static int memory_low_level_show(struct seq_file *m, void *v) | |
+{ | |
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m); | |
+ | |
+ seq_printf(m, "%lu\n", memcg->memory.low_level); | |
+ | |
+ return 0; | |
+} | |
+ | |
+static ssize_t memory_low_level_write(struct kernfs_open_file *of, | |
+ char *buf, size_t nbytes, loff_t off) | |
+{ | |
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | |
+ int ret, low_level; | |
+ | |
+ buf = strstrip(buf); | |
+ if (!buf) | |
+ return -EINVAL; | |
+ | |
+ ret = kstrtoint(buf, 0, &low_level); | |
+ if (ret) | |
+ return ret; | |
+ | |
+ if (low_level < 0 || low_level >= MEMCG_LOW_LEVEL_MAX) | |
+ return -EINVAL; | |
+ | |
+ memcg->memory.low_level = low_level; | |
+ | |
+ return nbytes; | |
+} | |
+ | |
static int memory_high_show(struct seq_file *m, void *v) | |
{ | |
return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high)); | |
@@ -6237,6 +6268,12 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, | |
.write = memory_low_write, | |
}, | |
{ | |
+ .name = "low.level", | |
+ .flags = CFTYPE_NOT_ON_ROOT, | |
+ .seq_show = memory_low_level_show, | |
+ .write = memory_low_level_write, | |
+ }, | |
+ { | |
.name = "high", | |
.flags = CFTYPE_NOT_ON_ROOT, | |
.seq_show = memory_high_show, | |
@@ -6366,6 +6403,8 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, | |
struct mem_cgroup *parent; | |
unsigned long emin, parent_emin; | |
unsigned long elow, parent_elow; | |
+ unsigned long elow_level; | |
+ unsigned long parent_elow_level; | |
unsigned long usage; | |
if (mem_cgroup_disabled()) | |
@@ -6382,6 +6421,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, | |
emin = memcg->memory.min; | |
elow = memcg->memory.low; | |
+ elow_level = memcg->memory.low_level; | |
parent = parent_mem_cgroup(memcg); | |
/* No parent means a non-hierarchical mode on v1 memcg */ | |
@@ -6417,11 +6457,18 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, | |
if (low_usage && siblings_low_usage) | |
elow = min(elow, parent_elow * low_usage / | |
siblings_low_usage); | |
+ | |
+ parent_elow_level = READ_ONCE(parent->memory.elow_level); | |
+ elow_level = min(elow_level, parent_elow_level); | |
+ } else { | |
+ elow_level = 0; | |
} | |
+ | |
exit: | |
memcg->memory.emin = emin; | |
memcg->memory.elow = elow; | |
+ memcg->memory.elow_level = elow_level; | |
if (usage <= emin) | |
return MEMCG_PROT_MIN; | |
diff --git a/mm/vmscan.c b/mm/vmscan.c | |
index ee47bbc..82897fa 100644 | |
--- a/mm/vmscan.c | |
+++ b/mm/vmscan.c | |
@@ -93,8 +93,9 @@ struct scan_control { | |
* unless we threaten to OOM. If any cgroups are skipped due to | |
* memory.low and nothing was reclaimed, go back for memory.low. | |
*/ | |
- unsigned int memcg_low_reclaim:1; | |
unsigned int memcg_low_skipped:1; | |
+ unsigned int memcg_low_level:3; | |
+ unsigned int memcg_low_step:2; | |
unsigned int hibernation_mode:1; | |
@@ -2463,7 +2464,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, | |
lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); | |
protection = mem_cgroup_protection(memcg, | |
- sc->memcg_low_reclaim); | |
+ sc->memcg_low_level > memcg->memory.elow_level); | |
if (protection) { | |
/* | |
@@ -2768,6 +2769,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) | |
do { | |
unsigned long reclaimed; | |
unsigned long scanned; | |
+ unsigned long step; | |
switch (mem_cgroup_protected(root, memcg)) { | |
case MEMCG_PROT_MIN: | |
@@ -2783,10 +2785,16 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) | |
* there is an unprotected supply | |
* of reclaimable memory from other cgroups. | |
*/ | |
- if (!sc->memcg_low_reclaim) { | |
+ if (sc->memcg_low_level <= memcg->memory.elow_level) { | |
+ // should record the min step | |
+ // TODO: init the step to max. | |
+ step = memcg->memory.elow_level - sc->memcg_low_level; | |
+ if (step < sc->memcg_low_step) | |
+ sc->memcg_low_step = step; | |
sc->memcg_low_skipped = 1; | |
continue; | |
} | |
+ | |
memcg_memory_event(memcg, MEMCG_LOW); | |
break; | |
case MEMCG_PROT_NONE: | |
@@ -3066,6 +3074,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |
pg_data_t *last_pgdat; | |
struct zoneref *z; | |
struct zone *zone; | |
+ | |
+ sc->memcg_low_step = MEMCG_LOW_LEVEL_MAX - 1, | |
retry: | |
delayacct_freepages_start(); | |
@@ -3112,9 +3122,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |
return 1; | |
/* Untapped cgroup reserves? Don't OOM, retry. */ | |
- if (sc->memcg_low_skipped) { | |
+ if (sc->memcg_low_skipped && | |
+ sc->memcg_low_level < MEMCG_LOW_LEVEL_MAX) { | |
+ // TODO: if still set the prio to initial_priority, | |
+ // it may take long time to finish. | |
+ // needs to improve, | |
+ // sc->priority = (initial_priority >> memcg_low_level); ? | |
sc->priority = initial_priority; | |
- sc->memcg_low_reclaim = 1; | |
+ | |
+ sc->memcg_low_level += sc->memcg_low_step + 1; | |
+ // considering memcg_low_step | |
sc->memcg_low_skipped = 0; | |
goto retry; | |
} | |
-- | |
1.8.3.1 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment