summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c329
1 files changed, 284 insertions, 45 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 598fece89e2b..abec50f31fe6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -67,6 +67,7 @@
#include <net/sock.h>
#include <net/ip.h>
#include "slab.h"
+#include "swap.h"
#include <linux/uaccess.h>
@@ -89,7 +90,7 @@ static bool cgroup_memory_nokmem __ro_after_init;
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
-bool cgroup_memory_noswap __ro_after_init;
+static bool cgroup_memory_noswap __ro_after_init;
#else
#define cgroup_memory_noswap 1
#endif
@@ -209,7 +210,6 @@ static struct move_charge_struct {
enum res_type {
_MEM,
_MEMSWAP,
- _OOM_TYPE,
_KMEM,
_TCP,
};
@@ -217,8 +217,6 @@ enum res_type {
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
-/* Used for OOM notifier */
-#define OOM_CONTROL (0)
/*
* Iteration constructs for visiting all cgroups (under a tree). If
@@ -1013,9 +1011,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (!root)
root = root_mem_cgroup;
- if (prev && !reclaim)
- pos = prev;
-
rcu_read_lock();
if (reclaim) {
@@ -1024,7 +1019,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
mz = root->nodeinfo[reclaim->pgdat->node_id];
iter = &mz->iter;
- if (prev && reclaim->generation != iter->generation)
+ /*
+ * On start, join the current reclaim iteration cycle.
+ * Exit when a concurrent walker completes it.
+ */
+ if (!prev)
+ reclaim->generation = iter->generation;
+ else if (reclaim->generation != iter->generation)
goto out_unlock;
while (1) {
@@ -1041,6 +1042,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
*/
(void)cmpxchg(&iter->position, pos, NULL);
}
+ } else if (prev) {
+ pos = prev;
}
if (pos)
@@ -1065,15 +1068,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
* is provided by the caller, so we know it's alive
* and kicking, and don't take an extra reference.
*/
- memcg = mem_cgroup_from_css(css);
-
- if (css == &root->css)
- break;
-
- if (css_tryget(css))
+ if (css == &root->css || css_tryget(css)) {
+ memcg = mem_cgroup_from_css(css);
break;
-
- memcg = NULL;
+ }
}
if (reclaim) {
@@ -1089,8 +1087,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (!memcg)
iter->generation++;
- else if (!prev)
- reclaim->generation = iter->generation;
}
out_unlock:
@@ -1402,6 +1398,10 @@ static const struct memory_stat memory_stats[] = {
{ "sock", MEMCG_SOCK },
{ "vmalloc", MEMCG_VMALLOC },
{ "shmem", NR_SHMEM },
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ { "zswap", MEMCG_ZSWAP_B },
+ { "zswapped", MEMCG_ZSWAPPED },
+#endif
{ "file_mapped", NR_FILE_MAPPED },
{ "file_dirty", NR_FILE_DIRTY },
{ "file_writeback", NR_WRITEBACK },
@@ -1436,6 +1436,7 @@ static int memcg_page_state_unit(int item)
{
switch (item) {
case MEMCG_PERCPU_B:
+ case MEMCG_ZSWAP_B:
case NR_SLAB_RECLAIMABLE_B:
case NR_SLAB_UNRECLAIMABLE_B:
case WORKINGSET_REFAULT_ANON:
@@ -1516,6 +1517,13 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
memcg_events(memcg, PGLAZYFREED));
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPIN),
+ memcg_events(memcg, ZSWPIN));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPOUT),
+ memcg_events(memcg, ZSWPOUT));
+#endif
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
memcg_events(memcg, THP_FAULT_ALLOC));
@@ -2887,6 +2895,19 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
return page_memcg_check(folio_page(folio, 0));
}
+static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
+{
+ struct obj_cgroup *objcg = NULL;
+
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ objcg = rcu_dereference(memcg->objcg);
+ if (objcg && obj_cgroup_tryget(objcg))
+ break;
+ objcg = NULL;
+ }
+ return objcg;
+}
+
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
{
struct obj_cgroup *objcg = NULL;
@@ -2900,15 +2921,32 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
memcg = active_memcg();
else
memcg = mem_cgroup_from_task(current);
-
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
- objcg = rcu_dereference(memcg->objcg);
- if (objcg && obj_cgroup_tryget(objcg))
- break;
- objcg = NULL;
- }
+ objcg = __get_obj_cgroup_from_memcg(memcg);
rcu_read_unlock();
+ return objcg;
+}
+
+struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
+{
+ struct obj_cgroup *objcg;
+
+ if (!memcg_kmem_enabled() || memcg_kmem_bypass())
+ return NULL;
+
+ if (PageMemcgKmem(page)) {
+ objcg = __folio_objcg(page_folio(page));
+ obj_cgroup_get(objcg);
+ } else {
+ struct mem_cgroup *memcg;
+ rcu_read_lock();
+ memcg = __folio_memcg(page_folio(page));
+ if (memcg)
+ objcg = __get_obj_cgroup_from_memcg(memcg);
+ else
+ objcg = NULL;
+ rcu_read_unlock();
+ }
return objcg;
}
@@ -3387,7 +3425,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
int loop = 0;
struct mem_cgroup_tree_per_node *mctz;
unsigned long excess;
- unsigned long nr_scanned;
if (order > 0)
return 0;
@@ -3415,13 +3452,10 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
if (!mz)
break;
- nr_scanned = 0;
reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
- gfp_mask, &nr_scanned);
+ gfp_mask, total_scanned);
nr_reclaimed += reclaimed;
- *total_scanned += nr_scanned;
spin_lock_irq(&mctz->lock);
- __mem_cgroup_remove_exceeded(mz, mctz);
/*
* If we failed to reclaim anything from this memory cgroup
@@ -4893,7 +4927,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
.name = "oom_control",
.seq_show = mem_cgroup_oom_control_read,
.write_u64 = mem_cgroup_oom_control_write,
- .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
{
.name = "pressure_level",
@@ -5151,6 +5184,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ memcg->zswap_max = PAGE_COUNTER_MAX;
+#endif
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
if (parent) {
memcg->swappiness = mem_cgroup_swappiness(parent);
@@ -5649,10 +5685,14 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
if (pte_present(ptent))
page = mc_handle_present_pte(vma, addr, ptent);
+ else if (pte_none_mostly(ptent))
+ /*
+ * PTE markers should be treated as a none pte here, separated
+ * from other swap handling below.
+ */
+ page = mc_handle_file_pte(vma, addr, ptent);
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, ptent, &ent);
- else if (pte_none(ptent))
- page = mc_handle_file_pte(vma, addr, ptent);
if (!page && !ent.val)
return ret;
@@ -6108,6 +6148,14 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
}
+static u64 memory_peak_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return (u64)memcg->memory.watermark * PAGE_SIZE;
+}
+
static int memory_min_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
@@ -6365,6 +6413,46 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
return nbytes;
}
+static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
+ unsigned long nr_to_reclaim, nr_reclaimed = 0;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "", &nr_to_reclaim);
+ if (err)
+ return err;
+
+ while (nr_reclaimed < nr_to_reclaim) {
+ unsigned long reclaimed;
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ /*
+ * This is the final attempt, drain percpu lru caches in the
+ * hope of introducing more evictable pages for
+ * try_to_free_mem_cgroup_pages().
+ */
+ if (!nr_retries)
+ lru_add_drain_all();
+
+ reclaimed = try_to_free_mem_cgroup_pages(memcg,
+ nr_to_reclaim - nr_reclaimed,
+ GFP_KERNEL, true);
+
+ if (!reclaimed && !nr_retries--)
+ return -EAGAIN;
+
+ nr_reclaimed += reclaimed;
+ }
+
+ return nbytes;
+}
+
static struct cftype memory_files[] = {
{
.name = "current",
@@ -6372,6 +6460,11 @@ static struct cftype memory_files[] = {
.read_u64 = memory_current_read,
},
{
+ .name = "peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = memory_peak_read,
+ },
+ {
.name = "min",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_min_show,
@@ -6423,6 +6516,11 @@ static struct cftype memory_files[] = {
.seq_show = memory_oom_group_show,
.write = memory_oom_group_write,
},
+ {
+ .name = "reclaim",
+ .flags = CFTYPE_NS_DELEGATABLE,
+ .write = memory_reclaim,
+ },
{ } /* terminate */
};
@@ -6593,9 +6691,6 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
return;
parent = parent_mem_cgroup(memcg);
- /* No parent means a non-hierarchical mode on v1 memcg */
- if (!parent)
- return;
if (parent == root) {
memcg->memory.emin = READ_ONCE(memcg->memory.min);
@@ -7125,17 +7220,17 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
}
/**
- * __mem_cgroup_try_charge_swap - try charging swap space for a page
- * @page: page being added to swap
+ * __mem_cgroup_try_charge_swap - try charging swap space for a folio
+ * @folio: folio being added to swap
* @entry: swap entry to charge
*
- * Try to charge @page's memcg for the swap space at @entry.
+ * Try to charge @folio's memcg for the swap space at @entry.
*
* Returns 0 on success, -ENOMEM on failure.
*/
-int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
{
- unsigned int nr_pages = thp_nr_pages(page);
+ unsigned int nr_pages = folio_nr_pages(folio);
struct page_counter *counter;
struct mem_cgroup *memcg;
unsigned short oldid;
@@ -7143,9 +7238,9 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
- memcg = page_memcg(page);
+ memcg = folio_memcg(folio);
- VM_WARN_ON_ONCE_PAGE(!memcg, page);
+ VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
if (!memcg)
return 0;
@@ -7168,7 +7263,7 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
if (nr_pages > 1)
mem_cgroup_id_get_many(memcg, nr_pages - 1);
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
- VM_BUG_ON_PAGE(oldid, page);
+ VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
return 0;
@@ -7371,6 +7466,148 @@ static struct cftype memsw_files[] = {
{ }, /* terminate */
};
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+/**
+ * obj_cgroup_may_zswap - check if this cgroup can zswap
+ * @objcg: the object cgroup
+ *
+ * Check if the hierarchical zswap limit has been reached.
+ *
+ * This doesn't check for specific headroom, and it is not atomic
+ * either. But with zswap, the size of the allocation is only known
+ * once compression has occured, and this optimistic pre-check avoids
+ * spending cycles on compression when there is already no room left
+ * or zswap is disabled altogether somewhere in the hierarchy.
+ */
+bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
+{
+ struct mem_cgroup *memcg, *original_memcg;
+ bool ret = true;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return true;
+
+ original_memcg = get_mem_cgroup_from_objcg(objcg);
+ for (memcg = original_memcg; memcg != root_mem_cgroup;
+ memcg = parent_mem_cgroup(memcg)) {
+ unsigned long max = READ_ONCE(memcg->zswap_max);
+ unsigned long pages;
+
+ if (max == PAGE_COUNTER_MAX)
+ continue;
+ if (max == 0) {
+ ret = false;
+ break;
+ }
+
+ cgroup_rstat_flush(memcg->css.cgroup);
+ pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
+ if (pages < max)
+ continue;
+ ret = false;
+ break;
+ }
+ mem_cgroup_put(original_memcg);
+ return ret;
+}
+
+/**
+ * obj_cgroup_charge_zswap - charge compression backend memory
+ * @objcg: the object cgroup
+ * @size: size of compressed object
+ *
+ * This forces the charge after obj_cgroup_may_swap() allowed
+ * compression and storage in zwap for this cgroup to go ahead.
+ */
+void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
+{
+ struct mem_cgroup *memcg;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return;
+
+ VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
+
+ /* PF_MEMALLOC context, charging must succeed */
+ if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
+ VM_WARN_ON_ONCE(1);
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
+ mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
+ rcu_read_unlock();
+}
+
+/**
+ * obj_cgroup_uncharge_zswap - uncharge compression backend memory
+ * @objcg: the object cgroup
+ * @size: size of compressed object
+ *
+ * Uncharges zswap memory on page in.
+ */
+void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
+{
+ struct mem_cgroup *memcg;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return;
+
+ obj_cgroup_uncharge(objcg, size);
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
+ mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
+ rcu_read_unlock();
+}
+
+static u64 zswap_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ cgroup_rstat_flush(css->cgroup);
+ return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B);
+}
+
+static int zswap_max_show(struct seq_file *m, void *v)
+{
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->zswap_max));
+}
+
+static ssize_t zswap_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long max;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &max);
+ if (err)
+ return err;
+
+ xchg(&memcg->zswap_max, max);
+
+ return nbytes;
+}
+
+static struct cftype zswap_files[] = {
+ {
+ .name = "zswap.current",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = zswap_current_read,
+ },
+ {
+ .name = "zswap.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = zswap_max_show,
+ .write = zswap_max_write,
+ },
+ { } /* terminate */
+};
+#endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */
+
/*
* If mem_cgroup_swap_init() is implemented as a subsys_initcall()
* instead of a core_initcall(), this could mean cgroup_memory_noswap still
@@ -7389,7 +7626,9 @@ static int __init mem_cgroup_swap_init(void)
WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
-
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files));
+#endif
return 0;
}
core_initcall(mem_cgroup_swap_init);