diff --git a/patches/0005-le9uo.patch b/patches/0005-le9uo.patch new file mode 100644 index 0000000..080dd73 --- /dev/null +++ b/patches/0005-le9uo.patch @@ -0,0 +1,482 @@ +From 95eac316f9bb32c4e33e64616876c2ab1dfaf3f4 Mon Sep 17 00:00:00 2001 +From: Eric Naim +Date: Fri, 15 Nov 2024 15:22:08 +0800 +Subject: [PATCH] le9uo + +Signed-off-by: Eric Naim +--- + Documentation/admin-guide/sysctl/vm.rst | 72 ++++++++++++ + include/linux/mm.h | 8 ++ + kernel/sysctl.c | 34 ++++++ + mm/Kconfig | 63 +++++++++++ + mm/mm_init.c | 1 + + mm/vmscan.c | 143 +++++++++++++++++++++++- + 6 files changed, 317 insertions(+), 4 deletions(-) + +diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst +index f48eaa98d22d..fc777c14cff6 100644 +--- a/Documentation/admin-guide/sysctl/vm.rst ++++ b/Documentation/admin-guide/sysctl/vm.rst +@@ -25,6 +25,9 @@ files can be found in mm/swap.c. + Currently, these files are in /proc/sys/vm: + + - admin_reserve_kbytes ++- anon_min_ratio ++- clean_low_ratio ++- clean_min_ratio + - compact_memory + - compaction_proactiveness + - compact_unevictable_allowed +@@ -108,6 +111,67 @@ On x86_64 this is about 128MB. + Changing this takes effect whenever an application requests memory. + + ++anon_min_ratio ++============== ++ ++This knob provides *hard* protection of anonymous pages. The anonymous pages ++on the current node won't be reclaimed under any conditions when their amount ++is below vm.anon_min_ratio. ++ ++This knob may be used to prevent excessive swap thrashing when anonymous ++memory is low (for example, when memory is going to be overfilled by ++compressed data of zram module). ++ ++Setting this value too high (close to 100) can result in inability to ++swap and can lead to early OOM under memory pressure. ++ ++The unit of measurement is the percentage of the total memory of the node. ++ ++The default value is 15. ++ ++ ++clean_low_ratio ++================ ++ ++This knob provides *best-effort* protection of clean file pages. The file pages ++on the current node won't be reclaimed under memory pressure when the amount of ++clean file pages is below vm.clean_low_ratio *unless* we threaten to OOM. ++ ++Protection of clean file pages using this knob may be used when swapping is ++still possible to ++ - prevent disk I/O thrashing under memory pressure; ++ - improve performance in disk cache-bound tasks under memory pressure. ++ ++Setting it to a high value may result in a early eviction of anonymous pages ++into the swap space by attempting to hold the protected amount of clean file ++pages in memory. ++ ++The unit of measurement is the percentage of the total memory of the node. ++ ++The default value is 0. ++ ++ ++clean_min_ratio ++================ ++ ++This knob provides *hard* protection of clean file pages. The file pages on the ++current node won't be reclaimed under memory pressure when the amount of clean ++file pages is below vm.clean_min_ratio. ++ ++Hard protection of clean file pages using this knob may be used to ++ - prevent disk I/O thrashing under memory pressure even with no free swap space; ++ - improve performance in disk cache-bound tasks under memory pressure; ++ - avoid high latency and prevent livelock in near-OOM conditions. ++ ++Setting it to a high value may result in a early out-of-memory condition due to ++the inability to reclaim the protected amount of clean file pages when other ++types of pages cannot be reclaimed. ++ ++The unit of measurement is the percentage of the total memory of the node. ++ ++The default value is 15. ++ ++ + compact_memory + ============== + +@@ -964,6 +1028,14 @@ be 133 (x + 2x = 200, 2x = 133.33). + At 0, the kernel will not initiate swap until the amount of free and + file-backed pages is less than the high watermark in a zone. + ++This knob has no effect if the amount of clean file pages on the current ++node is below vm.clean_low_ratio or vm.clean_min_ratio. In this case, ++only anonymous pages can be reclaimed. ++ ++If the number of anonymous pages on the current node is below ++vm.anon_min_ratio, then only file pages can be reclaimed with ++any vm.swappiness value. ++ + + unprivileged_userfaultfd + ======================== +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 61fff5d34ed5..18dc2544700b 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -205,6 +205,14 @@ static inline void __mm_zero_struct_page(struct page *page) + + extern int sysctl_max_map_count; + ++extern bool sysctl_workingset_protection; ++extern u8 sysctl_anon_min_ratio; ++extern u8 sysctl_clean_low_ratio; ++extern u8 sysctl_clean_min_ratio; ++int vm_workingset_protection_update_handler( ++ const struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos); ++ + extern unsigned long sysctl_user_reserve_kbytes; + extern unsigned long sysctl_admin_reserve_kbytes; + +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 676e89dc38c3..4a62ae02a2c0 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -2210,6 +2210,40 @@ static struct ctl_table vm_table[] = { + .extra1 = SYSCTL_ZERO, + }, + #endif ++ { ++ .procname = "workingset_protection", ++ .data = &sysctl_workingset_protection, ++ .maxlen = sizeof(bool), ++ .mode = 0644, ++ .proc_handler = &proc_dobool, ++ }, ++ { ++ .procname = "anon_min_ratio", ++ .data = &sysctl_anon_min_ratio, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = &vm_workingset_protection_update_handler, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE_HUNDRED, ++ }, ++ { ++ .procname = "clean_low_ratio", ++ .data = &sysctl_clean_low_ratio, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = &vm_workingset_protection_update_handler, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE_HUNDRED, ++ }, ++ { ++ .procname = "clean_min_ratio", ++ .data = &sysctl_clean_min_ratio, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = &vm_workingset_protection_update_handler, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE_HUNDRED, ++ }, + { + .procname = "user_reserve_kbytes", + .data = &sysctl_user_reserve_kbytes, +diff --git a/mm/Kconfig b/mm/Kconfig +index 6bfea371341e..87cb63f7ca57 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -499,6 +499,69 @@ config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP + config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP + bool + ++config ANON_MIN_RATIO ++ int "Default value for vm.anon_min_ratio" ++ depends on SYSCTL ++ range 0 100 ++ default 15 ++ help ++ This option sets the default value for vm.anon_min_ratio sysctl knob. ++ ++ The vm.anon_min_ratio sysctl knob provides *hard* protection of ++ anonymous pages. The anonymous pages on the current node won't be ++ reclaimed under any conditions when their amount is below ++ vm.anon_min_ratio. This knob may be used to prevent excessive swap ++ thrashing when anonymous memory is low (for example, when memory is ++ going to be overfilled by compressed data of zram module). ++ ++ Setting this value too high (close to MemTotal) can result in ++ inability to swap and can lead to early OOM under memory pressure. ++ ++config CLEAN_LOW_RATIO ++ int "Default value for vm.clean_low_ratio" ++ depends on SYSCTL ++ range 0 100 ++ default 0 ++ help ++ This option sets the default value for vm.clean_low_ratio sysctl knob. ++ ++ The vm.clean_low_ratio sysctl knob provides *best-effort* ++ protection of clean file pages. The file pages on the current node ++ won't be reclaimed under memory pressure when the amount of clean file ++ pages is below vm.clean_low_ratio *unless* we threaten to OOM. ++ Protection of clean file pages using this knob may be used when ++ swapping is still possible to ++ - prevent disk I/O thrashing under memory pressure; ++ - improve performance in disk cache-bound tasks under memory ++ pressure. ++ ++ Setting it to a high value may result in a early eviction of anonymous ++ pages into the swap space by attempting to hold the protected amount ++ of clean file pages in memory. ++ ++config CLEAN_MIN_RATIO ++ int "Default value for vm.clean_min_ratio" ++ depends on SYSCTL ++ range 0 100 ++ default 15 ++ help ++ This option sets the default value for vm.clean_min_ratio sysctl knob. ++ ++ The vm.clean_min_ratio sysctl knob provides *hard* protection of ++ clean file pages. The file pages on the current node won't be ++ reclaimed under memory pressure when the amount of clean file pages is ++ below vm.clean_min_ratio. Hard protection of clean file pages using ++ this knob may be used to ++ - prevent disk I/O thrashing under memory pressure even with no free ++ swap space; ++ - improve performance in disk cache-bound tasks under memory ++ pressure; ++ - avoid high latency and prevent livelock in near-OOM conditions. ++ ++ Setting it to a high value may result in a early out-of-memory condition ++ due to the inability to reclaim the protected amount of clean file pages ++ when other types of pages cannot be reclaimed. ++ + config HAVE_MEMBLOCK_PHYS_MAP + bool + +diff --git a/mm/mm_init.c b/mm/mm_init.c +index 4ba5607aaf19..0a83f8973ddf 100644 +--- a/mm/mm_init.c ++++ b/mm/mm_init.c +@@ -2628,6 +2628,7 @@ static void __init mem_init_print_info(void) + , K(totalhigh_pages()) + #endif + ); ++ printk(KERN_INFO "le9 Unofficial (le9uo) working set protection 1.7 by Masahito Suzuki (forked from hakavlad's original le9 patch)"); + } + + /* +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 99568ccfb0fd..af10b7d3407a 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -148,6 +148,15 @@ struct scan_control { + /* The file folios on the current node are dangerously low */ + unsigned int file_is_tiny:1; + ++ /* The anonymous pages on the current node are below vm.anon_min_ratio */ ++ unsigned int anon_below_min:1; ++ ++ /* The clean file pages on the current node are below vm.clean_low_ratio */ ++ unsigned int clean_below_low:1; ++ ++ /* The clean file pages on the current node are below vm.clean_min_ratio */ ++ unsigned int clean_below_min:1; ++ + /* Always discard instead of demoting to lower tier memory */ + unsigned int no_demotion:1; + +@@ -197,6 +206,15 @@ struct scan_control { + #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) + #endif + ++bool sysctl_workingset_protection __read_mostly = true; ++u8 sysctl_anon_min_ratio __read_mostly = CONFIG_ANON_MIN_RATIO; ++u8 sysctl_clean_low_ratio __read_mostly = CONFIG_CLEAN_LOW_RATIO; ++u8 sysctl_clean_min_ratio __read_mostly = CONFIG_CLEAN_MIN_RATIO; ++static u64 sysctl_anon_min_ratio_kb __read_mostly = 0; ++static u64 sysctl_clean_low_ratio_kb __read_mostly = 0; ++static u64 sysctl_clean_min_ratio_kb __read_mostly = 0; ++static u64 workingset_protection_prev_totalram __read_mostly = 0; ++ + /* + * From 0 .. MAX_SWAPPINESS. Higher means more swappy. + */ +@@ -1094,6 +1112,10 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, + folio_mapped(folio) && folio_test_referenced(folio)) + goto keep_locked; + ++ if (folio_is_file_lru(folio) ? sc->clean_below_min : ++ (sc->anon_below_min && !sc->clean_below_min)) ++ goto keep_locked; ++ + /* + * The number of dirty pages determines if a node is marked + * reclaim_congested. kswapd will stall and start writing +@@ -2419,6 +2441,15 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + goto out; + } + ++ /* ++ * Force-scan anon if clean file pages is under vm.clean_low_ratio ++ * or vm.clean_min_ratio. ++ */ ++ if (sc->clean_below_low || sc->clean_below_min) { ++ scan_balance = SCAN_ANON; ++ goto out; ++ } ++ + /* + * If there is enough inactive page cache, we do not reclaim + * anything from the anonymous working right now. +@@ -2563,6 +2594,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + BUG(); + } + ++ /* ++ * Hard protection of the working set. ++ * Don't reclaim anon/file pages when the amount is ++ * below the watermark of the same type. ++ */ ++ if (file ? sc->clean_below_min : sc->anon_below_min) ++ scan = 0; ++ + nr[lru] = scan; + } + } +@@ -3996,11 +4035,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc + } + + /* to protect the working set of the last N jiffies */ +-#ifdef CONFIG_CACHY +-static unsigned long lru_gen_min_ttl __read_mostly = 1000; +-#else + static unsigned long lru_gen_min_ttl __read_mostly; +-#endif + + static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + { +@@ -4038,6 +4073,96 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + } + } + ++int vm_workingset_protection_update_handler(const struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); ++ if (ret || !write) ++ return ret; ++ ++ workingset_protection_prev_totalram = 0; ++ ++ return 0; ++} ++ ++static void prepare_workingset_protection(pg_data_t *pgdat, struct scan_control *sc) ++{ ++ unsigned long node_mem_total; ++ struct sysinfo i; ++ ++ if (!(sysctl_workingset_protection)) { ++ sc->anon_below_min = 0; ++ sc->clean_below_low = 0; ++ sc->clean_below_min = 0; ++ return; ++ } ++ ++ if (likely(sysctl_anon_min_ratio || ++ sysctl_clean_low_ratio || ++ sysctl_clean_min_ratio)) { ++#ifdef CONFIG_NUMA ++ si_meminfo_node(&i, pgdat->node_id); ++#else //CONFIG_NUMA ++ si_meminfo(&i); ++#endif //CONFIG_NUMA ++ node_mem_total = i.totalram; ++ ++ if (unlikely(workingset_protection_prev_totalram != node_mem_total)) { ++ sysctl_anon_min_ratio_kb = ++ node_mem_total * sysctl_anon_min_ratio / 100; ++ sysctl_clean_low_ratio_kb = ++ node_mem_total * sysctl_clean_low_ratio / 100; ++ sysctl_clean_min_ratio_kb = ++ node_mem_total * sysctl_clean_min_ratio / 100; ++ workingset_protection_prev_totalram = node_mem_total; ++ } ++ } ++ ++ /* ++ * Check the number of anonymous pages to protect them from ++ * reclaiming if their amount is below the specified. ++ */ ++ if (sysctl_anon_min_ratio) { ++ unsigned long reclaimable_anon; ++ ++ reclaimable_anon = ++ node_page_state(pgdat, NR_ACTIVE_ANON) + ++ node_page_state(pgdat, NR_INACTIVE_ANON) + ++ node_page_state(pgdat, NR_ISOLATED_ANON); ++ ++ sc->anon_below_min = reclaimable_anon < sysctl_anon_min_ratio_kb; ++ } else ++ sc->anon_below_min = 0; ++ ++ /* ++ * Check the number of clean file pages to protect them from ++ * reclaiming if their amount is below the specified. ++ */ ++ if (sysctl_clean_low_ratio || sysctl_clean_min_ratio) { ++ unsigned long reclaimable_file, dirty, clean; ++ ++ reclaimable_file = ++ node_page_state(pgdat, NR_ACTIVE_FILE) + ++ node_page_state(pgdat, NR_INACTIVE_FILE) + ++ node_page_state(pgdat, NR_ISOLATED_FILE); ++ dirty = node_page_state(pgdat, NR_FILE_DIRTY); ++ /* ++ * node_page_state() sum can go out of sync since ++ * all the values are not read at once. ++ */ ++ if (likely(reclaimable_file > dirty)) ++ clean = reclaimable_file - dirty; ++ else ++ clean = 0; ++ ++ sc->clean_below_low = clean < sysctl_clean_low_ratio_kb; ++ sc->clean_below_min = clean < sysctl_clean_min_ratio_kb; ++ } else { ++ sc->clean_below_low = 0; ++ sc->clean_below_min = 0; ++ } ++} ++ + /****************************************************************************** + * rmap/PT walk feedback + ******************************************************************************/ +@@ -4536,6 +4661,12 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw + */ + if (!swappiness) + type = LRU_GEN_FILE; ++ else if (sc->clean_below_min) ++ type = LRU_GEN_ANON; ++ else if (sc->anon_below_min) ++ type = LRU_GEN_FILE; ++ else if (sc->clean_below_low) ++ type = LRU_GEN_ANON; + else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) + type = LRU_GEN_ANON; + else if (swappiness == 1) +@@ -4815,6 +4946,8 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + ++ prepare_workingset_protection(pgdat, sc); ++ + /* lru_gen_age_node() called mem_cgroup_calculate_protection() */ + if (mem_cgroup_below_min(NULL, memcg)) + return MEMCG_LRU_YOUNG; +@@ -5962,6 +6095,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) + + prepare_scan_control(pgdat, sc); + ++ prepare_workingset_protection(pgdat, sc); ++ + shrink_node_memcgs(pgdat, sc); + + flush_reclaim_state(sc); +-- +2.47.0