diff --git a/VERSION b/VERSION index 5f6c086..021c940 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -6.8.1 +6.8.3 diff --git a/config b/config index 53b954e..489c008 100644 --- a/config +++ b/config @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 6.8.1 Kernel Configuration +# Linux/x86 6.8.3 Kernel Configuration # CONFIG_CC_VERSION_TEXT="gcc (GCC) 13.2.1 20230801" CONFIG_CC_IS_GCC=y @@ -24,6 +24,7 @@ CONFIG_PAHOLE_VERSION=126 CONFIG_IRQ_WORK=y CONFIG_BUILDTIME_TABLE_SORT=y CONFIG_THREAD_INFO_IN_TASK=y +# CONFIG_ECHO_SCHED is not set # # General setup @@ -518,7 +519,6 @@ CONFIG_X86_DIRECT_GBPAGES=y CONFIG_X86_CPA_STATISTICS=y CONFIG_X86_MEM_ENCRYPT=y CONFIG_AMD_MEM_ENCRYPT=y -# CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT is not set CONFIG_NUMA=y CONFIG_AMD_NUMA=y CONFIG_X86_64_ACPI_NUMA=y @@ -561,6 +561,7 @@ CONFIG_HZ_300=y # CONFIG_HZ_500 is not set # CONFIG_HZ_600 is not set # CONFIG_HZ_750 is not set +# CONFIG_HZ_625 is not set # CONFIG_HZ_1000 is not set CONFIG_HZ=300 CONFIG_SCHED_HRTICK=y @@ -836,6 +837,8 @@ CONFIG_AS_SHA1_NI=y CONFIG_AS_SHA256_NI=y CONFIG_AS_TPAUSE=y CONFIG_AS_GFNI=y +CONFIG_AS_VAES=y +CONFIG_AS_VPCLMULQDQ=y CONFIG_AS_WRUSS=y # @@ -1024,6 +1027,7 @@ CONFIG_MODULE_SRCVERSION_ALL=y CONFIG_MODULE_SIG=y # CONFIG_MODULE_SIG_FORCE is not set CONFIG_MODULE_SIG_ALL=y +# CONFIG_MODULE_SIG_SHA1 is not set # CONFIG_MODULE_SIG_SHA256 is not set # CONFIG_MODULE_SIG_SHA384 is not set CONFIG_MODULE_SIG_SHA512=y @@ -2053,7 +2057,6 @@ CONFIG_BT_BNEP_MC_FILTER=y CONFIG_BT_BNEP_PROTO_FILTER=y CONFIG_BT_CMTP=m CONFIG_BT_HIDP=m -# CONFIG_BT_HS is not set CONFIG_BT_LE=y CONFIG_BT_LE_L2CAP_ECRED=y CONFIG_BT_6LOWPAN=m @@ -6867,6 +6870,7 @@ CONFIG_DRM_AMD_DC=y CONFIG_DRM_AMD_DC_FP=y CONFIG_DRM_AMD_DC_SI=y CONFIG_DRM_AMD_SECURE_DISPLAY=y +CONFIG_AMD_PRIVATE_COLOR=y # end of Display Engine Configuration CONFIG_HSA_AMD=y @@ -10944,7 +10948,8 @@ CONFIG_ASYNC_XOR=m CONFIG_ASYNC_PQ=m CONFIG_ASYNC_RAID6_RECOV=m CONFIG_CRYPTO=y - +CONFIG_NTSYNC=m +CONFIG_ACPI_CALL=m # # Crypto core or helper # @@ -11808,6 +11813,7 @@ CONFIG_ASYNC_RAID6_TEST=m # CONFIG_TEST_OBJPOOL is not set CONFIG_ARCH_USE_MEMTEST=y CONFIG_MEMTEST=y + # CONFIG_HYPERV_TESTING is not set # end of Kernel Testing and Coverage diff --git a/patches/cachyos/0001-bore-cachy.patch b/patches/cachyos/0001-bore-cachy.patch index 8aeeab5..cf2571b 100644 --- a/patches/cachyos/0001-bore-cachy.patch +++ b/patches/cachyos/0001-bore-cachy.patch @@ -1,24 +1,24 @@ -From 1ab81cfa061f454316364a32761ce45a7479e616 Mon Sep 17 00:00:00 2001 +From 37fd243d8f075b558f54a36fc85887269310709c Mon Sep 17 00:00:00 2001 From: Piotr Gorski -Date: Thu, 7 Mar 2024 22:28:47 +0100 +Date: Tue, 26 Mar 2024 08:11:18 +0100 Subject: [PATCH] bore-cachy Signed-off-by: Piotr Gorski --- - include/linux/sched.h | 12 ++ - init/Kconfig | 19 +++ - kernel/sched/core.c | 148 +++++++++++++++++++ - kernel/sched/debug.c | 61 +++++++- - kernel/sched/fair.c | 319 ++++++++++++++++++++++++++++++++++++---- + include/linux/sched.h | 10 ++ + init/Kconfig | 17 +++ + kernel/sched/core.c | 144 +++++++++++++++++++++++++ + kernel/sched/debug.c | 60 ++++++++++- + kernel/sched/fair.c | 231 +++++++++++++++++++++++++++++++++++++--- kernel/sched/features.h | 4 + - kernel/sched/sched.h | 7 + - 7 files changed, 542 insertions(+), 28 deletions(-) + kernel/sched/sched.h | 7 ++ + 7 files changed, 457 insertions(+), 16 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h -index ffe8f618a..7ac6163f9 100644 +index ffe8f618a..0ab0b0424 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h -@@ -547,6 +547,18 @@ struct sched_entity { +@@ -547,6 +547,16 @@ struct sched_entity { u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; @@ -28,8 +28,6 @@ index ffe8f618a..7ac6163f9 100644 + u8 curr_burst_penalty; + u8 burst_penalty; + u8 burst_score; -+ u32 burst_load; -+ bool on_cfs_rq; + u8 child_burst; + u32 child_burst_cnt; + u64 child_burst_last_cached; @@ -38,10 +36,10 @@ index ffe8f618a..7ac6163f9 100644 u64 slice; diff --git a/init/Kconfig b/init/Kconfig -index 47671886d..c99132cf6 100644 +index 9ea39297f..f9bb5401f 100644 --- a/init/Kconfig +++ b/init/Kconfig -@@ -1299,6 +1299,25 @@ config CHECKPOINT_RESTORE +@@ -1299,6 +1299,23 @@ config CHECKPOINT_RESTORE If unsure, say N here. @@ -60,18 +58,16 @@ index 47671886d..c99132cf6 100644 + With a little impact to scheduling fairness, it may improve + responsiveness especially under heavy background workload. + -+ You can turn it off by setting the sysctl kernel.sched_bore = 0. -+ + If unsure, say Y here. + config SCHED_AUTOGROUP bool "Automatic process group scheduling" select CGROUPS diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 9116bcc90..43e4311db 100644 +index 9116bcc90..fc3d7b48e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c -@@ -4507,6 +4507,143 @@ int wake_up_state(struct task_struct *p, unsigned int state) +@@ -4507,6 +4507,139 @@ int wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } @@ -86,18 +82,14 @@ index 9116bcc90..43e4311db 100644 + init_task.se.curr_burst_penalty = 0; + init_task.se.burst_penalty = 0; + init_task.se.burst_score = 0; -+ init_task.se.on_cfs_rq = false; + init_task.se.child_burst_last_cached = 0; -+ init_task.se.burst_load = 0; +} + +void inline sched_fork_bore(struct task_struct *p) { + p->se.burst_time = 0; + p->se.curr_burst_penalty = 0; + p->se.burst_score = 0; -+ p->se.on_cfs_rq = false; + p->se.child_burst_last_cached = 0; -+ p->se.burst_load = 0; +} + +static u32 count_child_tasks(struct task_struct *p) { @@ -206,7 +198,7 @@ index 9116bcc90..43e4311db 100644 +} + +static void sched_post_fork_bore(struct task_struct *p) { -+ if (p->sched_class == &fair_sched_class && likely(sched_bore)) ++ if (p->sched_class == &fair_sched_class) + inherit_burst(p); + p->se.burst_penalty = p->se.prev_burst_penalty; +} @@ -215,7 +207,7 @@ index 9116bcc90..43e4311db 100644 /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. -@@ -4523,6 +4660,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +@@ -4523,6 +4656,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; @@ -225,7 +217,7 @@ index 9116bcc90..43e4311db 100644 p->se.vlag = 0; p->se.slice = sysctl_sched_base_slice; INIT_LIST_HEAD(&p->se.group_node); -@@ -4839,6 +4979,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) +@@ -4839,6 +4975,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) void sched_post_fork(struct task_struct *p) { @@ -235,20 +227,20 @@ index 9116bcc90..43e4311db 100644 uclamp_post_fork(p); } -@@ -9910,6 +10053,11 @@ void __init sched_init(void) +@@ -9910,6 +10049,11 @@ void __init sched_init(void) BUG_ON(&dl_sched_class != &stop_sched_class + 1); #endif +#ifdef CONFIG_SCHED_BORE + sched_init_bore(); -+ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 4.5.2 by Masahito Suzuki"); ++ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 5.0.3 by Masahito Suzuki"); +#endif // CONFIG_SCHED_BORE + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 8d5d98a58..a565363fd 100644 +index 8d5d98a58..b17861261 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -167,7 +167,52 @@ static const struct file_operations sched_feat_fops = { @@ -344,19 +336,18 @@ index 8d5d98a58..a565363fd 100644 #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif -@@ -1068,6 +1123,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, +@@ -1068,6 +1123,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.load.weight); #ifdef CONFIG_SMP +#ifdef CONFIG_SCHED_BORE -+ P(se.burst_load); + P(se.burst_score); +#endif // CONFIG_SCHED_BORE P(se.avg.load_sum); P(se.avg.runnable_sum); P(se.avg.util_sum); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index fc0a9de42..3ee4e7e70 100644 +index fc0a9de42..ae55f46a8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -19,6 +19,9 @@ @@ -369,7 +360,7 @@ index fc0a9de42..3ee4e7e70 100644 */ #include #include -@@ -64,28 +67,128 @@ +@@ -64,28 +67,125 @@ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus * @@ -412,14 +403,12 @@ index fc0a9de42..3ee4e7e70 100644 + +#ifdef CONFIG_SCHED_BORE +u8 __read_mostly sched_bore = 1; -+u8 __read_mostly sched_burst_score_rounding = 0; +u8 __read_mostly sched_burst_smoothness_long = 1; +u8 __read_mostly sched_burst_smoothness_short = 0; +u8 __read_mostly sched_burst_fork_atavistic = 2; +u8 __read_mostly sched_burst_penalty_offset = 22; +uint __read_mostly sched_burst_penalty_scale = 1280; +uint __read_mostly sched_burst_cache_lifetime = 60000000; -+u8 __read_mostly sched_vlag_deviation_limit = 11; +static int __maybe_unused thirty_two = 32; +static int __maybe_unused sixty_four = 64; +static int __maybe_unused maxval_12_bits = 4095; @@ -456,20 +445,19 @@ index fc0a9de42..3ee4e7e70 100644 + return __unscale_slice(delta, se->burst_score); +} + -+static void avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se); -+static void avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se); ++void reweight_task(struct task_struct *p, int prio); + +static void update_burst_score(struct sched_entity *se) { -+ struct cfs_rq *cfs_rq = cfs_rq_of(se); -+ u8 prev_score = se->burst_score; -+ u32 penalty = se->burst_penalty; -+ if (sched_burst_score_rounding) penalty += 0x2U; -+ se->burst_score = penalty >> 2; ++ if (!entity_is_task(se)) return; ++ struct task_struct *p = task_of(se); ++ u8 prio = p->static_prio - MAX_RT_PRIO; ++ u8 prev_prio = min(39, prio + se->burst_score); + -+ if ((se->burst_score != prev_score) && se->on_cfs_rq) { -+ avg_vruntime_sub(cfs_rq, se); -+ avg_vruntime_add(cfs_rq, se); -+ } ++ se->burst_score = se->burst_penalty >> 2; ++ ++ u8 new_prio = min(39, prio + se->burst_score); ++ if (new_prio != prev_prio) ++ reweight_task(p, new_prio); +} + +static void update_burst_penalty(struct sched_entity *se) { @@ -509,7 +497,7 @@ index fc0a9de42..3ee4e7e70 100644 int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) -@@ -136,12 +239,8 @@ int __weak arch_asym_cpu_priority(int cpu) +@@ -136,12 +236,8 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ @@ -522,7 +510,7 @@ index fc0a9de42..3ee4e7e70 100644 #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ -@@ -150,6 +249,87 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; +@@ -150,6 +246,69 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = { @@ -533,16 +521,7 @@ index fc0a9de42..3ee4e7e70 100644 + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_ONE, -+ }, -+ { -+ .procname = "sched_burst_score_rounding", -+ .data = &sched_burst_score_rounding, -+ .maxlen = sizeof(u8), -+ .mode = 0644, -+ .proc_handler = proc_dou8vec_minmax, -+ .extra1 = SYSCTL_ZERO, ++ .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, + { @@ -597,20 +576,11 @@ index fc0a9de42..3ee4e7e70 100644 + .mode = 0644, + .proc_handler = proc_douintvec, + }, -+ { -+ .procname = "sched_vlag_deviation_limit", -+ .data = &sched_vlag_deviation_limit, -+ .maxlen = sizeof(u8), -+ .mode = 0644, -+ .proc_handler = proc_dou8vec_minmax, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = &thirty_two, -+ }, +#endif // CONFIG_SCHED_BORE #ifdef CONFIG_CFS_BANDWIDTH { .procname = "sched_cfs_bandwidth_slice_us", -@@ -208,6 +388,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) +@@ -208,6 +367,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) * * This idea comes from the SD scheduler of Con Kolivas: */ @@ -624,7 +594,7 @@ index fc0a9de42..3ee4e7e70 100644 static unsigned int get_update_sysctl_factor(void) { unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); -@@ -238,6 +425,7 @@ static void update_sysctl(void) +@@ -238,6 +404,7 @@ static void update_sysctl(void) SET_SYSCTL(sched_base_slice); #undef SET_SYSCTL } @@ -632,130 +602,17 @@ index fc0a9de42..3ee4e7e70 100644 void __init sched_init_granularity(void) { -@@ -311,6 +499,9 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) - if (unlikely(se->load.weight != NICE_0_LOAD)) - delta = __calc_delta(delta, NICE_0_LOAD, &se->load); - -+#ifdef CONFIG_SCHED_BORE -+ if (likely(sched_bore)) delta = scale_slice(delta, se); -+#endif // CONFIG_SCHED_BORE - return delta; - } - -@@ -637,10 +828,26 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) - * - * As measured, the max (key * weight) value was ~44 bits for a kernel build. - */ -+#if !defined(CONFIG_SCHED_BORE) -+#define entity_weight(se) scale_load_down(se->load.weight) -+#else // CONFIG_SCHED_BORE -+static unsigned long entity_weight(struct sched_entity *se) { -+ unsigned long weight = se->load.weight; -+ if (likely(sched_bore)) weight = unscale_slice(weight, se); -+#ifdef CONFIG_64BIT -+ weight >>= SCHED_FIXEDPOINT_SHIFT - 3; -+#endif // CONFIG_64BIT -+ return weight; -+} -+#endif // CONFIG_SCHED_BORE -+ - static void - avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) - { -- unsigned long weight = scale_load_down(se->load.weight); -+ unsigned long weight = entity_weight(se); -+#ifdef CONFIG_SCHED_BORE -+ se->burst_load = weight; -+#endif // CONFIG_SCHED_BORE - s64 key = entity_key(cfs_rq, se); - - cfs_rq->avg_vruntime += key * weight; -@@ -650,7 +857,12 @@ avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) - static void - avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) - { -- unsigned long weight = scale_load_down(se->load.weight); -+#if !defined(CONFIG_SCHED_BORE) -+ unsigned long weight = entity_weight(se); -+#else // CONFIG_SCHED_BORE -+ unsigned long weight = se->burst_load; -+ se->burst_load = 0; -+#endif // CONFIG_SCHED_BORE - s64 key = entity_key(cfs_rq, se); - - cfs_rq->avg_vruntime -= key * weight; -@@ -670,14 +882,14 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) - * Specifically: avg_runtime() + 0 must result in entity_eligible() := true - * For this to be so, the result of this function must have a left bias. - */ --u64 avg_vruntime(struct cfs_rq *cfs_rq) -+static u64 avg_key(struct cfs_rq *cfs_rq) - { - struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->avg_vruntime; - long load = cfs_rq->avg_load; - - if (curr && curr->on_rq) { -- unsigned long weight = scale_load_down(curr->load.weight); -+ unsigned long weight = entity_weight(curr); - - avg += entity_key(cfs_rq, curr) * weight; - load += weight; -@@ -687,12 +899,15 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) - /* sign flips effective floor / ceil */ - if (avg < 0) - avg -= (load - 1); -- avg = div_s64(avg, load); -+ avg = div64_s64(avg, load); - } - -- return cfs_rq->min_vruntime + avg; -+ return avg; - } - -+u64 avg_vruntime(struct cfs_rq *cfs_rq) { -+ return cfs_rq->min_vruntime + avg_key(cfs_rq); -+} - /* - * lag_i = S - s_i = w_i * (V - v_i) - * -@@ -717,6 +932,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +@@ -717,6 +884,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) lag = avg_vruntime(cfs_rq) - se->vruntime; limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); +#ifdef CONFIG_SCHED_BORE -+ if (likely(sched_bore)) limit >>= 1; ++ limit >>= 1; +#endif // CONFIG_SCHED_BORE se->vlag = clamp(lag, -limit, limit); } -@@ -744,7 +962,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) - long load = cfs_rq->avg_load; - - if (curr && curr->on_rq) { -- unsigned long weight = scale_load_down(curr->load.weight); -+ unsigned long weight = entity_weight(curr); - - avg += entity_key(cfs_rq, curr) * weight; - load += weight; -@@ -840,10 +1058,16 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) - se->min_vruntime = se->vruntime; - rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, - __entity_less, &min_vruntime_cb); -+#ifdef CONFIG_SCHED_BORE -+ se->on_cfs_rq = true; -+#endif // CONFIG_SCHED_BORE - } - - static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) - { -+#ifdef CONFIG_SCHED_BORE -+ se->on_cfs_rq = false; -+#endif // CONFIG_SCHED_BORE - rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, - &min_vruntime_cb); - avg_vruntime_sub(cfs_rq, se); -@@ -968,6 +1192,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +@@ -968,6 +1138,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ #ifdef CONFIG_SMP @@ -763,7 +620,7 @@ index fc0a9de42..3ee4e7e70 100644 int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); -@@ -979,6 +1204,7 @@ int sched_update_scaling(void) +@@ -979,6 +1150,7 @@ int sched_update_scaling(void) return 0; } @@ -771,7 +628,7 @@ index fc0a9de42..3ee4e7e70 100644 #endif #endif -@@ -1178,7 +1404,13 @@ static void update_curr(struct cfs_rq *cfs_rq) +@@ -1178,7 +1350,13 @@ static void update_curr(struct cfs_rq *cfs_rq) if (unlikely(delta_exec <= 0)) return; @@ -785,54 +642,17 @@ index fc0a9de42..3ee4e7e70 100644 update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); -@@ -5170,8 +5402,8 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} - static void - place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - { -- u64 vslice, vruntime = avg_vruntime(cfs_rq); -- s64 lag = 0; -+ s64 lag = 0, key = avg_key(cfs_rq); -+ u64 vslice, vruntime = cfs_rq->min_vruntime + key; - - se->slice = sysctl_sched_base_slice; - vslice = calc_delta_fair(se->slice, se); -@@ -5184,6 +5416,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -5184,6 +5362,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * EEVDF: placement strategy #1 / #2 */ +#ifdef CONFIG_SCHED_BORE -+ if (unlikely(!sched_bore) || se->vlag) ++ if (se->vlag) +#endif // CONFIG_SCHED_BORE if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { struct sched_entity *curr = cfs_rq->curr; unsigned long load; -@@ -5244,12 +5479,22 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - */ - load = cfs_rq->avg_load; - if (curr && curr->on_rq) -- load += scale_load_down(curr->load.weight); -+ load += entity_weight(curr); - -- lag *= load + scale_load_down(se->load.weight); -+ lag *= load + entity_weight(se); -+#if !defined(CONFIG_SCHED_BORE) - if (WARN_ON_ONCE(!load)) -+#else // CONFIG_SCHED_BORE -+ if (unlikely(!load)) -+#endif // CONFIG_SCHED_BORE - load = 1; -- lag = div_s64(lag, load); -+ lag = div64_s64(lag, load); -+#ifdef CONFIG_SCHED_BORE -+ if (likely(sched_bore)) { -+ s64 limit = vslice << sched_vlag_deviation_limit; -+ lag = clamp(lag, -limit, limit); -+ } -+#endif // CONFIG_SCHED_BORE - } - - se->vruntime = vruntime - lag; -@@ -6816,6 +7061,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +@@ -6816,6 +6997,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) bool was_sched_idle = sched_idle_rq(rq); util_est_dequeue(&rq->cfs, p); @@ -847,7 +667,7 @@ index fc0a9de42..3ee4e7e70 100644 for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); -@@ -8565,16 +8818,25 @@ static void yield_task_fair(struct rq *rq) +@@ -8565,16 +8754,25 @@ static void yield_task_fair(struct rq *rq) /* * Are we the only task in the tree? */ @@ -873,7 +693,7 @@ index fc0a9de42..3ee4e7e70 100644 /* * Tell update_rq_clock() that we've just updated, * so we don't do microscopic update in schedule() -@@ -12664,6 +12926,9 @@ static void task_fork_fair(struct task_struct *p) +@@ -12664,6 +12862,9 @@ static void task_fork_fair(struct task_struct *p) curr = cfs_rq->curr; if (curr) update_curr(cfs_rq); diff --git a/patches/cachyos/0001-cachyos-base-all.patch b/patches/cachyos/0001-cachyos-base-all.patch index 148a8a7..84d0f67 100644 --- a/patches/cachyos/0001-cachyos-base-all.patch +++ b/patches/cachyos/0001-cachyos-base-all.patch @@ -1,7 +1,1204 @@ -From 8f03bb4df21c5746b9f1c3e399faa3c932737e4f Mon Sep 17 00:00:00 2001 +From 2b7dc22b0a950292985c4d5118c5eeaa51ea2918 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 15 Mar 2024 20:08:47 +0100 -Subject: [PATCH 1/7] amd-pstate +Date: Wed, 3 Apr 2024 17:06:09 +0200 +Subject: [PATCH 1/8] aex-xts + +Signed-off-by: Peter Jung +--- + arch/x86/Kconfig.assembler | 10 + + arch/x86/crypto/Makefile | 3 +- + arch/x86/crypto/aes-xts-avx-x86_64.S | 838 +++++++++++++++++++++++++++ + arch/x86/crypto/aesni-intel_glue.c | 270 ++++++++- + 4 files changed, 1118 insertions(+), 3 deletions(-) + create mode 100644 arch/x86/crypto/aes-xts-avx-x86_64.S + +diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler +index 8ad41da301e5..59aedf32c4ea 100644 +--- a/arch/x86/Kconfig.assembler ++++ b/arch/x86/Kconfig.assembler +@@ -25,6 +25,16 @@ config AS_GFNI + help + Supported by binutils >= 2.30 and LLVM integrated assembler + ++config AS_VAES ++ def_bool $(as-instr,vaesenc %ymm0$(comma)%ymm1$(comma)%ymm2) ++ help ++ Supported by binutils >= 2.30 and LLVM integrated assembler ++ ++config AS_VPCLMULQDQ ++ def_bool $(as-instr,vpclmulqdq \$0x10$(comma)%ymm0$(comma)%ymm1$(comma)%ymm2) ++ help ++ Supported by binutils >= 2.30 and LLVM integrated assembler ++ + config AS_WRUSS + def_bool $(as-instr,wrussq %rax$(comma)(%rbx)) + help +diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile +index 9aa46093c91b..9c5ce5613738 100644 +--- a/arch/x86/crypto/Makefile ++++ b/arch/x86/crypto/Makefile +@@ -48,7 +48,8 @@ chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o + + obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o + aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o +-aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o ++aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o \ ++ aes_ctrby8_avx-x86_64.o aes-xts-avx-x86_64.o + + obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o + sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o +diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S +new file mode 100644 +index 000000000000..b8005d0205f8 +--- /dev/null ++++ b/arch/x86/crypto/aes-xts-avx-x86_64.S +@@ -0,0 +1,838 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++/* ++ * AES-XTS for modern x86_64 CPUs ++ * ++ * Copyright 2024 Google LLC ++ * ++ * Author: Eric Biggers ++ */ ++ ++/* ++ * This file implements AES-XTS for modern x86_64 CPUs. To handle the ++ * complexities of coding for x86 SIMD, e.g. where every vector length needs ++ * different code, it uses a macro to generate several implementations that ++ * share similar source code but are targeted at different CPUs, listed below: ++ * ++ * AES-NI + AVX ++ * - 128-bit vectors (1 AES block per vector) ++ * - VEX-coded instructions ++ * - xmm0-xmm15 ++ * - This is for older CPUs that lack VAES but do have AVX. ++ * ++ * VAES + VPCLMULQDQ + AVX2 ++ * - 256-bit vectors (2 AES blocks per vector) ++ * - VEX-coded instructions ++ * - ymm0-ymm15 ++ * - This is for CPUs that have VAES but lack AVX512 or AVX10, ++ * e.g. Intel's Alder Lake and AMD's Zen 3. ++ * ++ * VAES + VPCLMULQDQ + AVX10/256 + BMI2 ++ * - 256-bit vectors (2 AES blocks per vector) ++ * - EVEX-coded instructions ++ * - ymm0-ymm31 ++ * - This is for CPUs that have AVX512 but where using zmm registers causes ++ * downclocking, and for CPUs that have AVX10/256 but not AVX10/512. ++ * - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256. ++ * To avoid confusion with 512-bit, we just write AVX10/256. ++ * ++ * VAES + VPCLMULQDQ + AVX10/512 + BMI2 ++ * - Same as the previous one, but upgrades to 512-bit vectors ++ * (4 AES blocks per vector) in zmm0-zmm31. ++ * - This is for CPUs that have good AVX512 or AVX10/512 support. ++ * ++ * This file doesn't have an implementation for AES-NI alone (without AVX), as ++ * the lack of VEX would make all the assembly code different. ++ * ++ * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of ++ * the XTS tweaks. This avoids a bottleneck. Currently there don't seem to be ++ * any CPUs that support VAES but not VPCLMULQDQ. If that changes, we might ++ * need to start also providing an implementation using VAES alone. ++ * ++ * The AES-XTS implementations in this file support everything required by the ++ * crypto API, including support for arbitrary input lengths and multi-part ++ * processing. However, they are most heavily optimized for the common case of ++ * power-of-2 length inputs that are processed in a single part (disk sectors). ++ */ ++ ++#include ++#include ++ ++.section .rodata ++.p2align 4 ++.Lgf_poly: ++ // The low 64 bits of this value represent the polynomial x^7 + x^2 + x ++ // + 1. It is the value that must be XOR'd into the low 64 bits of the ++ // tweak each time a 1 is carried out of the high 64 bits. ++ // ++ // The high 64 bits of this value is just the internal carry bit that ++ // exists when there's a carry out of the low 64 bits of the tweak. ++ .quad 0x87, 1 ++ ++ // This table contains constants for vpshufb and vpblendvb, used to ++ // handle variable byte shifts and blending during ciphertext stealing ++ // on CPUs that don't support AVX10-style masking. ++.Lcts_permute_table: ++ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 ++ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 ++ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ++ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f ++ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 ++ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 ++.text ++ ++// Function parameters ++.set KEY, %rdi // Initially points to crypto_aes_ctx, then is ++ // advanced to point directly to the round keys ++.set SRC, %rsi // Pointer to next source data ++.set DST, %rdx // Pointer to next destination data ++.set LEN, %rcx // Remaining length in bytes ++.set TWEAK, %r8 // Pointer to next tweak ++ ++// %r9d holds the AES key length in bytes. ++.set KEYLEN, %r9d ++ ++// %rax and %r10-r11 are available as temporaries. ++ ++.macro _define_Vi i ++.if VL == 16 ++ .set V\i, %xmm\i ++.elseif VL == 32 ++ .set V\i, %ymm\i ++.elseif VL == 64 ++ .set V\i, %zmm\i ++.else ++ .error "Unsupported Vector Length (VL)" ++.endif ++.endm ++ ++.macro _define_aliases ++ // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers ++ // are available, that map to the xmm, ymm, or zmm registers according ++ // to the selected Vector Length (VL). ++ _define_Vi 0 ++ _define_Vi 1 ++ _define_Vi 2 ++ _define_Vi 3 ++ _define_Vi 4 ++ _define_Vi 5 ++ _define_Vi 6 ++ _define_Vi 7 ++ _define_Vi 8 ++ _define_Vi 9 ++ _define_Vi 10 ++ _define_Vi 11 ++ _define_Vi 12 ++ _define_Vi 13 ++ _define_Vi 14 ++ _define_Vi 15 ++.if USE_AVX10 ++ _define_Vi 16 ++ _define_Vi 17 ++ _define_Vi 18 ++ _define_Vi 19 ++ _define_Vi 20 ++ _define_Vi 21 ++ _define_Vi 22 ++ _define_Vi 23 ++ _define_Vi 24 ++ _define_Vi 25 ++ _define_Vi 26 ++ _define_Vi 27 ++ _define_Vi 28 ++ _define_Vi 29 ++ _define_Vi 30 ++ _define_Vi 31 ++.endif ++ ++ // V0-V3 hold the data blocks during the main loop, or temporary values ++ // otherwise. V4-V5 hold temporary values. ++ ++ // V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak. ++ .set TWEAK0_XMM, %xmm6 ++ .set TWEAK0, V6 ++ .set TWEAK1_XMM, %xmm7 ++ .set TWEAK1, V7 ++ .set TWEAK2, V8 ++ .set TWEAK3, V9 ++ ++ // V10-V13 are used for computing the next values of TWEAK[0-3]. ++ .set NEXT_TWEAK0, V10 ++ .set NEXT_TWEAK1, V11 ++ .set NEXT_TWEAK2, V12 ++ .set NEXT_TWEAK3, V13 ++ ++ // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes. ++ .set GF_POLY_XMM, %xmm14 ++ .set GF_POLY, V14 ++ ++ // V15 holds the first AES round key, copied to all 128-bit lanes. ++ .set KEY0_XMM, %xmm15 ++ .set KEY0, V15 ++ ++ // If 32 SIMD registers are available, then V16-V29 hold the remaining ++ // AES round keys, copied to all 128-bit lanes. ++.if USE_AVX10 ++ .set KEY1_XMM, %xmm16 ++ .set KEY1, V16 ++ .set KEY2_XMM, %xmm17 ++ .set KEY2, V17 ++ .set KEY3_XMM, %xmm18 ++ .set KEY3, V18 ++ .set KEY4_XMM, %xmm19 ++ .set KEY4, V19 ++ .set KEY5_XMM, %xmm20 ++ .set KEY5, V20 ++ .set KEY6_XMM, %xmm21 ++ .set KEY6, V21 ++ .set KEY7_XMM, %xmm22 ++ .set KEY7, V22 ++ .set KEY8_XMM, %xmm23 ++ .set KEY8, V23 ++ .set KEY9_XMM, %xmm24 ++ .set KEY9, V24 ++ .set KEY10_XMM, %xmm25 ++ .set KEY10, V25 ++ .set KEY11_XMM, %xmm26 ++ .set KEY11, V26 ++ .set KEY12_XMM, %xmm27 ++ .set KEY12, V27 ++ .set KEY13_XMM, %xmm28 ++ .set KEY13, V28 ++ .set KEY14_XMM, %xmm29 ++ .set KEY14, V29 ++.endif ++ // V30-V31 are currently unused. ++.endm ++ ++// Move a vector between memory and a register. ++.macro _vmovdqu src, dst ++.if VL < 64 ++ vmovdqu \src, \dst ++.else ++ vmovdqu8 \src, \dst ++.endif ++.endm ++ ++// Broadcast a 128-bit value into a vector. ++.macro _vbroadcast128 src, dst ++.if VL == 16 && !USE_AVX10 ++ vmovdqu \src, \dst ++.elseif VL == 32 && !USE_AVX10 ++ vbroadcasti128 \src, \dst ++.else ++ vbroadcasti32x4 \src, \dst ++.endif ++.endm ++ ++// XOR two vectors together. ++.macro _vpxor src1, src2, dst ++.if USE_AVX10 ++ vpxord \src1, \src2, \dst ++.else ++ vpxor \src1, \src2, \dst ++.endif ++.endm ++ ++// XOR three vectors together. ++.macro _xor3 src1, src2, src3_and_dst ++.if USE_AVX10 ++ // vpternlogd with immediate 0x96 is a three-argument XOR. ++ vpternlogd $0x96, \src1, \src2, \src3_and_dst ++.else ++ vpxor \src1, \src3_and_dst, \src3_and_dst ++ vpxor \src2, \src3_and_dst, \src3_and_dst ++.endif ++.endm ++ ++// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak ++// (by multiplying by the polynomial 'x') and write it to \dst. ++.macro _next_tweak src, tmp, dst ++ vpshufd $0x13, \src, \tmp ++ vpaddq \src, \src, \dst ++ vpsrad $31, \tmp, \tmp ++ vpand GF_POLY_XMM, \tmp, \tmp ++ vpxor \tmp, \dst, \dst ++.endm ++ ++// Given the XTS tweak(s) in the vector \src, compute the next vector of ++// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst. ++// ++// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute ++// all tweaks in the vector in parallel. If VL=16, we just do the regular ++// computation without vpclmulqdq, as it's the faster method for a single tweak. ++.macro _next_tweakvec src, tmp1, tmp2, dst ++.if VL == 16 ++ _next_tweak \src, \tmp1, \dst ++.else ++ vpsrlq $64 - VL/16, \src, \tmp1 ++ vpclmulqdq $0x01, GF_POLY, \tmp1, \tmp2 ++ vpslldq $8, \tmp1, \tmp1 ++ vpsllq $VL/16, \src, \dst ++ _xor3 \tmp1, \tmp2, \dst ++.endif ++.endm ++ ++// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and ++// store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5. ++.macro _compute_first_set_of_tweaks ++ vmovdqu (TWEAK), TWEAK0_XMM ++ _vbroadcast128 .Lgf_poly(%rip), GF_POLY ++.if VL == 16 ++ // With VL=16, multiplying by x serially is fastest. ++ _next_tweak TWEAK0, %xmm0, TWEAK1 ++ _next_tweak TWEAK1, %xmm0, TWEAK2 ++ _next_tweak TWEAK2, %xmm0, TWEAK3 ++.else ++.if VL == 32 ++ // Compute the second block of TWEAK0. ++ _next_tweak TWEAK0_XMM, %xmm0, %xmm1 ++ vinserti128 $1, %xmm1, TWEAK0, TWEAK0 ++.elseif VL == 64 ++ // Compute the remaining blocks of TWEAK0. ++ _next_tweak TWEAK0_XMM, %xmm0, %xmm1 ++ _next_tweak %xmm1, %xmm0, %xmm2 ++ _next_tweak %xmm2, %xmm0, %xmm3 ++ vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0 ++ vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0 ++ vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0 ++.endif ++ // Compute TWEAK[1-3] from TWEAK0. ++ vpsrlq $64 - 1*VL/16, TWEAK0, V0 ++ vpsrlq $64 - 2*VL/16, TWEAK0, V2 ++ vpsrlq $64 - 3*VL/16, TWEAK0, V4 ++ vpclmulqdq $0x01, GF_POLY, V0, V1 ++ vpclmulqdq $0x01, GF_POLY, V2, V3 ++ vpclmulqdq $0x01, GF_POLY, V4, V5 ++ vpslldq $8, V0, V0 ++ vpslldq $8, V2, V2 ++ vpslldq $8, V4, V4 ++ vpsllq $1*VL/16, TWEAK0, TWEAK1 ++ vpsllq $2*VL/16, TWEAK0, TWEAK2 ++ vpsllq $3*VL/16, TWEAK0, TWEAK3 ++.if USE_AVX10 ++ vpternlogd $0x96, V0, V1, TWEAK1 ++ vpternlogd $0x96, V2, V3, TWEAK2 ++ vpternlogd $0x96, V4, V5, TWEAK3 ++.else ++ vpxor V0, TWEAK1, TWEAK1 ++ vpxor V2, TWEAK2, TWEAK2 ++ vpxor V4, TWEAK3, TWEAK3 ++ vpxor V1, TWEAK1, TWEAK1 ++ vpxor V3, TWEAK2, TWEAK2 ++ vpxor V5, TWEAK3, TWEAK3 ++.endif ++.endif ++.endm ++ ++// Do one step in computing the next set of tweaks using the method of just ++// multiplying by x repeatedly (the same method _next_tweak uses). ++.macro _tweak_step_mulx i ++.if \i == 0 ++ .set PREV_TWEAK, TWEAK3 ++ .set NEXT_TWEAK, NEXT_TWEAK0 ++.elseif \i == 5 ++ .set PREV_TWEAK, NEXT_TWEAK0 ++ .set NEXT_TWEAK, NEXT_TWEAK1 ++.elseif \i == 10 ++ .set PREV_TWEAK, NEXT_TWEAK1 ++ .set NEXT_TWEAK, NEXT_TWEAK2 ++.elseif \i == 15 ++ .set PREV_TWEAK, NEXT_TWEAK2 ++ .set NEXT_TWEAK, NEXT_TWEAK3 ++.endif ++.if \i < 20 && \i % 5 == 0 ++ vpshufd $0x13, PREV_TWEAK, V5 ++.elseif \i < 20 && \i % 5 == 1 ++ vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK ++.elseif \i < 20 && \i % 5 == 2 ++ vpsrad $31, V5, V5 ++.elseif \i < 20 && \i % 5 == 3 ++ vpand GF_POLY, V5, V5 ++.elseif \i < 20 && \i % 5 == 4 ++ vpxor V5, NEXT_TWEAK, NEXT_TWEAK ++.elseif \i == 1000 ++ vmovdqa NEXT_TWEAK0, TWEAK0 ++ vmovdqa NEXT_TWEAK1, TWEAK1 ++ vmovdqa NEXT_TWEAK2, TWEAK2 ++ vmovdqa NEXT_TWEAK3, TWEAK3 ++.endif ++.endm ++ ++// Do one step in computing the next set of tweaks using the VPCLMULQDQ method ++// (the same method _next_tweakvec uses for VL > 16). This means multiplying ++// each tweak by x^(4*VL/16) independently. Since 4*VL/16 is a multiple of 8 ++// when VL > 16 (which it is here), the needed shift amounts are byte-aligned, ++// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts. ++.macro _tweak_step_pclmul i ++.if \i == 2 ++ vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0 ++.elseif \i == 4 ++ vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1 ++.elseif \i == 6 ++ vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2 ++.elseif \i == 8 ++ vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3 ++.elseif \i == 10 ++ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0 ++.elseif \i == 12 ++ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1 ++.elseif \i == 14 ++ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2 ++.elseif \i == 16 ++ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3 ++.elseif \i == 1000 ++ vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0 ++ vpslldq $(4*VL/16) / 8, TWEAK1, TWEAK1 ++ vpslldq $(4*VL/16) / 8, TWEAK2, TWEAK2 ++ vpslldq $(4*VL/16) / 8, TWEAK3, TWEAK3 ++ _vpxor NEXT_TWEAK0, TWEAK0, TWEAK0 ++ _vpxor NEXT_TWEAK1, TWEAK1, TWEAK1 ++ _vpxor NEXT_TWEAK2, TWEAK2, TWEAK2 ++ _vpxor NEXT_TWEAK3, TWEAK3, TWEAK3 ++.endif ++.endm ++ ++// _tweak_step does one step of the computation of the next set of tweaks from ++// TWEAK[0-3]. To complete all steps, this must be invoked with \i values 0 ++// through at least 19, then 1000 which signals the last step. ++// ++// This is used to interleave the computation of the next set of tweaks with the ++// AES en/decryptions, which increases performance in some cases. ++.macro _tweak_step i ++.if VL == 16 ++ _tweak_step_mulx \i ++.else ++ _tweak_step_pclmul \i ++.endif ++.endm ++ ++// Load the round keys: just the first one if !USE_AVX10, otherwise all of them. ++.macro _load_round_keys ++ _vbroadcast128 0*16(KEY), KEY0 ++.if USE_AVX10 ++ _vbroadcast128 1*16(KEY), KEY1 ++ _vbroadcast128 2*16(KEY), KEY2 ++ _vbroadcast128 3*16(KEY), KEY3 ++ _vbroadcast128 4*16(KEY), KEY4 ++ _vbroadcast128 5*16(KEY), KEY5 ++ _vbroadcast128 6*16(KEY), KEY6 ++ _vbroadcast128 7*16(KEY), KEY7 ++ _vbroadcast128 8*16(KEY), KEY8 ++ _vbroadcast128 9*16(KEY), KEY9 ++ _vbroadcast128 10*16(KEY), KEY10 ++ // Note: if it's AES-128 or AES-192, the last several round keys won't ++ // be used. We do the loads anyway to save a conditional jump. ++ _vbroadcast128 11*16(KEY), KEY11 ++ _vbroadcast128 12*16(KEY), KEY12 ++ _vbroadcast128 13*16(KEY), KEY13 ++ _vbroadcast128 14*16(KEY), KEY14 ++.endif ++.endm ++ ++// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0) ++// on the block(s) in \data using the round key(s) in \key. The register length ++// determines the number of AES blocks en/decrypted. ++.macro _vaes enc, last, key, data ++.if \enc ++.if \last ++ vaesenclast \key, \data, \data ++.else ++ vaesenc \key, \data, \data ++.endif ++.else ++.if \last ++ vaesdeclast \key, \data, \data ++.else ++ vaesdec \key, \data, \data ++.endif ++.endif ++.endm ++ ++// Do a single round of AES en/decryption on the block(s) in \data, using the ++// same key for all block(s). The round key is loaded from the appropriate ++// register or memory location for round \i. May clobber V4. ++.macro _vaes_1x enc, last, i, xmm_suffix, data ++.if USE_AVX10 ++ _vaes \enc, \last, KEY\i\xmm_suffix, \data ++.else ++.ifnb \xmm_suffix ++ _vaes \enc, \last, \i*16(KEY), \data ++.else ++ _vbroadcast128 \i*16(KEY), V4 ++ _vaes \enc, \last, V4, \data ++.endif ++.endif ++.endm ++ ++// Do a single round of AES en/decryption on the blocks in registers V0-V3, ++// using the same key for all blocks. The round key is loaded from the ++// appropriate register or memory location for round \i. In addition, does step ++// \i of the computation of the next set of tweaks. May clobber V4. ++.macro _vaes_4x enc, last, i ++.if USE_AVX10 ++ _tweak_step (2*(\i-1)) ++ _vaes \enc, \last, KEY\i, V0 ++ _vaes \enc, \last, KEY\i, V1 ++ _tweak_step (2*(\i-1) + 1) ++ _vaes \enc, \last, KEY\i, V2 ++ _vaes \enc, \last, KEY\i, V3 ++.else ++ _vbroadcast128 \i*16(KEY), V4 ++ _tweak_step (2*(\i-1)) ++ _vaes \enc, \last, V4, V0 ++ _vaes \enc, \last, V4, V1 ++ _tweak_step (2*(\i-1) + 1) ++ _vaes \enc, \last, V4, V2 ++ _vaes \enc, \last, V4, V3 ++.endif ++.endm ++ ++// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt, ++// then XOR with \tweak again) of the block(s) in \data. To process a single ++// block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of ++// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4. ++.macro _aes_crypt enc, xmm_suffix, tweak, data ++ _xor3 KEY0\xmm_suffix, \tweak, \data ++ _vaes_1x \enc, 0, 1, \xmm_suffix, \data ++ _vaes_1x \enc, 0, 2, \xmm_suffix, \data ++ _vaes_1x \enc, 0, 3, \xmm_suffix, \data ++ _vaes_1x \enc, 0, 4, \xmm_suffix, \data ++ _vaes_1x \enc, 0, 5, \xmm_suffix, \data ++ _vaes_1x \enc, 0, 6, \xmm_suffix, \data ++ _vaes_1x \enc, 0, 7, \xmm_suffix, \data ++ _vaes_1x \enc, 0, 8, \xmm_suffix, \data ++ _vaes_1x \enc, 0, 9, \xmm_suffix, \data ++ cmp $24, KEYLEN ++ jle .Laes_128_or_192\@ ++ _vaes_1x \enc, 0, 10, \xmm_suffix, \data ++ _vaes_1x \enc, 0, 11, \xmm_suffix, \data ++ _vaes_1x \enc, 0, 12, \xmm_suffix, \data ++ _vaes_1x \enc, 0, 13, \xmm_suffix, \data ++ _vaes_1x \enc, 1, 14, \xmm_suffix, \data ++ jmp .Laes_done\@ ++.Laes_128_or_192\@: ++ je .Laes_192\@ ++ _vaes_1x \enc, 1, 10, \xmm_suffix, \data ++ jmp .Laes_done\@ ++.Laes_192\@: ++ _vaes_1x \enc, 0, 10, \xmm_suffix, \data ++ _vaes_1x \enc, 0, 11, \xmm_suffix, \data ++ _vaes_1x \enc, 1, 12, \xmm_suffix, \data ++.Laes_done\@: ++ _vpxor \tweak, \data, \data ++.endm ++ ++.macro _aes_xts_crypt enc ++ _define_aliases ++ ++ // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). ++ movl 480(KEY), KEYLEN ++ ++ // If decrypting, advance KEY to the decryption round keys. ++.if !\enc ++ add $240, KEY ++.endif ++ ++ // Check whether the data length is a multiple of the AES block length. ++ test $15, LEN ++ jnz .Lneed_cts\@ ++.Lxts_init\@: ++ ++ // Cache as many round keys as possible. ++ _load_round_keys ++ ++ // Compute the first set of tweaks TWEAK[0-3]. ++ _compute_first_set_of_tweaks ++ ++ sub $4*VL, LEN ++ jl .Lhandle_remainder\@ ++ ++.Lmain_loop\@: ++ // This is the main loop, en/decrypting 4*VL bytes per iteration. ++ ++ // XOR each source block with its tweak and the first round key. ++.if USE_AVX10 ++ vmovdqu8 0*VL(SRC), V0 ++ vmovdqu8 1*VL(SRC), V1 ++ vmovdqu8 2*VL(SRC), V2 ++ vmovdqu8 3*VL(SRC), V3 ++ vpternlogd $0x96, TWEAK0, KEY0, V0 ++ vpternlogd $0x96, TWEAK1, KEY0, V1 ++ vpternlogd $0x96, TWEAK2, KEY0, V2 ++ vpternlogd $0x96, TWEAK3, KEY0, V3 ++.else ++ vpxor 0*VL(SRC), KEY0, V0 ++ vpxor 1*VL(SRC), KEY0, V1 ++ vpxor 2*VL(SRC), KEY0, V2 ++ vpxor 3*VL(SRC), KEY0, V3 ++ vpxor TWEAK0, V0, V0 ++ vpxor TWEAK1, V1, V1 ++ vpxor TWEAK2, V2, V2 ++ vpxor TWEAK3, V3, V3 ++.endif ++ // Do all the AES rounds on the data blocks, interleaved with ++ // the computation of the next set of tweaks. ++ _vaes_4x \enc, 0, 1 ++ _vaes_4x \enc, 0, 2 ++ _vaes_4x \enc, 0, 3 ++ _vaes_4x \enc, 0, 4 ++ _vaes_4x \enc, 0, 5 ++ _vaes_4x \enc, 0, 6 ++ _vaes_4x \enc, 0, 7 ++ _vaes_4x \enc, 0, 8 ++ _vaes_4x \enc, 0, 9 ++ // Try to optimize for AES-256 by keeping the code for AES-128 and ++ // AES-192 out-of-line. ++ cmp $24, KEYLEN ++ jle .Lencrypt_4x_aes_128_or_192\@ ++ _vaes_4x \enc, 0, 10 ++ _vaes_4x \enc, 0, 11 ++ _vaes_4x \enc, 0, 12 ++ _vaes_4x \enc, 0, 13 ++ _vaes_4x \enc, 1, 14 ++.Lencrypt_4x_done\@: ++ ++ // XOR in the tweaks again. ++ _vpxor TWEAK0, V0, V0 ++ _vpxor TWEAK1, V1, V1 ++ _vpxor TWEAK2, V2, V2 ++ _vpxor TWEAK3, V3, V3 ++ ++ // Store the destination blocks. ++ _vmovdqu V0, 0*VL(DST) ++ _vmovdqu V1, 1*VL(DST) ++ _vmovdqu V2, 2*VL(DST) ++ _vmovdqu V3, 3*VL(DST) ++ ++ // Finish computing the next set of tweaks. ++ _tweak_step 1000 ++ ++ add $4*VL, SRC ++ add $4*VL, DST ++ sub $4*VL, LEN ++ jge .Lmain_loop\@ ++ ++ // Check for the uncommon case where the data length isn't a multiple of ++ // 4*VL. Handle it out-of-line in order to optimize for the common ++ // case. In the common case, just fall through to the ret. ++ test $4*VL-1, LEN ++ jnz .Lhandle_remainder\@ ++.Ldone\@: ++ // Store the next tweak back to *TWEAK to support continuation calls. ++ vmovdqu TWEAK0_XMM, (TWEAK) ++.if VL > 16 ++ vzeroupper ++.endif ++ RET ++ ++.Lhandle_remainder\@: ++ add $4*VL, LEN // Undo the extra sub from earlier. ++ ++ // En/decrypt any remaining full blocks, one vector at a time. ++.if VL > 16 ++ sub $VL, LEN ++ jl .Lvec_at_a_time_done\@ ++.Lvec_at_a_time\@: ++ _vmovdqu (SRC), V0 ++ _aes_crypt \enc, , TWEAK0, V0 ++ _vmovdqu V0, (DST) ++ _next_tweakvec TWEAK0, V0, V1, TWEAK0 ++ add $VL, SRC ++ add $VL, DST ++ sub $VL, LEN ++ jge .Lvec_at_a_time\@ ++.Lvec_at_a_time_done\@: ++ add $VL-16, LEN // Undo the extra sub from earlier. ++.else ++ sub $16, LEN ++.endif ++ ++ // En/decrypt any remaining full blocks, one at a time. ++ jl .Lblock_at_a_time_done\@ ++.Lblock_at_a_time\@: ++ vmovdqu (SRC), %xmm0 ++ _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0 ++ vmovdqu %xmm0, (DST) ++ _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM ++ add $16, SRC ++ add $16, DST ++ sub $16, LEN ++ jge .Lblock_at_a_time\@ ++.Lblock_at_a_time_done\@: ++ add $16, LEN // Undo the extra sub from earlier. ++ ++.Lfull_blocks_done\@: ++ // Now 0 <= LEN <= 15. If LEN is nonzero, do ciphertext stealing to ++ // process the last 16 + LEN bytes. If LEN is zero, we're done. ++ test LEN, LEN ++ jnz .Lcts\@ ++ jmp .Ldone\@ ++ ++ // Out-of-line handling of AES-128 and AES-192 ++.Lencrypt_4x_aes_128_or_192\@: ++ jz .Lencrypt_4x_aes_192\@ ++ _vaes_4x \enc, 1, 10 ++ jmp .Lencrypt_4x_done\@ ++.Lencrypt_4x_aes_192\@: ++ _vaes_4x \enc, 0, 10 ++ _vaes_4x \enc, 0, 11 ++ _vaes_4x \enc, 1, 12 ++ jmp .Lencrypt_4x_done\@ ++ ++.Lneed_cts\@: ++ // The data length isn't a multiple of the AES block length, so ++ // ciphertext stealing (CTS) will be needed. Subtract one block from ++ // LEN so that the main loop doesn't process the last full block. The ++ // CTS step will process it specially along with the partial block. ++ sub $16, LEN ++ jmp .Lxts_init\@ ++ ++.Lcts\@: ++ // Do ciphertext stealing (CTS) to en/decrypt the last full block and ++ // the partial block. CTS needs two tweaks. TWEAK0_XMM contains the ++ // next tweak; compute the one after that. Decryption uses these two ++ // tweaks in reverse order, so also define aliases to handle that. ++ _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM ++.if \enc ++ .set CTS_TWEAK0, TWEAK0_XMM ++ .set CTS_TWEAK1, TWEAK1_XMM ++.else ++ .set CTS_TWEAK0, TWEAK1_XMM ++ .set CTS_TWEAK1, TWEAK0_XMM ++.endif ++ ++ // En/decrypt the last full block. ++ vmovdqu (SRC), %xmm0 ++ _aes_crypt \enc, _XMM, CTS_TWEAK0, %xmm0 ++ ++.if USE_AVX10 ++ // Create a mask that has the first LEN bits set. ++ mov $-1, %rax ++ bzhi LEN, %rax, %rax ++ kmovq %rax, %k1 ++ ++ // Swap the first LEN bytes of the above result with the partial block. ++ // Note that to support in-place en/decryption, the load from the src ++ // partial block must happen before the store to the dst partial block. ++ vmovdqa %xmm0, %xmm1 ++ vmovdqu8 16(SRC), %xmm0{%k1} ++ vmovdqu8 %xmm1, 16(DST){%k1} ++.else ++ lea .Lcts_permute_table(%rip), %rax ++ ++ // Load the src partial block, left-aligned. Note that to support ++ // in-place en/decryption, this must happen before the store to the dst ++ // partial block. ++ vmovdqu (SRC, LEN, 1), %xmm1 ++ ++ // Shift the first LEN bytes of the en/decryption of the last full block ++ // to the end of a register, then store it to DST+LEN. This stores the ++ // dst partial block. It also writes to the second part of the dst last ++ // full block, but that part is overwritten later. ++ vpshufb (%rax, LEN, 1), %xmm0, %xmm2 ++ vmovdqu %xmm2, (DST, LEN, 1) ++ ++ // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...]. ++ sub LEN, %rax ++ vmovdqu 32(%rax), %xmm3 ++ ++ // Shift the src partial block to the beginning of its register. ++ vpshufb %xmm3, %xmm1, %xmm1 ++ ++ // Do a blend to generate the src partial block followed by the second ++ // part of the en/decryption of the last full block. ++ vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 ++.endif ++ // En/decrypt again and store the last full block. ++ _aes_crypt \enc, _XMM, CTS_TWEAK1, %xmm0 ++ vmovdqu %xmm0, (DST) ++ jmp .Ldone\@ ++.endm ++ ++// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, ++// u8 iv[AES_BLOCK_SIZE]); ++SYM_FUNC_START(aes_xts_encrypt_iv) ++ vmovdqu (%rsi), %xmm0 ++ vpxor 0*16(%rdi), %xmm0, %xmm0 ++ vaesenc 1*16(%rdi), %xmm0, %xmm0 ++ vaesenc 2*16(%rdi), %xmm0, %xmm0 ++ vaesenc 3*16(%rdi), %xmm0, %xmm0 ++ vaesenc 4*16(%rdi), %xmm0, %xmm0 ++ vaesenc 5*16(%rdi), %xmm0, %xmm0 ++ vaesenc 6*16(%rdi), %xmm0, %xmm0 ++ vaesenc 7*16(%rdi), %xmm0, %xmm0 ++ vaesenc 8*16(%rdi), %xmm0, %xmm0 ++ vaesenc 9*16(%rdi), %xmm0, %xmm0 ++ cmpl $24, 480(%rdi) ++ jle .Lencrypt_iv_aes_128_or_192 ++ vaesenc 10*16(%rdi), %xmm0, %xmm0 ++ vaesenc 11*16(%rdi), %xmm0, %xmm0 ++ vaesenc 12*16(%rdi), %xmm0, %xmm0 ++ vaesenc 13*16(%rdi), %xmm0, %xmm0 ++ vaesenclast 14*16(%rdi), %xmm0, %xmm0 ++.Lencrypt_iv_done: ++ vmovdqu %xmm0, (%rsi) ++ RET ++ ++ // Out-of-line handling of AES-128 and AES-192 ++.Lencrypt_iv_aes_128_or_192: ++ jz .Lencrypt_iv_aes_192 ++ vaesenclast 10*16(%rdi), %xmm0, %xmm0 ++ jmp .Lencrypt_iv_done ++.Lencrypt_iv_aes_192: ++ vaesenc 10*16(%rdi), %xmm0, %xmm0 ++ vaesenc 11*16(%rdi), %xmm0, %xmm0 ++ vaesenclast 12*16(%rdi), %xmm0, %xmm0 ++ jmp .Lencrypt_iv_done ++SYM_FUNC_END(aes_xts_encrypt_iv) ++ ++// Below are the actual AES-XTS encryption and decryption functions, ++// instantiated from the above macro. They all have the following prototype: ++// ++// void (*xts_asm_func)(const struct crypto_aes_ctx *key, ++// const u8 *src, u8 *dst, size_t len, ++// u8 tweak[AES_BLOCK_SIZE]); ++// ++// |key| is the data key. |tweak| contains the next tweak; the encryption of ++// the original IV with the tweak key was already done. This function supports ++// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and ++// |len| must be a multiple of 16 except on the last call. If |len| is a ++// multiple of 16, then this function updates |tweak| to contain the next tweak. ++ ++.set VL, 16 ++.set USE_AVX10, 0 ++SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx) ++ _aes_xts_crypt 1 ++SYM_FUNC_END(aes_xts_encrypt_aesni_avx) ++SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx) ++ _aes_xts_crypt 0 ++SYM_FUNC_END(aes_xts_decrypt_aesni_avx) ++ ++#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) ++.set VL, 32 ++.set USE_AVX10, 0 ++SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2) ++ _aes_xts_crypt 1 ++SYM_FUNC_END(aes_xts_encrypt_vaes_avx2) ++SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2) ++ _aes_xts_crypt 0 ++SYM_FUNC_END(aes_xts_decrypt_vaes_avx2) ++ ++.set VL, 32 ++.set USE_AVX10, 1 ++SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256) ++ _aes_xts_crypt 1 ++SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256) ++SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256) ++ _aes_xts_crypt 0 ++SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256) ++ ++.set VL, 64 ++.set USE_AVX10, 1 ++SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512) ++ _aes_xts_crypt 1 ++SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512) ++SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512) ++ _aes_xts_crypt 0 ++SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512) ++#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ +diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c +index b1d90c25975a..0855ace8659c 100644 +--- a/arch/x86/crypto/aesni-intel_glue.c ++++ b/arch/x86/crypto/aesni-intel_glue.c +@@ -1137,7 +1137,264 @@ static struct skcipher_alg aesni_xctr = { + }; + + static struct simd_skcipher_alg *aesni_simd_xctr; +-#endif /* CONFIG_X86_64 */ ++ ++asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, ++ u8 iv[AES_BLOCK_SIZE]); ++ ++typedef void (*xts_asm_func)(const struct crypto_aes_ctx *key, ++ const u8 *src, u8 *dst, size_t len, ++ u8 tweak[AES_BLOCK_SIZE]); ++ ++/* This handles cases where the source and/or destination span pages. */ ++static noinline int ++xts_crypt_slowpath(struct skcipher_request *req, xts_asm_func asm_func) ++{ ++ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); ++ const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); ++ int tail = req->cryptlen % AES_BLOCK_SIZE; ++ struct scatterlist sg_src[2], sg_dst[2]; ++ struct skcipher_request subreq; ++ struct skcipher_walk walk; ++ struct scatterlist *src, *dst; ++ int err; ++ ++ /* ++ * If the message length isn't divisible by the AES block size, then ++ * separate off the last full block and the partial block. This ensures ++ * that they are processed in the same call to the assembly function, ++ * which is required for ciphertext stealing. ++ */ ++ if (tail) { ++ skcipher_request_set_tfm(&subreq, tfm); ++ skcipher_request_set_callback(&subreq, ++ skcipher_request_flags(req), ++ NULL, NULL); ++ skcipher_request_set_crypt(&subreq, req->src, req->dst, ++ req->cryptlen - tail - AES_BLOCK_SIZE, ++ req->iv); ++ req = &subreq; ++ } ++ ++ err = skcipher_walk_virt(&walk, req, false); ++ ++ while (walk.nbytes) { ++ unsigned int nbytes = walk.nbytes; ++ ++ if (nbytes < walk.total) ++ nbytes = round_down(nbytes, AES_BLOCK_SIZE); ++ ++ kernel_fpu_begin(); ++ (*asm_func)(&ctx->crypt_ctx, walk.src.virt.addr, ++ walk.dst.virt.addr, nbytes, req->iv); ++ kernel_fpu_end(); ++ err = skcipher_walk_done(&walk, walk.nbytes - nbytes); ++ } ++ ++ if (err || !tail) ++ return err; ++ ++ /* Do ciphertext stealing with the last full block and partial block. */ ++ ++ dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); ++ if (req->dst != req->src) ++ dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); ++ ++ skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, ++ req->iv); ++ ++ err = skcipher_walk_virt(&walk, req, false); ++ if (err) ++ return err; ++ ++ kernel_fpu_begin(); ++ (*asm_func)(&ctx->crypt_ctx, walk.src.virt.addr, walk.dst.virt.addr, ++ walk.nbytes, req->iv); ++ kernel_fpu_end(); ++ ++ return skcipher_walk_done(&walk, 0); ++} ++ ++/* __always_inline to avoid indirect call in fastpath */ ++static __always_inline int ++xts_crypt2(struct skcipher_request *req, xts_asm_func asm_func) ++{ ++ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); ++ const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); ++ const unsigned int cryptlen = req->cryptlen; ++ struct scatterlist *src = req->src; ++ struct scatterlist *dst = req->dst; ++ ++ if (unlikely(cryptlen < AES_BLOCK_SIZE)) ++ return -EINVAL; ++ ++ kernel_fpu_begin(); ++ aes_xts_encrypt_iv(&ctx->tweak_ctx, req->iv); ++ ++ /* ++ * In practice, virtually all XTS plaintexts and ciphertexts are either ++ * 512 or 4096 bytes, aligned such that they don't span page boundaries. ++ * To optimize the performance of these cases, and also any other case ++ * where no page boundary is spanned, the below fast-path handles ++ * single-page sources and destinations as efficiently as possible. ++ */ ++ if (likely(src->length >= cryptlen && dst->length >= cryptlen && ++ src->offset + cryptlen <= PAGE_SIZE && ++ dst->offset + cryptlen <= PAGE_SIZE)) { ++ struct page *src_page = sg_page(src); ++ struct page *dst_page = sg_page(dst); ++ void *src_virt = kmap_local_page(src_page) + src->offset; ++ void *dst_virt = kmap_local_page(dst_page) + dst->offset; ++ ++ (*asm_func)(&ctx->crypt_ctx, src_virt, dst_virt, cryptlen, ++ req->iv); ++ kunmap_local(dst_virt); ++ kunmap_local(src_virt); ++ kernel_fpu_end(); ++ return 0; ++ } ++ kernel_fpu_end(); ++ return xts_crypt_slowpath(req, asm_func); ++} ++ ++#define DEFINE_XTS_ALG(suffix, driver_name, priority) \ ++ \ ++asmlinkage void aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, \ ++ const u8 *src, u8 *dst, size_t len, \ ++ u8 tweak[AES_BLOCK_SIZE]); \ ++asmlinkage void aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, \ ++ const u8 *src, u8 *dst, size_t len, \ ++ u8 tweak[AES_BLOCK_SIZE]); \ ++ \ ++static int xts_encrypt_##suffix(struct skcipher_request *req) \ ++{ \ ++ return xts_crypt2(req, aes_xts_encrypt_##suffix); \ ++} \ ++ \ ++static int xts_decrypt_##suffix(struct skcipher_request *req) \ ++{ \ ++ return xts_crypt2(req, aes_xts_decrypt_##suffix); \ ++} \ ++ \ ++static struct skcipher_alg aes_xts_alg_##suffix = { \ ++ .base = { \ ++ .cra_name = "__xts(aes)", \ ++ .cra_driver_name = "__" driver_name, \ ++ .cra_priority = priority, \ ++ .cra_flags = CRYPTO_ALG_INTERNAL, \ ++ .cra_blocksize = AES_BLOCK_SIZE, \ ++ .cra_ctxsize = XTS_AES_CTX_SIZE, \ ++ .cra_module = THIS_MODULE, \ ++ }, \ ++ .min_keysize = 2 * AES_MIN_KEY_SIZE, \ ++ .max_keysize = 2 * AES_MAX_KEY_SIZE, \ ++ .ivsize = AES_BLOCK_SIZE, \ ++ .walksize = 2 * AES_BLOCK_SIZE, \ ++ .setkey = xts_aesni_setkey, \ ++ .encrypt = xts_encrypt_##suffix, \ ++ .decrypt = xts_decrypt_##suffix, \ ++}; \ ++ \ ++static struct simd_skcipher_alg *aes_xts_simdalg_##suffix ++ ++DEFINE_XTS_ALG(aesni_avx, "xts-aes-aesni-avx", 500); ++#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) ++DEFINE_XTS_ALG(vaes_avx2, "xts-aes-vaes-avx2", 600); ++DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700); ++DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800); ++#endif ++ ++/* ++ * This is a list of CPU models that are known to suffer from downclocking when ++ * zmm registers (512-bit vectors) are used. On these CPUs, the AES-XTS ++ * implementation with zmm registers won't be used by default. An ++ * implementation with ymm registers (256-bit vectors) will be used instead. ++ */ ++static const struct x86_cpu_id zmm_exclusion_list[] = { ++ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_SKYLAKE_X }, ++ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_X }, ++ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_D }, ++ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE }, ++ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_L }, ++ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_NNPI }, ++ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_TIGERLAKE_L }, ++ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_TIGERLAKE }, ++ /* Allow Rocket Lake and later, and Sapphire Rapids and later. */ ++ /* Also allow AMD CPUs (starting with Zen 4, the first with AVX-512). */ ++ {}, ++}; ++ ++static int __init register_xts_algs(void) ++{ ++ int err; ++ ++ if (!boot_cpu_has(X86_FEATURE_AVX)) ++ return 0; ++ err = simd_register_skciphers_compat(&aes_xts_alg_aesni_avx, 1, ++ &aes_xts_simdalg_aesni_avx); ++ if (err) ++ return err; ++#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) ++ if (!boot_cpu_has(X86_FEATURE_AVX2) || ++ !boot_cpu_has(X86_FEATURE_VAES) || ++ !boot_cpu_has(X86_FEATURE_VPCLMULQDQ) || ++ !boot_cpu_has(X86_FEATURE_PCLMULQDQ) || ++ !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) ++ return 0; ++ err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx2, 1, ++ &aes_xts_simdalg_vaes_avx2); ++ if (err) ++ return err; ++ ++ if (!boot_cpu_has(X86_FEATURE_AVX512BW) || ++ !boot_cpu_has(X86_FEATURE_AVX512VL) || ++ !boot_cpu_has(X86_FEATURE_BMI2) || ++ !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | ++ XFEATURE_MASK_AVX512, NULL)) ++ return 0; ++ ++ err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_256, 1, ++ &aes_xts_simdalg_vaes_avx10_256); ++ if (err) ++ return err; ++ ++ if (x86_match_cpu(zmm_exclusion_list)) ++ aes_xts_alg_vaes_avx10_512.base.cra_priority = 1; ++ ++ err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1, ++ &aes_xts_simdalg_vaes_avx10_512); ++ if (err) ++ return err; ++#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ ++ return 0; ++} ++ ++static void unregister_xts_algs(void) ++{ ++ if (aes_xts_simdalg_aesni_avx) ++ simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1, ++ &aes_xts_simdalg_aesni_avx); ++#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) ++ if (aes_xts_simdalg_vaes_avx2) ++ simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1, ++ &aes_xts_simdalg_vaes_avx2); ++ if (aes_xts_simdalg_vaes_avx10_256) ++ simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1, ++ &aes_xts_simdalg_vaes_avx10_256); ++ if (aes_xts_simdalg_vaes_avx10_512) ++ simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1, ++ &aes_xts_simdalg_vaes_avx10_512); ++#endif ++} ++#else /* CONFIG_X86_64 */ ++static int __init register_xts_algs(void) ++{ ++ return 0; ++} ++ ++static void unregister_xts_algs(void) ++{ ++} ++#endif /* !CONFIG_X86_64 */ + + #ifdef CONFIG_X86_64 + static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, +@@ -1276,13 +1533,21 @@ static int __init aesni_init(void) + goto unregister_aeads; + #endif /* CONFIG_X86_64 */ + ++ err = register_xts_algs(); ++ if (err) ++ goto unregister_xts; ++ + return 0; + ++unregister_xts: ++ unregister_xts_algs(); + #ifdef CONFIG_X86_64 ++ if (aesni_simd_xctr) ++ simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); + unregister_aeads: ++#endif /* CONFIG_X86_64 */ + simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), + aesni_simd_aeads); +-#endif /* CONFIG_X86_64 */ + + unregister_skciphers: + simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), +@@ -1303,6 +1568,7 @@ static void __exit aesni_exit(void) + if (boot_cpu_has(X86_FEATURE_AVX)) + simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); + #endif /* CONFIG_X86_64 */ ++ unregister_xts_algs(); + } + + late_initcall(aesni_init); +-- +2.44.0 + +From 4a47b09deb67c3854ac102bcb18ef0df00aae437 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Wed, 3 Apr 2024 17:06:20 +0200 +Subject: [PATCH 2/8] amd-pstate Signed-off-by: Peter Jung --- @@ -9,19 +1206,18 @@ Signed-off-by: Peter Jung Documentation/admin-guide/pm/amd-pstate.rst | 70 ++- arch/x86/Kconfig | 5 +- arch/x86/include/asm/msr-index.h | 2 + - arch/x86/kernel/acpi/cppc.c | 2 +- drivers/acpi/cppc_acpi.c | 17 +- drivers/acpi/processor_driver.c | 6 + drivers/cpufreq/acpi-cpufreq.c | 2 - drivers/cpufreq/amd-pstate-ut.c | 2 +- - drivers/cpufreq/amd-pstate.c | 501 +++++++++++++++--- + drivers/cpufreq/amd-pstate.c | 499 +++++++++++++++--- include/acpi/cppc_acpi.h | 5 + include/linux/amd-pstate.h | 32 +- include/linux/cpufreq.h | 1 + - 13 files changed, 562 insertions(+), 88 deletions(-) + 12 files changed, 560 insertions(+), 86 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 73062d47a462..a493d93e0d2c 100644 +index d2150bd3acc5..71ed7f1b0f9b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -374,6 +374,11 @@ @@ -132,7 +1328,7 @@ index 9eb26014d34b..82fbd01da658 100644 =============================================== diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 637e337c332e..de39c296ea3f 100644 +index 184730705650..70732a76171f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1054,8 +1054,9 @@ config SCHED_MC @@ -160,19 +1356,6 @@ index d1b5edaf6c34..bfe139eb75b6 100644 /* K6 MSRs */ #define MSR_K6_WHCR 0xc0000082 -diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c -index 8d8752b44f11..ff8f25faca3d 100644 ---- a/arch/x86/kernel/acpi/cppc.c -+++ b/arch/x86/kernel/acpi/cppc.c -@@ -20,7 +20,7 @@ bool cpc_supported_by_cpu(void) - (boot_cpu_data.x86_model >= 0x20 && boot_cpu_data.x86_model <= 0x2f))) - return true; - else if (boot_cpu_data.x86 == 0x17 && -- boot_cpu_data.x86_model >= 0x70 && boot_cpu_data.x86_model <= 0x7f) -+ boot_cpu_data.x86_model >= 0x30 && boot_cpu_data.x86_model <= 0x7f) - return true; - return boot_cpu_has(X86_FEATURE_CPPC); - } diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index d155a86a8614..e23a84f4a50a 100644 --- a/drivers/acpi/cppc_acpi.c @@ -260,7 +1443,7 @@ index f04ae67dda37..b3601b0e6dd3 100644 (policy->max == cpudata->nominal_freq)) amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c -index 1791d37fbc53..651055df1710 100644 +index 07f341995439..651055df1710 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -37,6 +37,7 @@ @@ -455,7 +1638,7 @@ index 1791d37fbc53..651055df1710 100644 if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) amd_pstate_update_min_max_limit(policy); -@@ -564,13 +635,12 @@ static void amd_pstate_adjust_perf(unsigned int cpu, +@@ -564,7 +635,6 @@ static void amd_pstate_adjust_perf(unsigned int cpu, cap_perf = READ_ONCE(cpudata->highest_perf); lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); @@ -463,13 +1646,6 @@ index 1791d37fbc53..651055df1710 100644 des_perf = cap_perf; if (target_perf < capacity) - des_perf = DIV_ROUND_UP(cap_perf * target_perf, capacity); - -- min_perf = READ_ONCE(cpudata->highest_perf); -+ min_perf = READ_ONCE(cpudata->lowest_perf); - if (_min_perf < capacity) - min_perf = DIV_ROUND_UP(cap_perf * _min_perf, capacity); - @@ -582,8 +652,6 @@ static void amd_pstate_adjust_perf(unsigned int cpu, max_perf = min_perf; @@ -629,7 +1805,8 @@ index 1791d37fbc53..651055df1710 100644 - nominal_perf = READ_ONCE(cpudata->nominal_perf); + if (boot_cpu_has(X86_FEATURE_CPPC)) { + u64 cap1; -+ + +- if (highest_perf <= nominal_perf) + ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1); + if (ret) + return ret; @@ -652,12 +1829,13 @@ index 1791d37fbc53..651055df1710 100644 +{ + int ret, prio; + u32 highest_perf; - -- if (highest_perf <= nominal_perf) ++ + ret = amd_pstate_get_highest_perf(cpudata->cpu, &highest_perf); + if (ret) -+ return; -+ + return; + +- cpudata->boost_supported = true; +- current_pstate_driver->boost_enabled = true; + cpudata->hw_prefcore = true; + /* check if CPPC preferred core feature is enabled*/ + if (highest_perf < CPPC_MAX_PERF) @@ -669,10 +1847,8 @@ index 1791d37fbc53..651055df1710 100644 + } + + if (!amd_pstate_prefcore) - return; - -- cpudata->boost_supported = true; -- current_pstate_driver->boost_enabled = true; ++ return; ++ + /* + * The priorities can be set regardless of whether or not + * sched_set_itmt_support(true) has been called and it is valid to @@ -1193,7 +2369,7 @@ index 6ad02ad9c7b4..e89cf1249715 100644 + #endif /* _LINUX_AMD_PSTATE_H */ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h -index afda5f24d3dd..9bebeec24abb 100644 +index 320fab7d2e94..3129411fa978 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -263,6 +263,7 @@ static inline bool cpufreq_supports_freq_invariance(void) @@ -1207,10 +2383,10 @@ index afda5f24d3dd..9bebeec24abb 100644 -- 2.44.0 -From 93aefd5f98b793e9447e64dcbaa69221102e304a Mon Sep 17 00:00:00 2001 +From 7f2e4860d7405f71337e99ea74b84ebcd2c3b90c Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 26 Feb 2024 15:46:58 +0100 -Subject: [PATCH 2/7] bbr3 +Date: Wed, 3 Apr 2024 17:06:31 +0200 +Subject: [PATCH 3/8] bbr3 Signed-off-by: Peter Jung --- @@ -1564,7 +2740,7 @@ index ae8b15e6896f..beb040e80b6f 100644 .undo_cwnd = bpf_tcp_ca_undo_cwnd, .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index c82dc42f57c6..1bc25bc01a8d 100644 +index a4f418592314..58469fe5195e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3089,6 +3089,7 @@ int tcp_disconnect(struct sock *sk, int flags) @@ -4375,10 +5551,10 @@ index df7b13f0e5e0..8415aa41524e 100644 tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c -index 9e85f2a0bddd..914a75bb0734 100644 +index 0ecc7311dc6c..82622782486a 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c -@@ -464,6 +464,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) +@@ -460,6 +460,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); bool ca_got_dst = false; @@ -4593,10 +5769,10 @@ index d1ad20ce1c8c..ef74f33c7905 100644 -- 2.44.0 -From fb681aa9768aa30b3b17152a221868238394dd64 Mon Sep 17 00:00:00 2001 +From 71b4361aff469d7e31d2260c0f689a976a1a89d0 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 26 Feb 2024 15:47:11 +0100 -Subject: [PATCH 3/7] block +Date: Wed, 3 Apr 2024 17:06:41 +0200 +Subject: [PATCH 4/8] block Signed-off-by: Peter Jung --- @@ -4864,7 +6040,7 @@ index 467e8cfc41a2..f44f5d4ec2f4 100644 * bic associated with the task issuing current bio for * merging. This and the next field are used as a support to diff --git a/block/mq-deadline.c b/block/mq-deadline.c -index f958e79277b8..1b0de4fc3958 100644 +index 02a916ba62ee..8bf621316a9e 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -79,10 +79,24 @@ struct dd_per_prio { @@ -4986,7 +6162,7 @@ index f958e79277b8..1b0de4fc3958 100644 return rq; } -@@ -706,6 +764,13 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) +@@ -705,6 +763,13 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) eq->elevator_data = dd; @@ -5000,7 +6176,7 @@ index f958e79277b8..1b0de4fc3958 100644 for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; -@@ -722,8 +787,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) +@@ -721,8 +786,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; dd->prio_aging_expire = prio_aging_expire; @@ -5009,7 +6185,7 @@ index f958e79277b8..1b0de4fc3958 100644 /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); -@@ -779,7 +842,19 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, +@@ -778,7 +841,19 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, struct request *free = NULL; bool ret; @@ -5030,7 +6206,7 @@ index f958e79277b8..1b0de4fc3958 100644 ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); spin_unlock(&dd->lock); -@@ -792,10 +867,9 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, +@@ -791,10 +866,9 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, /* * add rq to rbtree and fifo */ @@ -5042,7 +6218,7 @@ index f958e79277b8..1b0de4fc3958 100644 struct deadline_data *dd = q->elevator->elevator_data; const enum dd_data_dir data_dir = rq_data_dir(rq); u16 ioprio = req_get_ioprio(rq); -@@ -867,19 +941,13 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, +@@ -866,19 +940,13 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; @@ -5068,7 +6244,7 @@ index f958e79277b8..1b0de4fc3958 100644 } /* Callback from inside blk_mq_rq_ctx_init(). */ -@@ -958,6 +1026,10 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx) +@@ -957,6 +1025,10 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx) struct deadline_data *dd = hctx->queue->elevator->elevator_data; enum dd_prio prio; @@ -5082,14 +6258,15 @@ index f958e79277b8..1b0de4fc3958 100644 -- 2.44.0 -From 4f371ea8a1f8a47e624592a91f9e961080aec2eb Mon Sep 17 00:00:00 2001 +From b667355ece89a997a7b8508e6d6f1b5be46d3833 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 26 Feb 2024 15:47:21 +0100 -Subject: [PATCH 4/7] cachy +Date: Wed, 3 Apr 2024 17:06:52 +0200 +Subject: [PATCH 5/8] cachy Signed-off-by: Peter Jung --- .../admin-guide/kernel-parameters.txt | 9 + + Documentation/admin-guide/sysctl/vm.rst | 72 ++ Makefile | 162 ++++- arch/arm/Makefile | 56 +- arch/x86/Kconfig.cpu | 426 +++++++++++- @@ -5102,6 +6279,15 @@ Signed-off-by: Peter Jung block/elevator.c | 10 + drivers/ata/ahci.c | 23 +- drivers/cpufreq/Kconfig.x86 | 2 - + drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 + + drivers/gpu/drm/amd/display/Kconfig | 6 + + .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +- + .../amd/display/amdgpu_dm/amdgpu_dm_color.c | 2 +- + .../amd/display/amdgpu_dm/amdgpu_dm_crtc.c | 6 +- + .../amd/display/amdgpu_dm/amdgpu_dm_plane.c | 6 +- + drivers/gpu/drm/amd/pm/amdgpu_pm.c | 3 + + drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 14 +- drivers/i2c/busses/Kconfig | 9 + drivers/i2c/busses/Makefile | 1 + drivers/i2c/busses/i2c-nct6775.c | 648 ++++++++++++++++++ @@ -5114,7 +6300,7 @@ Signed-off-by: Peter Jung drivers/platform/x86/Kconfig | 14 + drivers/platform/x86/Makefile | 3 + drivers/platform/x86/steamdeck.c | 523 ++++++++++++++ - include/linux/mm.h | 2 +- + include/linux/mm.h | 10 +- include/linux/pagemap.h | 2 +- include/linux/user_namespace.h | 4 + init/Kconfig | 26 + @@ -5122,26 +6308,27 @@ Signed-off-by: Peter Jung kernel/fork.c | 14 + kernel/sched/fair.c | 13 + kernel/sched/sched.h | 2 +- - kernel/sysctl.c | 12 + + kernel/sysctl.c | 46 ++ kernel/user_namespace.c | 7 + - mm/Kconfig | 2 +- + mm/Kconfig | 65 +- mm/compaction.c | 4 + mm/huge_memory.c | 4 + + mm/mm_init.c | 1 + mm/page-writeback.c | 8 + mm/page_alloc.c | 27 +- mm/swap.c | 5 + mm/vmpressure.c | 4 + - mm/vmscan.c | 8 + - 43 files changed, 2639 insertions(+), 165 deletions(-) + mm/vmscan.c | 178 ++++- + 54 files changed, 3020 insertions(+), 182 deletions(-) create mode 100644 drivers/i2c/busses/i2c-nct6775.c create mode 100644 drivers/pci/controller/intel-nvme-remap.c create mode 100644 drivers/platform/x86/steamdeck.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index a493d93e0d2c..8d6a2ce37f8f 100644 +index 71ed7f1b0f9b..fbfaea49cbed 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -4396,6 +4396,15 @@ +@@ -4394,6 +4394,15 @@ nomsi [MSI] If the PCI_MSI kernel config parameter is enabled, this kernel boot option can be used to disable the use of MSI interrupts system-wide. @@ -5157,8 +6344,105 @@ index a493d93e0d2c..8d6a2ce37f8f 100644 noioapicquirk [APIC] Disable all boot interrupt quirks. Safety option to keep boot IRQs enabled. This should never be necessary. +diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst +index c59889de122b..468ae7dec1e1 100644 +--- a/Documentation/admin-guide/sysctl/vm.rst ++++ b/Documentation/admin-guide/sysctl/vm.rst +@@ -25,6 +25,9 @@ files can be found in mm/swap.c. + Currently, these files are in /proc/sys/vm: + + - admin_reserve_kbytes ++- anon_min_ratio ++- clean_low_ratio ++- clean_min_ratio + - compact_memory + - compaction_proactiveness + - compact_unevictable_allowed +@@ -106,6 +109,67 @@ On x86_64 this is about 128MB. + Changing this takes effect whenever an application requests memory. + + ++anon_min_ratio ++============== ++ ++This knob provides *hard* protection of anonymous pages. The anonymous pages ++on the current node won't be reclaimed under any conditions when their amount ++is below vm.anon_min_ratio. ++ ++This knob may be used to prevent excessive swap thrashing when anonymous ++memory is low (for example, when memory is going to be overfilled by ++compressed data of zram module). ++ ++Setting this value too high (close to 100) can result in inability to ++swap and can lead to early OOM under memory pressure. ++ ++The unit of measurement is the percentage of the total memory of the node. ++ ++The default value is 15. ++ ++ ++clean_low_ratio ++================ ++ ++This knob provides *best-effort* protection of clean file pages. The file pages ++on the current node won't be reclaimed under memory pressure when the amount of ++clean file pages is below vm.clean_low_ratio *unless* we threaten to OOM. ++ ++Protection of clean file pages using this knob may be used when swapping is ++still possible to ++ - prevent disk I/O thrashing under memory pressure; ++ - improve performance in disk cache-bound tasks under memory pressure. ++ ++Setting it to a high value may result in a early eviction of anonymous pages ++into the swap space by attempting to hold the protected amount of clean file ++pages in memory. ++ ++The unit of measurement is the percentage of the total memory of the node. ++ ++The default value is 0. ++ ++ ++clean_min_ratio ++================ ++ ++This knob provides *hard* protection of clean file pages. The file pages on the ++current node won't be reclaimed under memory pressure when the amount of clean ++file pages is below vm.clean_min_ratio. ++ ++Hard protection of clean file pages using this knob may be used to ++ - prevent disk I/O thrashing under memory pressure even with no free swap space; ++ - improve performance in disk cache-bound tasks under memory pressure; ++ - avoid high latency and prevent livelock in near-OOM conditions. ++ ++Setting it to a high value may result in a early out-of-memory condition due to ++the inability to reclaim the protected amount of clean file pages when other ++types of pages cannot be reclaimed. ++ ++The unit of measurement is the percentage of the total memory of the node. ++ ++The default value is 15. ++ ++ + compact_memory + ============== + +@@ -910,6 +974,14 @@ be 133 (x + 2x = 200, 2x = 133.33). + At 0, the kernel will not initiate swap until the amount of free and + file-backed pages is less than the high watermark in a zone. + ++This knob has no effect if the amount of clean file pages on the current ++node is below vm.clean_low_ratio or vm.clean_min_ratio. In this case, ++only anonymous pages can be reclaimed. ++ ++If the number of anonymous pages on the current node is below ++vm.anon_min_ratio, then only file pages can be reclaimed with ++any vm.swappiness value. ++ + + unprivileged_userfaultfd + ======================== diff --git a/Makefile b/Makefile -index 95b320ada47c..0b7d42037c3e 100644 +index a78379891d22..e58a4e647e7d 100644 --- a/Makefile +++ b/Makefile @@ -808,9 +808,164 @@ endif # need-config @@ -6230,10 +7514,10 @@ index 5ff093cb3cf8..1c93fe91b006 100644 /* diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c -index 682ff550ccfb..67f17fd94144 100644 +index df3fd6474bf2..4303eb5fe11b 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c -@@ -1560,7 +1560,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance) +@@ -1547,7 +1547,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance) } #endif @@ -6242,7 +7526,7 @@ index 682ff550ccfb..67f17fd94144 100644 struct ahci_host_priv *hpriv) { int i; -@@ -1573,7 +1573,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar, +@@ -1560,7 +1560,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar, pci_resource_len(pdev, bar) < SZ_512K || bar != AHCI_PCI_BAR_STANDARD || !(readl(hpriv->mmio + AHCI_VSCAP) & 1)) @@ -6251,7 +7535,7 @@ index 682ff550ccfb..67f17fd94144 100644 cap = readq(hpriv->mmio + AHCI_REMAP_CAP); for (i = 0; i < AHCI_MAX_REMAP; i++) { -@@ -1588,18 +1588,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar, +@@ -1575,18 +1575,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar, } if (!hpriv->remapped_nvme) @@ -6274,7 +7558,7 @@ index 682ff550ccfb..67f17fd94144 100644 } static int ahci_get_irq_vector(struct ata_host *host, int port) -@@ -1819,7 +1812,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) +@@ -1806,7 +1799,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) hpriv->mmio = pcim_iomap_table(pdev)[ahci_pci_bar]; /* detect remapped nvme devices */ @@ -6305,6 +7589,195 @@ index 438c9e75a04d..1bbfeca5f01e 100644 help This driver adds a CPUFreq driver which utilizes a fine grain processor performance frequency control range instead of legacy +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +index 79827a6dcd7f..ee85a2352771 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +@@ -153,6 +153,7 @@ struct amdgpu_watchdog_timer + */ + extern int amdgpu_modeset; + extern unsigned int amdgpu_vram_limit; ++extern int amdgpu_ignore_min_pcap; + extern int amdgpu_vis_vram_limit; + extern int amdgpu_gart_size; + extern int amdgpu_gtt_size; +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +index 586f4d03039d..a2524615b696 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +@@ -132,6 +132,7 @@ enum AMDGPU_DEBUG_MASK { + }; + + unsigned int amdgpu_vram_limit = UINT_MAX; ++int amdgpu_ignore_min_pcap = 0; /* do not ignore by default */ + int amdgpu_vis_vram_limit; + int amdgpu_gart_size = -1; /* auto */ + int amdgpu_gtt_size = -1; /* auto */ +@@ -241,6 +242,15 @@ struct amdgpu_watchdog_timer amdgpu_watchdog_timer = { + .period = 0x0, /* default to 0x0 (timeout disable) */ + }; + ++/** ++ * DOC: ignore_min_pcap (int) ++ * Ignore the minimum power cap. ++ * Useful on graphics cards where the minimum power cap is very high. ++ * The default is 0 (Do not ignore). ++ */ ++MODULE_PARM_DESC(ignore_min_pcap, "Ignore the minimum power cap"); ++module_param_named(ignore_min_pcap, amdgpu_ignore_min_pcap, int, 0600); ++ + /** + * DOC: vramlimit (int) + * Restrict the total amount of VRAM in MiB for testing. The default is 0 (Use full VRAM). +diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig +index 901d1961b739..05c49141f580 100644 +--- a/drivers/gpu/drm/amd/display/Kconfig ++++ b/drivers/gpu/drm/amd/display/Kconfig +@@ -51,4 +51,10 @@ config DRM_AMD_SECURE_DISPLAY + This option enables the calculation of crc of specific region via + debugfs. Cooperate with specific DMCU FW. + ++config AMD_PRIVATE_COLOR ++ bool "Enable KMS color management by AMD for AMD" ++ default n ++ help ++ This option extends the KMS color management API with AMD driver-specific properties to enhance the color management support on AMD Steam Deck. ++ + endmenu +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +index 1eb0f82e9dfa..5e0c551759ab 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +@@ -4072,7 +4072,7 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev) + return r; + } + +-#ifdef AMD_PRIVATE_COLOR ++#ifdef CONFIG_AMD_PRIVATE_COLOR + if (amdgpu_dm_create_color_properties(adev)) + return -ENOMEM; + #endif +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c +index c87b64e464ed..6fe07243adc3 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c +@@ -97,7 +97,7 @@ static inline struct fixed31_32 amdgpu_dm_fixpt_from_s3132(__u64 x) + return val; + } + +-#ifdef AMD_PRIVATE_COLOR ++#ifdef CONFIG_AMD_PRIVATE_COLOR + /* Pre-defined Transfer Functions (TF) + * + * AMD driver supports pre-defined mathematical functions for transferring +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c +index 6e715ef3a556..11c7199ec3b3 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c +@@ -290,7 +290,7 @@ static int amdgpu_dm_crtc_late_register(struct drm_crtc *crtc) + } + #endif + +-#ifdef AMD_PRIVATE_COLOR ++#ifdef CONFIG_AMD_PRIVATE_COLOR + /** + * dm_crtc_additional_color_mgmt - enable additional color properties + * @crtc: DRM CRTC +@@ -372,7 +372,7 @@ static const struct drm_crtc_funcs amdgpu_dm_crtc_funcs = { + #if defined(CONFIG_DEBUG_FS) + .late_register = amdgpu_dm_crtc_late_register, + #endif +-#ifdef AMD_PRIVATE_COLOR ++#ifdef CONFIG_AMD_PRIVATE_COLOR + .atomic_set_property = amdgpu_dm_atomic_crtc_set_property, + .atomic_get_property = amdgpu_dm_atomic_crtc_get_property, + #endif +@@ -551,7 +551,7 @@ int amdgpu_dm_crtc_init(struct amdgpu_display_manager *dm, + + drm_mode_crtc_set_gamma_size(&acrtc->base, MAX_COLOR_LEGACY_LUT_ENTRIES); + +-#ifdef AMD_PRIVATE_COLOR ++#ifdef CONFIG_AMD_PRIVATE_COLOR + dm_crtc_additional_color_mgmt(&acrtc->base); + #endif + return 0; +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c +index 8a4c40b4c27e..779880c64575 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c +@@ -1468,7 +1468,7 @@ static void amdgpu_dm_plane_drm_plane_destroy_state(struct drm_plane *plane, + drm_atomic_helper_plane_destroy_state(plane, state); + } + +-#ifdef AMD_PRIVATE_COLOR ++#ifdef CONFIG_AMD_PRIVATE_COLOR + static void + dm_atomic_plane_attach_color_mgmt_properties(struct amdgpu_display_manager *dm, + struct drm_plane *plane) +@@ -1659,7 +1659,7 @@ static const struct drm_plane_funcs dm_plane_funcs = { + .atomic_duplicate_state = amdgpu_dm_plane_drm_plane_duplicate_state, + .atomic_destroy_state = amdgpu_dm_plane_drm_plane_destroy_state, + .format_mod_supported = amdgpu_dm_plane_format_mod_supported, +-#ifdef AMD_PRIVATE_COLOR ++#ifdef CONFIG_AMD_PRIVATE_COLOR + .atomic_set_property = dm_atomic_plane_set_property, + .atomic_get_property = dm_atomic_plane_get_property, + #endif +@@ -1742,7 +1742,7 @@ int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm, + + drm_plane_helper_add(plane, &dm_plane_helper_funcs); + +-#ifdef AMD_PRIVATE_COLOR ++#ifdef CONFIG_AMD_PRIVATE_COLOR + dm_atomic_plane_attach_color_mgmt_properties(dm, plane); + #endif + /* Create (reset) the plane state */ +diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c +index 39c5e1dfa275..ee97bb26a8ef 100644 +--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c ++++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c +@@ -3034,6 +3034,9 @@ static ssize_t amdgpu_hwmon_show_power_cap_min(struct device *dev, + struct device_attribute *attr, + char *buf) + { ++ if (amdgpu_ignore_min_pcap) ++ return sysfs_emit(buf, "%i\n", 0); ++ + return amdgpu_hwmon_show_power_cap_generic(dev, attr, buf, PP_PWR_LIMIT_MIN); + } + +diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +index 0ad947df777a..7b82e3ef7c91 100644 +--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c ++++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +@@ -2695,7 +2695,10 @@ int smu_get_power_limit(void *handle, + *limit = smu->max_power_limit; + break; + case SMU_PPT_LIMIT_MIN: +- *limit = smu->min_power_limit; ++ if (amdgpu_ignore_min_pcap) ++ *limit = 0; ++ else ++ *limit = smu->min_power_limit; + break; + default: + return -EINVAL; +@@ -2719,7 +2722,14 @@ static int smu_set_power_limit(void *handle, uint32_t limit) + if (smu->ppt_funcs->set_power_limit) + return smu->ppt_funcs->set_power_limit(smu, limit_type, limit); + +- if ((limit > smu->max_power_limit) || (limit < smu->min_power_limit)) { ++ if (amdgpu_ignore_min_pcap) { ++ if ((limit > smu->max_power_limit)) { ++ dev_err(smu->adev->dev, ++ "New power limit (%d) is over the max allowed %d\n", ++ limit, smu->max_power_limit); ++ return -EINVAL; ++ } ++ } else if ((limit > smu->max_power_limit) || (limit < smu->min_power_limit)) { + dev_err(smu->adev->dev, + "New power limit (%d) is out of range [%d,%d]\n", + limit, smu->min_power_limit, smu->max_power_limit); diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 28eb48dd5b32..1cf4c700b108 100644 --- a/drivers/i2c/busses/Kconfig @@ -7573,7 +9046,7 @@ index 000000000000..e105e6f5cc91 +MODULE_AUTHOR("Daniel Drake "); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c -index d797df6e5f3e..b53d515da054 100644 +index eff7f5df08e2..cfb099dbeb5f 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3732,6 +3732,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) @@ -8257,10 +9730,10 @@ index 000000000000..77a6677ec19e +MODULE_DESCRIPTION("Steam Deck ACPI platform driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mm.h b/include/linux/mm.h -index f5a97dec5169..c9fb00c56844 100644 +index f5a97dec5169..397ad6f1ac39 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h -@@ -191,7 +191,7 @@ static inline void __mm_zero_struct_page(struct page *page) +@@ -191,10 +191,18 @@ static inline void __mm_zero_struct_page(struct page *page) * that. */ #define MAPCOUNT_ELF_CORE_MARGIN (5) @@ -8269,6 +9742,17 @@ index f5a97dec5169..c9fb00c56844 100644 extern int sysctl_max_map_count; ++extern bool sysctl_workingset_protection; ++extern u8 sysctl_anon_min_ratio; ++extern u8 sysctl_clean_low_ratio; ++extern u8 sysctl_clean_min_ratio; ++int vm_workingset_protection_update_handler( ++ struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos); ++ + extern unsigned long sysctl_user_reserve_kbytes; + extern unsigned long sysctl_admin_reserve_kbytes; + diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2df35e65557d..a52bd9f4b632 100644 --- a/include/linux/pagemap.h @@ -8437,7 +9921,7 @@ index 0d944e92a43f..5449c990a91a 100644 if (err) goto bad_unshare_out; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 533547e3c90a..fc0a9de42a9d 100644 +index e2b4e0396af8..97983b041e9d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -73,10 +73,19 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; @@ -8487,7 +9971,7 @@ index 001fe047bd5d..ed5c758c7368 100644 #else #define SCHED_NR_MIGRATE_BREAK 32 diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index 157f7ce2942d..c92d8a4b23fb 100644 +index 157f7ce2942d..aa55ebba2ec3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -95,6 +95,9 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); @@ -8516,6 +10000,47 @@ index 157f7ce2942d..c92d8a4b23fb 100644 #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", +@@ -2204,6 +2216,40 @@ static struct ctl_table vm_table[] = { + .extra1 = SYSCTL_ZERO, + }, + #endif ++ { ++ .procname = "workingset_protection", ++ .data = &sysctl_workingset_protection, ++ .maxlen = sizeof(bool), ++ .mode = 0644, ++ .proc_handler = &proc_dobool, ++ }, ++ { ++ .procname = "anon_min_ratio", ++ .data = &sysctl_anon_min_ratio, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = &vm_workingset_protection_update_handler, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE_HUNDRED, ++ }, ++ { ++ .procname = "clean_low_ratio", ++ .data = &sysctl_clean_low_ratio, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = &vm_workingset_protection_update_handler, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE_HUNDRED, ++ }, ++ { ++ .procname = "clean_min_ratio", ++ .data = &sysctl_clean_min_ratio, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = &vm_workingset_protection_update_handler, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE_HUNDRED, ++ }, + { + .procname = "user_reserve_kbytes", + .data = &sysctl_user_reserve_kbytes, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index ce4d99df5f0e..8272e2e359f1 100644 --- a/kernel/user_namespace.c @@ -8535,10 +10060,80 @@ index ce4d99df5f0e..8272e2e359f1 100644 static DEFINE_MUTEX(userns_state_mutex); diff --git a/mm/Kconfig b/mm/Kconfig -index ffc3a2ba3a8c..0e440573033c 100644 +index ffc3a2ba3a8c..002f48b655de 100644 --- a/mm/Kconfig +++ b/mm/Kconfig -@@ -630,7 +630,7 @@ config COMPACTION +@@ -486,6 +486,69 @@ config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP + config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP + bool + ++config ANON_MIN_RATIO ++ int "Default value for vm.anon_min_ratio" ++ depends on SYSCTL ++ range 0 100 ++ default 15 ++ help ++ This option sets the default value for vm.anon_min_ratio sysctl knob. ++ ++ The vm.anon_min_ratio sysctl knob provides *hard* protection of ++ anonymous pages. The anonymous pages on the current node won't be ++ reclaimed under any conditions when their amount is below ++ vm.anon_min_ratio. This knob may be used to prevent excessive swap ++ thrashing when anonymous memory is low (for example, when memory is ++ going to be overfilled by compressed data of zram module). ++ ++ Setting this value too high (close to MemTotal) can result in ++ inability to swap and can lead to early OOM under memory pressure. ++ ++config CLEAN_LOW_RATIO ++ int "Default value for vm.clean_low_ratio" ++ depends on SYSCTL ++ range 0 100 ++ default 0 ++ help ++ This option sets the default value for vm.clean_low_ratio sysctl knob. ++ ++ The vm.clean_low_ratio sysctl knob provides *best-effort* ++ protection of clean file pages. The file pages on the current node ++ won't be reclaimed under memory pressure when the amount of clean file ++ pages is below vm.clean_low_ratio *unless* we threaten to OOM. ++ Protection of clean file pages using this knob may be used when ++ swapping is still possible to ++ - prevent disk I/O thrashing under memory pressure; ++ - improve performance in disk cache-bound tasks under memory ++ pressure. ++ ++ Setting it to a high value may result in a early eviction of anonymous ++ pages into the swap space by attempting to hold the protected amount ++ of clean file pages in memory. ++ ++config CLEAN_MIN_RATIO ++ int "Default value for vm.clean_min_ratio" ++ depends on SYSCTL ++ range 0 100 ++ default 15 ++ help ++ This option sets the default value for vm.clean_min_ratio sysctl knob. ++ ++ The vm.clean_min_ratio sysctl knob provides *hard* protection of ++ clean file pages. The file pages on the current node won't be ++ reclaimed under memory pressure when the amount of clean file pages is ++ below vm.clean_min_ratio. Hard protection of clean file pages using ++ this knob may be used to ++ - prevent disk I/O thrashing under memory pressure even with no free ++ swap space; ++ - improve performance in disk cache-bound tasks under memory ++ pressure; ++ - avoid high latency and prevent livelock in near-OOM conditions. ++ ++ Setting it to a high value may result in a early out-of-memory condition ++ due to the inability to reclaim the protected amount of clean file pages ++ when other types of pages cannot be reclaimed. ++ + config HAVE_MEMBLOCK_PHYS_MAP + bool + +@@ -630,7 +693,7 @@ config COMPACTION config COMPACT_UNEVICTABLE_DEFAULT int depends on COMPACTION @@ -8579,6 +10174,18 @@ index 94c958f7ebb5..2f9974f305ee 100644 (1<clean_below_min : sc->anon_below_min) ++ goto keep_locked; ++ + /* + * The number of dirty pages determines if a node is marked + * reclaim_congested. kswapd will stall and start writing +@@ -2353,6 +2378,23 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + goto out; + } + ++ /* ++ * Force-scan the other type if anon/clean pages is ++ * under vm.{anon,clean}_{low,min}_ratio, respectively. ++ */ ++ if (sc->clean_below_min) { ++ scan_balance = SCAN_ANON; ++ goto out; ++ } ++ if (sc->anon_below_min) { ++ scan_balance = SCAN_FILE; ++ goto out; ++ } ++ if (sc->clean_below_low) { ++ scan_balance = SCAN_ANON; ++ goto out; ++ } ++ + /* + * Do not apply any pressure balancing cleverness when the + * system is close to OOM, scan both anon and file equally +@@ -2515,6 +2557,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + BUG(); + } + ++ /* ++ * Hard protection of the working set. ++ * Don't reclaim anon/file pages when the amount is ++ * below the watermark of the same type. ++ */ ++ if (file ? sc->clean_below_min : sc->anon_below_min) ++ scan = 0; ++ + nr[lru] = scan; + } + } +@@ -3922,7 +3972,28 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc } /* to protect the working set of the last N jiffies */ @@ -8735,22 +10419,819 @@ index 4255619a1a31..5a3fbaf34158 100644 +#else static unsigned long lru_gen_min_ttl __read_mostly; +#endif ++ ++static void do_invoke_oom(struct scan_control *sc, bool try_memcg) { ++ struct oom_control oc = { ++ .gfp_mask = sc->gfp_mask, ++ .order = sc->order, ++ }; ++ ++ if (try_memcg && mem_cgroup_oom_synchronize(true)) ++ return; ++ ++ if (!mutex_trylock(&oom_lock)) ++ return; ++ out_of_memory(&oc); ++ mutex_unlock(&oom_lock); ++} ++#define invoke_oom(sc) do_invoke_oom(sc, true) ++#define invoke_oom_nomemcg(sc) do_invoke_oom(sc, false) static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { +@@ -3952,14 +4023,96 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + * younger than min_ttl. However, another possibility is all memcgs are + * either too small or below min. + */ +- if (mutex_trylock(&oom_lock)) { +- struct oom_control oc = { +- .gfp_mask = sc->gfp_mask, +- }; ++ invoke_oom_nomemcg(sc); ++} ++ ++int vm_workingset_protection_update_handler(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); ++ if (ret || !write) ++ return ret; ++ ++ workingset_protection_prev_totalram = 0; ++ ++ return 0; ++} ++ ++static void prepare_workingset_protection(pg_data_t *pgdat, struct scan_control *sc) ++{ ++ unsigned long node_mem_total; ++ struct sysinfo i; ++ ++ if (!(sysctl_workingset_protection)) { ++ sc->anon_below_min = 0; ++ sc->clean_below_low = 0; ++ sc->clean_below_min = 0; ++ return; ++ } ++ ++ if (likely(sysctl_anon_min_ratio || ++ sysctl_clean_low_ratio || ++ sysctl_clean_min_ratio)) { ++#ifdef CONFIG_NUMA ++ si_meminfo_node(&i, pgdat->node_id); ++#else //CONFIG_NUMA ++ si_meminfo(&i); ++#endif //CONFIG_NUMA ++ node_mem_total = i.totalram; ++ ++ if (unlikely(workingset_protection_prev_totalram != node_mem_total)) { ++ sysctl_anon_min_ratio_kb = ++ node_mem_total * sysctl_anon_min_ratio / 100; ++ sysctl_clean_low_ratio_kb = ++ node_mem_total * sysctl_clean_low_ratio / 100; ++ sysctl_clean_min_ratio_kb = ++ node_mem_total * sysctl_clean_min_ratio / 100; ++ workingset_protection_prev_totalram = node_mem_total; ++ } ++ } + +- out_of_memory(&oc); ++ /* ++ * Check the number of anonymous pages to protect them from ++ * reclaiming if their amount is below the specified. ++ */ ++ if (sysctl_anon_min_ratio) { ++ unsigned long reclaimable_anon; ++ ++ reclaimable_anon = ++ node_page_state(pgdat, NR_ACTIVE_ANON) + ++ node_page_state(pgdat, NR_INACTIVE_ANON) + ++ node_page_state(pgdat, NR_ISOLATED_ANON); ++ ++ sc->anon_below_min = reclaimable_anon < sysctl_anon_min_ratio_kb; ++ } else ++ sc->anon_below_min = 0; ++ ++ /* ++ * Check the number of clean file pages to protect them from ++ * reclaiming if their amount is below the specified. ++ */ ++ if (sysctl_clean_low_ratio || sysctl_clean_min_ratio) { ++ unsigned long reclaimable_file, dirty, clean; ++ ++ reclaimable_file = ++ node_page_state(pgdat, NR_ACTIVE_FILE) + ++ node_page_state(pgdat, NR_INACTIVE_FILE) + ++ node_page_state(pgdat, NR_ISOLATED_FILE); ++ dirty = node_page_state(pgdat, NR_FILE_DIRTY); ++ /* ++ * node_page_state() sum can go out of sync since ++ * all the values are not read at once. ++ */ ++ if (likely(reclaimable_file > dirty)) ++ clean = reclaimable_file - dirty; ++ else ++ clean = 0; + +- mutex_unlock(&oom_lock); ++ sc->clean_below_low = clean < sysctl_clean_low_ratio_kb; ++ sc->clean_below_min = clean < sysctl_clean_min_ratio_kb; ++ } else { ++ sc->clean_below_low = 0; ++ sc->clean_below_min = 0; + } + } + +@@ -4462,6 +4615,12 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw + */ + if (!swappiness) + type = LRU_GEN_FILE; ++ else if (sc->clean_below_min) ++ type = LRU_GEN_ANON; ++ else if (sc->anon_below_min) ++ type = LRU_GEN_FILE; ++ else if (sc->clean_below_low) ++ type = LRU_GEN_ANON; + else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) + type = LRU_GEN_ANON; + else if (swappiness == 1) +@@ -4471,7 +4630,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw + else + type = get_type_to_scan(lruvec, swappiness, &tier); + +- for (i = !swappiness; i < ANON_AND_FILE; i++) { ++ for (i = 0; i < ANON_AND_FILE; i++) { + if (tier < 0) + tier = get_tier_idx(lruvec, type); + +@@ -4749,6 +4908,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + ++ prepare_workingset_protection(pgdat, sc); + mem_cgroup_calculate_protection(NULL, memcg); + + if (mem_cgroup_below_min(NULL, memcg)) +@@ -5899,6 +6059,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) + + prepare_scan_control(pgdat, sc); + ++ prepare_workingset_protection(pgdat, sc); ++ + shrink_node_memcgs(pgdat, sc); + + flush_reclaim_state(sc); +@@ -5987,6 +6149,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) + */ + if (reclaimable) + pgdat->kswapd_failures = 0; ++ else if (sc->clean_below_min && !sc->priority) ++ invoke_oom(sc); + } + + /* -- 2.44.0 -From 516559b0e31629dafbe60212d041e63af1b12c1c Mon Sep 17 00:00:00 2001 +From 3719b448ce6ae6e6df7f49a99ef30eeb0bf2117d Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 26 Feb 2024 15:47:43 +0100 -Subject: [PATCH 5/7] fixes +Date: Wed, 3 Apr 2024 17:43:37 +0200 +Subject: [PATCH] Revert "le9uo" + +This reverts commit 9bb31a68ef456524c4370323e1c19b07fc0633df. +--- + Documentation/admin-guide/sysctl/vm.rst | 72 ---------- + include/linux/mm.h | 8 -- + kernel/sysctl.c | 34 ----- + mm/Kconfig | 63 --------- + mm/mm_init.c | 1 - + mm/vmscan.c | 170 +----------------------- + 6 files changed, 7 insertions(+), 341 deletions(-) + +diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst +index 468ae7dec1e1..c59889de122b 100644 +--- a/Documentation/admin-guide/sysctl/vm.rst ++++ b/Documentation/admin-guide/sysctl/vm.rst +@@ -25,9 +25,6 @@ files can be found in mm/swap.c. + Currently, these files are in /proc/sys/vm: + + - admin_reserve_kbytes +-- anon_min_ratio +-- clean_low_ratio +-- clean_min_ratio + - compact_memory + - compaction_proactiveness + - compact_unevictable_allowed +@@ -109,67 +106,6 @@ On x86_64 this is about 128MB. + Changing this takes effect whenever an application requests memory. + + +-anon_min_ratio +-============== +- +-This knob provides *hard* protection of anonymous pages. The anonymous pages +-on the current node won't be reclaimed under any conditions when their amount +-is below vm.anon_min_ratio. +- +-This knob may be used to prevent excessive swap thrashing when anonymous +-memory is low (for example, when memory is going to be overfilled by +-compressed data of zram module). +- +-Setting this value too high (close to 100) can result in inability to +-swap and can lead to early OOM under memory pressure. +- +-The unit of measurement is the percentage of the total memory of the node. +- +-The default value is 15. +- +- +-clean_low_ratio +-================ +- +-This knob provides *best-effort* protection of clean file pages. The file pages +-on the current node won't be reclaimed under memory pressure when the amount of +-clean file pages is below vm.clean_low_ratio *unless* we threaten to OOM. +- +-Protection of clean file pages using this knob may be used when swapping is +-still possible to +- - prevent disk I/O thrashing under memory pressure; +- - improve performance in disk cache-bound tasks under memory pressure. +- +-Setting it to a high value may result in a early eviction of anonymous pages +-into the swap space by attempting to hold the protected amount of clean file +-pages in memory. +- +-The unit of measurement is the percentage of the total memory of the node. +- +-The default value is 0. +- +- +-clean_min_ratio +-================ +- +-This knob provides *hard* protection of clean file pages. The file pages on the +-current node won't be reclaimed under memory pressure when the amount of clean +-file pages is below vm.clean_min_ratio. +- +-Hard protection of clean file pages using this knob may be used to +- - prevent disk I/O thrashing under memory pressure even with no free swap space; +- - improve performance in disk cache-bound tasks under memory pressure; +- - avoid high latency and prevent livelock in near-OOM conditions. +- +-Setting it to a high value may result in a early out-of-memory condition due to +-the inability to reclaim the protected amount of clean file pages when other +-types of pages cannot be reclaimed. +- +-The unit of measurement is the percentage of the total memory of the node. +- +-The default value is 15. +- +- + compact_memory + ============== + +@@ -974,14 +910,6 @@ be 133 (x + 2x = 200, 2x = 133.33). + At 0, the kernel will not initiate swap until the amount of free and + file-backed pages is less than the high watermark in a zone. + +-This knob has no effect if the amount of clean file pages on the current +-node is below vm.clean_low_ratio or vm.clean_min_ratio. In this case, +-only anonymous pages can be reclaimed. +- +-If the number of anonymous pages on the current node is below +-vm.anon_min_ratio, then only file pages can be reclaimed with +-any vm.swappiness value. +- + + unprivileged_userfaultfd + ======================== +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 397ad6f1ac39..c9fb00c56844 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -195,14 +195,6 @@ static inline void __mm_zero_struct_page(struct page *page) + + extern int sysctl_max_map_count; + +-extern bool sysctl_workingset_protection; +-extern u8 sysctl_anon_min_ratio; +-extern u8 sysctl_clean_low_ratio; +-extern u8 sysctl_clean_min_ratio; +-int vm_workingset_protection_update_handler( +- struct ctl_table *table, int write, +- void __user *buffer, size_t *lenp, loff_t *ppos); +- + extern unsigned long sysctl_user_reserve_kbytes; + extern unsigned long sysctl_admin_reserve_kbytes; + +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index aa55ebba2ec3..c92d8a4b23fb 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -2216,40 +2216,6 @@ static struct ctl_table vm_table[] = { + .extra1 = SYSCTL_ZERO, + }, + #endif +- { +- .procname = "workingset_protection", +- .data = &sysctl_workingset_protection, +- .maxlen = sizeof(bool), +- .mode = 0644, +- .proc_handler = &proc_dobool, +- }, +- { +- .procname = "anon_min_ratio", +- .data = &sysctl_anon_min_ratio, +- .maxlen = sizeof(u8), +- .mode = 0644, +- .proc_handler = &vm_workingset_protection_update_handler, +- .extra1 = SYSCTL_ZERO, +- .extra2 = SYSCTL_ONE_HUNDRED, +- }, +- { +- .procname = "clean_low_ratio", +- .data = &sysctl_clean_low_ratio, +- .maxlen = sizeof(u8), +- .mode = 0644, +- .proc_handler = &vm_workingset_protection_update_handler, +- .extra1 = SYSCTL_ZERO, +- .extra2 = SYSCTL_ONE_HUNDRED, +- }, +- { +- .procname = "clean_min_ratio", +- .data = &sysctl_clean_min_ratio, +- .maxlen = sizeof(u8), +- .mode = 0644, +- .proc_handler = &vm_workingset_protection_update_handler, +- .extra1 = SYSCTL_ZERO, +- .extra2 = SYSCTL_ONE_HUNDRED, +- }, + { + .procname = "user_reserve_kbytes", + .data = &sysctl_user_reserve_kbytes, +diff --git a/mm/Kconfig b/mm/Kconfig +index 002f48b655de..0e440573033c 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -486,69 +486,6 @@ config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP + config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP + bool + +-config ANON_MIN_RATIO +- int "Default value for vm.anon_min_ratio" +- depends on SYSCTL +- range 0 100 +- default 15 +- help +- This option sets the default value for vm.anon_min_ratio sysctl knob. +- +- The vm.anon_min_ratio sysctl knob provides *hard* protection of +- anonymous pages. The anonymous pages on the current node won't be +- reclaimed under any conditions when their amount is below +- vm.anon_min_ratio. This knob may be used to prevent excessive swap +- thrashing when anonymous memory is low (for example, when memory is +- going to be overfilled by compressed data of zram module). +- +- Setting this value too high (close to MemTotal) can result in +- inability to swap and can lead to early OOM under memory pressure. +- +-config CLEAN_LOW_RATIO +- int "Default value for vm.clean_low_ratio" +- depends on SYSCTL +- range 0 100 +- default 0 +- help +- This option sets the default value for vm.clean_low_ratio sysctl knob. +- +- The vm.clean_low_ratio sysctl knob provides *best-effort* +- protection of clean file pages. The file pages on the current node +- won't be reclaimed under memory pressure when the amount of clean file +- pages is below vm.clean_low_ratio *unless* we threaten to OOM. +- Protection of clean file pages using this knob may be used when +- swapping is still possible to +- - prevent disk I/O thrashing under memory pressure; +- - improve performance in disk cache-bound tasks under memory +- pressure. +- +- Setting it to a high value may result in a early eviction of anonymous +- pages into the swap space by attempting to hold the protected amount +- of clean file pages in memory. +- +-config CLEAN_MIN_RATIO +- int "Default value for vm.clean_min_ratio" +- depends on SYSCTL +- range 0 100 +- default 15 +- help +- This option sets the default value for vm.clean_min_ratio sysctl knob. +- +- The vm.clean_min_ratio sysctl knob provides *hard* protection of +- clean file pages. The file pages on the current node won't be +- reclaimed under memory pressure when the amount of clean file pages is +- below vm.clean_min_ratio. Hard protection of clean file pages using +- this knob may be used to +- - prevent disk I/O thrashing under memory pressure even with no free +- swap space; +- - improve performance in disk cache-bound tasks under memory +- pressure; +- - avoid high latency and prevent livelock in near-OOM conditions. +- +- Setting it to a high value may result in a early out-of-memory condition +- due to the inability to reclaim the protected amount of clean file pages +- when other types of pages cannot be reclaimed. +- + config HAVE_MEMBLOCK_PHYS_MAP + bool + +diff --git a/mm/mm_init.c b/mm/mm_init.c +index 419ba5ac7c52..2c19f5515e36 100644 +--- a/mm/mm_init.c ++++ b/mm/mm_init.c +@@ -2749,7 +2749,6 @@ static void __init mem_init_print_info(void) + , K(totalhigh_pages()) + #endif + ); +- printk(KERN_INFO "le9 Unofficial (le9uo) working set protection 1.4 by Masahito Suzuki (forked from hakavlad's original le9 patch)"); + } + + /* +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 346810e1b69d..fd1d9b4194e3 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -133,15 +133,6 @@ struct scan_control { + /* The file folios on the current node are dangerously low */ + unsigned int file_is_tiny:1; + +- /* The anonymous pages on the current node are below vm.anon_min_ratio */ +- unsigned int anon_below_min:1; +- +- /* The clean file pages on the current node are below vm.clean_low_ratio */ +- unsigned int clean_below_low:1; +- +- /* The clean file pages on the current node are below vm.clean_min_ratio */ +- unsigned int clean_below_min:1; +- + /* Always discard instead of demoting to lower tier memory */ + unsigned int no_demotion:1; + +@@ -191,15 +182,6 @@ struct scan_control { + #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) + #endif + +-bool sysctl_workingset_protection __read_mostly = true; +-u8 sysctl_anon_min_ratio __read_mostly = CONFIG_ANON_MIN_RATIO; +-u8 sysctl_clean_low_ratio __read_mostly = CONFIG_CLEAN_LOW_RATIO; +-u8 sysctl_clean_min_ratio __read_mostly = CONFIG_CLEAN_MIN_RATIO; +-static u64 sysctl_anon_min_ratio_kb __read_mostly = 0; +-static u64 sysctl_clean_low_ratio_kb __read_mostly = 0; +-static u64 sysctl_clean_min_ratio_kb __read_mostly = 0; +-static u64 workingset_protection_prev_totalram __read_mostly = 0; +- + /* + * From 0 .. 200. Higher means more swappy. + */ +@@ -1074,9 +1056,6 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, + folio_mapped(folio) && folio_test_referenced(folio)) + goto keep_locked; + +- if (folio_is_file_lru(folio) ? sc->clean_below_min : sc->anon_below_min) +- goto keep_locked; +- + /* + * The number of dirty pages determines if a node is marked + * reclaim_congested. kswapd will stall and start writing +@@ -2378,23 +2357,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + goto out; + } + +- /* +- * Force-scan the other type if anon/clean pages is +- * under vm.{anon,clean}_{low,min}_ratio, respectively. +- */ +- if (sc->clean_below_min) { +- scan_balance = SCAN_ANON; +- goto out; +- } +- if (sc->anon_below_min) { +- scan_balance = SCAN_FILE; +- goto out; +- } +- if (sc->clean_below_low) { +- scan_balance = SCAN_ANON; +- goto out; +- } +- + /* + * Do not apply any pressure balancing cleverness when the + * system is close to OOM, scan both anon and file equally +@@ -2557,14 +2519,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + BUG(); + } + +- /* +- * Hard protection of the working set. +- * Don't reclaim anon/file pages when the amount is +- * below the watermark of the same type. +- */ +- if (file ? sc->clean_below_min : sc->anon_below_min) +- scan = 0; +- + nr[lru] = scan; + } + } +@@ -3978,23 +3932,6 @@ static unsigned long lru_gen_min_ttl __read_mostly = 1000; + static unsigned long lru_gen_min_ttl __read_mostly; + #endif + +-static void do_invoke_oom(struct scan_control *sc, bool try_memcg) { +- struct oom_control oc = { +- .gfp_mask = sc->gfp_mask, +- .order = sc->order, +- }; +- +- if (try_memcg && mem_cgroup_oom_synchronize(true)) +- return; +- +- if (!mutex_trylock(&oom_lock)) +- return; +- out_of_memory(&oc); +- mutex_unlock(&oom_lock); +-} +-#define invoke_oom(sc) do_invoke_oom(sc, true) +-#define invoke_oom_nomemcg(sc) do_invoke_oom(sc, false) +- + static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + { + struct mem_cgroup *memcg; +@@ -4023,96 +3960,14 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + * younger than min_ttl. However, another possibility is all memcgs are + * either too small or below min. + */ +- invoke_oom_nomemcg(sc); +-} +- +-int vm_workingset_protection_update_handler(struct ctl_table *table, int write, +- void __user *buffer, size_t *lenp, loff_t *ppos) +-{ +- int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); +- if (ret || !write) +- return ret; +- +- workingset_protection_prev_totalram = 0; +- +- return 0; +-} +- +-static void prepare_workingset_protection(pg_data_t *pgdat, struct scan_control *sc) +-{ +- unsigned long node_mem_total; +- struct sysinfo i; +- +- if (!(sysctl_workingset_protection)) { +- sc->anon_below_min = 0; +- sc->clean_below_low = 0; +- sc->clean_below_min = 0; +- return; +- } +- +- if (likely(sysctl_anon_min_ratio || +- sysctl_clean_low_ratio || +- sysctl_clean_min_ratio)) { +-#ifdef CONFIG_NUMA +- si_meminfo_node(&i, pgdat->node_id); +-#else //CONFIG_NUMA +- si_meminfo(&i); +-#endif //CONFIG_NUMA +- node_mem_total = i.totalram; +- +- if (unlikely(workingset_protection_prev_totalram != node_mem_total)) { +- sysctl_anon_min_ratio_kb = +- node_mem_total * sysctl_anon_min_ratio / 100; +- sysctl_clean_low_ratio_kb = +- node_mem_total * sysctl_clean_low_ratio / 100; +- sysctl_clean_min_ratio_kb = +- node_mem_total * sysctl_clean_min_ratio / 100; +- workingset_protection_prev_totalram = node_mem_total; +- } +- } +- +- /* +- * Check the number of anonymous pages to protect them from +- * reclaiming if their amount is below the specified. +- */ +- if (sysctl_anon_min_ratio) { +- unsigned long reclaimable_anon; +- +- reclaimable_anon = +- node_page_state(pgdat, NR_ACTIVE_ANON) + +- node_page_state(pgdat, NR_INACTIVE_ANON) + +- node_page_state(pgdat, NR_ISOLATED_ANON); ++ if (mutex_trylock(&oom_lock)) { ++ struct oom_control oc = { ++ .gfp_mask = sc->gfp_mask, ++ }; + +- sc->anon_below_min = reclaimable_anon < sysctl_anon_min_ratio_kb; +- } else +- sc->anon_below_min = 0; ++ out_of_memory(&oc); + +- /* +- * Check the number of clean file pages to protect them from +- * reclaiming if their amount is below the specified. +- */ +- if (sysctl_clean_low_ratio || sysctl_clean_min_ratio) { +- unsigned long reclaimable_file, dirty, clean; +- +- reclaimable_file = +- node_page_state(pgdat, NR_ACTIVE_FILE) + +- node_page_state(pgdat, NR_INACTIVE_FILE) + +- node_page_state(pgdat, NR_ISOLATED_FILE); +- dirty = node_page_state(pgdat, NR_FILE_DIRTY); +- /* +- * node_page_state() sum can go out of sync since +- * all the values are not read at once. +- */ +- if (likely(reclaimable_file > dirty)) +- clean = reclaimable_file - dirty; +- else +- clean = 0; +- +- sc->clean_below_low = clean < sysctl_clean_low_ratio_kb; +- sc->clean_below_min = clean < sysctl_clean_min_ratio_kb; +- } else { +- sc->clean_below_low = 0; +- sc->clean_below_min = 0; ++ mutex_unlock(&oom_lock); + } + } + +@@ -4615,12 +4470,6 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw + */ + if (!swappiness) + type = LRU_GEN_FILE; +- else if (sc->clean_below_min) +- type = LRU_GEN_ANON; +- else if (sc->anon_below_min) +- type = LRU_GEN_FILE; +- else if (sc->clean_below_low) +- type = LRU_GEN_ANON; + else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) + type = LRU_GEN_ANON; + else if (swappiness == 1) +@@ -4630,7 +4479,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw + else + type = get_type_to_scan(lruvec, swappiness, &tier); + +- for (i = 0; i < ANON_AND_FILE; i++) { ++ for (i = !swappiness; i < ANON_AND_FILE; i++) { + if (tier < 0) + tier = get_tier_idx(lruvec, type); + +@@ -4908,7 +4757,6 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + +- prepare_workingset_protection(pgdat, sc); + mem_cgroup_calculate_protection(NULL, memcg); + + if (mem_cgroup_below_min(NULL, memcg)) +@@ -6059,8 +5907,6 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) + + prepare_scan_control(pgdat, sc); + +- prepare_workingset_protection(pgdat, sc); +- + shrink_node_memcgs(pgdat, sc); + + flush_reclaim_state(sc); +@@ -6149,8 +5995,6 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) + */ + if (reclaimable) + pgdat->kswapd_failures = 0; +- else if (sc->clean_below_min && !sc->priority) +- invoke_oom(sc); + } + + /* +-- +2.44.0 + +From 4833f48c9738d6bb475df2e4c16be2ea26a7d91d Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Wed, 3 Apr 2024 17:07:02 +0200 +Subject: [PATCH 6/8] fixes Signed-off-by: Peter Jung --- - arch/Kconfig | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) + .../ABI/testing/sysfs-driver-hid-asus | 85 + + arch/Kconfig | 4 +- + drivers/hid/Makefile | 2 + + drivers/hid/{hid-asus.c => hid-asus-core.c} | 193 +-- + drivers/hid/hid-asus-rog.c | 1468 +++++++++++++++++ + drivers/hid/hid-asus-rog.h | 482 ++++++ + drivers/hid/hid-asus.h | 58 + + drivers/hid/hid-ids.h | 1 + + 8 files changed, 2174 insertions(+), 119 deletions(-) + create mode 100644 Documentation/ABI/testing/sysfs-driver-hid-asus + rename drivers/hid/{hid-asus.c => hid-asus-core.c} (89%) + create mode 100644 drivers/hid/hid-asus-rog.c + create mode 100644 drivers/hid/hid-asus-rog.h + create mode 100644 drivers/hid/hid-asus.h +diff --git a/Documentation/ABI/testing/sysfs-driver-hid-asus b/Documentation/ABI/testing/sysfs-driver-hid-asus +new file mode 100644 +index 000000000000..df5b0c5b0702 +--- /dev/null ++++ b/Documentation/ABI/testing/sysfs-driver-hid-asus +@@ -0,0 +1,85 @@ ++What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/gamepad_mode ++Date: December 2023 ++Contact: linux-input@vger.kernel.org ++Description: Set the mode the ROG Ally xpad operates in: ++ - 1 = Game mode ++ - 2 = WASD mode ++ - 3 = Mouse mode ++ This setting applies instantly and applies settings that were previously changed ++ under that mode which are: ++ - deadzones ++ - anti-deadzones ++ - button mapping ++ - button turbo settings ++ - response curves ++ ++What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/apply ++Date: December 2023 ++Contact: linux-input@vger.kernel.org ++Description: Apply the settings that have been stored in attributes so far. Because there are ++ many individual settings across a dozen packets this separation is required to ++ prevent spamming the MCU when userspace applications apply many changes at once. ++ ++What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/reset_btn_mapping ++Date: December 2023 ++Contact: linux-input@vger.kernel.org ++Description: Reset a gamepad mode to its default button mapping. ++ ++What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis__/deadzone ++Date: December 2023 ++Contact: linux-input@vger.kernel.org ++Description: Set the inner and outer deadzones of joysticks and triggers. These settings are not ++ written to the MCU until `apply` is set. ++ - range 0-64 (corresponds to 0-100%) ++ ++What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis__/deadzone_index ++Date: December 2023 ++Contact: linux-input@vger.kernel.org ++Description: Descriptive labels for joystick deadzone array. ++ ++What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis__/anti-deadzone ++Date: December 2023 ++Contact: linux-input@vger.kernel.org ++Description: Set the joystick anti-deadzone feature: ++ - range 0-32 (corresponds to 0-50%) ++ ++What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis__/calibration ++Date: December 2023 ++Contact: linux-input@vger.kernel.org ++Description: Calibration values for the joysticks and trigger analogues. There are no default ++ values as the calibration is determined in userspace. ++ ++What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis__/calibration_index ++Date: December 2023 ++Contact: linux-input@vger.kernel.org ++Description: Descriptive labels for joystick and triggers calibration array. ++ ++What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis__/rc_point ++Date: December 2023 ++Contact: linux-input@vger.kernel.org ++Description: Set the joystick response curve. There are 4 points available with 1 being the lowest ++ point and 4 being the highest point. ++ - range 0-64 (corresponds to 0-100%) ++ ++What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis__/rc_point_index ++Date: December 2023 ++Contact: linux-input@vger.kernel.org ++Description: Descriptive labels for joystick response curve points. ++ ++What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/btn_