From ccc77beb6e06c8679a8b7e7661b9675ab14e3f97 Mon Sep 17 00:00:00 2001 From: ferreo Date: Mon, 4 Nov 2024 21:28:16 +0100 Subject: [PATCH] Update patches/0001-cachyos-base-all.patch --- patches/0001-cachyos-base-all.patch | 17885 +++++++++++++++++++------- 1 file changed, 13172 insertions(+), 4713 deletions(-) diff --git a/patches/0001-cachyos-base-all.patch b/patches/0001-cachyos-base-all.patch index cb5cb43..9f2f6c8 100644 --- a/patches/0001-cachyos-base-all.patch +++ b/patches/0001-cachyos-base-all.patch @@ -1,121 +1,9 @@ -From 5984a6b2cf95450f2f92610cfb69378b844da2a6 Mon Sep 17 00:00:00 2001 -From: Eric Naim -Date: Tue, 22 Oct 2024 22:51:09 +0800 -Subject: [PATCH 01/13] address-masking +From 15ec398f577b2d406a028dc6310ed65a8d5de25e Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 4 Nov 2024 14:46:43 +0100 +Subject: [PATCH 01/13] amd-cache-optimizer -Signed-off-by: Eric Naim ---- - arch/x86/include/asm/uaccess_64.h | 11 +++++++++++ - fs/select.c | 4 +++- - include/linux/uaccess.h | 7 +++++++ - lib/strncpy_from_user.c | 9 +++++++++ - lib/strnlen_user.c | 9 +++++++++ - 5 files changed, 39 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h -index 04789f45ab2b..afce8ee5d7b7 100644 ---- a/arch/x86/include/asm/uaccess_64.h -+++ b/arch/x86/include/asm/uaccess_64.h -@@ -53,6 +53,17 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, - */ - #define valid_user_address(x) ((__force long)(x) >= 0) - -+/* -+ * Masking the user address is an alternative to a conditional -+ * user_access_begin that can avoid the fencing. This only works -+ * for dense accesses starting at the address. -+ */ -+#define mask_user_address(x) ((typeof(x))((long)(x)|((long)(x)>>63))) -+#define masked_user_access_begin(x) ({ \ -+ __auto_type __masked_ptr = (x); \ -+ __masked_ptr = mask_user_address(__masked_ptr); \ -+ __uaccess_begin(); __masked_ptr; }) -+ - /* - * User pointers can have tag bits on x86-64. This scheme tolerates - * arbitrary values in those bits rather then masking them off. -diff --git a/fs/select.c b/fs/select.c -index 9515c3fa1a03..bc185d111436 100644 ---- a/fs/select.c -+++ b/fs/select.c -@@ -780,7 +780,9 @@ static inline int get_sigset_argpack(struct sigset_argpack *to, - { - // the path is hot enough for overhead of copy_from_user() to matter - if (from) { -- if (!user_read_access_begin(from, sizeof(*from))) -+ if (can_do_masked_user_access()) -+ from = masked_user_access_begin(from); -+ else if (!user_read_access_begin(from, sizeof(*from))) - return -EFAULT; - unsafe_get_user(to->p, &from->p, Efault); - unsafe_get_user(to->size, &from->size, Efault); -diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h -index d8e4105a2f21..39c7cf82b0c2 100644 ---- a/include/linux/uaccess.h -+++ b/include/linux/uaccess.h -@@ -33,6 +33,13 @@ - }) - #endif - -+#ifdef masked_user_access_begin -+ #define can_do_masked_user_access() 1 -+#else -+ #define can_do_masked_user_access() 0 -+ #define masked_user_access_begin(src) NULL -+#endif -+ - /* - * Architectures should provide two primitives (raw_copy_{to,from}_user()) - * and get rid of their private instances of copy_{to,from}_user() and -diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c -index 6432b8c3e431..989a12a67872 100644 ---- a/lib/strncpy_from_user.c -+++ b/lib/strncpy_from_user.c -@@ -120,6 +120,15 @@ long strncpy_from_user(char *dst, const char __user *src, long count) - if (unlikely(count <= 0)) - return 0; - -+ if (can_do_masked_user_access()) { -+ long retval; -+ -+ src = masked_user_access_begin(src); -+ retval = do_strncpy_from_user(dst, src, count, count); -+ user_read_access_end(); -+ return retval; -+ } -+ - max_addr = TASK_SIZE_MAX; - src_addr = (unsigned long)untagged_addr(src); - if (likely(src_addr < max_addr)) { -diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c -index feeb935a2299..6e489f9e90f1 100644 ---- a/lib/strnlen_user.c -+++ b/lib/strnlen_user.c -@@ -96,6 +96,15 @@ long strnlen_user(const char __user *str, long count) - if (unlikely(count <= 0)) - return 0; - -+ if (can_do_masked_user_access()) { -+ long retval; -+ -+ str = masked_user_access_begin(str); -+ retval = do_strnlen_user(str, count, count); -+ user_read_access_end(); -+ return retval; -+ } -+ - max_addr = TASK_SIZE_MAX; - src_addr = (unsigned long)untagged_addr(str); - if (likely(src_addr < max_addr)) { --- -2.47.0.rc0 - -From bd40ee69b53e1cb291f96d3ad1120698aea8e96b Mon Sep 17 00:00:00 2001 -From: Eric Naim -Date: Tue, 22 Oct 2024 22:51:22 +0800 -Subject: [PATCH 02/13] amd-cache-optimizer - -Signed-off-by: Eric Naim +Signed-off-by: Peter Jung --- .../sysfs-bus-platform-drivers-amd_x3d_vcache | 14 ++ MAINTAINERS | 8 + @@ -147,10 +35,10 @@ index 000000000000..1aa6ed0c10d9 + + Format: %s. diff --git a/MAINTAINERS b/MAINTAINERS -index cc40a9d9b8cd..2ba00c0cd701 100644 +index bdae0faf000c..0ebce432a134 100644 --- a/MAINTAINERS +++ b/MAINTAINERS -@@ -948,6 +948,14 @@ Q: https://patchwork.kernel.org/project/linux-rdma/list/ +@@ -965,6 +965,14 @@ Q: https://patchwork.kernel.org/project/linux-rdma/list/ F: drivers/infiniband/hw/efa/ F: include/uapi/rdma/efa-abi.h @@ -401,68 +289,33 @@ index 000000000000..679613d02b9a +MODULE_DESCRIPTION("AMD 3D V-Cache Performance Optimizer Driver"); +MODULE_LICENSE("GPL"); -- -2.47.0.rc0 +2.47.0 -From f6f25468febfb80b968b50dabfb3f5656c4524e8 Mon Sep 17 00:00:00 2001 +From 2ec608e43d8d9b73183402468f1c84375534b191 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 25 Oct 2024 18:38:55 +0200 -Subject: [PATCH 03/13] amd-pstate +Date: Mon, 4 Nov 2024 14:46:55 +0100 +Subject: [PATCH 02/13] amd-pstate Signed-off-by: Peter Jung --- - Documentation/admin-guide/pm/amd-pstate.rst | 15 +- - arch/x86/include/asm/cpufeatures.h | 3 +- - arch/x86/include/asm/intel-family.h | 6 + - arch/x86/include/asm/processor.h | 21 +- - arch/x86/include/asm/topology.h | 9 + - arch/x86/kernel/acpi/cppc.c | 195 +++++++++++++- - arch/x86/kernel/cpu/amd.c | 16 -- - arch/x86/kernel/cpu/debugfs.c | 1 + - arch/x86/kernel/cpu/scattered.c | 3 +- - arch/x86/kernel/cpu/topology_amd.c | 3 + - arch/x86/kernel/cpu/topology_common.c | 34 +++ - arch/x86/kernel/smpboot.c | 5 +- - drivers/cpufreq/acpi-cpufreq.c | 12 +- - drivers/cpufreq/amd-pstate.c | 265 +++++++------------- - include/acpi/cppc_acpi.h | 41 ++- - tools/arch/x86/include/asm/cpufeatures.h | 2 +- - 16 files changed, 401 insertions(+), 230 deletions(-) + arch/x86/include/asm/cpufeatures.h | 3 +- + arch/x86/include/asm/intel-family.h | 6 + + arch/x86/include/asm/processor.h | 18 ++ + arch/x86/include/asm/topology.h | 9 + + arch/x86/kernel/acpi/cppc.c | 23 +++ + arch/x86/kernel/cpu/debugfs.c | 1 + + arch/x86/kernel/cpu/scattered.c | 3 +- + arch/x86/kernel/cpu/topology_amd.c | 3 + + arch/x86/kernel/cpu/topology_common.c | 34 ++++ + arch/x86/kernel/smpboot.c | 5 +- + arch/x86/mm/init.c | 23 ++- + drivers/cpufreq/amd-pstate-ut.c | 6 +- + drivers/cpufreq/amd-pstate.c | 231 ++++++++++------------- + tools/arch/x86/include/asm/cpufeatures.h | 2 +- + 14 files changed, 214 insertions(+), 153 deletions(-) -diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst -index d0324d44f548..210a808b74ec 100644 ---- a/Documentation/admin-guide/pm/amd-pstate.rst -+++ b/Documentation/admin-guide/pm/amd-pstate.rst -@@ -251,7 +251,9 @@ performance supported in `AMD CPPC Performance Capability `_). - In some ASICs, the highest CPPC performance is not the one in the ``_CPC`` - table, so we need to expose it to sysfs. If boost is not active, but - still supported, this maximum frequency will be larger than the one in --``cpuinfo``. -+``cpuinfo``. On systems that support preferred core, the driver will have -+different values for some cores than others and this will reflect the values -+advertised by the platform at bootup. - This attribute is read-only. - - ``amd_pstate_lowest_nonlinear_freq`` -@@ -262,6 +264,17 @@ lowest non-linear performance in `AMD CPPC Performance Capability - `_.) - This attribute is read-only. - -+``amd_pstate_hw_prefcore`` -+ -+Whether the platform supports the preferred core feature and it has been -+enabled. This attribute is read-only. -+ -+``amd_pstate_prefcore_ranking`` -+ -+The performance ranking of the core. This number doesn't have any unit, but -+larger numbers are preferred at the time of reading. This can change at -+runtime based on platform conditions. This attribute is read-only. -+ - ``energy_performance_available_preferences`` - - A list of all the supported EPP preferences that could be used for diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h -index 913fd3a7bac6..51b38bc66796 100644 +index 913fd3a7bac6..a7c93191b7c6 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -473,7 +473,8 @@ @@ -471,17 +324,17 @@ index 913fd3a7bac6..51b38bc66796 100644 #define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */ -#define X86_FEATURE_FAST_CPPC (21*32 + 5) /* AMD Fast CPPC */ +#define X86_FEATURE_AMD_FAST_CPPC (21*32 + 5) /* Fast CPPC */ -+#define X86_FEATURE_AMD_HETEROGENEOUS_CORES (21*32 + 6) /* Heterogeneous Core Topology */ ++#define X86_FEATURE_AMD_HETEROGENEOUS_CORES (21*32 + 6) /* Heterogeneous Core Topology */ /* * BUG word(s) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h -index f81a851c46dc..a44a3e026c4f 100644 +index 1a42f829667a..736764472048 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h -@@ -257,4 +257,10 @@ - #define INTEL_FAM5_QUARK_X1000 0x09 /* Quark X1000 SoC */ - #define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */ +@@ -183,4 +183,10 @@ + /* Family 19 */ + #define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */ +/* CPU core types */ +enum intel_cpu_type { @@ -491,7 +344,7 @@ index f81a851c46dc..a44a3e026c4f 100644 + #endif /* _ASM_X86_INTEL_FAMILY_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index a75a07f4931f..e17f4d733e44 100644 +index 4a686f0e5dbf..c0975815980c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -105,6 +105,24 @@ struct cpuinfo_topology { @@ -519,25 +372,8 @@ index a75a07f4931f..e17f4d733e44 100644 }; struct cpuinfo_x86 { -@@ -691,8 +709,6 @@ static inline u32 per_cpu_l2c_id(unsigned int cpu) - } - - #ifdef CONFIG_CPU_SUP_AMD --extern u32 amd_get_highest_perf(void); -- - /* - * Issue a DIV 0/1 insn to clear any division data from previous DIV - * operations. -@@ -705,7 +721,6 @@ static __always_inline void amd_clear_divider(void) - - extern void amd_check_microcode(void); - #else --static inline u32 amd_get_highest_perf(void) { return 0; } - static inline void amd_clear_divider(void) { } - static inline void amd_check_microcode(void) { } - #endif diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h -index abe3a8f22cbd..94d9832a5bc8 100644 +index aef70336d624..9f9376db64e3 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -114,6 +114,12 @@ enum x86_topology_domains { @@ -564,210 +400,24 @@ index abe3a8f22cbd..94d9832a5bc8 100644 { return __max_logical_packages; diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c -index ff8f25faca3d..59edf64ad9ed 100644 +index 956984054bf3..59edf64ad9ed 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c -@@ -9,6 +9,17 @@ - #include - #include - -+#define CPPC_HIGHEST_PERF_PERFORMANCE 196 -+#define CPPC_HIGHEST_PERF_PREFCORE 166 -+ -+enum amd_pref_core { -+ AMD_PREF_CORE_UNKNOWN = 0, -+ AMD_PREF_CORE_SUPPORTED, -+ AMD_PREF_CORE_UNSUPPORTED, -+}; -+static enum amd_pref_core amd_pref_core_detected; -+static u64 boost_numerator; -+ - /* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ - - bool cpc_supported_by_cpu(void) -@@ -69,31 +80,30 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) - static void amd_set_max_freq_ratio(void) +@@ -234,8 +234,10 @@ EXPORT_SYMBOL_GPL(amd_detect_prefcore); + */ + int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) { - struct cppc_perf_caps perf_caps; -- u64 highest_perf, nominal_perf; -+ u64 numerator, nominal_perf; - u64 perf_ratio; - int rc; - - rc = cppc_get_perf_caps(0, &perf_caps); - if (rc) { -- pr_debug("Could not retrieve perf counters (%d)\n", rc); -+ pr_warn("Could not retrieve perf counters (%d)\n", rc); - return; - } - -- highest_perf = amd_get_highest_perf(); -+ rc = amd_get_boost_ratio_numerator(0, &numerator); -+ if (rc) { -+ pr_warn("Could not retrieve highest performance (%d)\n", rc); -+ return; -+ } - nominal_perf = perf_caps.nominal_perf; - -- if (!highest_perf || !nominal_perf) { -- pr_debug("Could not retrieve highest or nominal performance\n"); -+ if (!nominal_perf) { -+ pr_warn("Could not retrieve nominal performance\n"); - return; - } - -- perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); - /* midpoint between max_boost and max_P */ -- perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; -- if (!perf_ratio) { -- pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); -- return; -- } -+ perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1; - - freq_invariance_set_perf_ratio(perf_ratio, false); - } -@@ -116,3 +126,166 @@ void init_freq_invariance_cppc(void) - init_done = true; - mutex_unlock(&freq_invariance_lock); - } -+ -+/* -+ * Get the highest performance register value. -+ * @cpu: CPU from which to get highest performance. -+ * @highest_perf: Return address for highest performance value. -+ * -+ * Return: 0 for success, negative error code otherwise. -+ */ -+int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf) -+{ -+ u64 val; -+ int ret; -+ -+ if (cpu_feature_enabled(X86_FEATURE_CPPC)) { -+ ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val); -+ if (ret) -+ goto out; -+ -+ val = AMD_CPPC_HIGHEST_PERF(val); -+ } else { -+ ret = cppc_get_highest_perf(cpu, &val); -+ if (ret) -+ goto out; -+ } -+ -+ WRITE_ONCE(*highest_perf, (u32)val); -+out: -+ return ret; -+} -+EXPORT_SYMBOL_GPL(amd_get_highest_perf); -+ -+/** -+ * amd_detect_prefcore: Detect if CPUs in the system support preferred cores -+ * @detected: Output variable for the result of the detection. -+ * -+ * Determine whether CPUs in the system support preferred cores. On systems -+ * that support preferred cores, different highest perf values will be found -+ * on different cores. On other systems, the highest perf value will be the -+ * same on all cores. -+ * -+ * The result of the detection will be stored in the 'detected' parameter. -+ * -+ * Return: 0 for success, negative error code otherwise -+ */ -+int amd_detect_prefcore(bool *detected) -+{ -+ int cpu, count = 0; -+ u64 highest_perf[2] = {0}; -+ -+ if (WARN_ON(!detected)) -+ return -EINVAL; -+ -+ switch (amd_pref_core_detected) { -+ case AMD_PREF_CORE_SUPPORTED: -+ *detected = true; -+ return 0; -+ case AMD_PREF_CORE_UNSUPPORTED: -+ *detected = false; -+ return 0; -+ default: -+ break; -+ } -+ -+ for_each_present_cpu(cpu) { -+ u32 tmp; -+ int ret; -+ -+ ret = amd_get_highest_perf(cpu, &tmp); -+ if (ret) -+ return ret; -+ -+ if (!count || (count == 1 && tmp != highest_perf[0])) -+ highest_perf[count++] = tmp; -+ -+ if (count == 2) -+ break; -+ } -+ -+ *detected = (count == 2); -+ boost_numerator = highest_perf[0]; -+ -+ amd_pref_core_detected = *detected ? AMD_PREF_CORE_SUPPORTED : -+ AMD_PREF_CORE_UNSUPPORTED; -+ -+ pr_debug("AMD CPPC preferred core is %ssupported (highest perf: 0x%llx)\n", -+ *detected ? "" : "un", highest_perf[0]); -+ -+ return 0; -+} -+EXPORT_SYMBOL_GPL(amd_detect_prefcore); -+ -+/** -+ * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation -+ * @cpu: CPU to get numerator for. -+ * @numerator: Output variable for numerator. -+ * -+ * Determine the numerator to use for calculating the boost ratio on -+ * a CPU. On systems that support preferred cores, this will be a hardcoded -+ * value. On other systems this will the highest performance register value. -+ * -+ * If booting the system with amd-pstate enabled but preferred cores disabled then -+ * the correct boost numerator will be returned to match hardware capabilities -+ * even if the preferred cores scheduling hints are not enabled. -+ * -+ * Return: 0 for success, negative error code otherwise. -+ */ -+int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) -+{ + enum x86_topology_cpu_type core_type = get_topology_cpu_type(&cpu_data(cpu)); -+ bool prefcore; -+ int ret; + bool prefcore; + int ret; + u32 tmp; -+ -+ ret = amd_detect_prefcore(&prefcore); -+ if (ret) -+ return ret; -+ -+ /* without preferred cores, return the highest perf register value */ -+ if (!prefcore) { -+ *numerator = boost_numerator; -+ return 0; -+ } -+ -+ /* -+ * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f, -+ * the highest performance level is set to 196. -+ * https://bugzilla.kernel.org/show_bug.cgi?id=218759 -+ */ -+ if (cpu_feature_enabled(X86_FEATURE_ZEN4)) { -+ switch (boot_cpu_data.x86_model) { -+ case 0x70 ... 0x7f: -+ *numerator = CPPC_HIGHEST_PERF_PERFORMANCE; -+ return 0; -+ default: -+ break; -+ } -+ } + + ret = amd_detect_prefcore(&prefcore); + if (ret) +@@ -261,6 +263,27 @@ int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) + break; + } + } + + /* detect if running on heterogeneous design */ + if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) { @@ -789,38 +439,9 @@ index ff8f25faca3d..59edf64ad9ed 100644 + } + } + -+ *numerator = CPPC_HIGHEST_PERF_PREFCORE; -+ -+ return 0; -+} -+EXPORT_SYMBOL_GPL(amd_get_boost_ratio_numerator); -diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c -index f01b72052f79..fab5caec0b72 100644 ---- a/arch/x86/kernel/cpu/amd.c -+++ b/arch/x86/kernel/cpu/amd.c -@@ -1190,22 +1190,6 @@ unsigned long amd_get_dr_addr_mask(unsigned int dr) - } - EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask); + *numerator = CPPC_HIGHEST_PERF_PREFCORE; --u32 amd_get_highest_perf(void) --{ -- struct cpuinfo_x86 *c = &boot_cpu_data; -- -- if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) || -- (c->x86_model >= 0x70 && c->x86_model < 0x80))) -- return 166; -- -- if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) || -- (c->x86_model >= 0x40 && c->x86_model < 0x70))) -- return 166; -- -- return 255; --} --EXPORT_SYMBOL_GPL(amd_get_highest_perf); -- - static void zenbleed_check_cpu(void *unused) - { - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + return 0; diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c index 3baf3e435834..10719aba6276 100644 --- a/arch/x86/kernel/cpu/debugfs.c @@ -934,7 +555,7 @@ index 9a6069e7133c..8277c64f88db 100644 case X86_VENDOR_HYGON: if (IS_ENABLED(CONFIG_CPU_SUP_HYGON)) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c -index 390e4fe7433e..9ee84f58f3b4 100644 +index 766f092dab80..b5a8f0891135 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -497,8 +497,9 @@ static int x86_cluster_flags(void) @@ -949,44 +570,76 @@ index 390e4fe7433e..9ee84f58f3b4 100644 return 0; } -diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c -index a8ca625a98b8..0f04feb6cafa 100644 ---- a/drivers/cpufreq/acpi-cpufreq.c -+++ b/drivers/cpufreq/acpi-cpufreq.c -@@ -642,10 +642,16 @@ static u64 get_max_boost_ratio(unsigned int cpu) - return 0; - } +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c +index eb503f53c319..101725c149c4 100644 +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -263,28 +263,33 @@ static void __init probe_page_size_mask(void) + } -- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) -- highest_perf = amd_get_highest_perf(); -- else -+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { -+ ret = amd_get_boost_ratio_numerator(cpu, &highest_perf); -+ if (ret) { -+ pr_debug("CPU%d: Unable to get boost ratio numerator (%d)\n", -+ cpu, ret); -+ return 0; -+ } -+ } else { - highest_perf = perf_caps.highest_perf; -+ } + /* +- * INVLPG may not properly flush Global entries +- * on these CPUs when PCIDs are enabled. ++ * INVLPG may not properly flush Global entries on ++ * these CPUs. New microcode fixes the issue. + */ + static const struct x86_cpu_id invlpg_miss_ids[] = { +- X86_MATCH_VFM(INTEL_ALDERLAKE, 0), +- X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0), +- X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0), +- X86_MATCH_VFM(INTEL_RAPTORLAKE, 0), +- X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0), +- X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0), ++ X86_MATCH_VFM(INTEL_ALDERLAKE, 0x2e), ++ X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0x42c), ++ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0x11), ++ X86_MATCH_VFM(INTEL_RAPTORLAKE, 0x118), ++ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0x4117), ++ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0x2e), + {} + }; - nominal_perf = perf_caps.nominal_perf; + static void setup_pcid(void) + { ++ const struct x86_cpu_id *invlpg_miss_match; ++ + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + if (!boot_cpu_has(X86_FEATURE_PCID)) + return; + +- if (x86_match_cpu(invlpg_miss_ids)) { ++ invlpg_miss_match = x86_match_cpu(invlpg_miss_ids); ++ ++ if (invlpg_miss_match && ++ boot_cpu_data.microcode < invlpg_miss_match->driver_data) { + pr_info("Incomplete global flushes, disabling PCID"); + setup_clear_cpu_cap(X86_FEATURE_PCID); + return; +diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c +index f66701514d90..a261d7300951 100644 +--- a/drivers/cpufreq/amd-pstate-ut.c ++++ b/drivers/cpufreq/amd-pstate-ut.c +@@ -227,10 +227,10 @@ static void amd_pstate_ut_check_freq(u32 index) + goto skip_test; + } + +- if (cpudata->min_freq != policy->min) { ++ if (cpudata->lowest_nonlinear_freq != policy->min) { + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; +- pr_err("%s cpu%d cpudata_min_freq=%d policy_min=%d, they should be equal!\n", +- __func__, cpu, cpudata->min_freq, policy->min); ++ pr_err("%s cpu%d cpudata_lowest_nonlinear_freq=%d policy_min=%d, they should be equal!\n", ++ __func__, cpu, cpudata->lowest_nonlinear_freq, policy->min); + goto skip_test; + } diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c -index 589fde37ccd7..fb0a72ccff79 100644 +index b63863f77c67..d7630bab2516 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c -@@ -52,8 +52,6 @@ - #define AMD_PSTATE_TRANSITION_LATENCY 20000 - #define AMD_PSTATE_TRANSITION_DELAY 1000 - #define AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY 600 --#define CPPC_HIGHEST_PERF_PERFORMANCE 196 --#define CPPC_HIGHEST_PERF_DEFAULT 166 - - #define AMD_CPPC_EPP_PERFORMANCE 0x00 - #define AMD_CPPC_EPP_BALANCE_PERFORMANCE 0x80 -@@ -239,7 +237,7 @@ static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata) +@@ -233,7 +233,7 @@ static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata) return index; } @@ -995,7 +648,7 @@ index 589fde37ccd7..fb0a72ccff79 100644 u32 des_perf, u32 max_perf, bool fast_switch) { if (fast_switch) -@@ -249,7 +247,7 @@ static void pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf, +@@ -243,7 +243,7 @@ static void pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf, READ_ONCE(cpudata->cppc_req_cached)); } @@ -1004,80 +657,54 @@ index 589fde37ccd7..fb0a72ccff79 100644 static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf, u32 des_perf, -@@ -312,7 +310,7 @@ static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, +@@ -306,11 +306,17 @@ static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, return ret; } -static inline int pstate_enable(bool enable) -+static inline int msr_enable(bool enable) ++static inline int msr_cppc_enable(bool enable) { int ret, cpu; unsigned long logical_proc_id_mask = 0; -@@ -338,7 +336,7 @@ static inline int pstate_enable(bool enable) + ++ /* ++ * MSR_AMD_CPPC_ENABLE is write-once, once set it cannot be cleared. ++ */ ++ if (!enable) ++ return 0; ++ + if (enable == cppc_enabled) + return 0; + +@@ -332,7 +338,7 @@ static inline int pstate_enable(bool enable) return 0; } -static int cppc_enable(bool enable) -+static int shmem_enable(bool enable) ++static int shmem_cppc_enable(bool enable) { int cpu, ret = 0; struct cppc_perf_ctrls perf_ctrls; -@@ -365,50 +363,24 @@ static int cppc_enable(bool enable) +@@ -359,14 +365,14 @@ static int cppc_enable(bool enable) return ret; } -DEFINE_STATIC_CALL(amd_pstate_enable, pstate_enable); -+DEFINE_STATIC_CALL(amd_pstate_enable, msr_enable); ++DEFINE_STATIC_CALL(amd_pstate_cppc_enable, msr_cppc_enable); - static inline int amd_pstate_enable(bool enable) +-static inline int amd_pstate_enable(bool enable) ++static inline int amd_pstate_cppc_enable(bool enable) { - return static_call(amd_pstate_enable)(enable); +- return static_call(amd_pstate_enable)(enable); ++ return static_call(amd_pstate_cppc_enable)(enable); } --static u32 amd_pstate_highest_perf_set(struct amd_cpudata *cpudata) --{ -- struct cpuinfo_x86 *c = &cpu_data(0); -- -- /* -- * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f, -- * the highest performance level is set to 196. -- * https://bugzilla.kernel.org/show_bug.cgi?id=218759 -- */ -- if (c->x86 == 0x19 && (c->x86_model >= 0x70 && c->x86_model <= 0x7f)) -- return CPPC_HIGHEST_PERF_PERFORMANCE; -- -- return CPPC_HIGHEST_PERF_DEFAULT; --} -- -static int pstate_init_perf(struct amd_cpudata *cpudata) +static int msr_init_perf(struct amd_cpudata *cpudata) { u64 cap1; -- u32 highest_perf; - int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, - &cap1); - if (ret) - return ret; - -- /* For platforms that do not support the preferred core feature, the -- * highest_pef may be configured with 166 or 255, to avoid max frequency -- * calculated wrongly. we take the AMD_CPPC_HIGHEST_PERF(cap1) value as -- * the default max perf. -- */ -- if (cpudata->hw_prefcore) -- highest_perf = amd_pstate_highest_perf_set(cpudata); -- else -- highest_perf = AMD_CPPC_HIGHEST_PERF(cap1); -- -- WRITE_ONCE(cpudata->highest_perf, highest_perf); -- WRITE_ONCE(cpudata->max_limit_perf, highest_perf); -+ WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); -+ WRITE_ONCE(cpudata->max_limit_perf, AMD_CPPC_HIGHEST_PERF(cap1)); - WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1)); - WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1)); - WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1)); -@@ -417,22 +389,16 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) +@@ -385,7 +391,7 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) return 0; } @@ -1085,25 +712,8 @@ index 589fde37ccd7..fb0a72ccff79 100644 +static int shmem_init_perf(struct amd_cpudata *cpudata) { struct cppc_perf_caps cppc_perf; -- u32 highest_perf; - int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); - if (ret) - return ret; - -- if (cpudata->hw_prefcore) -- highest_perf = amd_pstate_highest_perf_set(cpudata); -- else -- highest_perf = cppc_perf.highest_perf; -- -- WRITE_ONCE(cpudata->highest_perf, highest_perf); -- WRITE_ONCE(cpudata->max_limit_perf, highest_perf); -+ WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf); -+ WRITE_ONCE(cpudata->max_limit_perf, cppc_perf.highest_perf); - WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf); - WRITE_ONCE(cpudata->lowest_nonlinear_perf, - cppc_perf.lowest_nonlinear_perf); -@@ -458,14 +424,14 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) +@@ -420,14 +426,14 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) return ret; } @@ -1120,7 +730,7 @@ index 589fde37ccd7..fb0a72ccff79 100644 u32 min_perf, u32 des_perf, u32 max_perf, bool fast_switch) { -@@ -565,20 +531,44 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, +@@ -527,9 +533,28 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, cpufreq_cpu_put(policy); } @@ -1151,40 +761,7 @@ index 589fde37ccd7..fb0a72ccff79 100644 return 0; } - - static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) - { -- u32 max_limit_perf, min_limit_perf, lowest_perf; -+ u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf; - struct amd_cpudata *cpudata = policy->driver_data; - -- max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq); -- min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq); -+ if (cpudata->boost_supported && !policy->boost_enabled) -+ max_perf = READ_ONCE(cpudata->nominal_perf); -+ else -+ max_perf = READ_ONCE(cpudata->highest_perf); -+ -+ max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq); -+ min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq); - - lowest_perf = READ_ONCE(cpudata->lowest_perf); - if (min_limit_perf < lowest_perf) -@@ -659,12 +649,7 @@ static void amd_pstate_adjust_perf(unsigned int cpu, - unsigned long max_perf, min_perf, des_perf, - cap_perf, lowest_nonlinear_perf; - struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); -- struct amd_cpudata *cpudata; -- -- if (!policy) -- return; -- -- cpudata = policy->driver_data; -+ struct amd_cpudata *cpudata = policy->driver_data; - - if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) - amd_pstate_update_min_max_limit(policy); -@@ -698,34 +683,12 @@ static void amd_pstate_adjust_perf(unsigned int cpu, +@@ -665,34 +690,12 @@ static void amd_pstate_adjust_perf(unsigned int cpu, static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on) { struct amd_cpudata *cpudata = policy->driver_data; @@ -1220,110 +797,7 @@ index 589fde37ccd7..fb0a72ccff79 100644 if (on) policy->cpuinfo.max_freq = max_freq; else if (policy->cpuinfo.max_freq > nominal_freq * 1000) -@@ -811,66 +774,22 @@ static void amd_pstste_sched_prefcore_workfn(struct work_struct *work) - } - static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn); - --/* -- * Get the highest performance register value. -- * @cpu: CPU from which to get highest performance. -- * @highest_perf: Return address. -- * -- * Return: 0 for success, -EIO otherwise. -- */ --static int amd_pstate_get_highest_perf(int cpu, u32 *highest_perf) --{ -- int ret; -- -- if (cpu_feature_enabled(X86_FEATURE_CPPC)) { -- u64 cap1; -- -- ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1); -- if (ret) -- return ret; -- WRITE_ONCE(*highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); -- } else { -- u64 cppc_highest_perf; -- -- ret = cppc_get_highest_perf(cpu, &cppc_highest_perf); -- if (ret) -- return ret; -- WRITE_ONCE(*highest_perf, cppc_highest_perf); -- } -- -- return (ret); --} -- - #define CPPC_MAX_PERF U8_MAX - - static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) - { -- int ret, prio; -- u32 highest_perf; -- -- ret = amd_pstate_get_highest_perf(cpudata->cpu, &highest_perf); -- if (ret) -+ /* user disabled or not detected */ -+ if (!amd_pstate_prefcore) - return; - - cpudata->hw_prefcore = true; -- /* check if CPPC preferred core feature is enabled*/ -- if (highest_perf < CPPC_MAX_PERF) -- prio = (int)highest_perf; -- else { -- pr_debug("AMD CPPC preferred core is unsupported!\n"); -- cpudata->hw_prefcore = false; -- return; -- } -- -- if (!amd_pstate_prefcore) -- return; - - /* - * The priorities can be set regardless of whether or not - * sched_set_itmt_support(true) has been called and it is valid to - * update them at any time after it has been called. - */ -- sched_set_itmt_core_prio(prio, cpudata->cpu); -+ sched_set_itmt_core_prio((int)READ_ONCE(cpudata->highest_perf), cpudata->cpu); - - schedule_work(&sched_prefcore_work); - } -@@ -878,27 +797,22 @@ static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) - static void amd_pstate_update_limits(unsigned int cpu) - { - struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); -- struct amd_cpudata *cpudata; -+ struct amd_cpudata *cpudata = policy->driver_data; - u32 prev_high = 0, cur_high = 0; - int ret; - bool highest_perf_changed = false; - -- if (!policy) -+ if (!amd_pstate_prefcore) - return; - -- cpudata = policy->driver_data; -- - mutex_lock(&amd_pstate_driver_lock); -- if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore)) -- goto free_cpufreq_put; -- -- ret = amd_pstate_get_highest_perf(cpu, &cur_high); -+ ret = amd_get_highest_perf(cpu, &cur_high); - if (ret) - goto free_cpufreq_put; - - prev_high = READ_ONCE(cpudata->prefcore_ranking); -- if (prev_high != cur_high) { -- highest_perf_changed = true; -+ highest_perf_changed = (prev_high != cur_high); -+ if (highest_perf_changed) { - WRITE_ONCE(cpudata->prefcore_ranking, cur_high); - - if (cur_high < CPPC_MAX_PERF) -@@ -924,7 +838,7 @@ static u32 amd_pstate_get_transition_delay_us(unsigned int cpu) +@@ -847,7 +850,7 @@ static u32 amd_pstate_get_transition_delay_us(unsigned int cpu) transition_delay_ns = cppc_get_transition_latency(cpu); if (transition_delay_ns == CPUFREQ_ETERNAL) { @@ -1332,46 +806,7 @@ index 589fde37ccd7..fb0a72ccff79 100644 return AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY; else return AMD_PSTATE_TRANSITION_DELAY; -@@ -962,8 +876,8 @@ static u32 amd_pstate_get_transition_latency(unsigned int cpu) - static int amd_pstate_init_freq(struct amd_cpudata *cpudata) - { - int ret; -- u32 min_freq; -- u32 highest_perf, max_freq; -+ u32 min_freq, max_freq; -+ u64 numerator; - u32 nominal_perf, nominal_freq; - u32 lowest_nonlinear_perf, lowest_nonlinear_freq; - u32 boost_ratio, lowest_nonlinear_ratio; -@@ -985,8 +899,10 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) - - nominal_perf = READ_ONCE(cpudata->nominal_perf); - -- highest_perf = READ_ONCE(cpudata->highest_perf); -- boost_ratio = div_u64(highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf); -+ ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator); -+ if (ret) -+ return ret; -+ boost_ratio = div_u64(numerator << SCHED_CAPACITY_SHIFT, nominal_perf); - max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000; - - lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); -@@ -1041,12 +957,12 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) - - cpudata->cpu = policy->cpu; - -- amd_pstate_init_prefcore(cpudata); -- - ret = amd_pstate_init_perf(cpudata); - if (ret) - goto free_cpudata1; - -+ amd_pstate_init_prefcore(cpudata); -+ - ret = amd_pstate_init_freq(cpudata); - if (ret) - goto free_cpudata1; -@@ -1076,7 +992,7 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) +@@ -1001,7 +1004,7 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) policy->fast_switch_possible = true; ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0], @@ -1380,45 +815,85 @@ index 589fde37ccd7..fb0a72ccff79 100644 if (ret < 0) { dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret); goto free_cpudata1; -@@ -1281,11 +1197,21 @@ static int amd_pstate_register_driver(int mode) - return -EINVAL; +@@ -1045,7 +1048,7 @@ static int amd_pstate_cpu_resume(struct cpufreq_policy *policy) + { + int ret; - cppc_state = mode; -+ -+ ret = amd_pstate_enable(true); -+ if (ret) { -+ pr_err("failed to enable cppc during amd-pstate driver registration, return %d\n", -+ ret); -+ amd_pstate_driver_cleanup(); -+ return ret; -+ } -+ - ret = cpufreq_register_driver(current_pstate_driver); - if (ret) { - amd_pstate_driver_cleanup(); - return ret; - } -+ - return 0; +- ret = amd_pstate_enable(true); ++ ret = amd_pstate_cppc_enable(true); + if (ret) + pr_err("failed to enable amd-pstate during resume, return %d\n", ret); + +@@ -1056,7 +1059,7 @@ static int amd_pstate_cpu_suspend(struct cpufreq_policy *policy) + { + int ret; + +- ret = amd_pstate_enable(false); ++ ret = amd_pstate_cppc_enable(false); + if (ret) + pr_err("failed to disable amd-pstate during suspend, return %d\n", ret); + +@@ -1189,25 +1192,41 @@ static ssize_t show_energy_performance_preference( + + static void amd_pstate_driver_cleanup(void) + { +- amd_pstate_enable(false); ++ amd_pstate_cppc_enable(false); + cppc_state = AMD_PSTATE_DISABLE; + current_pstate_driver = NULL; } -@@ -1496,12 +1422,12 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) - cpudata->cpu = policy->cpu; - cpudata->epp_policy = 0; - -- amd_pstate_init_prefcore(cpudata); -- - ret = amd_pstate_init_perf(cpudata); - if (ret) - goto free_cpudata1; - -+ amd_pstate_init_prefcore(cpudata); ++static int amd_pstate_set_driver(int mode_idx) ++{ ++ if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) { ++ cppc_state = mode_idx; ++ if (cppc_state == AMD_PSTATE_DISABLE) ++ pr_info("driver is explicitly disabled\n"); + - ret = amd_pstate_init_freq(cpudata); - if (ret) - goto free_cpudata1; -@@ -1571,23 +1497,13 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) - static void amd_pstate_epp_update_limit(struct cpufreq_policy *policy) ++ if (cppc_state == AMD_PSTATE_ACTIVE) ++ current_pstate_driver = &amd_pstate_epp_driver; ++ ++ if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED) ++ current_pstate_driver = &amd_pstate_driver; ++ ++ return 0; ++ } ++ ++ return -EINVAL; ++} ++ + static int amd_pstate_register_driver(int mode) + { + int ret; + +- if (mode == AMD_PSTATE_PASSIVE || mode == AMD_PSTATE_GUIDED) +- current_pstate_driver = &amd_pstate_driver; +- else if (mode == AMD_PSTATE_ACTIVE) +- current_pstate_driver = &amd_pstate_epp_driver; +- else +- return -EINVAL; ++ ret = amd_pstate_set_driver(mode); ++ if (ret) ++ return ret; + + cppc_state = mode; + +- ret = amd_pstate_enable(true); ++ ret = amd_pstate_cppc_enable(true); + if (ret) { + pr_err("failed to enable cppc during amd-pstate driver registration, return %d\n", + ret); +@@ -1485,6 +1504,8 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) + WRITE_ONCE(cpudata->cppc_cap1_cached, value); + } + ++ current_pstate_driver->adjust_perf = NULL; ++ + return 0; + + free_cpudata1: +@@ -1507,26 +1528,13 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) + static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; - u32 max_perf, min_perf, min_limit_perf, max_limit_perf; @@ -1426,10 +901,14 @@ index 589fde37ccd7..fb0a72ccff79 100644 u64 value; s16 epp; - max_perf = READ_ONCE(cpudata->highest_perf); +- if (cpudata->boost_supported && !policy->boost_enabled) +- max_perf = READ_ONCE(cpudata->nominal_perf); +- else +- max_perf = READ_ONCE(cpudata->highest_perf); ++ max_perf = READ_ONCE(cpudata->highest_perf); min_perf = READ_ONCE(cpudata->lowest_perf); -- max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq); -- min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq); +- max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq); +- min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq); - - if (min_limit_perf < min_perf) - min_limit_perf = min_perf; @@ -1443,7 +922,16 @@ index 589fde37ccd7..fb0a72ccff79 100644 max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf, cpudata->max_limit_perf); -@@ -1624,12 +1540,6 @@ static void amd_pstate_epp_update_limit(struct cpufreq_policy *policy) +@@ -1535,7 +1543,7 @@ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) + value = READ_ONCE(cpudata->cppc_req_cached); + + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) +- min_perf = max_perf; ++ min_perf = min(cpudata->nominal_perf, max_perf); + + /* Initial min/max values for CPPC Performance Controls Register */ + value &= ~AMD_CPPC_MIN_PERF(~0L); +@@ -1563,12 +1571,6 @@ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) epp = 0; @@ -1454,9 +942,42 @@ index 589fde37ccd7..fb0a72ccff79 100644 - } - WRITE_ONCE(cpudata->cppc_req_cached, value); - amd_pstate_set_epp(cpudata, epp); + return amd_pstate_set_epp(cpudata, epp); } -@@ -1737,13 +1647,6 @@ static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) +@@ -1605,7 +1607,7 @@ static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) + u64 value, max_perf; + int ret; + +- ret = amd_pstate_enable(true); ++ ret = amd_pstate_cppc_enable(true); + if (ret) + pr_err("failed to enable amd pstate during resume, return %d\n", ret); + +@@ -1616,8 +1618,9 @@ static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) + wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); + } else { + perf_ctrls.max_perf = max_perf; +- perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached); + cppc_set_perf(cpudata->cpu, &perf_ctrls); ++ perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached); ++ cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); + } + } + +@@ -1657,9 +1660,11 @@ static void amd_pstate_epp_offline(struct cpufreq_policy *policy) + wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); + } else { + perf_ctrls.desired_perf = 0; ++ perf_ctrls.min_perf = min_perf; + perf_ctrls.max_perf = min_perf; +- perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE); + cppc_set_perf(cpudata->cpu, &perf_ctrls); ++ perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE); ++ cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); + } + mutex_unlock(&amd_pstate_limits_lock); + } +@@ -1679,13 +1684,6 @@ static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) return 0; } @@ -1470,7 +991,16 @@ index 589fde37ccd7..fb0a72ccff79 100644 static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; -@@ -1799,7 +1702,7 @@ static struct cpufreq_driver amd_pstate_driver = { +@@ -1699,7 +1697,7 @@ static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) + cpudata->suspended = true; + + /* disable CPPC in lowlevel firmware */ +- ret = amd_pstate_enable(false); ++ ret = amd_pstate_cppc_enable(false); + if (ret) + pr_err("failed to suspend, return %d\n", ret); + +@@ -1741,7 +1739,7 @@ static struct cpufreq_driver amd_pstate_driver = { static struct cpufreq_driver amd_pstate_epp_driver = { .flags = CPUFREQ_CONST_LOOPS, @@ -1479,129 +1009,118 @@ index 589fde37ccd7..fb0a72ccff79 100644 .setpolicy = amd_pstate_epp_set_policy, .init = amd_pstate_epp_cpu_init, .exit = amd_pstate_epp_cpu_exit, -@@ -1832,7 +1735,7 @@ static int __init amd_pstate_set_driver(int mode_idx) - return -EINVAL; - } +@@ -1755,26 +1753,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = { + .attr = amd_pstate_epp_attr, + }; +-static int __init amd_pstate_set_driver(int mode_idx) +-{ +- if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) { +- cppc_state = mode_idx; +- if (cppc_state == AMD_PSTATE_DISABLE) +- pr_info("driver is explicitly disabled\n"); +- +- if (cppc_state == AMD_PSTATE_ACTIVE) +- current_pstate_driver = &amd_pstate_epp_driver; +- +- if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED) +- current_pstate_driver = &amd_pstate_driver; +- +- return 0; +- } +- +- return -EINVAL; +-} +- -/** +/* * CPPC function is not supported for family ID 17H with model_ID ranging from 0x10 to 0x2F. * show the debug message that helps to check if the CPU has CPPC support for loading issue. */ -@@ -1955,9 +1858,15 @@ static int __init amd_pstate_init(void) - current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; +@@ -1864,10 +1843,10 @@ static int __init amd_pstate_init(void) + if (cppc_state == AMD_PSTATE_UNDEFINED) { + /* Disable on the following configs by default: + * 1. Undefined platforms +- * 2. Server platforms ++ * 2. Server platforms with CPUs older than Family 0x1A. + */ + if (amd_pstate_acpi_pm_profile_undefined() || +- amd_pstate_acpi_pm_profile_server()) { ++ (amd_pstate_acpi_pm_profile_server() && boot_cpu_data.x86 < 0x1A)) { + pr_info("driver load is disabled, boot with specific mode to enable this\n"); + return -ENODEV; + } +@@ -1875,50 +1854,31 @@ static int __init amd_pstate_init(void) + cppc_state = CONFIG_X86_AMD_PSTATE_DEFAULT_MODE; + } + +- switch (cppc_state) { +- case AMD_PSTATE_DISABLE: ++ if (cppc_state == AMD_PSTATE_DISABLE) { + pr_info("driver load is disabled, boot with specific mode to enable this\n"); + return -ENODEV; +- case AMD_PSTATE_PASSIVE: +- case AMD_PSTATE_ACTIVE: +- case AMD_PSTATE_GUIDED: +- ret = amd_pstate_set_driver(cppc_state); +- if (ret) +- return ret; +- break; +- default: +- return -EINVAL; + } + + /* capability check */ + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + pr_debug("AMD CPPC MSR based functionality is supported\n"); +- if (cppc_state != AMD_PSTATE_ACTIVE) +- current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; } else { pr_debug("AMD CPPC shared memory based functionality is supported\n"); - static_call_update(amd_pstate_enable, cppc_enable); - static_call_update(amd_pstate_init_perf, cppc_init_perf); - static_call_update(amd_pstate_update_perf, cppc_update_perf); -+ static_call_update(amd_pstate_enable, shmem_enable); ++ static_call_update(amd_pstate_cppc_enable, shmem_cppc_enable); + static_call_update(amd_pstate_init_perf, shmem_init_perf); + static_call_update(amd_pstate_update_perf, shmem_update_perf); -+ } -+ + } + +- if (amd_pstate_prefcore) { +- ret = amd_detect_prefcore(&amd_pstate_prefcore); +- if (ret) +- return ret; +- } +- +- /* enable amd pstate feature */ +- ret = amd_pstate_enable(true); ++ ret = amd_pstate_register_driver(cppc_state); + if (ret) { +- pr_err("failed to enable driver mode(%d)\n", cppc_state); ++ pr_err("failed to register with return %d\n", ret); + return ret; + } + +- ret = cpufreq_register_driver(current_pstate_driver); +- if (ret) { +- pr_err("failed to register with return %d\n", ret); +- goto disable_driver; + if (amd_pstate_prefcore) { + ret = amd_detect_prefcore(&amd_pstate_prefcore); + if (ret) + return ret; } - /* enable amd pstate feature */ -diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h -index e1720d930666..76e44e102780 100644 ---- a/include/acpi/cppc_acpi.h -+++ b/include/acpi/cppc_acpi.h -@@ -161,34 +161,37 @@ extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf); - extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable); - extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps); - extern int cppc_set_auto_sel(int cpu, bool enable); -+extern int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf); -+extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator); -+extern int amd_detect_prefcore(bool *detected); - #else /* !CONFIG_ACPI_CPPC_LIB */ - static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf) - { -- return -ENOTSUPP; -+ return -EOPNOTSUPP; - } - static inline int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) - { -- return -ENOTSUPP; -+ return -EOPNOTSUPP; - } - static inline int cppc_get_highest_perf(int cpunum, u64 *highest_perf) - { -- return -ENOTSUPP; -+ return -EOPNOTSUPP; - } - static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs) - { -- return -ENOTSUPP; -+ return -EOPNOTSUPP; - } - static inline int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) - { -- return -ENOTSUPP; -+ return -EOPNOTSUPP; - } - static inline int cppc_set_enable(int cpu, bool enable) - { -- return -ENOTSUPP; -+ return -EOPNOTSUPP; - } - static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps) - { -- return -ENOTSUPP; -+ return -EOPNOTSUPP; - } - static inline bool cppc_perf_ctrs_in_pcc(void) - { -@@ -212,27 +215,39 @@ static inline bool cpc_ffh_supported(void) - } - static inline int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) - { -- return -ENOTSUPP; -+ return -EOPNOTSUPP; - } - static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) - { -- return -ENOTSUPP; -+ return -EOPNOTSUPP; - } - static inline int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) - { -- return -ENOTSUPP; -+ return -EOPNOTSUPP; - } - static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf) - { -- return -ENOTSUPP; -+ return -EOPNOTSUPP; - } - static inline int cppc_set_auto_sel(int cpu, bool enable) - { -- return -ENOTSUPP; -+ return -EOPNOTSUPP; - } - static inline int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps) - { -- return -ENOTSUPP; -+ return -EOPNOTSUPP; -+} -+static inline int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf) -+{ -+ return -ENODEV; -+} -+static inline int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) -+{ -+ return -EOPNOTSUPP; -+} -+static inline int amd_detect_prefcore(bool *detected) -+{ -+ return -ENODEV; - } - #endif /* !CONFIG_ACPI_CPPC_LIB */ + dev_root = bus_get_dev_root(&cpu_subsys); +@@ -1935,8 +1895,7 @@ static int __init amd_pstate_init(void) + global_attr_free: + cpufreq_unregister_driver(current_pstate_driver); +-disable_driver: +- amd_pstate_enable(false); ++ amd_pstate_cppc_enable(false); + return ret; + } + device_initcall(amd_pstate_init); diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index dd4682857c12..23698d0f4bb4 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h @@ -1616,14 +1135,760 @@ index dd4682857c12..23698d0f4bb4 100644 /* * BUG word(s) -- -2.47.0.rc0 +2.47.0 -From f3e882f80066e4cfda6767e245e95ca280db8bc0 Mon Sep 17 00:00:00 2001 -From: Eric Naim -Date: Tue, 22 Oct 2024 22:52:00 +0800 +From 76091118b35aeec1bf99e555aa14a8814ed9fbca Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 4 Nov 2024 14:53:16 +0100 +Subject: [PATCH 03/13] autofdo + +Signed-off-by: Peter Jung +--- + Documentation/dev-tools/autofdo.rst | 168 ++++++++++++++++++++++++++ + Documentation/dev-tools/index.rst | 2 + + Documentation/dev-tools/propeller.rst | 162 +++++++++++++++++++++++++ + MAINTAINERS | 14 +++ + Makefile | 2 + + arch/Kconfig | 39 ++++++ + arch/sparc/kernel/vmlinux.lds.S | 5 + + arch/x86/Kconfig | 2 + + arch/x86/kernel/vmlinux.lds.S | 4 + + include/asm-generic/vmlinux.lds.h | 49 ++++++-- + scripts/Makefile.autofdo | 24 ++++ + scripts/Makefile.lib | 20 +++ + scripts/Makefile.propeller | 28 +++++ + tools/objtool/check.c | 2 + + tools/objtool/elf.c | 15 ++- + 15 files changed, 520 insertions(+), 16 deletions(-) + create mode 100644 Documentation/dev-tools/autofdo.rst + create mode 100644 Documentation/dev-tools/propeller.rst + create mode 100644 scripts/Makefile.autofdo + create mode 100644 scripts/Makefile.propeller + +diff --git a/Documentation/dev-tools/autofdo.rst b/Documentation/dev-tools/autofdo.rst +new file mode 100644 +index 000000000000..1f0a451e9ccd +--- /dev/null ++++ b/Documentation/dev-tools/autofdo.rst +@@ -0,0 +1,168 @@ ++.. SPDX-License-Identifier: GPL-2.0 ++ ++=================================== ++Using AutoFDO with the Linux kernel ++=================================== ++ ++This enables AutoFDO build support for the kernel when using ++the Clang compiler. AutoFDO (Auto-Feedback-Directed Optimization) ++is a type of profile-guided optimization (PGO) used to enhance the ++performance of binary executables. It gathers information about the ++frequency of execution of various code paths within a binary using ++hardware sampling. This data is then used to guide the compiler's ++optimization decisions, resulting in a more efficient binary. AutoFDO ++is a powerful optimization technique, and data indicates that it can ++significantly improve kernel performance. It's especially beneficial ++for workloads affected by front-end stalls. ++ ++For AutoFDO builds, unlike non-FDO builds, the user must supply a ++profile. Acquiring an AutoFDO profile can be done in several ways. ++AutoFDO profiles are created by converting hardware sampling using ++the "perf" tool. It is crucial that the workload used to create these ++perf files is representative; they must exhibit runtime ++characteristics similar to the workloads that are intended to be ++optimized. Failure to do so will result in the compiler optimizing ++for the wrong objective. ++ ++The AutoFDO profile often encapsulates the program's behavior. If the ++performance-critical codes are architecture-independent, the profile ++can be applied across platforms to achieve performance gains. For ++instance, using the profile generated on Intel architecture to build ++a kernel for AMD architecture can also yield performance improvements. ++ ++There are two methods for acquiring a representative profile: ++(1) Sample real workloads using a production environment. ++(2) Generate the profile using a representative load test. ++When enabling the AutoFDO build configuration without providing an ++AutoFDO profile, the compiler only modifies the dwarf information in ++the kernel without impacting runtime performance. It's advisable to ++use a kernel binary built with the same AutoFDO configuration to ++collect the perf profile. While it's possible to use a kernel built ++with different options, it may result in inferior performance. ++ ++One can collect profiles using AutoFDO build for the previous kernel. ++AutoFDO employs relative line numbers to match the profiles, offering ++some tolerance for source changes. This mode is commonly used in a ++production environment for profile collection. ++ ++In a profile collection based on a load test, the AutoFDO collection ++process consists of the following steps: ++ ++#. Initial build: The kernel is built with AutoFDO options ++ without a profile. ++ ++#. Profiling: The above kernel is then run with a representative ++ workload to gather execution frequency data. This data is ++ collected using hardware sampling, via perf. AutoFDO is most ++ effective on platforms supporting advanced PMU features like ++ LBR on Intel machines. ++ ++#. AutoFDO profile generation: Perf output file is converted to ++ the AutoFDO profile via offline tools. ++ ++The support requires a Clang compiler LLVM 17 or later. ++ ++Preparation ++=========== ++ ++Configure the kernel with:: ++ ++ CONFIG_AUTOFDO_CLANG=y ++ ++Customization ++============= ++ ++The default CONFIG_AUTOFDO_CLANG setting covers kernel space objects for ++AutoFDO builds. One can, however, enable or disable AutoFDO build for ++individual files and directories by adding a line similar to the following ++to the respective kernel Makefile: ++ ++- For enabling a single file (e.g. foo.o) :: ++ ++ AUTOFDO_PROFILE_foo.o := y ++ ++- For enabling all files in one directory :: ++ ++ AUTOFDO_PROFILE := y ++ ++- For disabling one file :: ++ ++ AUTOFDO_PROFILE_foo.o := n ++ ++- For disabling all files in one directory :: ++ ++ AUTOFDO_PROFILE := n ++ ++Workflow ++======== ++ ++Here is an example workflow for AutoFDO kernel: ++ ++1) Build the kernel on the host machine with LLVM enabled, ++ for example, :: ++ ++ $ make menuconfig LLVM=1 ++ ++ Turn on AutoFDO build config:: ++ ++ CONFIG_AUTOFDO_CLANG=y ++ ++ With a configuration that with LLVM enabled, use the following command:: ++ ++ $ scripts/config -e AUTOFDO_CLANG ++ ++ After getting the config, build with :: ++ ++ $ make LLVM=1 ++ ++2) Install the kernel on the test machine. ++ ++3) Run the load tests. The '-c' option in perf specifies the sample ++ event period. We suggest using a suitable prime number, like 500009, ++ for this purpose. ++ ++ - For Intel platforms:: ++ ++ $ perf record -e BR_INST_RETIRED.NEAR_TAKEN:k -a -N -b -c -o -- ++ ++ - For AMD platforms: ++ ++ The supported systems are: Zen3 with BRS, or Zen4 with amd_lbr_v2. To check, ++ ++ For Zen3:: ++ ++ $ cat proc/cpuinfo | grep " brs" ++ ++ For Zen4:: ++ ++ $ cat proc/cpuinfo | grep amd_lbr_v2 ++ ++ The following command generated the perf data file:: ++ ++ $ perf record --pfm-events RETIRED_TAKEN_BRANCH_INSTRUCTIONS:k -a -N -b -c -o -- ++ ++4) (Optional) Download the raw perf file to the host machine. ++ ++5) To generate an AutoFDO profile, two offline tools are available: ++ create_llvm_prof and llvm_profgen. The create_llvm_prof tool is part ++ of the AutoFDO project and can be found on GitHub ++ (https://github.com/google/autofdo), version v0.30.1 or later. ++ The llvm_profgen tool is included in the LLVM compiler itself. It's ++ important to note that the version of llvm_profgen doesn't need to match ++ the version of Clang. It needs to be the LLVM 19 release of Clang ++ or later, or just from the LLVM trunk. :: ++ ++ $ llvm-profgen --kernel --binary= --perfdata= -o ++ ++ or :: ++ ++ $ create_llvm_prof --binary= --profile= --format=extbinary --out= ++ ++ Note that multiple AutoFDO profile files can be merged into one via:: ++ ++ $ llvm-profdata merge -o ... ++ ++6) Rebuild the kernel using the AutoFDO profile file with the same config as step 1, ++ (Note CONFIG_AUTOFDO_CLANG needs to be enabled):: ++ ++ $ make LLVM=1 CLANG_AUTOFDO_PROFILE= +diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst +index 53d4d124f9c5..3c0ac08b2709 100644 +--- a/Documentation/dev-tools/index.rst ++++ b/Documentation/dev-tools/index.rst +@@ -34,6 +34,8 @@ Documentation/dev-tools/testing-overview.rst + ktap + checkuapi + gpio-sloppy-logic-analyzer ++ autofdo ++ propeller + + + .. only:: subproject and html +diff --git a/Documentation/dev-tools/propeller.rst b/Documentation/dev-tools/propeller.rst +new file mode 100644 +index 000000000000..92195958e3db +--- /dev/null ++++ b/Documentation/dev-tools/propeller.rst +@@ -0,0 +1,162 @@ ++.. SPDX-License-Identifier: GPL-2.0 ++ ++===================================== ++Using Propeller with the Linux kernel ++===================================== ++ ++This enables Propeller build support for the kernel when using Clang ++compiler. Propeller is a profile-guided optimization (PGO) method used ++to optimize binary executables. Like AutoFDO, it utilizes hardware ++sampling to gather information about the frequency of execution of ++different code paths within a binary. Unlike AutoFDO, this information ++is then used right before linking phase to optimize (among others) ++block layout within and across functions. ++ ++A few important notes about adopting Propeller optimization: ++ ++#. Although it can be used as a standalone optimization step, it is ++ strongly recommended to apply Propeller on top of AutoFDO, ++ AutoFDO+ThinLTO or Instrument FDO. The rest of this document ++ assumes this paradigm. ++ ++#. Propeller uses another round of profiling on top of ++ AutoFDO/AutoFDO+ThinLTO/iFDO. The whole build process involves ++ "build-afdo - train-afdo - build-propeller - train-propeller - ++ build-optimized". ++ ++#. Propeller requires LLVM 19 release or later for Clang/Clang++ ++ and the linker(ld.lld). ++ ++#. In addition to LLVM toolchain, Propeller requires a profiling ++ conversion tool: https://github.com/google/autofdo with a release ++ after v0.30.1: https://github.com/google/autofdo/releases/tag/v0.30.1. ++ ++The Propeller optimization process involves the following steps: ++ ++#. Initial building: Build the AutoFDO or AutoFDO+ThinLTO binary as ++ you would normally do, but with a set of compile-time / link-time ++ flags, so that a special metadata section is created within the ++ kernel binary. The special section is only intend to be used by the ++ profiling tool, it is not part of the runtime image, nor does it ++ change kernel run time text sections. ++ ++#. Profiling: The above kernel is then run with a representative ++ workload to gather execution frequency data. This data is collected ++ using hardware sampling, via perf. Propeller is most effective on ++ platforms supporting advanced PMU features like LBR on Intel ++ machines. This step is the same as profiling the kernel for AutoFDO ++ (the exact perf parameters can be different). ++ ++#. Propeller profile generation: Perf output file is converted to a ++ pair of Propeller profiles via an offline tool. ++ ++#. Optimized build: Build the AutoFDO or AutoFDO+ThinLTO optimized ++ binary as you would normally do, but with a compile-time / ++ link-time flag to pick up the Propeller compile time and link time ++ profiles. This build step uses 3 profiles - the AutoFDO profile, ++ the Propeller compile-time profile and the Propeller link-time ++ profile. ++ ++#. Deployment: The optimized kernel binary is deployed and used ++ in production environments, providing improved performance ++ and reduced latency. ++ ++Preparation ++=========== ++ ++Configure the kernel with:: ++ ++ CONFIG_AUTOFDO_CLANG=y ++ CONFIG_PROPELLER_CLANG=y ++ ++Customization ++============= ++ ++The default CONFIG_PROPELLER_CLANG setting covers kernel space objects ++for Propeller builds. One can, however, enable or disable Propeller build ++for individual files and directories by adding a line similar to the ++following to the respective kernel Makefile: ++ ++- For enabling a single file (e.g. foo.o):: ++ ++ PROPELLER_PROFILE_foo.o := y ++ ++- For enabling all files in one directory:: ++ ++ PROPELLER_PROFILE := y ++ ++- For disabling one file:: ++ ++ PROPELLER_PROFILE_foo.o := n ++ ++- For disabling all files in one directory:: ++ ++ PROPELLER__PROFILE := n ++ ++ ++Workflow ++======== ++ ++Here is an example workflow for building an AutoFDO+Propeller kernel: ++ ++1) Assuming an AutoFDO profile is already collected following ++ instructions in the AutoFDO document, build the kernel on the host ++ machine, with AutoFDO and Propeller build configs :: ++ ++ CONFIG_AUTOFDO_CLANG=y ++ CONFIG_PROPELLER_CLANG=y ++ ++ and :: ++ ++ $ make LLVM=1 CLANG_AUTOFDO_PROFILE= ++ ++2) Install the kernel on the test machine. ++ ++3) Run the load tests. The '-c' option in perf specifies the sample ++ event period. We suggest using a suitable prime number, like 500009, ++ for this purpose. ++ ++ - For Intel platforms:: ++ ++ $ perf record -e BR_INST_RETIRED.NEAR_TAKEN:k -a -N -b -c -o -- ++ ++ - For AMD platforms:: ++ ++ $ perf record --pfm-event RETIRED_TAKEN_BRANCH_INSTRUCTIONS:k -a -N -b -c -o -- ++ ++ Note you can repeat the above steps to collect multiple s. ++ ++4) (Optional) Download the raw perf file(s) to the host machine. ++ ++5) Use the create_llvm_prof tool (https://github.com/google/autofdo) to ++ generate Propeller profile. :: ++ ++ $ create_llvm_prof --binary= --profile= ++ --format=propeller --propeller_output_module_name ++ --out=_cc_profile.txt ++ --propeller_symorder=_ld_profile.txt ++ ++ "" can be something like "/home/user/dir/any_string". ++ ++ This command generates a pair of Propeller profiles: ++ "_cc_profile.txt" and ++ "_ld_profile.txt". ++ ++ If there are more than 1 perf_file collected in the previous step, ++ you can create a temp list file "" with each line ++ containing one perf file name and run:: ++ ++ $ create_llvm_prof --binary= --profile=@ ++ --format=propeller --propeller_output_module_name ++ --out=_cc_profile.txt ++ --propeller_symorder=_ld_profile.txt ++ ++6) Rebuild the kernel using the AutoFDO and Propeller ++ profiles. :: ++ ++ CONFIG_AUTOFDO_CLANG=y ++ CONFIG_PROPELLER_CLANG=y ++ ++ and :: ++ ++ $ make LLVM=1 CLANG_AUTOFDO_PROFILE= CLANG_PROPELLER_PROFILE_PREFIX= +diff --git a/MAINTAINERS b/MAINTAINERS +index 0ebce432a134..919f01186c11 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -3673,6 +3673,13 @@ F: kernel/audit* + F: lib/*audit.c + K: \baudit_[a-z_0-9]\+\b + ++AUTOFDO BUILD ++M: Rong Xu ++M: Han Shen ++S: Supported ++F: Documentation/dev-tools/autofdo.rst ++F: scripts/Makefile.autofdo ++ + AUXILIARY BUS DRIVER + M: Greg Kroah-Hartman + R: Dave Ertman +@@ -18504,6 +18511,13 @@ S: Maintained + F: include/linux/psi* + F: kernel/sched/psi.c + ++PROPELLER BUILD ++M: Rong Xu ++M: Han Shen ++S: Supported ++F: Documentation/dev-tools/propeller.rst ++F: scripts/Makefile.propeller ++ + PRINTK + M: Petr Mladek + R: Steven Rostedt +diff --git a/Makefile b/Makefile +index b8efbfe9da94..2c3f65c68bec 100644 +--- a/Makefile ++++ b/Makefile +@@ -1018,6 +1018,8 @@ include-$(CONFIG_KMSAN) += scripts/Makefile.kmsan + include-$(CONFIG_UBSAN) += scripts/Makefile.ubsan + include-$(CONFIG_KCOV) += scripts/Makefile.kcov + include-$(CONFIG_RANDSTRUCT) += scripts/Makefile.randstruct ++include-$(CONFIG_AUTOFDO_CLANG) += scripts/Makefile.autofdo ++include-$(CONFIG_PROPELLER_CLANG) += scripts/Makefile.propeller + include-$(CONFIG_GCC_PLUGINS) += scripts/Makefile.gcc-plugins + + include $(addprefix $(srctree)/, $(include-y)) +diff --git a/arch/Kconfig b/arch/Kconfig +index bd9f095d69fa..00551f340dbe 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -811,6 +811,45 @@ config LTO_CLANG_THIN + If unsure, say Y. + endchoice + ++config ARCH_SUPPORTS_AUTOFDO_CLANG ++ bool ++ ++config AUTOFDO_CLANG ++ bool "Enable Clang's AutoFDO build (EXPERIMENTAL)" ++ depends on ARCH_SUPPORTS_AUTOFDO_CLANG ++ depends on CC_IS_CLANG && CLANG_VERSION >= 170000 ++ help ++ This option enables Clang’s AutoFDO build. When ++ an AutoFDO profile is specified in variable ++ CLANG_AUTOFDO_PROFILE during the build process, ++ Clang uses the profile to optimize the kernel. ++ ++ If no profile is specified, AutoFDO options are ++ still passed to Clang to facilitate the collection ++ of perf data for creating an AutoFDO profile in ++ subsequent builds. ++ ++ If unsure, say N. ++ ++config ARCH_SUPPORTS_PROPELLER_CLANG ++ bool ++ ++config PROPELLER_CLANG ++ bool "Enable Clang's Propeller build" ++ depends on ARCH_SUPPORTS_PROPELLER_CLANG ++ depends on CC_IS_CLANG && CLANG_VERSION >= 190000 ++ help ++ This option enables Clang’s Propeller build. When the Propeller ++ profiles is specified in variable CLANG_PROPELLER_PROFILE_PREFIX ++ during the build process, Clang uses the profiles to optimize ++ the kernel. ++ ++ If no profile is specified, Propeller options are still passed ++ to Clang to facilitate the collection of perf data for creating ++ the Propeller profiles in subsequent builds. ++ ++ If unsure, say N. ++ + config ARCH_SUPPORTS_CFI_CLANG + bool + help +diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S +index d317a843f7ea..f1b86eb30340 100644 +--- a/arch/sparc/kernel/vmlinux.lds.S ++++ b/arch/sparc/kernel/vmlinux.lds.S +@@ -48,6 +48,11 @@ SECTIONS + { + _text = .; + HEAD_TEXT ++ ALIGN_FUNCTION(); ++#ifdef CONFIG_SPARC64 ++ /* Match text section symbols in head_64.S first */ ++ *head_64.o(.text) ++#endif + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 16354dfa6d96..89b8fc452a7c 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -126,6 +126,8 @@ config X86 + select ARCH_SUPPORTS_LTO_CLANG + select ARCH_SUPPORTS_LTO_CLANG_THIN + select ARCH_SUPPORTS_RT ++ select ARCH_SUPPORTS_AUTOFDO_CLANG ++ select ARCH_SUPPORTS_PROPELLER_CLANG if X86_64 + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64 + select ARCH_USE_MEMTEST +diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S +index b8c5741d2fb4..cf22081601ed 100644 +--- a/arch/x86/kernel/vmlinux.lds.S ++++ b/arch/x86/kernel/vmlinux.lds.S +@@ -443,6 +443,10 @@ SECTIONS + + STABS_DEBUG + DWARF_DEBUG ++#ifdef CONFIG_PROPELLER_CLANG ++ .llvm_bb_addr_map : { *(.llvm_bb_addr_map) } ++#endif ++ + ELF_DETAILS + + DISCARDS +diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h +index eeadbaeccf88..c995474e4c64 100644 +--- a/include/asm-generic/vmlinux.lds.h ++++ b/include/asm-generic/vmlinux.lds.h +@@ -95,18 +95,25 @@ + * With LTO_CLANG, the linker also splits sections by default, so we need + * these macros to combine the sections during the final link. + * ++ * With AUTOFDO_CLANG and PROPELLER_CLANG, by default, the linker splits ++ * text sections and regroups functions into subsections. ++ * + * RODATA_MAIN is not used because existing code already defines .rodata.x + * sections to be brought in with rodata. + */ +-#if defined(CONFIG_LD_DEAD_CODE_DATA_ELIMINATION) || defined(CONFIG_LTO_CLANG) ++#if defined(CONFIG_LD_DEAD_CODE_DATA_ELIMINATION) || defined(CONFIG_LTO_CLANG) || \ ++defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) + #define TEXT_MAIN .text .text.[0-9a-zA-Z_]* ++#else ++#define TEXT_MAIN .text ++#endif ++#if defined(CONFIG_LD_DEAD_CODE_DATA_ELIMINATION) || defined(CONFIG_LTO_CLANG) + #define DATA_MAIN .data .data.[0-9a-zA-Z_]* .data..L* .data..compoundliteral* .data.$__unnamed_* .data.$L* + #define SDATA_MAIN .sdata .sdata.[0-9a-zA-Z_]* + #define RODATA_MAIN .rodata .rodata.[0-9a-zA-Z_]* .rodata..L* + #define BSS_MAIN .bss .bss.[0-9a-zA-Z_]* .bss..L* .bss..compoundliteral* + #define SBSS_MAIN .sbss .sbss.[0-9a-zA-Z_]* + #else +-#define TEXT_MAIN .text + #define DATA_MAIN .data + #define SDATA_MAIN .sdata + #define RODATA_MAIN .rodata +@@ -549,24 +556,44 @@ + __cpuidle_text_end = .; \ + __noinstr_text_end = .; + ++#define TEXT_SPLIT \ ++ __split_text_start = .; \ ++ *(.text.split .text.split.[0-9a-zA-Z_]*) \ ++ __split_text_end = .; ++ ++#define TEXT_UNLIKELY \ ++ __unlikely_text_start = .; \ ++ *(.text.unlikely .text.unlikely.*) \ ++ __unlikely_text_end = .; ++ ++#define TEXT_HOT \ ++ __hot_text_start = .; \ ++ *(.text.hot .text.hot.*) \ ++ __hot_text_end = .; ++ + /* + * .text section. Map to function alignment to avoid address changes + * during second ld run in second ld pass when generating System.map + * +- * TEXT_MAIN here will match .text.fixup and .text.unlikely if dead +- * code elimination is enabled, so these sections should be converted +- * to use ".." first. ++ * TEXT_MAIN here will match symbols with a fixed pattern (for example, ++ * .text.hot or .text.unlikely) if dead code elimination or ++ * function-section is enabled. Match these symbols first before ++ * TEXT_MAIN to ensure they are grouped together. ++ * ++ * Also placing .text.hot section at the beginning of a page, this ++ * would help the TLB performance. + */ + #define TEXT_TEXT \ + ALIGN_FUNCTION(); \ +- *(.text.hot .text.hot.*) \ +- *(TEXT_MAIN .text.fixup) \ +- *(.text.unlikely .text.unlikely.*) \ ++ *(.text.asan.* .text.tsan.*) \ + *(.text.unknown .text.unknown.*) \ ++ TEXT_SPLIT \ ++ TEXT_UNLIKELY \ ++ . = ALIGN(PAGE_SIZE); \ ++ TEXT_HOT \ ++ *(TEXT_MAIN .text.fixup) \ + NOINSTR_TEXT \ +- *(.ref.text) \ +- *(.text.asan.* .text.tsan.*) +- ++ *(.ref.text) + + /* sched.text is aling to function alignment to secure we have same + * address even at second ld pass when generating System.map */ +diff --git a/scripts/Makefile.autofdo b/scripts/Makefile.autofdo +new file mode 100644 +index 000000000000..1caf2457e585 +--- /dev/null ++++ b/scripts/Makefile.autofdo +@@ -0,0 +1,24 @@ ++# SPDX-License-Identifier: GPL-2.0 ++ ++# Enable available and selected Clang AutoFDO features. ++ ++CFLAGS_AUTOFDO_CLANG := -fdebug-info-for-profiling -mllvm -enable-fs-discriminator=true -mllvm -improved-fs-discriminator=true ++ ++ifndef CONFIG_DEBUG_INFO ++ CFLAGS_AUTOFDO_CLANG += -gmlt ++endif ++ ++ifdef CLANG_AUTOFDO_PROFILE ++ CFLAGS_AUTOFDO_CLANG += -fprofile-sample-use=$(CLANG_AUTOFDO_PROFILE) -ffunction-sections ++ CFLAGS_AUTOFDO_CLANG += -fsplit-machine-functions ++endif ++ ++ifdef CONFIG_LTO_CLANG_THIN ++ ifdef CLANG_AUTOFDO_PROFILE ++ KBUILD_LDFLAGS += --lto-sample-profile=$(CLANG_AUTOFDO_PROFILE) ++ endif ++ KBUILD_LDFLAGS += --mllvm=-enable-fs-discriminator=true --mllvm=-improved-fs-discriminator=true -plugin-opt=thinlto ++ KBUILD_LDFLAGS += -plugin-opt=-split-machine-functions ++endif ++ ++export CFLAGS_AUTOFDO_CLANG +diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib +index 01a9f567d5af..e7859ad90224 100644 +--- a/scripts/Makefile.lib ++++ b/scripts/Makefile.lib +@@ -191,6 +191,26 @@ _c_flags += $(if $(patsubst n%,, \ + -D__KCSAN_INSTRUMENT_BARRIERS__) + endif + ++# ++# Enable AutoFDO build flags except some files or directories we don't want to ++# enable (depends on variables AUTOFDO_PROFILE_obj.o and AUTOFDO_PROFILE). ++# ++ifeq ($(CONFIG_AUTOFDO_CLANG),y) ++_c_flags += $(if $(patsubst n%,, \ ++ $(AUTOFDO_PROFILE_$(target-stem).o)$(AUTOFDO_PROFILE)$(is-kernel-object)), \ ++ $(CFLAGS_AUTOFDO_CLANG)) ++endif ++ ++# ++# Enable Propeller build flags except some files or directories we don't want to ++# enable (depends on variables AUTOFDO_PROPELLER_obj.o and PROPELLER_PROFILE). ++# ++ifdef CONFIG_PROPELLER_CLANG ++_c_flags += $(if $(patsubst n%,, \ ++ $(AUTOFDO_PROFILE_$(target-stem).o)$(AUTOFDO_PROFILE)$(PROPELLER_PROFILE))$(is-kernel-object), \ ++ $(CFLAGS_PROPELLER_CLANG)) ++endif ++ + # $(src) for including checkin headers from generated source files + # $(obj) for including generated headers from checkin source files + ifeq ($(KBUILD_EXTMOD),) +diff --git a/scripts/Makefile.propeller b/scripts/Makefile.propeller +new file mode 100644 +index 000000000000..344190717e47 +--- /dev/null ++++ b/scripts/Makefile.propeller +@@ -0,0 +1,28 @@ ++# SPDX-License-Identifier: GPL-2.0 ++ ++# Enable available and selected Clang Propeller features. ++ifdef CLANG_PROPELLER_PROFILE_PREFIX ++ CFLAGS_PROPELLER_CLANG := -fbasic-block-sections=list=$(CLANG_PROPELLER_PROFILE_PREFIX)_cc_profile.txt -ffunction-sections ++ KBUILD_LDFLAGS += --symbol-ordering-file=$(CLANG_PROPELLER_PROFILE_PREFIX)_ld_profile.txt --no-warn-symbol-ordering ++else ++ CFLAGS_PROPELLER_CLANG := -fbasic-block-sections=labels ++endif ++ ++# Propeller requires debug information to embed module names in the profiles. ++# If CONFIG_DEBUG_INFO is not enabled, set -gmlt option. Skip this for AutoFDO, ++# as the option should already be set. ++ifndef CONFIG_DEBUG_INFO ++ ifndef CONFIG_AUTOFDO_CLANG ++ CFLAGS_PROPELLER_CLANG += -gmlt ++ endif ++endif ++ ++ifdef CONFIG_LTO_CLANG_THIN ++ ifdef CLANG_PROPELLER_PROFILE_PREFIX ++ KBUILD_LDFLAGS += --lto-basic-block-sections=$(CLANG_PROPELLER_PROFILE_PREFIX)_cc_profile.txt ++ else ++ KBUILD_LDFLAGS += --lto-basic-block-sections=labels ++ endif ++endif ++ ++export CFLAGS_PROPELLER_CLANG +diff --git a/tools/objtool/check.c b/tools/objtool/check.c +index 6604f5d038aa..05a0fb4a3d1a 100644 +--- a/tools/objtool/check.c ++++ b/tools/objtool/check.c +@@ -4557,6 +4557,8 @@ static int validate_ibt(struct objtool_file *file) + !strcmp(sec->name, "__jump_table") || + !strcmp(sec->name, "__mcount_loc") || + !strcmp(sec->name, ".kcfi_traps") || ++ !strcmp(sec->name, ".llvm.call-graph-profile") || ++ !strcmp(sec->name, ".llvm_bb_addr_map") || + strstr(sec->name, "__patchable_function_entries")) + continue; + +diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c +index 3d27983dc908..6f64d611faea 100644 +--- a/tools/objtool/elf.c ++++ b/tools/objtool/elf.c +@@ -224,12 +224,17 @@ int find_symbol_hole_containing(const struct section *sec, unsigned long offset) + if (n) + return 0; /* not a hole */ + +- /* didn't find a symbol for which @offset is after it */ +- if (!hole.sym) +- return 0; /* not a hole */ ++ /* ++ * @offset >= sym->offset + sym->len, find symbol after it. ++ * When hole.sym is empty, use the first node to compute the hole. ++ * If there is no symbol in the section, the first node will be NULL, ++ * in which case, -1 is returned to skip the whole section. ++ */ ++ if (hole.sym) ++ n = rb_next(&hole.sym->node); ++ else ++ n = rb_first_cached(&sec->symbol_tree); + +- /* @offset >= sym->offset + sym->len, find symbol after it */ +- n = rb_next(&hole.sym->node); + if (!n) + return -1; /* until end of address space */ + +-- +2.47.0 + +From cb27311981625ec965a2795725ecabad07150096 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 4 Nov 2024 14:47:06 +0100 Subject: [PATCH 04/13] bbr3 -Signed-off-by: Eric Naim +Signed-off-by: Peter Jung --- include/linux/tcp.h | 4 +- include/net/inet_connection_sock.h | 4 +- @@ -1674,7 +1939,7 @@ index c0deaafebfdc..d53f042d936e 100644 #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ diff --git a/include/net/tcp.h b/include/net/tcp.h -index 196c148fce8a..f37256b8abfd 100644 +index d1948d357dad..7d99f0bec5f2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -375,6 +375,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) @@ -1731,7 +1996,7 @@ index 196c148fce8a..f37256b8abfd 100644 } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; -@@ -1087,6 +1108,7 @@ enum tcp_ca_event { +@@ -1088,6 +1109,7 @@ enum tcp_ca_event { CA_EVENT_LOSS, /* loss timeout */ CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ @@ -1739,7 +2004,7 @@ index 196c148fce8a..f37256b8abfd 100644 }; /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ -@@ -1109,7 +1131,11 @@ enum tcp_ca_ack_event_flags { +@@ -1110,7 +1132,11 @@ enum tcp_ca_ack_event_flags { #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 @@ -1752,7 +2017,7 @@ index 196c148fce8a..f37256b8abfd 100644 union tcp_cc_info; -@@ -1129,10 +1155,13 @@ struct ack_sample { +@@ -1130,10 +1156,13 @@ struct ack_sample { */ struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ @@ -1767,7 +2032,7 @@ index 196c148fce8a..f37256b8abfd 100644 long interval_us; /* time for tp->delivered to incr "delivered" */ u32 snd_interval_us; /* snd interval for delivered packets */ u32 rcv_interval_us; /* rcv interval for delivered packets */ -@@ -1143,7 +1172,9 @@ struct rate_sample { +@@ -1144,7 +1173,9 @@ struct rate_sample { u32 last_end_seq; /* end_seq of most recently ACKed packet */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ @@ -1777,7 +2042,7 @@ index 196c148fce8a..f37256b8abfd 100644 }; struct tcp_congestion_ops { -@@ -1167,8 +1198,11 @@ struct tcp_congestion_ops { +@@ -1168,8 +1199,11 @@ struct tcp_congestion_ops { /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); @@ -1791,7 +2056,7 @@ index 196c148fce8a..f37256b8abfd 100644 /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) -@@ -1234,6 +1268,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) +@@ -1235,6 +1269,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) } #endif @@ -1806,7 +2071,7 @@ index 196c148fce8a..f37256b8abfd 100644 static inline bool tcp_ca_needs_ecn(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); -@@ -1253,6 +1295,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) +@@ -1254,6 +1296,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) void tcp_set_ca_state(struct sock *sk, const u8 ca_state); /* From tcp_rate.c */ @@ -1814,7 +2079,7 @@ index 196c148fce8a..f37256b8abfd 100644 void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); -@@ -1265,6 +1308,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) +@@ -1266,6 +1309,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) return t1 > t2 || (t1 == t2 && after(seq1, seq2)); } @@ -1836,7 +2101,7 @@ index 196c148fce8a..f37256b8abfd 100644 /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. -@@ -2416,7 +2474,7 @@ struct tcp_plb_state { +@@ -2417,7 +2475,7 @@ struct tcp_plb_state { u8 consec_cong_rounds:5, /* consecutive congested rounds */ unused:3; u32 pause_until; /* jiffies32 when PLB can resume rerouting */ @@ -1846,7 +2111,7 @@ index 196c148fce8a..f37256b8abfd 100644 static inline void tcp_plb_init(const struct sock *sk, struct tcp_plb_state *plb) diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h -index 50655de04c9b..82f8bd8f0d16 100644 +index 86bb2e8b17c9..9d9a3eb2ce9b 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -229,6 +229,29 @@ struct tcp_bbr_info { @@ -1912,10 +2177,10 @@ index dbf896f3146c..4702cd2f1ffc 100644 /* * Sender's congestion state indicating normal or abnormal situations diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig -index 8e94ed7c56a0..50dc9970cad2 100644 +index 6d2c97f8e9ef..ddc116ef22cb 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig -@@ -668,15 +668,18 @@ config TCP_CONG_BBR +@@ -669,15 +669,18 @@ config TCP_CONG_BBR default n help @@ -1944,10 +2209,10 @@ index 8e94ed7c56a0..50dc9970cad2 100644 choice prompt "Default TCP congestion control" diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c -index 3f88d0961e5b..4273cac333f6 100644 +index 554804774628..2279e6e7bc9c 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c -@@ -305,11 +305,15 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp +@@ -280,11 +280,15 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp { } @@ -1964,7 +2229,7 @@ index 3f88d0961e5b..4273cac333f6 100644 static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) { -@@ -340,7 +344,8 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { +@@ -315,7 +319,8 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { .cwnd_event = bpf_tcp_ca_cwnd_event, .in_ack_event = bpf_tcp_ca_in_ack_event, .pkts_acked = bpf_tcp_ca_pkts_acked, @@ -1975,10 +2240,10 @@ index 3f88d0961e5b..4273cac333f6 100644 .undo_cwnd = bpf_tcp_ca_undo_cwnd, .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index 831a18dc7aa6..d9faa8fef55e 100644 +index 4f77bd862e95..fd3a5551eda7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c -@@ -3123,6 +3123,7 @@ int tcp_disconnect(struct sock *sk, int flags) +@@ -3384,6 +3384,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; tp->rcv_ooopack = 0; @@ -1986,7 +2251,7 @@ index 831a18dc7aa6..d9faa8fef55e 100644 /* Clean up fastopen related fields */ -@@ -3849,6 +3850,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) +@@ -4110,6 +4111,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; @@ -4652,7 +4917,7 @@ index 0306d257fa64..28f581c0dab7 100644 icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c -index 889db23bfc05..b924a852f108 100644 +index 2d844e1f867f..efb92e47a632 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -370,7 +370,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) @@ -4768,7 +5033,7 @@ index 889db23bfc05..b924a852f108 100644 return 1; old_ack: -@@ -5745,13 +5770,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) +@@ -5752,13 +5777,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && @@ -4786,10 +5051,10 @@ index 889db23bfc05..b924a852f108 100644 tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c -index a19a9dbd3409..e0ef8406a326 100644 +index bb1fe1ba867a..050a80769de6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c -@@ -459,6 +459,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) +@@ -462,6 +462,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); bool ca_got_dst = false; @@ -4799,7 +5064,7 @@ index a19a9dbd3409..e0ef8406a326 100644 const struct tcp_congestion_ops *ca; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 8f67eea34779..f497c6c4a609 100644 +index 68804fd01daf..afdb62febe42 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -336,10 +336,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) @@ -4990,10 +5255,10 @@ index a8f6d9d06f2e..8737f2134648 100644 rs->interval_us = max(snd_us, ack_us); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c -index 4d40615dc8fc..f27941201ef2 100644 +index 79064580c8c0..697270ce1ea6 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c -@@ -689,6 +689,7 @@ void tcp_write_timer_handler(struct sock *sk) +@@ -690,6 +690,7 @@ void tcp_write_timer_handler(struct sock *sk) return; } @@ -5002,28 +5267,25 @@ index 4d40615dc8fc..f27941201ef2 100644 event = icsk->icsk_pending; -- -2.47.0.rc0 +2.47.0 -From 2a2f186f1c8c99bdd1183fd28527bf95f781166c Mon Sep 17 00:00:00 2001 -From: Eric Naim -Date: Tue, 22 Oct 2024 22:52:15 +0800 +From 7b911d200df5cd539857ad49ab9e726bad8affb8 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 4 Nov 2024 14:47:19 +0100 Subject: [PATCH 05/13] cachy -Signed-off-by: Eric Naim +Signed-off-by: Peter Jung --- .../admin-guide/kernel-parameters.txt | 12 + Makefile | 8 + arch/x86/Kconfig.cpu | 359 +- - arch/x86/Makefile | 91 +- + arch/x86/Makefile | 87 +- arch/x86/include/asm/pci.h | 6 + arch/x86/include/asm/vermagic.h | 70 + arch/x86/pci/common.c | 7 +- - block/bfq-iosched.c | 6 + - block/elevator.c | 10 + drivers/Makefile | 13 +- drivers/ata/ahci.c | 23 +- drivers/cpufreq/Kconfig.x86 | 2 - - drivers/cpufreq/cpufreq.c | 27 +- drivers/cpufreq/intel_pstate.c | 2 + drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 + @@ -5034,10 +5296,6 @@ Signed-off-by: Eric Naim .../amd/display/amdgpu_dm/amdgpu_dm_plane.c | 6 +- drivers/gpu/drm/amd/pm/amdgpu_pm.c | 3 + drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 14 +- - drivers/i2c/busses/Kconfig | 9 + - drivers/i2c/busses/Makefile | 1 + - drivers/i2c/busses/i2c-nct6775.c | 648 ++++ - drivers/i2c/busses/i2c-piix4.c | 4 +- drivers/input/evdev.c | 19 +- drivers/md/dm-crypt.c | 5 + drivers/media/v4l2-core/Kconfig | 5 + @@ -5048,7 +5306,6 @@ Signed-off-by: Eric Naim drivers/pci/controller/Makefile | 6 + drivers/pci/controller/intel-nvme-remap.c | 462 +++ drivers/pci/quirks.c | 101 + - include/linux/cpufreq.h | 6 - include/linux/pagemap.h | 2 +- include/linux/user_namespace.h | 4 + include/linux/wait.h | 2 + @@ -5063,27 +5320,23 @@ Signed-off-by: Eric Naim kernel/user_namespace.c | 7 + mm/Kconfig | 2 +- mm/compaction.c | 4 + - mm/huge_memory.c | 4 + mm/page-writeback.c | 8 + mm/page_alloc.c | 4 + mm/swap.c | 5 + mm/vmpressure.c | 4 + mm/vmscan.c | 8 + net/ipv4/inet_connection_sock.c | 2 +- - scripts/Makefile.package | 3 +- - scripts/package/PKGBUILD | 52 +- - 61 files changed, 5798 insertions(+), 113 deletions(-) - create mode 100644 drivers/i2c/busses/i2c-nct6775.c + 50 files changed, 5073 insertions(+), 64 deletions(-) create mode 100644 drivers/media/v4l2-core/v4l2loopback.c create mode 100644 drivers/media/v4l2-core/v4l2loopback.h create mode 100644 drivers/media/v4l2-core/v4l2loopback_formats.h create mode 100644 drivers/pci/controller/intel-nvme-remap.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index be010fec7654..900113802ffc 100644 +index 1518343bbe22..a1773a699743 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -2231,6 +2231,9 @@ +@@ -2248,6 +2248,9 @@ disable Do not enable intel_pstate as the default scaling driver for the supported processors @@ -5093,7 +5346,7 @@ index be010fec7654..900113802ffc 100644 active Use intel_pstate driver to bypass the scaling governors layer of cpufreq and provides it own -@@ -4412,6 +4415,15 @@ +@@ -4473,6 +4476,15 @@ nomsi [MSI] If the PCI_MSI kernel config parameter is enabled, this kernel boot option can be used to disable the use of MSI interrupts system-wide. @@ -5110,10 +5363,10 @@ index be010fec7654..900113802ffc 100644 Safety option to keep boot IRQs enabled. This should never be necessary. diff --git a/Makefile b/Makefile -index 687ce7aee67a..7c3cbfb2f6b5 100644 +index 2c3f65c68bec..969f0cfe7fbd 100644 --- a/Makefile +++ b/Makefile -@@ -803,11 +803,19 @@ KBUILD_CFLAGS += -fno-delete-null-pointer-checks +@@ -801,11 +801,19 @@ KBUILD_CFLAGS += -fno-delete-null-pointer-checks ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE KBUILD_CFLAGS += -O2 KBUILD_RUSTFLAGS += -Copt-level=2 @@ -5572,22 +5825,10 @@ index 2a7279d80460..f5849153b385 100644 # # P6_NOPs are a relatively minor optimization that require a family >= diff --git a/arch/x86/Makefile b/arch/x86/Makefile -index 801fd85c3ef6..85d962aa68fe 100644 +index cd75e78a06c1..396d1db12bca 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile -@@ -70,9 +70,9 @@ export BITS - # - # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 - # --KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-avx2 -mno-avx512f -fno-tree-vectorize - KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json --KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 -+KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-avx512f - - # - # CFLAGS for compiling floating point code inside the kernel. -@@ -177,15 +177,96 @@ else +@@ -181,15 +181,96 @@ else cflags-$(CONFIG_MK8) += -march=k8 cflags-$(CONFIG_MPSC) += -march=nocona cflags-$(CONFIG_MCORE2) += -march=core2 @@ -5821,59 +6062,11 @@ index ddb798603201..7c20387d8202 100644 return dev; } -#endif -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c -index 1cc40a857fb8..c446fa6a6ad1 100644 ---- a/block/bfq-iosched.c -+++ b/block/bfq-iosched.c -@@ -7656,6 +7656,7 @@ MODULE_ALIAS("bfq-iosched"); - static int __init bfq_init(void) - { - int ret; -+ char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.11"; - - #ifdef CONFIG_BFQ_GROUP_IOSCHED - ret = blkcg_policy_register(&blkcg_policy_bfq); -@@ -7687,6 +7688,11 @@ static int __init bfq_init(void) - if (ret) - goto slab_kill; - -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ strcat(msg, " (with cgroups support)"); -+#endif -+ pr_info("%s", msg); -+ - return 0; - - slab_kill: -diff --git a/block/elevator.c b/block/elevator.c -index 4122026b11f1..cd630e991eae 100644 ---- a/block/elevator.c -+++ b/block/elevator.c -@@ -567,9 +567,19 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) - - if (q->nr_hw_queues != 1 && - !blk_mq_is_shared_tags(q->tag_set->flags)) -+#if defined(CONFIG_CACHY) && defined(CONFIG_MQ_IOSCHED_KYBER) -+ return elevator_find_get(q, "kyber"); -+#elif defined(CONFIG_CACHY) -+ return elevator_find_get(q, "mq-deadline"); -+#else - return NULL; -+#endif - -+#if defined(CONFIG_CACHY) && defined(CONFIG_IOSCHED_BFQ) -+ return elevator_find_get(q, "bfq"); -+#else - return elevator_find_get(q, "mq-deadline"); -+#endif - } - - /* diff --git a/drivers/Makefile b/drivers/Makefile -index fe9ceb0d2288..b58955caf19b 100644 +index 45d1c3e630f7..4f5ab2429a7f 100644 --- a/drivers/Makefile +++ b/drivers/Makefile -@@ -61,14 +61,8 @@ obj-y += char/ +@@ -64,14 +64,8 @@ obj-y += char/ # iommu/ comes before gpu as gpu are using iommu controllers obj-y += iommu/ @@ -5888,7 +6081,7 @@ index fe9ceb0d2288..b58955caf19b 100644 obj-$(CONFIG_PARPORT) += parport/ obj-y += base/ block/ misc/ mfd/ nfc/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ -@@ -80,6 +74,13 @@ obj-y += macintosh/ +@@ -83,6 +77,13 @@ obj-y += macintosh/ obj-y += scsi/ obj-y += nvme/ obj-$(CONFIG_ATA) += ata/ @@ -5903,7 +6096,7 @@ index fe9ceb0d2288..b58955caf19b 100644 obj-$(CONFIG_MTD) += mtd/ obj-$(CONFIG_SPI) += spi/ diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c -index a05c17249448..be4b54ff7e89 100644 +index 45f63b09828a..d8bcb8b7544f 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -1618,7 +1618,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance) @@ -5978,50 +6171,11 @@ index 97c2d4f15d76..5a3af44d785a 100644 help This driver adds a CPUFreq driver which utilizes a fine grain processor performance frequency control range instead of legacy -diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c -index 04fc786dd2c0..f98c9438760c 100644 ---- a/drivers/cpufreq/cpufreq.c -+++ b/drivers/cpufreq/cpufreq.c -@@ -575,30 +575,11 @@ unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy) - return policy->transition_delay_us; - - latency = policy->cpuinfo.transition_latency / NSEC_PER_USEC; -- if (latency) { -- unsigned int max_delay_us = 2 * MSEC_PER_SEC; -+ if (latency) -+ /* Give a 50% breathing room between updates */ -+ return latency + (latency >> 1); - -- /* -- * If the platform already has high transition_latency, use it -- * as-is. -- */ -- if (latency > max_delay_us) -- return latency; -- -- /* -- * For platforms that can change the frequency very fast (< 2 -- * us), the above formula gives a decent transition delay. But -- * for platforms where transition_latency is in milliseconds, it -- * ends up giving unrealistic values. -- * -- * Cap the default transition delay to 2 ms, which seems to be -- * a reasonable amount of time after which we should reevaluate -- * the frequency. -- */ -- return min(latency * LATENCY_MULTIPLIER, max_delay_us); -- } -- -- return LATENCY_MULTIPLIER; -+ return USEC_PER_MSEC; - } - EXPORT_SYMBOL_GPL(cpufreq_policy_transition_delay_us); - diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c -index 949ead440da9..348a330678bd 100644 +index b0018f371ea3..23f51eb073e4 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c -@@ -3568,6 +3568,8 @@ static int __init intel_pstate_setup(char *str) +@@ -3800,6 +3800,8 @@ static int __init intel_pstate_setup(char *str) if (!strcmp(str, "disable")) no_load = 1; @@ -6031,10 +6185,10 @@ index 949ead440da9..348a330678bd 100644 default_driver = &intel_pstate; else if (!strcmp(str, "passive")) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h -index 137a88b8de45..233c17537492 100644 +index 9b1e0ede05a4..7617963901fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h -@@ -162,6 +162,7 @@ struct amdgpu_watchdog_timer { +@@ -164,6 +164,7 @@ struct amdgpu_watchdog_timer { */ extern int amdgpu_modeset; extern unsigned int amdgpu_vram_limit; @@ -6043,10 +6197,10 @@ index 137a88b8de45..233c17537492 100644 extern int amdgpu_gart_size; extern int amdgpu_gtt_size; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c -index e2382566af44..9c3b7b027485 100644 +index 81d9877c8735..852e6f315576 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c -@@ -135,6 +135,7 @@ enum AMDGPU_DEBUG_MASK { +@@ -136,6 +136,7 @@ enum AMDGPU_DEBUG_MASK { }; unsigned int amdgpu_vram_limit = UINT_MAX; @@ -6054,7 +6208,7 @@ index e2382566af44..9c3b7b027485 100644 int amdgpu_vis_vram_limit; int amdgpu_gart_size = -1; /* auto */ int amdgpu_gtt_size = -1; /* auto */ -@@ -248,6 +249,15 @@ struct amdgpu_watchdog_timer amdgpu_watchdog_timer = { +@@ -259,6 +260,15 @@ struct amdgpu_watchdog_timer amdgpu_watchdog_timer = { .period = 0x0, /* default to 0x0 (timeout disable) */ }; @@ -6086,10 +6240,10 @@ index df17e79c45c7..e454488c1a31 100644 + endmenu diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c -index 4f19e9736a67..575fdcfb138c 100644 +index 13421a58210d..fc46b3e1d140 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c -@@ -4445,7 +4445,7 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev) +@@ -4473,7 +4473,7 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev) return r; } @@ -6112,10 +6266,10 @@ index ebabfe3a512f..4d3ebcaacca1 100644 * * AMD driver supports pre-defined mathematical functions for transferring diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c -index 99014339aaa3..222f72b4c44f 100644 +index a2cf2c066a76..285f5a045ca5 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c -@@ -426,7 +426,7 @@ static int amdgpu_dm_crtc_late_register(struct drm_crtc *crtc) +@@ -474,7 +474,7 @@ static int amdgpu_dm_crtc_late_register(struct drm_crtc *crtc) } #endif @@ -6124,7 +6278,7 @@ index 99014339aaa3..222f72b4c44f 100644 /** * dm_crtc_additional_color_mgmt - enable additional color properties * @crtc: DRM CRTC -@@ -508,7 +508,7 @@ static const struct drm_crtc_funcs amdgpu_dm_crtc_funcs = { +@@ -556,7 +556,7 @@ static const struct drm_crtc_funcs amdgpu_dm_crtc_funcs = { #if defined(CONFIG_DEBUG_FS) .late_register = amdgpu_dm_crtc_late_register, #endif @@ -6133,7 +6287,7 @@ index 99014339aaa3..222f72b4c44f 100644 .atomic_set_property = amdgpu_dm_atomic_crtc_set_property, .atomic_get_property = amdgpu_dm_atomic_crtc_get_property, #endif -@@ -687,7 +687,7 @@ int amdgpu_dm_crtc_init(struct amdgpu_display_manager *dm, +@@ -735,7 +735,7 @@ int amdgpu_dm_crtc_init(struct amdgpu_display_manager *dm, drm_mode_crtc_set_gamma_size(&acrtc->base, MAX_COLOR_LEGACY_LUT_ENTRIES); @@ -6143,10 +6297,10 @@ index 99014339aaa3..222f72b4c44f 100644 #endif return 0; diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c -index a573a6639898..52e0e42e26a5 100644 +index 495e3cd70426..704a48209657 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c -@@ -1569,7 +1569,7 @@ static void amdgpu_dm_plane_drm_plane_destroy_state(struct drm_plane *plane, +@@ -1573,7 +1573,7 @@ static void amdgpu_dm_plane_drm_plane_destroy_state(struct drm_plane *plane, drm_atomic_helper_plane_destroy_state(plane, state); } @@ -6155,7 +6309,7 @@ index a573a6639898..52e0e42e26a5 100644 static void dm_atomic_plane_attach_color_mgmt_properties(struct amdgpu_display_manager *dm, struct drm_plane *plane) -@@ -1760,7 +1760,7 @@ static const struct drm_plane_funcs dm_plane_funcs = { +@@ -1764,7 +1764,7 @@ static const struct drm_plane_funcs dm_plane_funcs = { .atomic_duplicate_state = amdgpu_dm_plane_drm_plane_duplicate_state, .atomic_destroy_state = amdgpu_dm_plane_drm_plane_destroy_state, .format_mod_supported = amdgpu_dm_plane_format_mod_supported, @@ -6164,7 +6318,7 @@ index a573a6639898..52e0e42e26a5 100644 .atomic_set_property = dm_atomic_plane_set_property, .atomic_get_property = dm_atomic_plane_get_property, #endif -@@ -1853,7 +1853,7 @@ int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm, +@@ -1857,7 +1857,7 @@ int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm, drm_plane_helper_add(plane, &dm_plane_helper_funcs); @@ -6188,10 +6342,10 @@ index d5d6ab484e5a..dccba7bcdf97 100644 } diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c -index 87672ca714de..21442469791c 100644 +index 80e60ea2d11e..51dea35848f6 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c -@@ -2762,7 +2762,10 @@ int smu_get_power_limit(void *handle, +@@ -2775,7 +2775,10 @@ int smu_get_power_limit(void *handle, *limit = smu->max_power_limit; break; case SMU_PPT_LIMIT_MIN: @@ -6203,7 +6357,7 @@ index 87672ca714de..21442469791c 100644 break; default: return -EINVAL; -@@ -2786,7 +2789,14 @@ static int smu_set_power_limit(void *handle, uint32_t limit) +@@ -2799,7 +2802,14 @@ static int smu_set_power_limit(void *handle, uint32_t limit) if (smu->ppt_funcs->set_power_limit) return smu->ppt_funcs->set_power_limit(smu, limit_type, limit); @@ -6219,712 +6373,8 @@ index 87672ca714de..21442469791c 100644 dev_err(smu->adev->dev, "New power limit (%d) is out of range [%d,%d]\n", limit, smu->min_power_limit, smu->max_power_limit); -diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig -index a22f9125322a..44d863e0175e 100644 ---- a/drivers/i2c/busses/Kconfig -+++ b/drivers/i2c/busses/Kconfig -@@ -240,6 +240,15 @@ config I2C_CHT_WC - combined with a FUSB302 Type-C port-controller as such it is advised - to also select CONFIG_TYPEC_FUSB302=m. - -+config I2C_NCT6775 -+ tristate "Nuvoton NCT6775 and compatible SMBus controller" -+ help -+ If you say yes to this option, support will be included for the -+ Nuvoton NCT6775 and compatible SMBus controllers. -+ -+ This driver can also be built as a module. If so, the module -+ will be called i2c-nct6775. -+ - config I2C_NFORCE2 - tristate "Nvidia nForce2, nForce3 and nForce4" - depends on PCI && HAS_IOPORT -diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile -index 78d0561339e5..9ea3a294f9f0 100644 ---- a/drivers/i2c/busses/Makefile -+++ b/drivers/i2c/busses/Makefile -@@ -20,6 +20,7 @@ obj-$(CONFIG_I2C_CHT_WC) += i2c-cht-wc.o - obj-$(CONFIG_I2C_I801) += i2c-i801.o - obj-$(CONFIG_I2C_ISCH) += i2c-isch.o - obj-$(CONFIG_I2C_ISMT) += i2c-ismt.o -+obj-$(CONFIG_I2C_NCT6775) += i2c-nct6775.o - obj-$(CONFIG_I2C_NFORCE2) += i2c-nforce2.o - obj-$(CONFIG_I2C_NFORCE2_S4985) += i2c-nforce2-s4985.o - obj-$(CONFIG_I2C_NVIDIA_GPU) += i2c-nvidia-gpu.o -diff --git a/drivers/i2c/busses/i2c-nct6775.c b/drivers/i2c/busses/i2c-nct6775.c -new file mode 100644 -index 000000000000..fdbd9a1c8d7a ---- /dev/null -+++ b/drivers/i2c/busses/i2c-nct6775.c -@@ -0,0 +1,648 @@ -+/* -+ * i2c-nct6775 - Driver for the SMBus master functionality of -+ * Nuvoton NCT677x Super-I/O chips -+ * -+ * Copyright (C) 2019 Adam Honse -+ * -+ * Derived from nct6775 hwmon driver -+ * Copyright (C) 2012 Guenter Roeck -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -+ * -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#define DRVNAME "i2c-nct6775" -+ -+/* Nuvoton SMBus address offsets */ -+#define SMBHSTDAT (0 + nuvoton_nct6793d_smba) -+#define SMBBLKSZ (1 + nuvoton_nct6793d_smba) -+#define SMBHSTCMD (2 + nuvoton_nct6793d_smba) -+#define SMBHSTIDX (3 + nuvoton_nct6793d_smba) //Index field is the Command field on other controllers -+#define SMBHSTCTL (4 + nuvoton_nct6793d_smba) -+#define SMBHSTADD (5 + nuvoton_nct6793d_smba) -+#define SMBHSTERR (9 + nuvoton_nct6793d_smba) -+#define SMBHSTSTS (0xE + nuvoton_nct6793d_smba) -+ -+/* Command register */ -+#define NCT6793D_READ_BYTE 0 -+#define NCT6793D_READ_WORD 1 -+#define NCT6793D_READ_BLOCK 2 -+#define NCT6793D_BLOCK_WRITE_READ_PROC_CALL 3 -+#define NCT6793D_PROC_CALL 4 -+#define NCT6793D_WRITE_BYTE 8 -+#define NCT6793D_WRITE_WORD 9 -+#define NCT6793D_WRITE_BLOCK 10 -+ -+/* Control register */ -+#define NCT6793D_MANUAL_START 128 -+#define NCT6793D_SOFT_RESET 64 -+ -+/* Error register */ -+#define NCT6793D_NO_ACK 32 -+ -+/* Status register */ -+#define NCT6793D_FIFO_EMPTY 1 -+#define NCT6793D_FIFO_FULL 2 -+#define NCT6793D_MANUAL_ACTIVE 4 -+ -+#define NCT6775_LD_SMBUS 0x0B -+ -+/* Other settings */ -+#define MAX_RETRIES 400 -+ -+enum kinds { nct6106, nct6775, nct6776, nct6779, nct6791, nct6792, nct6793, -+ nct6795, nct6796, nct6798 }; -+ -+struct nct6775_sio_data { -+ int sioreg; -+ enum kinds kind; -+}; -+ -+/* used to set data->name = nct6775_device_names[data->sio_kind] */ -+static const char * const nct6775_device_names[] = { -+ "nct6106", -+ "nct6775", -+ "nct6776", -+ "nct6779", -+ "nct6791", -+ "nct6792", -+ "nct6793", -+ "nct6795", -+ "nct6796", -+ "nct6798", -+}; -+ -+static const char * const nct6775_sio_names[] __initconst = { -+ "NCT6106D", -+ "NCT6775F", -+ "NCT6776D/F", -+ "NCT6779D", -+ "NCT6791D", -+ "NCT6792D", -+ "NCT6793D", -+ "NCT6795D", -+ "NCT6796D", -+ "NCT6798D", -+}; -+ -+#define SIO_REG_LDSEL 0x07 /* Logical device select */ -+#define SIO_REG_DEVID 0x20 /* Device ID (2 bytes) */ -+#define SIO_REG_SMBA 0x62 /* SMBus base address register */ -+ -+#define SIO_NCT6106_ID 0xc450 -+#define SIO_NCT6775_ID 0xb470 -+#define SIO_NCT6776_ID 0xc330 -+#define SIO_NCT6779_ID 0xc560 -+#define SIO_NCT6791_ID 0xc800 -+#define SIO_NCT6792_ID 0xc910 -+#define SIO_NCT6793_ID 0xd120 -+#define SIO_NCT6795_ID 0xd350 -+#define SIO_NCT6796_ID 0xd420 -+#define SIO_NCT6798_ID 0xd428 -+#define SIO_ID_MASK 0xFFF0 -+ -+static inline void -+superio_outb(int ioreg, int reg, int val) -+{ -+ outb(reg, ioreg); -+ outb(val, ioreg + 1); -+} -+ -+static inline int -+superio_inb(int ioreg, int reg) -+{ -+ outb(reg, ioreg); -+ return inb(ioreg + 1); -+} -+ -+static inline void -+superio_select(int ioreg, int ld) -+{ -+ outb(SIO_REG_LDSEL, ioreg); -+ outb(ld, ioreg + 1); -+} -+ -+static inline int -+superio_enter(int ioreg) -+{ -+ /* -+ * Try to reserve and for exclusive access. -+ */ -+ if (!request_muxed_region(ioreg, 2, DRVNAME)) -+ return -EBUSY; -+ -+ outb(0x87, ioreg); -+ outb(0x87, ioreg); -+ -+ return 0; -+} -+ -+static inline void -+superio_exit(int ioreg) -+{ -+ outb(0xaa, ioreg); -+ outb(0x02, ioreg); -+ outb(0x02, ioreg + 1); -+ release_region(ioreg, 2); -+} -+ -+/* -+ * ISA constants -+ */ -+ -+#define IOREGION_ALIGNMENT (~7) -+#define IOREGION_LENGTH 2 -+#define ADDR_REG_OFFSET 0 -+#define DATA_REG_OFFSET 1 -+ -+#define NCT6775_REG_BANK 0x4E -+#define NCT6775_REG_CONFIG 0x40 -+ -+static struct i2c_adapter *nct6775_adapter; -+ -+struct i2c_nct6775_adapdata { -+ unsigned short smba; -+}; -+ -+/* Return negative errno on error. */ -+static s32 nct6775_access(struct i2c_adapter * adap, u16 addr, -+ unsigned short flags, char read_write, -+ u8 command, int size, union i2c_smbus_data * data) -+{ -+ struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap); -+ unsigned short nuvoton_nct6793d_smba = adapdata->smba; -+ int i, len, cnt; -+ union i2c_smbus_data tmp_data; -+ int timeout = 0; -+ -+ tmp_data.word = 0; -+ cnt = 0; -+ len = 0; -+ -+ outb_p(NCT6793D_SOFT_RESET, SMBHSTCTL); -+ -+ switch (size) { -+ case I2C_SMBUS_QUICK: -+ outb_p((addr << 1) | read_write, -+ SMBHSTADD); -+ break; -+ case I2C_SMBUS_BYTE_DATA: -+ tmp_data.byte = data->byte; -+ fallthrough; -+ case I2C_SMBUS_BYTE: -+ outb_p((addr << 1) | read_write, -+ SMBHSTADD); -+ outb_p(command, SMBHSTIDX); -+ if (read_write == I2C_SMBUS_WRITE) { -+ outb_p(tmp_data.byte, SMBHSTDAT); -+ outb_p(NCT6793D_WRITE_BYTE, SMBHSTCMD); -+ } -+ else { -+ outb_p(NCT6793D_READ_BYTE, SMBHSTCMD); -+ } -+ break; -+ case I2C_SMBUS_WORD_DATA: -+ outb_p((addr << 1) | read_write, -+ SMBHSTADD); -+ outb_p(command, SMBHSTIDX); -+ if (read_write == I2C_SMBUS_WRITE) { -+ outb_p(data->word & 0xff, SMBHSTDAT); -+ outb_p((data->word & 0xff00) >> 8, SMBHSTDAT); -+ outb_p(NCT6793D_WRITE_WORD, SMBHSTCMD); -+ } -+ else { -+ outb_p(NCT6793D_READ_WORD, SMBHSTCMD); -+ } -+ break; -+ case I2C_SMBUS_BLOCK_DATA: -+ outb_p((addr << 1) | read_write, -+ SMBHSTADD); -+ outb_p(command, SMBHSTIDX); -+ if (read_write == I2C_SMBUS_WRITE) { -+ len = data->block[0]; -+ if (len == 0 || len > I2C_SMBUS_BLOCK_MAX) -+ return -EINVAL; -+ outb_p(len, SMBBLKSZ); -+ -+ cnt = 1; -+ if (len >= 4) { -+ for (i = cnt; i <= 4; i++) { -+ outb_p(data->block[i], SMBHSTDAT); -+ } -+ -+ len -= 4; -+ cnt += 4; -+ } -+ else { -+ for (i = cnt; i <= len; i++ ) { -+ outb_p(data->block[i], SMBHSTDAT); -+ } -+ -+ len = 0; -+ } -+ -+ outb_p(NCT6793D_WRITE_BLOCK, SMBHSTCMD); -+ } -+ else { -+ return -ENOTSUPP; -+ } -+ break; -+ default: -+ dev_warn(&adap->dev, "Unsupported transaction %d\n", size); -+ return -EOPNOTSUPP; -+ } -+ -+ outb_p(NCT6793D_MANUAL_START, SMBHSTCTL); -+ -+ while ((size == I2C_SMBUS_BLOCK_DATA) && (len > 0)) { -+ if (read_write == I2C_SMBUS_WRITE) { -+ timeout = 0; -+ while ((inb_p(SMBHSTSTS) & NCT6793D_FIFO_EMPTY) == 0) -+ { -+ if(timeout > MAX_RETRIES) -+ { -+ return -ETIMEDOUT; -+ } -+ usleep_range(250, 500); -+ timeout++; -+ } -+ -+ //Load more bytes into FIFO -+ if (len >= 4) { -+ for (i = cnt; i <= (cnt + 4); i++) { -+ outb_p(data->block[i], SMBHSTDAT); -+ } -+ -+ len -= 4; -+ cnt += 4; -+ } -+ else { -+ for (i = cnt; i <= (cnt + len); i++) { -+ outb_p(data->block[i], SMBHSTDAT); -+ } -+ -+ len = 0; -+ } -+ } -+ else { -+ return -ENOTSUPP; -+ } -+ -+ } -+ -+ //wait for manual mode to complete -+ timeout = 0; -+ while ((inb_p(SMBHSTSTS) & NCT6793D_MANUAL_ACTIVE) != 0) -+ { -+ if(timeout > MAX_RETRIES) -+ { -+ return -ETIMEDOUT; -+ } -+ usleep_range(250, 500); -+ timeout++; -+ } -+ -+ if ((inb_p(SMBHSTERR) & NCT6793D_NO_ACK) != 0) { -+ return -ENXIO; -+ } -+ else if ((read_write == I2C_SMBUS_WRITE) || (size == I2C_SMBUS_QUICK)) { -+ return 0; -+ } -+ -+ switch (size) { -+ case I2C_SMBUS_QUICK: -+ case I2C_SMBUS_BYTE_DATA: -+ data->byte = inb_p(SMBHSTDAT); -+ break; -+ case I2C_SMBUS_WORD_DATA: -+ data->word = inb_p(SMBHSTDAT) + (inb_p(SMBHSTDAT) << 8); -+ break; -+ } -+ return 0; -+} -+ -+static u32 nct6775_func(struct i2c_adapter *adapter) -+{ -+ return I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE | -+ I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA | -+ I2C_FUNC_SMBUS_BLOCK_DATA; -+} -+ -+static const struct i2c_algorithm smbus_algorithm = { -+ .smbus_xfer = nct6775_access, -+ .functionality = nct6775_func, -+}; -+ -+static int nct6775_add_adapter(unsigned short smba, const char *name, struct i2c_adapter **padap) -+{ -+ struct i2c_adapter *adap; -+ struct i2c_nct6775_adapdata *adapdata; -+ int retval; -+ -+ adap = kzalloc(sizeof(*adap), GFP_KERNEL); -+ if (adap == NULL) { -+ return -ENOMEM; -+ } -+ -+ adap->owner = THIS_MODULE; -+ adap->class = I2C_CLASS_HWMON; -+ adap->algo = &smbus_algorithm; -+ -+ adapdata = kzalloc(sizeof(*adapdata), GFP_KERNEL); -+ if (adapdata == NULL) { -+ kfree(adap); -+ return -ENOMEM; -+ } -+ -+ adapdata->smba = smba; -+ -+ snprintf(adap->name, sizeof(adap->name), -+ "SMBus NCT67xx adapter%s at %04x", name, smba); -+ -+ i2c_set_adapdata(adap, adapdata); -+ -+ retval = i2c_add_adapter(adap); -+ if (retval) { -+ kfree(adapdata); -+ kfree(adap); -+ return retval; -+ } -+ -+ *padap = adap; -+ return 0; -+} -+ -+static void nct6775_remove_adapter(struct i2c_adapter *adap) -+{ -+ struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap); -+ -+ if (adapdata->smba) { -+ i2c_del_adapter(adap); -+ kfree(adapdata); -+ kfree(adap); -+ } -+} -+ -+//static SIMPLE_DEV_PM_OPS(nct6775_dev_pm_ops, nct6775_suspend, nct6775_resume); -+ -+/* -+ * when Super-I/O functions move to a separate file, the Super-I/O -+ * bus will manage the lifetime of the device and this module will only keep -+ * track of the nct6775 driver. But since we use platform_device_alloc(), we -+ * must keep track of the device -+ */ -+static struct platform_device *pdev[2]; -+ -+static int nct6775_probe(struct platform_device *pdev) -+{ -+ struct device *dev = &pdev->dev; -+ struct nct6775_sio_data *sio_data = dev_get_platdata(dev); -+ struct resource *res; -+ -+ res = platform_get_resource(pdev, IORESOURCE_IO, 0); -+ if (!devm_request_region(&pdev->dev, res->start, IOREGION_LENGTH, -+ DRVNAME)) -+ return -EBUSY; -+ -+ switch (sio_data->kind) { -+ case nct6791: -+ case nct6792: -+ case nct6793: -+ case nct6795: -+ case nct6796: -+ case nct6798: -+ nct6775_add_adapter(res->start, "", &nct6775_adapter); -+ break; -+ default: -+ return -ENODEV; -+ } -+ -+ return 0; -+} -+/* -+static void nct6791_enable_io_mapping(int sioaddr) -+{ -+ int val; -+ -+ val = superio_inb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE); -+ if (val & 0x10) { -+ pr_info("Enabling hardware monitor logical device mappings.\n"); -+ superio_outb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE, -+ val & ~0x10); -+ } -+}*/ -+ -+static struct platform_driver i2c_nct6775_driver = { -+ .driver = { -+ .name = DRVNAME, -+// .pm = &nct6775_dev_pm_ops, -+ }, -+ .probe = nct6775_probe, -+}; -+ -+static void __exit i2c_nct6775_exit(void) -+{ -+ int i; -+ -+ if(nct6775_adapter) -+ nct6775_remove_adapter(nct6775_adapter); -+ -+ for (i = 0; i < ARRAY_SIZE(pdev); i++) { -+ if (pdev[i]) -+ platform_device_unregister(pdev[i]); -+ } -+ platform_driver_unregister(&i2c_nct6775_driver); -+} -+ -+/* nct6775_find() looks for a '627 in the Super-I/O config space */ -+static int __init nct6775_find(int sioaddr, struct nct6775_sio_data *sio_data) -+{ -+ u16 val; -+ int err; -+ int addr; -+ -+ err = superio_enter(sioaddr); -+ if (err) -+ return err; -+ -+ val = (superio_inb(sioaddr, SIO_REG_DEVID) << 8) | -+ superio_inb(sioaddr, SIO_REG_DEVID + 1); -+ -+ switch (val & SIO_ID_MASK) { -+ case SIO_NCT6106_ID: -+ sio_data->kind = nct6106; -+ break; -+ case SIO_NCT6775_ID: -+ sio_data->kind = nct6775; -+ break; -+ case SIO_NCT6776_ID: -+ sio_data->kind = nct6776; -+ break; -+ case SIO_NCT6779_ID: -+ sio_data->kind = nct6779; -+ break; -+ case SIO_NCT6791_ID: -+ sio_data->kind = nct6791; -+ break; -+ case SIO_NCT6792_ID: -+ sio_data->kind = nct6792; -+ break; -+ case SIO_NCT6793_ID: -+ sio_data->kind = nct6793; -+ break; -+ case SIO_NCT6795_ID: -+ sio_data->kind = nct6795; -+ break; -+ case SIO_NCT6796_ID: -+ sio_data->kind = nct6796; -+ break; -+ case SIO_NCT6798_ID: -+ sio_data->kind = nct6798; -+ break; -+ default: -+ if (val != 0xffff) -+ pr_debug("unsupported chip ID: 0x%04x\n", val); -+ superio_exit(sioaddr); -+ return -ENODEV; -+ } -+ -+ /* We have a known chip, find the SMBus I/O address */ -+ superio_select(sioaddr, NCT6775_LD_SMBUS); -+ val = (superio_inb(sioaddr, SIO_REG_SMBA) << 8) -+ | superio_inb(sioaddr, SIO_REG_SMBA + 1); -+ addr = val & IOREGION_ALIGNMENT; -+ if (addr == 0) { -+ pr_err("Refusing to enable a Super-I/O device with a base I/O port 0\n"); -+ superio_exit(sioaddr); -+ return -ENODEV; -+ } -+ -+ //if (sio_data->kind == nct6791 || sio_data->kind == nct6792 || -+ // sio_data->kind == nct6793 || sio_data->kind == nct6795 || -+ // sio_data->kind == nct6796) -+ // nct6791_enable_io_mapping(sioaddr); -+ -+ superio_exit(sioaddr); -+ pr_info("Found %s or compatible chip at %#x:%#x\n", -+ nct6775_sio_names[sio_data->kind], sioaddr, addr); -+ sio_data->sioreg = sioaddr; -+ -+ return addr; -+} -+ -+static int __init i2c_nct6775_init(void) -+{ -+ int i, err; -+ bool found = false; -+ int address; -+ struct resource res; -+ struct nct6775_sio_data sio_data; -+ int sioaddr[2] = { 0x2e, 0x4e }; -+ -+ err = platform_driver_register(&i2c_nct6775_driver); -+ if (err) -+ return err; -+ -+ /* -+ * initialize sio_data->kind and sio_data->sioreg. -+ * -+ * when Super-I/O functions move to a separate file, the Super-I/O -+ * driver will probe 0x2e and 0x4e and auto-detect the presence of a -+ * nct6775 hardware monitor, and call probe() -+ */ -+ for (i = 0; i < ARRAY_SIZE(pdev); i++) { -+ address = nct6775_find(sioaddr[i], &sio_data); -+ if (address <= 0) -+ continue; -+ -+ found = true; -+ -+ pdev[i] = platform_device_alloc(DRVNAME, address); -+ if (!pdev[i]) { -+ err = -ENOMEM; -+ goto exit_device_unregister; -+ } -+ -+ err = platform_device_add_data(pdev[i], &sio_data, -+ sizeof(struct nct6775_sio_data)); -+ if (err) -+ goto exit_device_put; -+ -+ memset(&res, 0, sizeof(res)); -+ res.name = DRVNAME; -+ res.start = address; -+ res.end = address + IOREGION_LENGTH - 1; -+ res.flags = IORESOURCE_IO; -+ -+ err = acpi_check_resource_conflict(&res); -+ if (err) { -+ platform_device_put(pdev[i]); -+ pdev[i] = NULL; -+ continue; -+ } -+ -+ err = platform_device_add_resources(pdev[i], &res, 1); -+ if (err) -+ goto exit_device_put; -+ -+ /* platform_device_add calls probe() */ -+ err = platform_device_add(pdev[i]); -+ if (err) -+ goto exit_device_put; -+ } -+ if (!found) { -+ err = -ENODEV; -+ goto exit_unregister; -+ } -+ -+ return 0; -+ -+exit_device_put: -+ platform_device_put(pdev[i]); -+exit_device_unregister: -+ while (--i >= 0) { -+ if (pdev[i]) -+ platform_device_unregister(pdev[i]); -+ } -+exit_unregister: -+ platform_driver_unregister(&i2c_nct6775_driver); -+ return err; -+} -+ -+MODULE_AUTHOR("Adam Honse "); -+MODULE_DESCRIPTION("SMBus driver for NCT6775F and compatible chips"); -+MODULE_LICENSE("GPL"); -+ -+module_init(i2c_nct6775_init); -+module_exit(i2c_nct6775_exit); -diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c -index 4e32d57ae0bf..a2deb7379904 100644 ---- a/drivers/i2c/busses/i2c-piix4.c -+++ b/drivers/i2c/busses/i2c-piix4.c -@@ -569,11 +569,11 @@ static int piix4_transaction(struct i2c_adapter *piix4_adapter) - if (srvrworks_csb5_delay) /* Extra delay for SERVERWORKS_CSB5 */ - usleep_range(2000, 2100); - else -- usleep_range(250, 500); -+ usleep_range(25, 50); - - while ((++timeout < MAX_TIMEOUT) && - ((temp = inb_p(SMBHSTSTS)) & 0x01)) -- usleep_range(250, 500); -+ usleep_range(25, 50); - - /* If the SMBus is still busy, we give up */ - if (timeout == MAX_TIMEOUT) { diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c -index a8ce3d140722..49729cf8c12f 100644 +index b5cbb57ee5f6..a0f7fa1518c6 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -46,6 +46,7 @@ struct evdev_client { @@ -6988,10 +6438,10 @@ index a8ce3d140722..49729cf8c12f 100644 } diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c -index 348b4b26c272..708405b16687 100644 +index 1ae2c71bb383..784829ada178 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c -@@ -3310,6 +3310,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) +@@ -3315,6 +3315,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } @@ -11378,28 +10828,11 @@ index dccb60c1d9cc..d9a8af789de8 100644 { 0 } }; -diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h -index aabec598f79a..7fe0981a7e46 100644 ---- a/include/linux/cpufreq.h -+++ b/include/linux/cpufreq.h -@@ -577,12 +577,6 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div, - #define CPUFREQ_POLICY_POWERSAVE (1) - #define CPUFREQ_POLICY_PERFORMANCE (2) - --/* -- * The polling frequency depends on the capability of the processor. Default -- * polling frequency is 1000 times the transition latency of the processor. -- */ --#define LATENCY_MULTIPLIER (1000) -- - struct cpufreq_governor { - char name[CPUFREQ_NAME_LEN]; - int (*init)(struct cpufreq_policy *policy); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h -index d9c7edb6422b..b57c72793580 100644 +index 68a5f1ff3301..291873a34079 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h -@@ -1264,7 +1264,7 @@ struct readahead_control { +@@ -1362,7 +1362,7 @@ struct readahead_control { ._index = i, \ } @@ -11409,10 +10842,10 @@ index d9c7edb6422b..b57c72793580 100644 void page_cache_ra_unbounded(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_count); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h -index 6030a8235617..60b7fe5fa74a 100644 +index 3625096d5f85..a07f895b8eba 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h -@@ -156,6 +156,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, +@@ -158,6 +158,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, #ifdef CONFIG_USER_NS @@ -11421,7 +10854,7 @@ index 6030a8235617..60b7fe5fa74a 100644 static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) -@@ -189,6 +191,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns); +@@ -191,6 +193,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns); struct ns_common *ns_get_owner(struct ns_common *ns); #else @@ -11451,10 +10884,10 @@ index 8aa3372f21a0..924778a426ce 100644 void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); diff --git a/init/Kconfig b/init/Kconfig -index 5783a0b87517..08a0d51afaae 100644 +index c521e1421ad4..38dbd16da6a9 100644 --- a/init/Kconfig +++ b/init/Kconfig -@@ -134,6 +134,10 @@ config THREAD_INFO_IN_TASK +@@ -145,6 +145,10 @@ config THREAD_INFO_IN_TASK menu "General setup" @@ -11465,7 +10898,7 @@ index 5783a0b87517..08a0d51afaae 100644 config BROKEN bool -@@ -1265,6 +1269,22 @@ config USER_NS +@@ -1300,6 +1304,22 @@ config USER_NS If unsure, say N. @@ -11488,7 +10921,7 @@ index 5783a0b87517..08a0d51afaae 100644 config PID_NS bool "PID Namespaces" default y -@@ -1407,6 +1427,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE +@@ -1442,6 +1462,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE with the "-O2" compiler flag for best performance and most helpful compile-time warnings. @@ -11544,12 +10977,12 @@ index 38ef6d06888e..0f78364efd4f 100644 config SCHED_HRTICK diff --git a/kernel/fork.c b/kernel/fork.c -index 6b97fb2ac4af..003de4829c15 100644 +index 22f43721d031..8287afdd01d2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c -@@ -105,6 +105,10 @@ - #include +@@ -107,6 +107,10 @@ #include + #include +#ifdef CONFIG_USER_NS +#include @@ -11558,7 +10991,7 @@ index 6b97fb2ac4af..003de4829c15 100644 #include #include #include -@@ -2135,6 +2139,10 @@ __latent_entropy struct task_struct *copy_process( +@@ -2138,6 +2142,10 @@ __latent_entropy struct task_struct *copy_process( if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -11569,7 +11002,7 @@ index 6b97fb2ac4af..003de4829c15 100644 /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. -@@ -3283,6 +3291,12 @@ int ksys_unshare(unsigned long unshare_flags) +@@ -3291,6 +3299,12 @@ int ksys_unshare(unsigned long unshare_flags) if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; @@ -11583,10 +11016,10 @@ index 6b97fb2ac4af..003de4829c15 100644 if (err) goto bad_unshare_out; diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c -index 33cac79e3994..3277df47ab3c 100644 +index 2bbb6eca5144..125cdf85741c 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c -@@ -749,6 +749,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) +@@ -747,6 +747,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) struct task_struct *new, *owner; unsigned long flags, new_flags; enum owner_state state; @@ -11594,7 +11027,7 @@ index 33cac79e3994..3277df47ab3c 100644 lockdep_assert_preemption_disabled(); -@@ -785,7 +786,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) +@@ -783,7 +784,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) break; } @@ -11605,7 +11038,7 @@ index 33cac79e3994..3277df47ab3c 100644 return state; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 1d2cbdb162a6..91b242e47db7 100644 +index 2d16c8545c71..54e7c4c3e2c5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -73,10 +73,19 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; @@ -11642,10 +11075,10 @@ index 1d2cbdb162a6..91b242e47db7 100644 #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 4c36cc680361..432b43aa091c 100644 +index 6c54a57275cc..f610df2e0811 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h -@@ -2591,7 +2591,7 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); +@@ -2815,7 +2815,7 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); @@ -11727,7 +11160,7 @@ index 79e6cb1d5c48..676e89dc38c3 100644 { .procname = "tainted", diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index 0b0b95418b16..c4b835b91fc0 100644 +index aa0b2e47f2f2..d74d857b1696 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -22,6 +22,13 @@ @@ -11745,10 +11178,10 @@ index 0b0b95418b16..c4b835b91fc0 100644 static DEFINE_MUTEX(userns_state_mutex); diff --git a/mm/Kconfig b/mm/Kconfig -index 03395624bc70..676ff8d1266b 100644 +index 33fa51d608dc..6bfea371341e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig -@@ -649,7 +649,7 @@ config COMPACTION +@@ -648,7 +648,7 @@ config COMPACTION config COMPACT_UNEVICTABLE_DEFAULT int depends on COMPACTION @@ -11758,10 +11191,10 @@ index 03395624bc70..676ff8d1266b 100644 # diff --git a/mm/compaction.c b/mm/compaction.c -index eb95e9b435d0..ae03cdc3e76e 100644 +index a2b16b08cbbf..48d611e58ad3 100644 --- a/mm/compaction.c +++ b/mm/compaction.c -@@ -1950,7 +1950,11 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE +@@ -1920,7 +1920,11 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE * aggressively the kernel should compact memory in the * background. It takes values in the range [0, 100]. */ @@ -11773,24 +11206,8 @@ index eb95e9b435d0..ae03cdc3e76e 100644 static int sysctl_extfrag_threshold = 500; static int __read_mostly sysctl_compact_memory; -diff --git a/mm/huge_memory.c b/mm/huge_memory.c -index 99b146d16a18..4d2839fcf688 100644 ---- a/mm/huge_memory.c -+++ b/mm/huge_memory.c -@@ -64,7 +64,11 @@ unsigned long transparent_hugepage_flags __read_mostly = - #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE - (1<> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ -@@ -1137,4 +1141,5 @@ void __init swap_setup(void) +@@ -1105,4 +1109,5 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ @@ -11871,10 +11288,10 @@ index bd5183dfd879..3a410f53a07c 100644 /* diff --git a/mm/vmscan.c b/mm/vmscan.c -index 128f307da6ee..35b67785907b 100644 +index ddaaff67642e..6a8a68b2f7f2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c -@@ -199,7 +199,11 @@ struct scan_control { +@@ -200,7 +200,11 @@ struct scan_control { /* * From 0 .. MAX_SWAPPINESS. Higher means more swappy. */ @@ -11886,7 +11303,7 @@ index 128f307da6ee..35b67785907b 100644 #ifdef CONFIG_MEMCG -@@ -3968,7 +3972,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc +@@ -3992,7 +3996,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc } /* to protect the working set of the last N jiffies */ @@ -11899,7 +11316,7 @@ index 128f307da6ee..35b67785907b 100644 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c -index 64d07b842e73..a0ac138e7bf8 100644 +index 2b698f8419fe..fd039c41d1c8 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -634,7 +634,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) @@ -11911,173 +11328,1640 @@ index 64d07b842e73..a0ac138e7bf8 100644 TASK_INTERRUPTIBLE); release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) -diff --git a/scripts/Makefile.package b/scripts/Makefile.package -index 4a80584ec771..11d53f240a2b 100644 ---- a/scripts/Makefile.package -+++ b/scripts/Makefile.package -@@ -147,8 +147,7 @@ snap-pkg: - PHONY += pacman-pkg - pacman-pkg: - @ln -srf $(srctree)/scripts/package/PKGBUILD $(objtree)/PKGBUILD -- +objtree="$(realpath $(objtree))" \ -- BUILDDIR="$(realpath $(objtree))/pacman" \ -+ +BUILDDIR="$(realpath $(objtree))/pacman" \ - CARCH="$(UTS_MACHINE)" \ - KBUILD_MAKEFLAGS="$(MAKEFLAGS)" \ - KBUILD_REVISION="$(shell $(srctree)/scripts/build-version)" \ -diff --git a/scripts/package/PKGBUILD b/scripts/package/PKGBUILD -index 663ce300dd06..f83493838cf9 100644 ---- a/scripts/package/PKGBUILD -+++ b/scripts/package/PKGBUILD -@@ -3,10 +3,13 @@ - # Contributor: Jan Alexander Steffens (heftig) - - pkgbase=${PACMAN_PKGBASE:-linux-upstream} --pkgname=("${pkgbase}" "${pkgbase}-api-headers") --if grep -q CONFIG_MODULES=y include/config/auto.conf; then -- pkgname+=("${pkgbase}-headers") --fi -+pkgname=("${pkgbase}") -+ -+_extrapackages=${PACMAN_EXTRAPACKAGES-headers api-headers debug} -+for pkg in $_extrapackages; do -+ pkgname+=("${pkgbase}-${pkg}") -+done -+ - pkgver="${KERNELRELEASE//-/_}" - # The PKGBUILD is evaluated multiple times. - # Running scripts/build-version from here would introduce inconsistencies. -@@ -33,11 +36,17 @@ makedepends=( - ) - options=(!debug !strip !buildflags !makeflags) - --build() { -+_prologue() { - # MAKEFLAGS from makepkg.conf override the ones inherited from kbuild. - # Bypass this override with a custom variable. - export MAKEFLAGS="${KBUILD_MAKEFLAGS}" -- cd "${objtree}" -+ -+ # Kbuild works in the output directory, where this PKGBUILD is located. -+ cd "$(dirname "${BASH_SOURCE[0]}")" -+} -+ -+build() { -+ _prologue - - ${MAKE} KERNELRELEASE="${KERNELRELEASE}" KBUILD_BUILD_VERSION="${pkgrel}" - } -@@ -45,10 +54,10 @@ build() { - _package() { - pkgdesc="The ${pkgdesc} kernel and modules" - -- export MAKEFLAGS="${KBUILD_MAKEFLAGS}" -- cd "${objtree}" - local modulesdir="${pkgdir}/usr/${MODLIB}" - -+ _prologue -+ - echo "Installing boot image..." - # systemd expects to find the kernel here to allow hibernation - # https://github.com/systemd/systemd/commit/edda44605f06a41fb86b7ab8128dcf99161d2344 -@@ -73,14 +82,17 @@ _package() { - _package-headers() { - pkgdesc="Headers and scripts for building modules for the ${pkgdesc} kernel" - -- export MAKEFLAGS="${KBUILD_MAKEFLAGS}" -- cd "${objtree}" - local builddir="${pkgdir}/usr/${MODLIB}/build" - -- echo "Installing build files..." -- "${srctree}/scripts/package/install-extmod-build" "${builddir}" -+ _prologue -+ -+ if grep -q CONFIG_MODULES=y include/config/auto.conf; then -+ echo "Installing build files..." -+ "${srctree}/scripts/package/install-extmod-build" "${builddir}" -+ fi - - echo "Installing System.map and config..." -+ mkdir -p "${builddir}" - cp System.map "${builddir}/System.map" - cp .config "${builddir}/.config" - -@@ -94,12 +106,24 @@ _package-api-headers() { - provides=(linux-api-headers) - conflicts=(linux-api-headers) - -- export MAKEFLAGS="${KBUILD_MAKEFLAGS}" -- cd "${objtree}" -+ _prologue - - ${MAKE} headers_install INSTALL_HDR_PATH="${pkgdir}/usr" - } - -+_package-debug(){ -+ pkgdesc="Non-stripped vmlinux file for the ${pkgdesc} kernel" -+ -+ local debugdir="${pkgdir}/usr/src/debug/${pkgbase}" -+ local builddir="${pkgdir}/usr/${MODLIB}/build" -+ -+ _prologue -+ -+ install -Dt "${debugdir}" -m644 vmlinux -+ mkdir -p "${builddir}" -+ ln -sr "${debugdir}/vmlinux" "${builddir}/vmlinux" -+} -+ - for _p in "${pkgname[@]}"; do - eval "package_$_p() { - $(declare -f "_package${_p#$pkgbase}") -- -2.47.0.rc0 +2.47.0 -From 6c98e17d041435fde0a3c49fce29a562935c8cb6 Mon Sep 17 00:00:00 2001 -From: Eric Naim -Date: Tue, 22 Oct 2024 22:52:25 +0800 -Subject: [PATCH 06/13] fixes +From 26c1af5f52faff4b98337408c9e8fea43d530f54 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 4 Nov 2024 14:47:50 +0100 +Subject: [PATCH 06/13] crypto -Signed-off-by: Eric Naim +Signed-off-by: Peter Jung --- - arch/Kconfig | 4 +- - drivers/bluetooth/btusb.c | 4 ++ - drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 ++ - drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 10 +++- - drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 3 +- - drivers/gpu/drm/amd/pm/swsmu/inc/smu_v11_0.h | 2 +- - drivers/gpu/drm/amd/pm/swsmu/inc/smu_v12_0.h | 2 +- - drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h | 2 +- - drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h | 2 +- - .../gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c | 2 +- - .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 2 +- - .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c | 25 +++++++--- - .../gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c | 19 ++++---- - .../gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c | 14 +++--- - .../gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c | 2 +- - .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 15 +++--- - .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 34 +++++++++---- - .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 3 +- - .../drm/amd/pm/swsmu/smu13/smu_v13_0_5_ppt.c | 22 +++++---- - .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 15 +++--- - .../drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 3 +- - .../drm/amd/pm/swsmu/smu13/yellow_carp_ppt.c | 36 ++++++++------ - .../gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c | 33 +++++++++---- - .../drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c | 3 +- - drivers/gpu/drm/drm_edid.c | 47 ++++++++++++++++-- - drivers/net/wireless/realtek/rtw89/pci.c | 48 ++++++++++++++++--- - drivers/platform/x86/dell/dell-wmi-base.c | 9 ++++ - mm/mmap.c | 4 -- - mm/shrinker.c | 4 +- - net/netfilter/xt_NFLOG.c | 2 +- - net/netfilter/xt_TRACE.c | 1 + - net/netfilter/xt_mark.c | 2 +- - 32 files changed, 269 insertions(+), 110 deletions(-) + arch/x86/crypto/Kconfig | 4 +- + arch/x86/crypto/aegis128-aesni-asm.S | 532 ++++++++-------------- + arch/x86/crypto/aegis128-aesni-glue.c | 145 +++--- + arch/x86/crypto/crc32c-intel_glue.c | 2 +- + arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 354 +++++--------- + 5 files changed, 387 insertions(+), 650 deletions(-) + +diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig +index 7b1bebed879d..3d2e38ba5240 100644 +--- a/arch/x86/crypto/Kconfig ++++ b/arch/x86/crypto/Kconfig +@@ -363,7 +363,7 @@ config CRYPTO_CHACHA20_X86_64 + - AVX-512VL (Advanced Vector Extensions-512VL) + + config CRYPTO_AEGIS128_AESNI_SSE2 +- tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE2)" ++ tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE4.1)" + depends on X86 && 64BIT + select CRYPTO_AEAD + select CRYPTO_SIMD +@@ -372,7 +372,7 @@ config CRYPTO_AEGIS128_AESNI_SSE2 + + Architecture: x86_64 using: + - AES-NI (AES New Instructions) +- - SSE2 (Streaming SIMD Extensions 2) ++ - SSE4.1 (Streaming SIMD Extensions 4.1) + + config CRYPTO_NHPOLY1305_SSE2 + tristate "Hash functions: NHPoly1305 (SSE2)" +diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S +index ad7f4c891625..7294dc0ee7ba 100644 +--- a/arch/x86/crypto/aegis128-aesni-asm.S ++++ b/arch/x86/crypto/aegis128-aesni-asm.S +@@ -1,14 +1,13 @@ + /* SPDX-License-Identifier: GPL-2.0-only */ + /* +- * AES-NI + SSE2 implementation of AEGIS-128 ++ * AES-NI + SSE4.1 implementation of AEGIS-128 + * + * Copyright (c) 2017-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. ++ * Copyright 2024 Google LLC + */ + + #include +-#include +-#include + + #define STATE0 %xmm0 + #define STATE1 %xmm1 +@@ -20,11 +19,6 @@ + #define T0 %xmm6 + #define T1 %xmm7 + +-#define STATEP %rdi +-#define LEN %rsi +-#define SRC %rdx +-#define DST %rcx +- + .section .rodata.cst16.aegis128_const, "aM", @progbits, 32 + .align 16 + .Laegis128_const_0: +@@ -34,11 +28,11 @@ + .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 + .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +-.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16 +-.align 16 +-.Laegis128_counter: +- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 +- .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f ++.section .rodata.cst32.zeropad_mask, "aM", @progbits, 32 ++.align 32 ++.Lzeropad_mask: ++ .octa 0xffffffffffffffffffffffffffffffff ++ .octa 0 + + .text + +@@ -61,140 +55,102 @@ + .endm + + /* +- * __load_partial: internal ABI +- * input: +- * LEN - bytes +- * SRC - src +- * output: +- * MSG - message block +- * changed: +- * T0 +- * %r8 +- * %r9 ++ * Load 1 <= LEN (%ecx) <= 15 bytes from the pointer SRC into the xmm register ++ * MSG and zeroize any remaining bytes. Clobbers %rax, %rcx, and %r8. + */ +-SYM_FUNC_START_LOCAL(__load_partial) +- xor %r9d, %r9d +- pxor MSG, MSG +- +- mov LEN, %r8 +- and $0x1, %r8 +- jz .Lld_partial_1 +- +- mov LEN, %r8 +- and $0x1E, %r8 +- add SRC, %r8 +- mov (%r8), %r9b +- +-.Lld_partial_1: +- mov LEN, %r8 +- and $0x2, %r8 +- jz .Lld_partial_2 +- +- mov LEN, %r8 +- and $0x1C, %r8 +- add SRC, %r8 +- shl $0x10, %r9 +- mov (%r8), %r9w +- +-.Lld_partial_2: +- mov LEN, %r8 +- and $0x4, %r8 +- jz .Lld_partial_4 +- +- mov LEN, %r8 +- and $0x18, %r8 +- add SRC, %r8 +- shl $32, %r9 +- mov (%r8), %r8d +- xor %r8, %r9 +- +-.Lld_partial_4: +- movq %r9, MSG +- +- mov LEN, %r8 +- and $0x8, %r8 +- jz .Lld_partial_8 +- +- mov LEN, %r8 +- and $0x10, %r8 +- add SRC, %r8 +- pslldq $8, MSG +- movq (%r8), T0 +- pxor T0, MSG +- +-.Lld_partial_8: +- RET +-SYM_FUNC_END(__load_partial) ++.macro load_partial ++ sub $8, %ecx /* LEN - 8 */ ++ jle .Lle8\@ ++ ++ /* Load 9 <= LEN <= 15 bytes: */ ++ movq (SRC), MSG /* Load first 8 bytes */ ++ mov (SRC, %rcx), %rax /* Load last 8 bytes */ ++ neg %ecx ++ shl $3, %ecx ++ shr %cl, %rax /* Discard overlapping bytes */ ++ pinsrq $1, %rax, MSG ++ jmp .Ldone\@ ++ ++.Lle8\@: ++ add $4, %ecx /* LEN - 4 */ ++ jl .Llt4\@ ++ ++ /* Load 4 <= LEN <= 8 bytes: */ ++ mov (SRC), %eax /* Load first 4 bytes */ ++ mov (SRC, %rcx), %r8d /* Load last 4 bytes */ ++ jmp .Lcombine\@ ++ ++.Llt4\@: ++ /* Load 1 <= LEN <= 3 bytes: */ ++ add $2, %ecx /* LEN - 2 */ ++ movzbl (SRC), %eax /* Load first byte */ ++ jl .Lmovq\@ ++ movzwl (SRC, %rcx), %r8d /* Load last 2 bytes */ ++.Lcombine\@: ++ shl $3, %ecx ++ shl %cl, %r8 ++ or %r8, %rax /* Combine the two parts */ ++.Lmovq\@: ++ movq %rax, MSG ++.Ldone\@: ++.endm + + /* +- * __store_partial: internal ABI +- * input: +- * LEN - bytes +- * DST - dst +- * output: +- * T0 - message block +- * changed: +- * %r8 +- * %r9 +- * %r10 ++ * Store 1 <= LEN (%ecx) <= 15 bytes from the xmm register \msg to the pointer ++ * DST. Clobbers %rax, %rcx, and %r8. + */ +-SYM_FUNC_START_LOCAL(__store_partial) +- mov LEN, %r8 +- mov DST, %r9 +- +- movq T0, %r10 +- +- cmp $8, %r8 +- jl .Lst_partial_8 +- +- mov %r10, (%r9) +- psrldq $8, T0 +- movq T0, %r10 +- +- sub $8, %r8 +- add $8, %r9 +- +-.Lst_partial_8: +- cmp $4, %r8 +- jl .Lst_partial_4 +- +- mov %r10d, (%r9) +- shr $32, %r10 +- +- sub $4, %r8 +- add $4, %r9 +- +-.Lst_partial_4: +- cmp $2, %r8 +- jl .Lst_partial_2 +- +- mov %r10w, (%r9) +- shr $0x10, %r10 +- +- sub $2, %r8 +- add $2, %r9 +- +-.Lst_partial_2: +- cmp $1, %r8 +- jl .Lst_partial_1 +- +- mov %r10b, (%r9) +- +-.Lst_partial_1: +- RET +-SYM_FUNC_END(__store_partial) ++.macro store_partial msg ++ sub $8, %ecx /* LEN - 8 */ ++ jl .Llt8\@ ++ ++ /* Store 8 <= LEN <= 15 bytes: */ ++ pextrq $1, \msg, %rax ++ mov %ecx, %r8d ++ shl $3, %ecx ++ ror %cl, %rax ++ mov %rax, (DST, %r8) /* Store last LEN - 8 bytes */ ++ movq \msg, (DST) /* Store first 8 bytes */ ++ jmp .Ldone\@ ++ ++.Llt8\@: ++ add $4, %ecx /* LEN - 4 */ ++ jl .Llt4\@ ++ ++ /* Store 4 <= LEN <= 7 bytes: */ ++ pextrd $1, \msg, %eax ++ mov %ecx, %r8d ++ shl $3, %ecx ++ ror %cl, %eax ++ mov %eax, (DST, %r8) /* Store last LEN - 4 bytes */ ++ movd \msg, (DST) /* Store first 4 bytes */ ++ jmp .Ldone\@ ++ ++.Llt4\@: ++ /* Store 1 <= LEN <= 3 bytes: */ ++ pextrb $0, \msg, 0(DST) ++ cmp $-2, %ecx /* LEN - 4 == -2, i.e. LEN == 2? */ ++ jl .Ldone\@ ++ pextrb $1, \msg, 1(DST) ++ je .Ldone\@ ++ pextrb $2, \msg, 2(DST) ++.Ldone\@: ++.endm + + /* +- * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv); ++ * void aegis128_aesni_init(struct aegis_state *state, ++ * const struct aegis_block *key, ++ * const u8 iv[AEGIS128_NONCE_SIZE]); + */ +-SYM_FUNC_START(crypto_aegis128_aesni_init) +- FRAME_BEGIN ++SYM_FUNC_START(aegis128_aesni_init) ++ .set STATEP, %rdi ++ .set KEYP, %rsi ++ .set IVP, %rdx + + /* load IV: */ +- movdqu (%rdx), T1 ++ movdqu (IVP), T1 + + /* load key: */ +- movdqa (%rsi), KEY ++ movdqa (KEYP), KEY + pxor KEY, T1 + movdqa T1, STATE0 + movdqa KEY, STATE3 +@@ -224,20 +180,22 @@ SYM_FUNC_START(crypto_aegis128_aesni_init) + movdqu STATE2, 0x20(STATEP) + movdqu STATE3, 0x30(STATEP) + movdqu STATE4, 0x40(STATEP) +- +- FRAME_END + RET +-SYM_FUNC_END(crypto_aegis128_aesni_init) ++SYM_FUNC_END(aegis128_aesni_init) + + /* +- * void crypto_aegis128_aesni_ad(void *state, unsigned int length, +- * const void *data); ++ * void aegis128_aesni_ad(struct aegis_state *state, const u8 *data, ++ * unsigned int len); ++ * ++ * len must be a multiple of 16. + */ +-SYM_FUNC_START(crypto_aegis128_aesni_ad) +- FRAME_BEGIN ++SYM_FUNC_START(aegis128_aesni_ad) ++ .set STATEP, %rdi ++ .set SRC, %rsi ++ .set LEN, %edx + +- cmp $0x10, LEN +- jb .Lad_out ++ test LEN, LEN ++ jz .Lad_out + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 +@@ -246,89 +204,40 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + +- mov SRC, %r8 +- and $0xF, %r8 +- jnz .Lad_u_loop +- +-.align 8 +-.Lad_a_loop: +- movdqa 0x00(SRC), MSG +- aegis128_update +- pxor MSG, STATE4 +- sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_1 +- +- movdqa 0x10(SRC), MSG +- aegis128_update +- pxor MSG, STATE3 +- sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_2 +- +- movdqa 0x20(SRC), MSG +- aegis128_update +- pxor MSG, STATE2 +- sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_3 +- +- movdqa 0x30(SRC), MSG +- aegis128_update +- pxor MSG, STATE1 +- sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_4 +- +- movdqa 0x40(SRC), MSG +- aegis128_update +- pxor MSG, STATE0 +- sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_0 +- +- add $0x50, SRC +- jmp .Lad_a_loop +- + .align 8 +-.Lad_u_loop: ++.Lad_loop: + movdqu 0x00(SRC), MSG + aegis128_update + pxor MSG, STATE4 + sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_1 ++ jz .Lad_out_1 + + movdqu 0x10(SRC), MSG + aegis128_update + pxor MSG, STATE3 + sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_2 ++ jz .Lad_out_2 + + movdqu 0x20(SRC), MSG + aegis128_update + pxor MSG, STATE2 + sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_3 ++ jz .Lad_out_3 + + movdqu 0x30(SRC), MSG + aegis128_update + pxor MSG, STATE1 + sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_4 ++ jz .Lad_out_4 + + movdqu 0x40(SRC), MSG + aegis128_update + pxor MSG, STATE0 + sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_0 ++ jz .Lad_out_0 + + add $0x50, SRC +- jmp .Lad_u_loop ++ jmp .Lad_loop + + /* store the state: */ + .Lad_out_0: +@@ -337,7 +246,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) + movdqu STATE2, 0x20(STATEP) + movdqu STATE3, 0x30(STATEP) + movdqu STATE4, 0x40(STATEP) +- FRAME_END + RET + + .Lad_out_1: +@@ -346,7 +254,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) +- FRAME_END + RET + + .Lad_out_2: +@@ -355,7 +262,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) + movdqu STATE0, 0x20(STATEP) + movdqu STATE1, 0x30(STATEP) + movdqu STATE2, 0x40(STATEP) +- FRAME_END + RET + + .Lad_out_3: +@@ -364,7 +270,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) + movdqu STATE4, 0x20(STATEP) + movdqu STATE0, 0x30(STATEP) + movdqu STATE1, 0x40(STATEP) +- FRAME_END + RET + + .Lad_out_4: +@@ -373,41 +278,38 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) + movdqu STATE3, 0x20(STATEP) + movdqu STATE4, 0x30(STATEP) + movdqu STATE0, 0x40(STATEP) +- FRAME_END +- RET +- + .Lad_out: +- FRAME_END + RET +-SYM_FUNC_END(crypto_aegis128_aesni_ad) ++SYM_FUNC_END(aegis128_aesni_ad) + +-.macro encrypt_block a s0 s1 s2 s3 s4 i +- movdq\a (\i * 0x10)(SRC), MSG ++.macro encrypt_block s0 s1 s2 s3 s4 i ++ movdqu (\i * 0x10)(SRC), MSG + movdqa MSG, T0 + pxor \s1, T0 + pxor \s4, T0 + movdqa \s2, T1 + pand \s3, T1 + pxor T1, T0 +- movdq\a T0, (\i * 0x10)(DST) ++ movdqu T0, (\i * 0x10)(DST) + + aegis128_update + pxor MSG, \s4 + + sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lenc_out_\i ++ jz .Lenc_out_\i + .endm + + /* +- * void crypto_aegis128_aesni_enc(void *state, unsigned int length, +- * const void *src, void *dst); ++ * void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, u8 *dst, ++ * unsigned int len); ++ * ++ * len must be nonzero and a multiple of 16. + */ +-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) +- FRAME_BEGIN +- +- cmp $0x10, LEN +- jb .Lenc_out ++SYM_FUNC_START(aegis128_aesni_enc) ++ .set STATEP, %rdi ++ .set SRC, %rsi ++ .set DST, %rdx ++ .set LEN, %ecx + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 +@@ -416,34 +318,17 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + +- mov SRC, %r8 +- or DST, %r8 +- and $0xF, %r8 +- jnz .Lenc_u_loop +- + .align 8 +-.Lenc_a_loop: +- encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 +- encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 +- encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 +- encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 +- encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 ++.Lenc_loop: ++ encrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0 ++ encrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1 ++ encrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2 ++ encrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3 ++ encrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4 + + add $0x50, SRC + add $0x50, DST +- jmp .Lenc_a_loop +- +-.align 8 +-.Lenc_u_loop: +- encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 +- encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 +- encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 +- encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 +- encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 +- +- add $0x50, SRC +- add $0x50, DST +- jmp .Lenc_u_loop ++ jmp .Lenc_loop + + /* store the state: */ + .Lenc_out_0: +@@ -452,7 +337,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) +- FRAME_END + RET + + .Lenc_out_1: +@@ -461,7 +345,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) + movdqu STATE0, 0x20(STATEP) + movdqu STATE1, 0x30(STATEP) + movdqu STATE2, 0x40(STATEP) +- FRAME_END + RET + + .Lenc_out_2: +@@ -470,7 +353,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) + movdqu STATE4, 0x20(STATEP) + movdqu STATE0, 0x30(STATEP) + movdqu STATE1, 0x40(STATEP) +- FRAME_END + RET + + .Lenc_out_3: +@@ -479,7 +361,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) + movdqu STATE3, 0x20(STATEP) + movdqu STATE4, 0x30(STATEP) + movdqu STATE0, 0x40(STATEP) +- FRAME_END + RET + + .Lenc_out_4: +@@ -488,20 +369,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) + movdqu STATE2, 0x20(STATEP) + movdqu STATE3, 0x30(STATEP) + movdqu STATE4, 0x40(STATEP) +- FRAME_END +- RET +- + .Lenc_out: +- FRAME_END + RET +-SYM_FUNC_END(crypto_aegis128_aesni_enc) ++SYM_FUNC_END(aegis128_aesni_enc) + + /* +- * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length, +- * const void *src, void *dst); ++ * void aegis128_aesni_enc_tail(struct aegis_state *state, const u8 *src, ++ * u8 *dst, unsigned int len); + */ +-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) +- FRAME_BEGIN ++SYM_FUNC_START(aegis128_aesni_enc_tail) ++ .set STATEP, %rdi ++ .set SRC, %rsi ++ .set DST, %rdx ++ .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */ + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 +@@ -511,7 +391,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) + movdqu 0x40(STATEP), STATE4 + + /* encrypt message: */ +- call __load_partial ++ mov LEN, %r9d ++ load_partial + + movdqa MSG, T0 + pxor STATE1, T0 +@@ -520,7 +401,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) + pand STATE3, T1 + pxor T1, T0 + +- call __store_partial ++ mov %r9d, LEN ++ store_partial T0 + + aegis128_update + pxor MSG, STATE4 +@@ -531,37 +413,36 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) +- +- FRAME_END + RET +-SYM_FUNC_END(crypto_aegis128_aesni_enc_tail) ++SYM_FUNC_END(aegis128_aesni_enc_tail) + +-.macro decrypt_block a s0 s1 s2 s3 s4 i +- movdq\a (\i * 0x10)(SRC), MSG ++.macro decrypt_block s0 s1 s2 s3 s4 i ++ movdqu (\i * 0x10)(SRC), MSG + pxor \s1, MSG + pxor \s4, MSG + movdqa \s2, T1 + pand \s3, T1 + pxor T1, MSG +- movdq\a MSG, (\i * 0x10)(DST) ++ movdqu MSG, (\i * 0x10)(DST) + + aegis128_update + pxor MSG, \s4 + + sub $0x10, LEN +- cmp $0x10, LEN +- jl .Ldec_out_\i ++ jz .Ldec_out_\i + .endm + + /* +- * void crypto_aegis128_aesni_dec(void *state, unsigned int length, +- * const void *src, void *dst); ++ * void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, u8 *dst, ++ * unsigned int len); ++ * ++ * len must be nonzero and a multiple of 16. + */ +-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) +- FRAME_BEGIN +- +- cmp $0x10, LEN +- jb .Ldec_out ++SYM_FUNC_START(aegis128_aesni_dec) ++ .set STATEP, %rdi ++ .set SRC, %rsi ++ .set DST, %rdx ++ .set LEN, %ecx + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 +@@ -570,34 +451,17 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + +- mov SRC, %r8 +- or DST, %r8 +- and $0xF, %r8 +- jnz .Ldec_u_loop +- + .align 8 +-.Ldec_a_loop: +- decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 +- decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 +- decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 +- decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 +- decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 ++.Ldec_loop: ++ decrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0 ++ decrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1 ++ decrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2 ++ decrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3 ++ decrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4 + + add $0x50, SRC + add $0x50, DST +- jmp .Ldec_a_loop +- +-.align 8 +-.Ldec_u_loop: +- decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 +- decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 +- decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 +- decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 +- decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 +- +- add $0x50, SRC +- add $0x50, DST +- jmp .Ldec_u_loop ++ jmp .Ldec_loop + + /* store the state: */ + .Ldec_out_0: +@@ -606,7 +470,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) +- FRAME_END + RET + + .Ldec_out_1: +@@ -615,7 +478,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) + movdqu STATE0, 0x20(STATEP) + movdqu STATE1, 0x30(STATEP) + movdqu STATE2, 0x40(STATEP) +- FRAME_END + RET + + .Ldec_out_2: +@@ -624,7 +486,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) + movdqu STATE4, 0x20(STATEP) + movdqu STATE0, 0x30(STATEP) + movdqu STATE1, 0x40(STATEP) +- FRAME_END + RET + + .Ldec_out_3: +@@ -633,7 +494,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) + movdqu STATE3, 0x20(STATEP) + movdqu STATE4, 0x30(STATEP) + movdqu STATE0, 0x40(STATEP) +- FRAME_END + RET + + .Ldec_out_4: +@@ -642,20 +502,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) + movdqu STATE2, 0x20(STATEP) + movdqu STATE3, 0x30(STATEP) + movdqu STATE4, 0x40(STATEP) +- FRAME_END +- RET +- + .Ldec_out: +- FRAME_END + RET +-SYM_FUNC_END(crypto_aegis128_aesni_dec) ++SYM_FUNC_END(aegis128_aesni_dec) + + /* +- * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length, +- * const void *src, void *dst); ++ * void aegis128_aesni_dec_tail(struct aegis_state *state, const u8 *src, ++ * u8 *dst, unsigned int len); + */ +-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) +- FRAME_BEGIN ++SYM_FUNC_START(aegis128_aesni_dec_tail) ++ .set STATEP, %rdi ++ .set SRC, %rsi ++ .set DST, %rdx ++ .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */ + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 +@@ -665,7 +524,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) + movdqu 0x40(STATEP), STATE4 + + /* decrypt message: */ +- call __load_partial ++ mov LEN, %r9d ++ load_partial + + pxor STATE1, MSG + pxor STATE4, MSG +@@ -673,17 +533,13 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) + pand STATE3, T1 + pxor T1, MSG + +- movdqa MSG, T0 +- call __store_partial ++ mov %r9d, LEN ++ store_partial MSG + + /* mask with byte count: */ +- movq LEN, T0 +- punpcklbw T0, T0 +- punpcklbw T0, T0 +- punpcklbw T0, T0 +- punpcklbw T0, T0 +- movdqa .Laegis128_counter(%rip), T1 +- pcmpgtb T1, T0 ++ lea .Lzeropad_mask+16(%rip), %rax ++ sub %r9, %rax ++ movdqu (%rax), T0 + pand T0, MSG + + aegis128_update +@@ -695,17 +551,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) +- +- FRAME_END + RET +-SYM_FUNC_END(crypto_aegis128_aesni_dec_tail) ++SYM_FUNC_END(aegis128_aesni_dec_tail) + + /* +- * void crypto_aegis128_aesni_final(void *state, void *tag_xor, +- * u64 assoclen, u64 cryptlen); ++ * void aegis128_aesni_final(struct aegis_state *state, ++ * struct aegis_block *tag_xor, ++ * unsigned int assoclen, unsigned int cryptlen); + */ +-SYM_FUNC_START(crypto_aegis128_aesni_final) +- FRAME_BEGIN ++SYM_FUNC_START(aegis128_aesni_final) ++ .set STATEP, %rdi ++ .set TAG_XOR, %rsi ++ .set ASSOCLEN, %edx ++ .set CRYPTLEN, %ecx + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 +@@ -715,10 +573,8 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) + movdqu 0x40(STATEP), STATE4 + + /* prepare length block: */ +- movq %rdx, MSG +- movq %rcx, T0 +- pslldq $8, T0 +- pxor T0, MSG ++ movd ASSOCLEN, MSG ++ pinsrd $2, CRYPTLEN, MSG + psllq $3, MSG /* multiply by 8 (to get bit count) */ + + pxor STATE3, MSG +@@ -733,7 +589,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) + aegis128_update; pxor MSG, STATE3 + + /* xor tag: */ +- movdqu (%rsi), MSG ++ movdqu (TAG_XOR), MSG + + pxor STATE0, MSG + pxor STATE1, MSG +@@ -741,8 +597,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) + pxor STATE3, MSG + pxor STATE4, MSG + +- movdqu MSG, (%rsi) +- +- FRAME_END ++ movdqu MSG, (TAG_XOR) + RET +-SYM_FUNC_END(crypto_aegis128_aesni_final) ++SYM_FUNC_END(aegis128_aesni_final) +diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c +index 4623189000d8..c19d8e3d96a3 100644 +--- a/arch/x86/crypto/aegis128-aesni-glue.c ++++ b/arch/x86/crypto/aegis128-aesni-glue.c +@@ -1,7 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0-or-later + /* + * The AEGIS-128 Authenticated-Encryption Algorithm +- * Glue for AES-NI + SSE2 implementation ++ * Glue for AES-NI + SSE4.1 implementation + * + * Copyright (c) 2017-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. +@@ -23,27 +23,6 @@ + #define AEGIS128_MIN_AUTH_SIZE 8 + #define AEGIS128_MAX_AUTH_SIZE 16 + +-asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv); +- +-asmlinkage void crypto_aegis128_aesni_ad( +- void *state, unsigned int length, const void *data); +- +-asmlinkage void crypto_aegis128_aesni_enc( +- void *state, unsigned int length, const void *src, void *dst); +- +-asmlinkage void crypto_aegis128_aesni_dec( +- void *state, unsigned int length, const void *src, void *dst); +- +-asmlinkage void crypto_aegis128_aesni_enc_tail( +- void *state, unsigned int length, const void *src, void *dst); +- +-asmlinkage void crypto_aegis128_aesni_dec_tail( +- void *state, unsigned int length, const void *src, void *dst); +- +-asmlinkage void crypto_aegis128_aesni_final( +- void *state, void *tag_xor, unsigned int cryptlen, +- unsigned int assoclen); +- + struct aegis_block { + u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN); + }; +@@ -56,15 +35,31 @@ struct aegis_ctx { + struct aegis_block key; + }; + +-struct aegis_crypt_ops { +- int (*skcipher_walk_init)(struct skcipher_walk *walk, +- struct aead_request *req, bool atomic); ++asmlinkage void aegis128_aesni_init(struct aegis_state *state, ++ const struct aegis_block *key, ++ const u8 iv[AEGIS128_NONCE_SIZE]); + +- void (*crypt_blocks)(void *state, unsigned int length, const void *src, +- void *dst); +- void (*crypt_tail)(void *state, unsigned int length, const void *src, +- void *dst); +-}; ++asmlinkage void aegis128_aesni_ad(struct aegis_state *state, const u8 *data, ++ unsigned int len); ++ ++asmlinkage void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, ++ u8 *dst, unsigned int len); ++ ++asmlinkage void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, ++ u8 *dst, unsigned int len); ++ ++asmlinkage void aegis128_aesni_enc_tail(struct aegis_state *state, ++ const u8 *src, u8 *dst, ++ unsigned int len); ++ ++asmlinkage void aegis128_aesni_dec_tail(struct aegis_state *state, ++ const u8 *src, u8 *dst, ++ unsigned int len); ++ ++asmlinkage void aegis128_aesni_final(struct aegis_state *state, ++ struct aegis_block *tag_xor, ++ unsigned int assoclen, ++ unsigned int cryptlen); + + static void crypto_aegis128_aesni_process_ad( + struct aegis_state *state, struct scatterlist *sg_src, +@@ -85,16 +80,15 @@ static void crypto_aegis128_aesni_process_ad( + if (pos > 0) { + unsigned int fill = AEGIS128_BLOCK_SIZE - pos; + memcpy(buf.bytes + pos, src, fill); +- crypto_aegis128_aesni_ad(state, +- AEGIS128_BLOCK_SIZE, +- buf.bytes); ++ aegis128_aesni_ad(state, buf.bytes, ++ AEGIS128_BLOCK_SIZE); + pos = 0; + left -= fill; + src += fill; + } + +- crypto_aegis128_aesni_ad(state, left, src); +- ++ aegis128_aesni_ad(state, src, ++ left & ~(AEGIS128_BLOCK_SIZE - 1)); + src += left & ~(AEGIS128_BLOCK_SIZE - 1); + left &= AEGIS128_BLOCK_SIZE - 1; + } +@@ -110,24 +104,37 @@ static void crypto_aegis128_aesni_process_ad( + + if (pos > 0) { + memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos); +- crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes); ++ aegis128_aesni_ad(state, buf.bytes, AEGIS128_BLOCK_SIZE); + } + } + +-static void crypto_aegis128_aesni_process_crypt( +- struct aegis_state *state, struct skcipher_walk *walk, +- const struct aegis_crypt_ops *ops) ++static __always_inline void ++crypto_aegis128_aesni_process_crypt(struct aegis_state *state, ++ struct skcipher_walk *walk, bool enc) + { + while (walk->nbytes >= AEGIS128_BLOCK_SIZE) { +- ops->crypt_blocks(state, +- round_down(walk->nbytes, AEGIS128_BLOCK_SIZE), +- walk->src.virt.addr, walk->dst.virt.addr); ++ if (enc) ++ aegis128_aesni_enc(state, walk->src.virt.addr, ++ walk->dst.virt.addr, ++ round_down(walk->nbytes, ++ AEGIS128_BLOCK_SIZE)); ++ else ++ aegis128_aesni_dec(state, walk->src.virt.addr, ++ walk->dst.virt.addr, ++ round_down(walk->nbytes, ++ AEGIS128_BLOCK_SIZE)); + skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE); + } + + if (walk->nbytes) { +- ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr, +- walk->dst.virt.addr); ++ if (enc) ++ aegis128_aesni_enc_tail(state, walk->src.virt.addr, ++ walk->dst.virt.addr, ++ walk->nbytes); ++ else ++ aegis128_aesni_dec_tail(state, walk->src.virt.addr, ++ walk->dst.virt.addr, ++ walk->nbytes); + skcipher_walk_done(walk, 0); + } + } +@@ -162,42 +169,39 @@ static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm, + return 0; + } + +-static void crypto_aegis128_aesni_crypt(struct aead_request *req, +- struct aegis_block *tag_xor, +- unsigned int cryptlen, +- const struct aegis_crypt_ops *ops) ++static __always_inline void ++crypto_aegis128_aesni_crypt(struct aead_request *req, ++ struct aegis_block *tag_xor, ++ unsigned int cryptlen, bool enc) + { + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm); + struct skcipher_walk walk; + struct aegis_state state; + +- ops->skcipher_walk_init(&walk, req, true); ++ if (enc) ++ skcipher_walk_aead_encrypt(&walk, req, true); ++ else ++ skcipher_walk_aead_decrypt(&walk, req, true); + + kernel_fpu_begin(); + +- crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv); ++ aegis128_aesni_init(&state, &ctx->key, req->iv); + crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen); +- crypto_aegis128_aesni_process_crypt(&state, &walk, ops); +- crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); ++ crypto_aegis128_aesni_process_crypt(&state, &walk, enc); ++ aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); + + kernel_fpu_end(); + } + + static int crypto_aegis128_aesni_encrypt(struct aead_request *req) + { +- static const struct aegis_crypt_ops OPS = { +- .skcipher_walk_init = skcipher_walk_aead_encrypt, +- .crypt_blocks = crypto_aegis128_aesni_enc, +- .crypt_tail = crypto_aegis128_aesni_enc_tail, +- }; +- + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_block tag = {}; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen; + +- crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); ++ crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true); + + scatterwalk_map_and_copy(tag.bytes, req->dst, + req->assoclen + cryptlen, authsize, 1); +@@ -208,12 +212,6 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req) + { + static const struct aegis_block zeros = {}; + +- static const struct aegis_crypt_ops OPS = { +- .skcipher_walk_init = skcipher_walk_aead_decrypt, +- .crypt_blocks = crypto_aegis128_aesni_dec, +- .crypt_tail = crypto_aegis128_aesni_dec_tail, +- }; +- + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_block tag; + unsigned int authsize = crypto_aead_authsize(tfm); +@@ -222,27 +220,16 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req) + scatterwalk_map_and_copy(tag.bytes, req->src, + req->assoclen + cryptlen, authsize, 0); + +- crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); ++ crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false); + + return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; + } + +-static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead) +-{ +- return 0; +-} +- +-static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead) +-{ +-} +- + static struct aead_alg crypto_aegis128_aesni_alg = { + .setkey = crypto_aegis128_aesni_setkey, + .setauthsize = crypto_aegis128_aesni_setauthsize, + .encrypt = crypto_aegis128_aesni_encrypt, + .decrypt = crypto_aegis128_aesni_decrypt, +- .init = crypto_aegis128_aesni_init_tfm, +- .exit = crypto_aegis128_aesni_exit_tfm, + + .ivsize = AEGIS128_NONCE_SIZE, + .maxauthsize = AEGIS128_MAX_AUTH_SIZE, +@@ -267,7 +254,7 @@ static struct simd_aead_alg *simd_alg; + + static int __init crypto_aegis128_aesni_module_init(void) + { +- if (!boot_cpu_has(X86_FEATURE_XMM2) || ++ if (!boot_cpu_has(X86_FEATURE_XMM4_1) || + !boot_cpu_has(X86_FEATURE_AES) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) + return -ENODEV; +@@ -286,6 +273,6 @@ module_exit(crypto_aegis128_aesni_module_exit); + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Ondrej Mosnacek "); +-MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation"); ++MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE4.1 implementation"); + MODULE_ALIAS_CRYPTO("aegis128"); + MODULE_ALIAS_CRYPTO("aegis128-aesni"); +diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c +index feccb5254c7e..52c5d47ef5a1 100644 +--- a/arch/x86/crypto/crc32c-intel_glue.c ++++ b/arch/x86/crypto/crc32c-intel_glue.c +@@ -41,7 +41,7 @@ + */ + #define CRC32C_PCL_BREAKEVEN 512 + +-asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, ++asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len, + unsigned int crc_init); + #endif /* CONFIG_X86_64 */ + +diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +index bbcff1fb78cb..752812bc4991 100644 +--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S ++++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +@@ -7,6 +7,7 @@ + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf + * + * Copyright (C) 2012 Intel Corporation. ++ * Copyright 2024 Google LLC + * + * Authors: + * Wajdi Feghali +@@ -44,185 +45,129 @@ + */ + + #include +-#include + + ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction + +-.macro LABEL prefix n +-.L\prefix\n\(): +-.endm +- +-.macro JMPTBL_ENTRY i +-.quad .Lcrc_\i +-.endm +- +-.macro JNC_LESS_THAN j +- jnc .Lless_than_\j +-.endm +- +-# Define threshold where buffers are considered "small" and routed to more +-# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so +-# SMALL_SIZE can be no larger than 255. +- ++# Define threshold below which buffers are considered "small" and routed to ++# regular CRC code that does not interleave the CRC instructions. + #define SMALL_SIZE 200 + +-.if (SMALL_SIZE > 255) +-.error "SMALL_ SIZE must be < 256" +-.endif +- +-# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); ++# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init); + + .text + SYM_FUNC_START(crc_pcl) +-#define bufp rdi +-#define bufp_dw %edi +-#define bufp_w %di +-#define bufp_b %dil +-#define bufptmp %rcx +-#define block_0 %rcx +-#define block_1 %rdx +-#define block_2 %r11 +-#define len %rsi +-#define len_dw %esi +-#define len_w %si +-#define len_b %sil +-#define crc_init_arg %rdx +-#define tmp %rbx +-#define crc_init %r8 +-#define crc_init_dw %r8d +-#define crc1 %r9 +-#define crc2 %r10 +- +- pushq %rbx +- pushq %rdi +- pushq %rsi +- +- ## Move crc_init for Linux to a different +- mov crc_init_arg, crc_init ++#define bufp %rdi ++#define bufp_d %edi ++#define len %esi ++#define crc_init %edx ++#define crc_init_q %rdx ++#define n_misaligned %ecx /* overlaps chunk_bytes! */ ++#define n_misaligned_q %rcx ++#define chunk_bytes %ecx /* overlaps n_misaligned! */ ++#define chunk_bytes_q %rcx ++#define crc1 %r8 ++#define crc2 %r9 ++ ++ cmp $SMALL_SIZE, len ++ jb .Lsmall + + ################################################################ + ## 1) ALIGN: + ################################################################ +- +- mov %bufp, bufptmp # rdi = *buf +- neg %bufp +- and $7, %bufp # calculate the unalignment amount of ++ mov bufp_d, n_misaligned ++ neg n_misaligned ++ and $7, n_misaligned # calculate the misalignment amount of + # the address +- je .Lproc_block # Skip if aligned +- +- ## If len is less than 8 and we're unaligned, we need to jump +- ## to special code to avoid reading beyond the end of the buffer +- cmp $8, len +- jae .Ldo_align +- # less_than_8 expects length in upper 3 bits of len_dw +- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] +- shl $32-3+1, len_dw +- jmp .Lless_than_8_post_shl1 ++ je .Laligned # Skip if aligned + ++ # Process 1 <= n_misaligned <= 7 bytes individually in order to align ++ # the remaining data to an 8-byte boundary. + .Ldo_align: +- #### Calculate CRC of unaligned bytes of the buffer (if any) +- movq (bufptmp), tmp # load a quadward from the buffer +- add %bufp, bufptmp # align buffer pointer for quadword +- # processing +- sub %bufp, len # update buffer length ++ movq (bufp), %rax ++ add n_misaligned_q, bufp ++ sub n_misaligned, len + .Lalign_loop: +- crc32b %bl, crc_init_dw # compute crc32 of 1-byte +- shr $8, tmp # get next byte +- dec %bufp ++ crc32b %al, crc_init # compute crc32 of 1-byte ++ shr $8, %rax # get next byte ++ dec n_misaligned + jne .Lalign_loop +- +-.Lproc_block: ++.Laligned: + + ################################################################ +- ## 2) PROCESS BLOCKS: ++ ## 2) PROCESS BLOCK: + ################################################################ + +- ## compute num of bytes to be processed +- movq len, tmp # save num bytes in tmp +- +- cmpq $128*24, len ++ cmp $128*24, len + jae .Lfull_block + +-.Lcontinue_block: +- cmpq $SMALL_SIZE, len +- jb .Lsmall +- +- ## len < 128*24 +- movq $2731, %rax # 2731 = ceil(2^16 / 24) +- mul len_dw +- shrq $16, %rax +- +- ## eax contains floor(bytes / 24) = num 24-byte chunks to do +- +- ## process rax 24-byte chunks (128 >= rax >= 0) +- +- ## compute end address of each block +- ## block 0 (base addr + RAX * 8) +- ## block 1 (base addr + RAX * 16) +- ## block 2 (base addr + RAX * 24) +- lea (bufptmp, %rax, 8), block_0 +- lea (block_0, %rax, 8), block_1 +- lea (block_1, %rax, 8), block_2 ++.Lpartial_block: ++ # Compute floor(len / 24) to get num qwords to process from each lane. ++ imul $2731, len, %eax # 2731 = ceil(2^16 / 24) ++ shr $16, %eax ++ jmp .Lcrc_3lanes + +- xor crc1, crc1 +- xor crc2, crc2 +- +- ## branch into array +- leaq jump_table(%rip), %bufp +- mov (%bufp,%rax,8), %bufp +- JMP_NOSPEC bufp +- +- ################################################################ +- ## 2a) PROCESS FULL BLOCKS: +- ################################################################ + .Lfull_block: +- movl $128,%eax +- lea 128*8*2(block_0), block_1 +- lea 128*8*3(block_0), block_2 +- add $128*8*1, block_0 +- +- xor crc1,crc1 +- xor crc2,crc2 +- +- # Fall through into top of crc array (crc_128) ++ # Processing 128 qwords from each lane. ++ mov $128, %eax + + ################################################################ +- ## 3) CRC Array: ++ ## 3) CRC each of three lanes: + ################################################################ + +- i=128 +-.rept 128-1 +-.altmacro +-LABEL crc_ %i +-.noaltmacro +- ENDBR +- crc32q -i*8(block_0), crc_init +- crc32q -i*8(block_1), crc1 +- crc32q -i*8(block_2), crc2 +- i=(i-1) +-.endr +- +-.altmacro +-LABEL crc_ %i +-.noaltmacro +- ENDBR +- crc32q -i*8(block_0), crc_init +- crc32q -i*8(block_1), crc1 +-# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet +- +- mov block_2, block_0 ++.Lcrc_3lanes: ++ xor crc1,crc1 ++ xor crc2,crc2 ++ mov %eax, chunk_bytes ++ shl $3, chunk_bytes # num bytes to process from each lane ++ sub $5, %eax # 4 for 4x_loop, 1 for special last iter ++ jl .Lcrc_3lanes_4x_done ++ ++ # Unroll the loop by a factor of 4 to reduce the overhead of the loop ++ # bookkeeping instructions, which can compete with crc32q for the ALUs. ++.Lcrc_3lanes_4x_loop: ++ crc32q (bufp), crc_init_q ++ crc32q (bufp,chunk_bytes_q), crc1 ++ crc32q (bufp,chunk_bytes_q,2), crc2 ++ crc32q 8(bufp), crc_init_q ++ crc32q 8(bufp,chunk_bytes_q), crc1 ++ crc32q 8(bufp,chunk_bytes_q,2), crc2 ++ crc32q 16(bufp), crc_init_q ++ crc32q 16(bufp,chunk_bytes_q), crc1 ++ crc32q 16(bufp,chunk_bytes_q,2), crc2 ++ crc32q 24(bufp), crc_init_q ++ crc32q 24(bufp,chunk_bytes_q), crc1 ++ crc32q 24(bufp,chunk_bytes_q,2), crc2 ++ add $32, bufp ++ sub $4, %eax ++ jge .Lcrc_3lanes_4x_loop ++ ++.Lcrc_3lanes_4x_done: ++ add $4, %eax ++ jz .Lcrc_3lanes_last_qword ++ ++.Lcrc_3lanes_1x_loop: ++ crc32q (bufp), crc_init_q ++ crc32q (bufp,chunk_bytes_q), crc1 ++ crc32q (bufp,chunk_bytes_q,2), crc2 ++ add $8, bufp ++ dec %eax ++ jnz .Lcrc_3lanes_1x_loop ++ ++.Lcrc_3lanes_last_qword: ++ crc32q (bufp), crc_init_q ++ crc32q (bufp,chunk_bytes_q), crc1 ++# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet + + ################################################################ + ## 4) Combine three results: + ################################################################ + +- lea (K_table-8)(%rip), %bufp # first entry is for idx 1 +- shlq $3, %rax # rax *= 8 +- pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 +- leal (%eax,%eax,2), %eax # rax *= 3 (total *24) +- subq %rax, tmp # tmp -= rax*24 ++ lea (K_table-8)(%rip), %rax # first entry is for idx 1 ++ pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2 ++ lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3 ++ sub %eax, len # len -= chunk_bytes * 3 + +- movq crc_init, %xmm1 # CRC for block 1 ++ movq crc_init_q, %xmm1 # CRC for block 1 + pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 + + movq crc1, %xmm2 # CRC for block 2 +@@ -230,103 +175,54 @@ LABEL crc_ %i + + pxor %xmm2,%xmm1 + movq %xmm1, %rax +- xor -i*8(block_2), %rax +- mov crc2, crc_init +- crc32 %rax, crc_init ++ xor (bufp,chunk_bytes_q,2), %rax ++ mov crc2, crc_init_q ++ crc32 %rax, crc_init_q ++ lea 8(bufp,chunk_bytes_q,2), bufp + + ################################################################ +- ## 5) Check for end: ++ ## 5) If more blocks remain, goto (2): + ################################################################ + +-LABEL crc_ 0 +- ENDBR +- mov tmp, len +- cmp $128*24, tmp +- jae .Lfull_block +- cmp $24, tmp +- jae .Lcontinue_block +- +-.Lless_than_24: +- shl $32-4, len_dw # less_than_16 expects length +- # in upper 4 bits of len_dw +- jnc .Lless_than_16 +- crc32q (bufptmp), crc_init +- crc32q 8(bufptmp), crc_init +- jz .Ldo_return +- add $16, bufptmp +- # len is less than 8 if we got here +- # less_than_8 expects length in upper 3 bits of len_dw +- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] +- shl $2, len_dw +- jmp .Lless_than_8_post_shl1 ++ cmp $128*24, len ++ jae .Lfull_block ++ cmp $SMALL_SIZE, len ++ jae .Lpartial_block + + ####################################################################### +- ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) ++ ## 6) Process any remainder without interleaving: + ####################################################################### + .Lsmall: +- shl $32-8, len_dw # Prepare len_dw for less_than_256 +- j=256 +-.rept 5 # j = {256, 128, 64, 32, 16} +-.altmacro +-LABEL less_than_ %j # less_than_j: Length should be in +- # upper lg(j) bits of len_dw +- j=(j/2) +- shl $1, len_dw # Get next MSB +- JNC_LESS_THAN %j +-.noaltmacro +- i=0 +-.rept (j/8) +- crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data +- i=i+8 +-.endr +- jz .Ldo_return # Return if remaining length is zero +- add $j, bufptmp # Advance buf +-.endr +- +-.Lless_than_8: # Length should be stored in +- # upper 3 bits of len_dw +- shl $1, len_dw +-.Lless_than_8_post_shl1: +- jnc .Lless_than_4 +- crc32l (bufptmp), crc_init_dw # CRC of 4 bytes +- jz .Ldo_return # return if remaining data is zero +- add $4, bufptmp +-.Lless_than_4: # Length should be stored in +- # upper 2 bits of len_dw +- shl $1, len_dw +- jnc .Lless_than_2 +- crc32w (bufptmp), crc_init_dw # CRC of 2 bytes +- jz .Ldo_return # return if remaining data is zero +- add $2, bufptmp +-.Lless_than_2: # Length should be stored in the MSB +- # of len_dw +- shl $1, len_dw +- jnc .Lless_than_1 +- crc32b (bufptmp), crc_init_dw # CRC of 1 byte +-.Lless_than_1: # Length should be zero +-.Ldo_return: +- movq crc_init, %rax +- popq %rsi +- popq %rdi +- popq %rbx ++ test len, len ++ jz .Ldone ++ mov len, %eax ++ shr $3, %eax ++ jz .Ldo_dword ++.Ldo_qwords: ++ crc32q (bufp), crc_init_q ++ add $8, bufp ++ dec %eax ++ jnz .Ldo_qwords ++.Ldo_dword: ++ test $4, len ++ jz .Ldo_word ++ crc32l (bufp), crc_init ++ add $4, bufp ++.Ldo_word: ++ test $2, len ++ jz .Ldo_byte ++ crc32w (bufp), crc_init ++ add $2, bufp ++.Ldo_byte: ++ test $1, len ++ jz .Ldone ++ crc32b (bufp), crc_init ++.Ldone: ++ mov crc_init, %eax + RET + SYM_FUNC_END(crc_pcl) + + .section .rodata, "a", @progbits +- ################################################################ +- ## jump table Table is 129 entries x 2 bytes each +- ################################################################ +-.align 4 +-jump_table: +- i=0 +-.rept 129 +-.altmacro +-JMPTBL_ENTRY %i +-.noaltmacro +- i=i+1 +-.endr +- +- + ################################################################ + ## PCLMULQDQ tables + ## Table is 128 entries x 2 words (8 bytes) each +-- +2.47.0 + +From 3032e50511074fb9d365a2a57b5905b9d0437af6 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 4 Nov 2024 14:48:52 +0100 +Subject: [PATCH 07/13] fixes + +Signed-off-by: Peter Jung +--- + arch/Kconfig | 4 +-- + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 +++ + drivers/gpu/drm/drm_edid.c | 47 +++++++++++++++++++++++-- + drivers/misc/lkdtm/bugs.c | 2 +- + include/linux/compiler_attributes.h | 13 ------- + include/linux/compiler_types.h | 19 ++++++++++ + init/Kconfig | 8 +++++ + lib/overflow_kunit.c | 2 +- + mm/huge_memory.c | 21 ++++++++--- + mm/memcontrol.c | 3 +- + mm/page_alloc.c | 5 ++- + scripts/package/PKGBUILD | 5 +++ + 12 files changed, 105 insertions(+), 29 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig -index 975dd22a2dbd..de69b8f5b5be 100644 +index 00551f340dbe..833b2344ce79 100644 --- a/arch/Kconfig +++ b/arch/Kconfig -@@ -1050,7 +1050,7 @@ config ARCH_MMAP_RND_BITS +@@ -1128,7 +1128,7 @@ config ARCH_MMAP_RND_BITS int "Number of bits to use for ASLR of mmap base address" if EXPERT range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT @@ -12086,7 +12970,7 @@ index 975dd22a2dbd..de69b8f5b5be 100644 depends on HAVE_ARCH_MMAP_RND_BITS help This value can be used to select the number of bits to use to -@@ -1084,7 +1084,7 @@ config ARCH_MMAP_RND_COMPAT_BITS +@@ -1162,7 +1162,7 @@ config ARCH_MMAP_RND_COMPAT_BITS int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT @@ -12095,26 +12979,11 @@ index 975dd22a2dbd..de69b8f5b5be 100644 depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS help This value can be used to select the number of bits to use to -diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c -index 2408e50743ca..73c54e92afa9 100644 ---- a/drivers/bluetooth/btusb.c -+++ b/drivers/bluetooth/btusb.c -@@ -692,6 +692,10 @@ static const struct usb_device_id quirks_table[] = { - { USB_DEVICE(0x0489, 0xe113), .driver_info = BTUSB_MEDIATEK | - BTUSB_WIDEBAND_SPEECH | - BTUSB_VALID_LE_STATES }, -+ { USB_DEVICE(0x0489, 0xe118), .driver_info = BTUSB_MEDIATEK | -+ BTUSB_WIDEBAND_SPEECH }, -+ { USB_DEVICE(0x0489, 0xe11e), .driver_info = BTUSB_MEDIATEK | -+ BTUSB_WIDEBAND_SPEECH }, - { USB_DEVICE(0x13d3, 0x3602), .driver_info = BTUSB_MEDIATEK | - BTUSB_WIDEBAND_SPEECH | - BTUSB_VALID_LE_STATES }, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c -index 9c3b7b027485..ad5c05ee92f3 100644 +index 852e6f315576..f6a6fc6a4f5c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c -@@ -3056,6 +3056,11 @@ static int __init amdgpu_init(void) +@@ -3078,6 +3078,11 @@ static int __init amdgpu_init(void) /* Ignore KFD init failures. Normal when CONFIG_HSA_AMD is not set. */ amdgpu_amdkfd_init(); @@ -12126,916 +12995,8 @@ index 9c3b7b027485..ad5c05ee92f3 100644 /* let modprobe override vga console setting */ return pci_register_driver(&amdgpu_kms_pci_driver); -diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c -index 21442469791c..18eaab929540 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c -@@ -140,7 +140,8 @@ int smu_set_soft_freq_range(struct smu_context *smu, - ret = smu->ppt_funcs->set_soft_freq_limited_range(smu, - clk_type, - min, -- max); -+ max, -+ false); - - return ret; - } -@@ -1257,7 +1258,6 @@ static int smu_sw_init(void *handle) - atomic_set(&smu->smu_power.power_gate.vpe_gated, 1); - atomic_set(&smu->smu_power.power_gate.umsch_mm_gated, 1); - -- smu->workload_mask = 1 << smu->workload_prority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT]; - smu->workload_prority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT] = 0; - smu->workload_prority[PP_SMC_POWER_PROFILE_FULLSCREEN3D] = 1; - smu->workload_prority[PP_SMC_POWER_PROFILE_POWERSAVING] = 2; -@@ -1266,6 +1266,12 @@ static int smu_sw_init(void *handle) - smu->workload_prority[PP_SMC_POWER_PROFILE_COMPUTE] = 5; - smu->workload_prority[PP_SMC_POWER_PROFILE_CUSTOM] = 6; - -+ if (smu->is_apu || -+ (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 1))) -+ smu->workload_mask = 1 << smu->workload_prority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT]; -+ else -+ smu->workload_mask = 1 << smu->workload_prority[PP_SMC_POWER_PROFILE_FULLSCREEN3D]; -+ - smu->workload_setting[0] = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; - smu->workload_setting[1] = PP_SMC_POWER_PROFILE_FULLSCREEN3D; - smu->workload_setting[2] = PP_SMC_POWER_PROFILE_POWERSAVING; -diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h -index b44a185d07e8..5eb4e5c75981 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h -+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h -@@ -1260,7 +1260,8 @@ struct pptable_funcs { - * @set_soft_freq_limited_range: Set the soft frequency range of a clock - * domain in MHz. - */ -- int (*set_soft_freq_limited_range)(struct smu_context *smu, enum smu_clk_type clk_type, uint32_t min, uint32_t max); -+ int (*set_soft_freq_limited_range)(struct smu_context *smu, enum smu_clk_type clk_type, uint32_t min, uint32_t max, -+ bool automatic); - - /** - * @set_power_source: Notify the SMU of the current power source. -diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v11_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v11_0.h -index c2ab336bb530..ed8304d82831 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v11_0.h -+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v11_0.h -@@ -255,7 +255,7 @@ int smu_v11_0_get_dpm_ultimate_freq(struct smu_context *smu, enum smu_clk_type c - uint32_t *min, uint32_t *max); - - int smu_v11_0_set_soft_freq_limited_range(struct smu_context *smu, enum smu_clk_type clk_type, -- uint32_t min, uint32_t max); -+ uint32_t min, uint32_t max, bool automatic); - - int smu_v11_0_set_hard_freq_limited_range(struct smu_context *smu, - enum smu_clk_type clk_type, -diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v12_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v12_0.h -index 1ad2dff71090..0886d8cffbd0 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v12_0.h -+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v12_0.h -@@ -56,7 +56,7 @@ int smu_v12_0_set_default_dpm_tables(struct smu_context *smu); - int smu_v12_0_mode2_reset(struct smu_context *smu); - - int smu_v12_0_set_soft_freq_limited_range(struct smu_context *smu, enum smu_clk_type clk_type, -- uint32_t min, uint32_t max); -+ uint32_t min, uint32_t max, bool automatic); - - int smu_v12_0_set_driver_table_location(struct smu_context *smu); - -diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h -index e58220a7ee2f..044d6893b43e 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h -+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h -@@ -219,7 +219,7 @@ int smu_v13_0_get_dpm_ultimate_freq(struct smu_context *smu, enum smu_clk_type c - uint32_t *min, uint32_t *max); - - int smu_v13_0_set_soft_freq_limited_range(struct smu_context *smu, enum smu_clk_type clk_type, -- uint32_t min, uint32_t max); -+ uint32_t min, uint32_t max, bool automatic); - - int smu_v13_0_set_hard_freq_limited_range(struct smu_context *smu, - enum smu_clk_type clk_type, -diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h -index 46b456590a08..6cada19a8482 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h -+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h -@@ -186,7 +186,7 @@ int smu_v14_0_get_dpm_ultimate_freq(struct smu_context *smu, enum smu_clk_type c - uint32_t *min, uint32_t *max); - - int smu_v14_0_set_soft_freq_limited_range(struct smu_context *smu, enum smu_clk_type clk_type, -- uint32_t min, uint32_t max); -+ uint32_t min, uint32_t max, bool automatic); - - int smu_v14_0_set_hard_freq_limited_range(struct smu_context *smu, - enum smu_clk_type clk_type, -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c -index 076620fa3ef5..306a07b366a8 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c -@@ -1689,7 +1689,7 @@ static int navi10_force_clk_levels(struct smu_context *smu, - if (ret) - return 0; - -- ret = smu_v11_0_set_soft_freq_limited_range(smu, clk_type, min_freq, max_freq); -+ ret = smu_v11_0_set_soft_freq_limited_range(smu, clk_type, min_freq, max_freq, false); - if (ret) - return 0; - break; -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c -index 0d3e1a121b67..cbd5fcbb5547 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c -@@ -1469,7 +1469,7 @@ static int sienna_cichlid_force_clk_levels(struct smu_context *smu, - if (ret) - goto forec_level_out; - -- ret = smu_v11_0_set_soft_freq_limited_range(smu, clk_type, min_freq, max_freq); -+ ret = smu_v11_0_set_soft_freq_limited_range(smu, clk_type, min_freq, max_freq, false); - if (ret) - goto forec_level_out; - break; -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c -index 16fcd9dcd202..16e7959879d4 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c -@@ -1763,7 +1763,8 @@ int smu_v11_0_get_dpm_ultimate_freq(struct smu_context *smu, enum smu_clk_type c - int smu_v11_0_set_soft_freq_limited_range(struct smu_context *smu, - enum smu_clk_type clk_type, - uint32_t min, -- uint32_t max) -+ uint32_t max, -+ bool automatic) - { - int ret = 0, clk_id = 0; - uint32_t param; -@@ -1778,7 +1779,10 @@ int smu_v11_0_set_soft_freq_limited_range(struct smu_context *smu, - return clk_id; - - if (max > 0) { -- param = (uint32_t)((clk_id << 16) | (max & 0xffff)); -+ if (automatic) -+ param = (uint32_t)((clk_id << 16) | 0xffff); -+ else -+ param = (uint32_t)((clk_id << 16) | (max & 0xffff)); - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetSoftMaxByFreq, - param, NULL); - if (ret) -@@ -1786,7 +1790,10 @@ int smu_v11_0_set_soft_freq_limited_range(struct smu_context *smu, - } - - if (min > 0) { -- param = (uint32_t)((clk_id << 16) | (min & 0xffff)); -+ if (automatic) -+ param = (uint32_t)((clk_id << 16) | 0); -+ else -+ param = (uint32_t)((clk_id << 16) | (min & 0xffff)); - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetSoftMinByFreq, - param, NULL); - if (ret) -@@ -1854,6 +1861,7 @@ int smu_v11_0_set_performance_level(struct smu_context *smu, - uint32_t mclk_min = 0, mclk_max = 0; - uint32_t socclk_min = 0, socclk_max = 0; - int ret = 0; -+ bool auto_level = false; - - switch (level) { - case AMD_DPM_FORCED_LEVEL_HIGH: -@@ -1873,6 +1881,7 @@ int smu_v11_0_set_performance_level(struct smu_context *smu, - mclk_max = mem_table->max; - socclk_min = soc_table->min; - socclk_max = soc_table->max; -+ auto_level = true; - break; - case AMD_DPM_FORCED_LEVEL_PROFILE_STANDARD: - sclk_min = sclk_max = pstate_table->gfxclk_pstate.standard; -@@ -1905,13 +1914,15 @@ int smu_v11_0_set_performance_level(struct smu_context *smu, - if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(11, 0, 2)) { - mclk_min = mclk_max = 0; - socclk_min = socclk_max = 0; -+ auto_level = false; - } - - if (sclk_min && sclk_max) { - ret = smu_v11_0_set_soft_freq_limited_range(smu, - SMU_GFXCLK, - sclk_min, -- sclk_max); -+ sclk_max, -+ auto_level); - if (ret) - return ret; - } -@@ -1920,7 +1931,8 @@ int smu_v11_0_set_performance_level(struct smu_context *smu, - ret = smu_v11_0_set_soft_freq_limited_range(smu, - SMU_MCLK, - mclk_min, -- mclk_max); -+ mclk_max, -+ auto_level); - if (ret) - return ret; - } -@@ -1929,7 +1941,8 @@ int smu_v11_0_set_performance_level(struct smu_context *smu, - ret = smu_v11_0_set_soft_freq_limited_range(smu, - SMU_SOCCLK, - socclk_min, -- socclk_max); -+ socclk_max, -+ auto_level); - if (ret) - return ret; - } -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c -index 22737b11b1bf..a333ab827f48 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c -@@ -1091,9 +1091,10 @@ static int vangogh_set_power_profile_mode(struct smu_context *smu, long *input, - } - - static int vangogh_set_soft_freq_limited_range(struct smu_context *smu, -- enum smu_clk_type clk_type, -- uint32_t min, -- uint32_t max) -+ enum smu_clk_type clk_type, -+ uint32_t min, -+ uint32_t max, -+ bool automatic) - { - int ret = 0; - -@@ -1299,7 +1300,7 @@ static int vangogh_force_dpm_limit_value(struct smu_context *smu, bool highest) - return ret; - - force_freq = highest ? max_freq : min_freq; -- ret = vangogh_set_soft_freq_limited_range(smu, clk_type, force_freq, force_freq); -+ ret = vangogh_set_soft_freq_limited_range(smu, clk_type, force_freq, force_freq, false); - if (ret) - return ret; - } -@@ -1335,7 +1336,7 @@ static int vangogh_unforce_dpm_levels(struct smu_context *smu) - if (ret) - return ret; - -- ret = vangogh_set_soft_freq_limited_range(smu, clk_type, min_freq, max_freq); -+ ret = vangogh_set_soft_freq_limited_range(smu, clk_type, min_freq, max_freq, false); - - if (ret) - return ret; -@@ -1354,7 +1355,7 @@ static int vangogh_set_peak_clock_by_device(struct smu_context *smu) - if (ret) - return ret; - -- ret = vangogh_set_soft_freq_limited_range(smu, SMU_FCLK, fclk_freq, fclk_freq); -+ ret = vangogh_set_soft_freq_limited_range(smu, SMU_FCLK, fclk_freq, fclk_freq, false); - if (ret) - return ret; - -@@ -1362,7 +1363,7 @@ static int vangogh_set_peak_clock_by_device(struct smu_context *smu) - if (ret) - return ret; - -- ret = vangogh_set_soft_freq_limited_range(smu, SMU_SOCCLK, socclk_freq, socclk_freq); -+ ret = vangogh_set_soft_freq_limited_range(smu, SMU_SOCCLK, socclk_freq, socclk_freq, false); - if (ret) - return ret; - -@@ -1370,7 +1371,7 @@ static int vangogh_set_peak_clock_by_device(struct smu_context *smu) - if (ret) - return ret; - -- ret = vangogh_set_soft_freq_limited_range(smu, SMU_VCLK, vclk_freq, vclk_freq); -+ ret = vangogh_set_soft_freq_limited_range(smu, SMU_VCLK, vclk_freq, vclk_freq, false); - if (ret) - return ret; - -@@ -1378,7 +1379,7 @@ static int vangogh_set_peak_clock_by_device(struct smu_context *smu) - if (ret) - return ret; - -- ret = vangogh_set_soft_freq_limited_range(smu, SMU_DCLK, dclk_freq, dclk_freq); -+ ret = vangogh_set_soft_freq_limited_range(smu, SMU_DCLK, dclk_freq, dclk_freq, false); - if (ret) - return ret; - -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c -index cc0504b063fa..0b210b1f2628 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c -@@ -707,7 +707,7 @@ static int renoir_force_dpm_limit_value(struct smu_context *smu, bool highest) - return ret; - - force_freq = highest ? max_freq : min_freq; -- ret = smu_v12_0_set_soft_freq_limited_range(smu, clk_type, force_freq, force_freq); -+ ret = smu_v12_0_set_soft_freq_limited_range(smu, clk_type, force_freq, force_freq, false); - if (ret) - return ret; - } -@@ -740,7 +740,7 @@ static int renoir_unforce_dpm_levels(struct smu_context *smu) { - if (ret) - return ret; - -- ret = smu_v12_0_set_soft_freq_limited_range(smu, clk_type, min_freq, max_freq); -+ ret = smu_v12_0_set_soft_freq_limited_range(smu, clk_type, min_freq, max_freq, false); - if (ret) - return ret; - } -@@ -911,7 +911,7 @@ static int renoir_set_peak_clock_by_device(struct smu_context *smu) - if (ret) - return ret; - -- ret = smu_v12_0_set_soft_freq_limited_range(smu, SMU_SCLK, sclk_freq, sclk_freq); -+ ret = smu_v12_0_set_soft_freq_limited_range(smu, SMU_SCLK, sclk_freq, sclk_freq, false); - if (ret) - return ret; - -@@ -919,7 +919,7 @@ static int renoir_set_peak_clock_by_device(struct smu_context *smu) - if (ret) - return ret; - -- ret = smu_v12_0_set_soft_freq_limited_range(smu, SMU_UCLK, uclk_freq, uclk_freq); -+ ret = smu_v12_0_set_soft_freq_limited_range(smu, SMU_UCLK, uclk_freq, uclk_freq, false); - if (ret) - return ret; - -@@ -961,13 +961,13 @@ static int renior_set_dpm_profile_freq(struct smu_context *smu, - } - - if (sclk) -- ret = smu_v12_0_set_soft_freq_limited_range(smu, SMU_SCLK, sclk, sclk); -+ ret = smu_v12_0_set_soft_freq_limited_range(smu, SMU_SCLK, sclk, sclk, false); - - if (socclk) -- ret = smu_v12_0_set_soft_freq_limited_range(smu, SMU_SOCCLK, socclk, socclk); -+ ret = smu_v12_0_set_soft_freq_limited_range(smu, SMU_SOCCLK, socclk, socclk, false); - - if (fclk) -- ret = smu_v12_0_set_soft_freq_limited_range(smu, SMU_FCLK, fclk, fclk); -+ ret = smu_v12_0_set_soft_freq_limited_range(smu, SMU_FCLK, fclk, fclk, false); - - return ret; - } -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c -index ed15f5a0fd11..3d3cd546f0ad 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c -@@ -211,7 +211,7 @@ int smu_v12_0_mode2_reset(struct smu_context *smu) - } - - int smu_v12_0_set_soft_freq_limited_range(struct smu_context *smu, enum smu_clk_type clk_type, -- uint32_t min, uint32_t max) -+ uint32_t min, uint32_t max, bool automatic) - { - int ret = 0; - -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c -index 2c35eb31475a..f6b029354327 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c -@@ -1297,9 +1297,10 @@ static int aldebaran_set_performance_level(struct smu_context *smu, - } - - static int aldebaran_set_soft_freq_limited_range(struct smu_context *smu, -- enum smu_clk_type clk_type, -- uint32_t min, -- uint32_t max) -+ enum smu_clk_type clk_type, -+ uint32_t min, -+ uint32_t max, -+ bool automatic) - { - struct smu_dpm_context *smu_dpm = &(smu->smu_dpm); - struct smu_13_0_dpm_context *dpm_context = smu_dpm->dpm_context; -@@ -1328,7 +1329,7 @@ static int aldebaran_set_soft_freq_limited_range(struct smu_context *smu, - return 0; - - ret = smu_v13_0_set_soft_freq_limited_range(smu, SMU_GFXCLK, -- min, max); -+ min, max, false); - if (!ret) { - pstate_table->gfxclk_pstate.curr.min = min; - pstate_table->gfxclk_pstate.curr.max = max; -@@ -1348,7 +1349,7 @@ static int aldebaran_set_soft_freq_limited_range(struct smu_context *smu, - /* Restore default min/max clocks and enable determinism */ - min_clk = dpm_context->dpm_tables.gfx_table.min; - max_clk = dpm_context->dpm_tables.gfx_table.max; -- ret = smu_v13_0_set_soft_freq_limited_range(smu, SMU_GFXCLK, min_clk, max_clk); -+ ret = smu_v13_0_set_soft_freq_limited_range(smu, SMU_GFXCLK, min_clk, max_clk, false); - if (!ret) { - usleep_range(500, 1000); - ret = smu_cmn_send_smc_msg_with_param(smu, -@@ -1422,7 +1423,7 @@ static int aldebaran_usr_edit_dpm_table(struct smu_context *smu, enum PP_OD_DPM_ - min_clk = dpm_context->dpm_tables.gfx_table.min; - max_clk = dpm_context->dpm_tables.gfx_table.max; - -- return aldebaran_set_soft_freq_limited_range(smu, SMU_GFXCLK, min_clk, max_clk); -+ return aldebaran_set_soft_freq_limited_range(smu, SMU_GFXCLK, min_clk, max_clk, false); - } - break; - case PP_OD_COMMIT_DPM_TABLE: -@@ -1441,7 +1442,7 @@ static int aldebaran_usr_edit_dpm_table(struct smu_context *smu, enum PP_OD_DPM_ - min_clk = pstate_table->gfxclk_pstate.custom.min; - max_clk = pstate_table->gfxclk_pstate.custom.max; - -- return aldebaran_set_soft_freq_limited_range(smu, SMU_GFXCLK, min_clk, max_clk); -+ return aldebaran_set_soft_freq_limited_range(smu, SMU_GFXCLK, min_clk, max_clk, false); - } - break; - default: -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c -index e17466cc1952..6cfd66363915 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c -@@ -1608,7 +1608,8 @@ int smu_v13_0_get_dpm_ultimate_freq(struct smu_context *smu, enum smu_clk_type c - int smu_v13_0_set_soft_freq_limited_range(struct smu_context *smu, - enum smu_clk_type clk_type, - uint32_t min, -- uint32_t max) -+ uint32_t max, -+ bool automatic) - { - int ret = 0, clk_id = 0; - uint32_t param; -@@ -1623,7 +1624,10 @@ int smu_v13_0_set_soft_freq_limited_range(struct smu_context *smu, - return clk_id; - - if (max > 0) { -- param = (uint32_t)((clk_id << 16) | (max & 0xffff)); -+ if (automatic) -+ param = (uint32_t)((clk_id << 16) | 0xffff); -+ else -+ param = (uint32_t)((clk_id << 16) | (max & 0xffff)); - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetSoftMaxByFreq, - param, NULL); - if (ret) -@@ -1631,7 +1635,10 @@ int smu_v13_0_set_soft_freq_limited_range(struct smu_context *smu, - } - - if (min > 0) { -- param = (uint32_t)((clk_id << 16) | (min & 0xffff)); -+ if (automatic) -+ param = (uint32_t)((clk_id << 16) | 0); -+ else -+ param = (uint32_t)((clk_id << 16) | (min & 0xffff)); - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetSoftMinByFreq, - param, NULL); - if (ret) -@@ -1708,6 +1715,7 @@ int smu_v13_0_set_performance_level(struct smu_context *smu, - uint32_t dclk_min = 0, dclk_max = 0; - uint32_t fclk_min = 0, fclk_max = 0; - int ret = 0, i; -+ bool auto_level = false; - - switch (level) { - case AMD_DPM_FORCED_LEVEL_HIGH: -@@ -1739,6 +1747,7 @@ int smu_v13_0_set_performance_level(struct smu_context *smu, - dclk_max = dclk_table->max; - fclk_min = fclk_table->min; - fclk_max = fclk_table->max; -+ auto_level = true; - break; - case AMD_DPM_FORCED_LEVEL_PROFILE_STANDARD: - sclk_min = sclk_max = pstate_table->gfxclk_pstate.standard; -@@ -1780,13 +1789,15 @@ int smu_v13_0_set_performance_level(struct smu_context *smu, - vclk_min = vclk_max = 0; - dclk_min = dclk_max = 0; - fclk_min = fclk_max = 0; -+ auto_level = false; - } - - if (sclk_min && sclk_max) { - ret = smu_v13_0_set_soft_freq_limited_range(smu, - SMU_GFXCLK, - sclk_min, -- sclk_max); -+ sclk_max, -+ auto_level); - if (ret) - return ret; - -@@ -1798,7 +1809,8 @@ int smu_v13_0_set_performance_level(struct smu_context *smu, - ret = smu_v13_0_set_soft_freq_limited_range(smu, - SMU_MCLK, - mclk_min, -- mclk_max); -+ mclk_max, -+ auto_level); - if (ret) - return ret; - -@@ -1810,7 +1822,8 @@ int smu_v13_0_set_performance_level(struct smu_context *smu, - ret = smu_v13_0_set_soft_freq_limited_range(smu, - SMU_SOCCLK, - socclk_min, -- socclk_max); -+ socclk_max, -+ auto_level); - if (ret) - return ret; - -@@ -1825,7 +1838,8 @@ int smu_v13_0_set_performance_level(struct smu_context *smu, - ret = smu_v13_0_set_soft_freq_limited_range(smu, - i ? SMU_VCLK1 : SMU_VCLK, - vclk_min, -- vclk_max); -+ vclk_max, -+ auto_level); - if (ret) - return ret; - } -@@ -1840,7 +1854,8 @@ int smu_v13_0_set_performance_level(struct smu_context *smu, - ret = smu_v13_0_set_soft_freq_limited_range(smu, - i ? SMU_DCLK1 : SMU_DCLK, - dclk_min, -- dclk_max); -+ dclk_max, -+ auto_level); - if (ret) - return ret; - } -@@ -1852,7 +1867,8 @@ int smu_v13_0_set_performance_level(struct smu_context *smu, - ret = smu_v13_0_set_soft_freq_limited_range(smu, - SMU_FCLK, - fclk_min, -- fclk_max); -+ fclk_max, -+ auto_level); - if (ret) - return ret; - -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c -index cb923e33fd6f..f69fe75352de 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c -@@ -1975,7 +1975,8 @@ static int smu_v13_0_0_force_clk_levels(struct smu_context *smu, - ret = smu_v13_0_set_soft_freq_limited_range(smu, - clk_type, - min_freq, -- max_freq); -+ max_freq, -+ false); - break; - case SMU_DCEFCLK: - case SMU_PCIE: -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_5_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_5_ppt.c -index 9c2c43bfed0b..a71b7c0803f1 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_5_ppt.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_5_ppt.c -@@ -811,9 +811,10 @@ static int smu_v13_0_5_get_dpm_ultimate_freq(struct smu_context *smu, - } - - static int smu_v13_0_5_set_soft_freq_limited_range(struct smu_context *smu, -- enum smu_clk_type clk_type, -- uint32_t min, -- uint32_t max) -+ enum smu_clk_type clk_type, -+ uint32_t min, -+ uint32_t max, -+ bool automatic) - { - enum smu_message_type msg_set_min, msg_set_max; - uint32_t min_clk = min; -@@ -950,7 +951,7 @@ static int smu_v13_0_5_force_clk_levels(struct smu_context *smu, - if (ret) - goto force_level_out; - -- ret = smu_v13_0_5_set_soft_freq_limited_range(smu, clk_type, min_freq, max_freq); -+ ret = smu_v13_0_5_set_soft_freq_limited_range(smu, clk_type, min_freq, max_freq, false); - if (ret) - goto force_level_out; - break; -@@ -1046,9 +1047,10 @@ static int smu_v13_0_5_set_performance_level(struct smu_context *smu, - - if (sclk_min && sclk_max) { - ret = smu_v13_0_5_set_soft_freq_limited_range(smu, -- SMU_SCLK, -- sclk_min, -- sclk_max); -+ SMU_SCLK, -+ sclk_min, -+ sclk_max, -+ false); - if (ret) - return ret; - -@@ -1060,7 +1062,8 @@ static int smu_v13_0_5_set_performance_level(struct smu_context *smu, - ret = smu_v13_0_5_set_soft_freq_limited_range(smu, - SMU_VCLK, - vclk_min, -- vclk_max); -+ vclk_max, -+ false); - if (ret) - return ret; - } -@@ -1069,7 +1072,8 @@ static int smu_v13_0_5_set_performance_level(struct smu_context *smu, - ret = smu_v13_0_5_set_soft_freq_limited_range(smu, - SMU_DCLK, - dclk_min, -- dclk_max); -+ dclk_max, -+ false); - if (ret) - return ret; - } -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c -index 9974c9f8135e..8d2ccd8a8b0c 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c -@@ -1739,7 +1739,7 @@ static int smu_v13_0_6_set_performance_level(struct smu_context *smu, - if (uclk_table->max != pstate_table->uclk_pstate.curr.max) { - /* Min UCLK is not expected to be changed */ - ret = smu_v13_0_set_soft_freq_limited_range( -- smu, SMU_UCLK, 0, uclk_table->max); -+ smu, SMU_UCLK, 0, uclk_table->max, false); - if (ret) - return ret; - pstate_table->uclk_pstate.curr.max = uclk_table->max; -@@ -1758,7 +1758,8 @@ static int smu_v13_0_6_set_performance_level(struct smu_context *smu, - - static int smu_v13_0_6_set_soft_freq_limited_range(struct smu_context *smu, - enum smu_clk_type clk_type, -- uint32_t min, uint32_t max) -+ uint32_t min, uint32_t max, -+ bool automatic) - { - struct smu_dpm_context *smu_dpm = &(smu->smu_dpm); - struct smu_13_0_dpm_context *dpm_context = smu_dpm->dpm_context; -@@ -1806,7 +1807,7 @@ static int smu_v13_0_6_set_soft_freq_limited_range(struct smu_context *smu, - return -EOPNOTSUPP; - /* Only max clock limiting is allowed for UCLK */ - ret = smu_v13_0_set_soft_freq_limited_range( -- smu, SMU_UCLK, 0, max); -+ smu, SMU_UCLK, 0, max, false); - if (!ret) - pstate_table->uclk_pstate.curr.max = max; - } -@@ -1946,7 +1947,7 @@ static int smu_v13_0_6_usr_edit_dpm_table(struct smu_context *smu, - max_clk = dpm_context->dpm_tables.gfx_table.max; - - ret = smu_v13_0_6_set_soft_freq_limited_range( -- smu, SMU_GFXCLK, min_clk, max_clk); -+ smu, SMU_GFXCLK, min_clk, max_clk, false); - - if (ret) - return ret; -@@ -1954,7 +1955,7 @@ static int smu_v13_0_6_usr_edit_dpm_table(struct smu_context *smu, - min_clk = dpm_context->dpm_tables.uclk_table.min; - max_clk = dpm_context->dpm_tables.uclk_table.max; - ret = smu_v13_0_6_set_soft_freq_limited_range( -- smu, SMU_UCLK, min_clk, max_clk); -+ smu, SMU_UCLK, min_clk, max_clk, false); - if (ret) - return ret; - pstate_table->uclk_pstate.custom.max = 0; -@@ -1978,7 +1979,7 @@ static int smu_v13_0_6_usr_edit_dpm_table(struct smu_context *smu, - max_clk = pstate_table->gfxclk_pstate.custom.max; - - ret = smu_v13_0_6_set_soft_freq_limited_range( -- smu, SMU_GFXCLK, min_clk, max_clk); -+ smu, SMU_GFXCLK, min_clk, max_clk, false); - - if (ret) - return ret; -@@ -1989,7 +1990,7 @@ static int smu_v13_0_6_usr_edit_dpm_table(struct smu_context *smu, - min_clk = pstate_table->uclk_pstate.curr.min; - max_clk = pstate_table->uclk_pstate.custom.max; - return smu_v13_0_6_set_soft_freq_limited_range( -- smu, SMU_UCLK, min_clk, max_clk); -+ smu, SMU_UCLK, min_clk, max_clk, false); - } - break; - default: -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c -index b891a5e0a396..2077506ef336 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c -@@ -1964,7 +1964,8 @@ static int smu_v13_0_7_force_clk_levels(struct smu_context *smu, - ret = smu_v13_0_set_soft_freq_limited_range(smu, - clk_type, - min_freq, -- max_freq); -+ max_freq, -+ false); - break; - case SMU_DCEFCLK: - case SMU_PCIE: -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/yellow_carp_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/yellow_carp_ppt.c -index 260c339f89c5..71d58c8c8cc0 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu13/yellow_carp_ppt.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/yellow_carp_ppt.c -@@ -945,9 +945,10 @@ static int yellow_carp_get_dpm_ultimate_freq(struct smu_context *smu, - } - - static int yellow_carp_set_soft_freq_limited_range(struct smu_context *smu, -- enum smu_clk_type clk_type, -- uint32_t min, -- uint32_t max) -+ enum smu_clk_type clk_type, -+ uint32_t min, -+ uint32_t max, -+ bool automatic) - { - enum smu_message_type msg_set_min, msg_set_max; - uint32_t min_clk = min; -@@ -1134,7 +1135,7 @@ static int yellow_carp_force_clk_levels(struct smu_context *smu, - if (ret) - goto force_level_out; - -- ret = yellow_carp_set_soft_freq_limited_range(smu, clk_type, min_freq, max_freq); -+ ret = yellow_carp_set_soft_freq_limited_range(smu, clk_type, min_freq, max_freq, false); - if (ret) - goto force_level_out; - break; -@@ -1254,9 +1255,10 @@ static int yellow_carp_set_performance_level(struct smu_context *smu, - - if (sclk_min && sclk_max) { - ret = yellow_carp_set_soft_freq_limited_range(smu, -- SMU_SCLK, -- sclk_min, -- sclk_max); -+ SMU_SCLK, -+ sclk_min, -+ sclk_max, -+ false); - if (ret) - return ret; - -@@ -1266,18 +1268,20 @@ static int yellow_carp_set_performance_level(struct smu_context *smu, - - if (fclk_min && fclk_max) { - ret = yellow_carp_set_soft_freq_limited_range(smu, -- SMU_FCLK, -- fclk_min, -- fclk_max); -+ SMU_FCLK, -+ fclk_min, -+ fclk_max, -+ false); - if (ret) - return ret; - } - - if (socclk_min && socclk_max) { - ret = yellow_carp_set_soft_freq_limited_range(smu, -- SMU_SOCCLK, -- socclk_min, -- socclk_max); -+ SMU_SOCCLK, -+ socclk_min, -+ socclk_max, -+ false); - if (ret) - return ret; - } -@@ -1286,7 +1290,8 @@ static int yellow_carp_set_performance_level(struct smu_context *smu, - ret = yellow_carp_set_soft_freq_limited_range(smu, - SMU_VCLK, - vclk_min, -- vclk_max); -+ vclk_max, -+ false); - if (ret) - return ret; - } -@@ -1295,7 +1300,8 @@ static int yellow_carp_set_performance_level(struct smu_context *smu, - ret = yellow_carp_set_soft_freq_limited_range(smu, - SMU_DCLK, - dclk_min, -- dclk_max); -+ dclk_max, -+ false); - if (ret) - return ret; - } -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c -index 09973615f210..a7a6c4eea153 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c -@@ -1093,7 +1093,8 @@ int smu_v14_0_get_dpm_ultimate_freq(struct smu_context *smu, enum smu_clk_type c - int smu_v14_0_set_soft_freq_limited_range(struct smu_context *smu, - enum smu_clk_type clk_type, - uint32_t min, -- uint32_t max) -+ uint32_t max, -+ bool automatic) - { - int ret = 0, clk_id = 0; - uint32_t param; -@@ -1108,7 +1109,10 @@ int smu_v14_0_set_soft_freq_limited_range(struct smu_context *smu, - return clk_id; - - if (max > 0) { -- param = (uint32_t)((clk_id << 16) | (max & 0xffff)); -+ if (automatic) -+ param = (uint32_t)((clk_id << 16) | 0xffff); -+ else -+ param = (uint32_t)((clk_id << 16) | (max & 0xffff)); - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetSoftMaxByFreq, - param, NULL); - if (ret) -@@ -1116,7 +1120,10 @@ int smu_v14_0_set_soft_freq_limited_range(struct smu_context *smu, - } - - if (min > 0) { -- param = (uint32_t)((clk_id << 16) | (min & 0xffff)); -+ if (automatic) -+ param = (uint32_t)((clk_id << 16) | 0); -+ else -+ param = (uint32_t)((clk_id << 16) | (min & 0xffff)); - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetSoftMinByFreq, - param, NULL); - if (ret) -@@ -1193,6 +1200,7 @@ int smu_v14_0_set_performance_level(struct smu_context *smu, - uint32_t dclk_min = 0, dclk_max = 0; - uint32_t fclk_min = 0, fclk_max = 0; - int ret = 0, i; -+ bool auto_level = false; - - switch (level) { - case AMD_DPM_FORCED_LEVEL_HIGH: -@@ -1224,6 +1232,7 @@ int smu_v14_0_set_performance_level(struct smu_context *smu, - dclk_max = dclk_table->max; - fclk_min = fclk_table->min; - fclk_max = fclk_table->max; -+ auto_level = true; - break; - case AMD_DPM_FORCED_LEVEL_PROFILE_STANDARD: - sclk_min = sclk_max = pstate_table->gfxclk_pstate.standard; -@@ -1259,7 +1268,8 @@ int smu_v14_0_set_performance_level(struct smu_context *smu, - ret = smu_v14_0_set_soft_freq_limited_range(smu, - SMU_GFXCLK, - sclk_min, -- sclk_max); -+ sclk_max, -+ auto_level); - if (ret) - return ret; - -@@ -1271,7 +1281,8 @@ int smu_v14_0_set_performance_level(struct smu_context *smu, - ret = smu_v14_0_set_soft_freq_limited_range(smu, - SMU_MCLK, - mclk_min, -- mclk_max); -+ mclk_max, -+ auto_level); - if (ret) - return ret; - -@@ -1283,7 +1294,8 @@ int smu_v14_0_set_performance_level(struct smu_context *smu, - ret = smu_v14_0_set_soft_freq_limited_range(smu, - SMU_SOCCLK, - socclk_min, -- socclk_max); -+ socclk_max, -+ auto_level); - if (ret) - return ret; - -@@ -1298,7 +1310,8 @@ int smu_v14_0_set_performance_level(struct smu_context *smu, - ret = smu_v14_0_set_soft_freq_limited_range(smu, - i ? SMU_VCLK1 : SMU_VCLK, - vclk_min, -- vclk_max); -+ vclk_max, -+ auto_level); - if (ret) - return ret; - } -@@ -1313,7 +1326,8 @@ int smu_v14_0_set_performance_level(struct smu_context *smu, - ret = smu_v14_0_set_soft_freq_limited_range(smu, - i ? SMU_DCLK1 : SMU_DCLK, - dclk_min, -- dclk_max); -+ dclk_max, -+ auto_level); - if (ret) - return ret; - } -@@ -1325,7 +1339,8 @@ int smu_v14_0_set_performance_level(struct smu_context *smu, - ret = smu_v14_0_set_soft_freq_limited_range(smu, - SMU_FCLK, - fclk_min, -- fclk_max); -+ fclk_max, -+ auto_level); - if (ret) - return ret; - -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c -index ba17d01e6439..6c0f3505bb55 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c -@@ -1245,7 +1245,8 @@ static int smu_v14_0_2_force_clk_levels(struct smu_context *smu, - ret = smu_v14_0_set_soft_freq_limited_range(smu, - clk_type, - min_freq, -- max_freq); -+ max_freq, -+ false); - break; - case SMU_DCEFCLK: - case SMU_PCIE: diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c -index f68a41eeb1fa..6cd386d0fccb 100644 +index 855beafb76ff..ad78059ee954 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -94,6 +94,8 @@ static int oui(u8 first, u8 second, u8 third) @@ -13060,7 +13021,7 @@ index f68a41eeb1fa..6cd386d0fccb 100644 /* Sony PVM-2541A does up to 12 bpc, but only reports max 8 bpc */ EDID_QUIRK('S', 'N', 'Y', 0x2541, EDID_QUIRK_FORCE_12BPC), -@@ -6757,7 +6765,37 @@ static void update_display_info(struct drm_connector *connector, +@@ -6753,7 +6761,37 @@ static void update_display_info(struct drm_connector *connector, drm_edid_to_eld(connector, drm_edid); } @@ -13099,7 +13060,7 @@ index f68a41eeb1fa..6cd386d0fccb 100644 struct displayid_detailed_timings_1 *timings, bool type_7) { -@@ -6776,7 +6814,7 @@ static struct drm_display_mode *drm_mode_displayid_detailed(struct drm_device *d +@@ -6772,7 +6810,7 @@ static struct drm_display_mode *drm_mode_displayid_detailed(struct drm_device *d bool hsync_positive = (timings->hsync[1] >> 7) & 0x1; bool vsync_positive = (timings->vsync[1] >> 7) & 0x1; @@ -13108,7 +13069,7 @@ index f68a41eeb1fa..6cd386d0fccb 100644 if (!mode) return NULL; -@@ -6799,6 +6837,9 @@ static struct drm_display_mode *drm_mode_displayid_detailed(struct drm_device *d +@@ -6795,6 +6833,9 @@ static struct drm_display_mode *drm_mode_displayid_detailed(struct drm_device *d if (timings->flags & 0x80) mode->type |= DRM_MODE_TYPE_PREFERRED; @@ -13118,7 +13079,7 @@ index f68a41eeb1fa..6cd386d0fccb 100644 drm_mode_set_name(mode); return mode; -@@ -6821,7 +6862,7 @@ static int add_displayid_detailed_1_modes(struct drm_connector *connector, +@@ -6817,7 +6858,7 @@ static int add_displayid_detailed_1_modes(struct drm_connector *connector, for (i = 0; i < num_timings; i++) { struct displayid_detailed_timings_1 *timings = &det->timings[i]; @@ -13127,672 +13088,206 @@ index f68a41eeb1fa..6cd386d0fccb 100644 if (!newmode) continue; -diff --git a/drivers/net/wireless/realtek/rtw89/pci.c b/drivers/net/wireless/realtek/rtw89/pci.c -index 02afeb3acce4..5aef7fa37878 100644 ---- a/drivers/net/wireless/realtek/rtw89/pci.c -+++ b/drivers/net/wireless/realtek/rtw89/pci.c -@@ -3026,24 +3026,54 @@ static void rtw89_pci_declaim_device(struct rtw89_dev *rtwdev, - pci_disable_device(pdev); - } +diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c +index 62ba01525479..376047beea3d 100644 +--- a/drivers/misc/lkdtm/bugs.c ++++ b/drivers/misc/lkdtm/bugs.c +@@ -445,7 +445,7 @@ static void lkdtm_FAM_BOUNDS(void) --static void rtw89_pci_cfg_dac(struct rtw89_dev *rtwdev) -+static bool rtw89_pci_chip_is_manual_dac(struct rtw89_dev *rtwdev) - { -- struct rtw89_pci *rtwpci = (struct rtw89_pci *)rtwdev->priv; - const struct rtw89_chip_info *chip = rtwdev->chip; + pr_err("FAIL: survived access of invalid flexible array member index!\n"); -- if (!rtwpci->enable_dac) -- return; +- if (!__has_attribute(__counted_by__)) ++ if (!IS_ENABLED(CONFIG_CC_HAS_COUNTED_BY)) + pr_warn("This is expected since this %s was built with a compiler that does not support __counted_by\n", + lkdtm_kernel_info); + else if (IS_ENABLED(CONFIG_UBSAN_BOUNDS)) +diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h +index 32284cd26d52..c16d4199bf92 100644 +--- a/include/linux/compiler_attributes.h ++++ b/include/linux/compiler_attributes.h +@@ -94,19 +94,6 @@ + # define __copy(symbol) + #endif + +-/* +- * Optional: only supported since gcc >= 15 +- * Optional: only supported since clang >= 18 +- * +- * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896 +- * clang: https://github.com/llvm/llvm-project/pull/76348 +- */ +-#if __has_attribute(__counted_by__) +-# define __counted_by(member) __attribute__((__counted_by__(member))) +-#else +-# define __counted_by(member) +-#endif - - switch (chip->chip_id) { - case RTL8852A: - case RTL8852B: - case RTL8851B: - case RTL8852BT: -- break; -+ return true; - default: -- return; -+ return false; -+ } -+} -+ -+static bool rtw89_pci_is_dac_compatible_bridge(struct rtw89_dev *rtwdev) -+{ -+ struct rtw89_pci *rtwpci = (struct rtw89_pci *)rtwdev->priv; -+ struct pci_dev *bridge = pci_upstream_bridge(rtwpci->pdev); -+ -+ if (!rtw89_pci_chip_is_manual_dac(rtwdev)) -+ return true; -+ -+ if (!bridge) -+ return false; -+ -+ switch (bridge->vendor) { -+ case PCI_VENDOR_ID_INTEL: -+ return true; -+ case PCI_VENDOR_ID_ASMEDIA: -+ if (bridge->device == 0x2806) -+ return true; -+ break; - } - -+ return false; -+} -+ -+static void rtw89_pci_cfg_dac(struct rtw89_dev *rtwdev) -+{ -+ struct rtw89_pci *rtwpci = (struct rtw89_pci *)rtwdev->priv; -+ -+ if (!rtwpci->enable_dac) -+ return; -+ -+ if (!rtw89_pci_chip_is_manual_dac(rtwdev)) -+ return; -+ - rtw89_pci_config_byte_set(rtwdev, RTW89_PCIE_L1_CTRL, RTW89_PCIE_BIT_EN_64BITS); - } - -@@ -3061,6 +3091,9 @@ static int rtw89_pci_setup_mapping(struct rtw89_dev *rtwdev, - goto err; - } - -+ if (!rtw89_pci_is_dac_compatible_bridge(rtwdev)) -+ goto no_dac; -+ - ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(36)); - if (!ret) { - rtwpci->enable_dac = true; -@@ -3073,6 +3106,7 @@ static int rtw89_pci_setup_mapping(struct rtw89_dev *rtwdev, - goto err_release_regions; - } - } -+no_dac: - - resource_len = pci_resource_len(pdev, bar_id); - rtwpci->mmap = pci_iomap(pdev, bar_id, resource_len); -diff --git a/drivers/platform/x86/dell/dell-wmi-base.c b/drivers/platform/x86/dell/dell-wmi-base.c -index 502783a7adb1..24fd7ffadda9 100644 ---- a/drivers/platform/x86/dell/dell-wmi-base.c -+++ b/drivers/platform/x86/dell/dell-wmi-base.c -@@ -264,6 +264,15 @@ static const struct key_entry dell_wmi_keymap_type_0010[] = { - /*Speaker Mute*/ - { KE_KEY, 0x109, { KEY_MUTE} }, - -+ /* S2Idle screen off */ -+ { KE_IGNORE, 0x120, { KEY_RESERVED }}, -+ -+ /* Leaving S4 or S2Idle suspend */ -+ { KE_IGNORE, 0x130, { KEY_RESERVED }}, -+ -+ /* Entering S2Idle suspend */ -+ { KE_IGNORE, 0x140, { KEY_RESERVED }}, -+ - /* Mic mute */ - { KE_KEY, 0x150, { KEY_MICMUTE } }, - -diff --git a/mm/mmap.c b/mm/mmap.c -index 18fddcce03b8..d84d6dd8771c 100644 ---- a/mm/mmap.c -+++ b/mm/mmap.c -@@ -1952,10 +1952,6 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, - - if (get_area) { - addr = get_area(file, addr, len, pgoff, flags); -- } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { -- /* Ensures that larger anonymous mappings are THP aligned. */ -- addr = thp_get_unmapped_area_vmflags(file, addr, len, -- pgoff, flags, vm_flags); - } else { - addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len, - pgoff, flags, vm_flags); -diff --git a/mm/shrinker.c b/mm/shrinker.c -index dc5d2a6fcfc4..e4b795ee6d2e 100644 ---- a/mm/shrinker.c -+++ b/mm/shrinker.c -@@ -87,8 +87,10 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) - if (!info) - goto err; - info->map_nr_max = shrinker_nr_max; -- if (shrinker_unit_alloc(info, NULL, nid)) -+ if (shrinker_unit_alloc(info, NULL, nid)) { -+ kvfree(info); - goto err; -+ } - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); - } - mutex_unlock(&shrinker_mutex); -diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c -index d80abd6ccaf8..6dcf4bc7e30b 100644 ---- a/net/netfilter/xt_NFLOG.c -+++ b/net/netfilter/xt_NFLOG.c -@@ -79,7 +79,7 @@ static struct xt_target nflog_tg_reg[] __read_mostly = { - { - .name = "NFLOG", - .revision = 0, -- .family = NFPROTO_IPV4, -+ .family = NFPROTO_IPV6, - .checkentry = nflog_tg_check, - .destroy = nflog_tg_destroy, - .target = nflog_tg, -diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c -index f3fa4f11348c..a642ff09fc8e 100644 ---- a/net/netfilter/xt_TRACE.c -+++ b/net/netfilter/xt_TRACE.c -@@ -49,6 +49,7 @@ static struct xt_target trace_tg_reg[] __read_mostly = { - .target = trace_tg, - .checkentry = trace_tg_check, - .destroy = trace_tg_destroy, -+ .me = THIS_MODULE, - }, + /* + * Optional: not supported by gcc + * Optional: only supported since clang >= 14.0 +diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h +index 1a957ea2f4fe..639be0f30b45 100644 +--- a/include/linux/compiler_types.h ++++ b/include/linux/compiler_types.h +@@ -323,6 +323,25 @@ struct ftrace_likely_data { + #define __no_sanitize_or_inline __always_inline #endif - }; -diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c -index f76fe04fc9a4..65b965ca40ea 100644 ---- a/net/netfilter/xt_mark.c -+++ b/net/netfilter/xt_mark.c -@@ -62,7 +62,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = { - { - .name = "MARK", - .revision = 2, -- .family = NFPROTO_IPV4, -+ .family = NFPROTO_IPV6, - .target = mark_tg, - .targetsize = sizeof(struct xt_mark_tginfo2), - .me = THIS_MODULE, --- -2.47.0.rc0 - -From 13dcfcc62c4c4467d7f8c9d1436097cdd70c0cec Mon Sep 17 00:00:00 2001 -From: Eric Naim -Date: Tue, 22 Oct 2024 22:52:43 +0800 -Subject: [PATCH 07/13] intel-pstate - -Signed-off-by: Eric Naim ---- - arch/x86/include/asm/topology.h | 13 ++ - arch/x86/kernel/cpu/aperfmperf.c | 89 +++++++++++- - drivers/cpufreq/intel_pstate.c | 232 ++++++++++++++++++++++++++++++- - 3 files changed, 328 insertions(+), 6 deletions(-) - -diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h -index 94d9832a5bc8..9f9376db64e3 100644 ---- a/arch/x86/include/asm/topology.h -+++ b/arch/x86/include/asm/topology.h -@@ -291,9 +291,22 @@ static inline long arch_scale_freq_capacity(int cpu) - } - #define arch_scale_freq_capacity arch_scale_freq_capacity -+bool arch_enable_hybrid_capacity_scale(void); -+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, -+ unsigned long cap_freq, unsigned long base_freq); -+ -+unsigned long arch_scale_cpu_capacity(int cpu); -+#define arch_scale_cpu_capacity arch_scale_cpu_capacity -+ - extern void arch_set_max_freq_ratio(bool turbo_disabled); - extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled); - #else -+static inline bool arch_enable_hybrid_capacity_scale(void) { return false; } -+static inline void arch_set_cpu_capacity(int cpu, unsigned long cap, -+ unsigned long max_cap, -+ unsigned long cap_freq, -+ unsigned long base_freq) { } -+ - static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } - static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { } - #endif -diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c -index 0b69bfbf345d..ec07678c641b 100644 ---- a/arch/x86/kernel/cpu/aperfmperf.c -+++ b/arch/x86/kernel/cpu/aperfmperf.c -@@ -349,9 +349,89 @@ static DECLARE_WORK(disable_freq_invariance_work, - DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; - EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); - -+static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key); -+ -+struct arch_hybrid_cpu_scale { -+ unsigned long capacity; -+ unsigned long freq_ratio; -+}; -+ -+static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale; -+ -+/** -+ * arch_enable_hybrid_capacity_scale - Enable hybrid CPU capacity scaling -+ * -+ * Allocate memory for per-CPU data used by hybrid CPU capacity scaling, -+ * initialize it and set the static key controlling its code paths. -+ * -+ * Must be called before arch_set_cpu_capacity(). -+ */ -+bool arch_enable_hybrid_capacity_scale(void) -+{ -+ int cpu; -+ -+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) { -+ WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled"); -+ return true; -+ } -+ -+ arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale); -+ if (!arch_cpu_scale) -+ return false; -+ -+ for_each_possible_cpu(cpu) { -+ per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE; -+ per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio; -+ } -+ -+ static_branch_enable(&arch_hybrid_cap_scale_key); -+ -+ pr_info("Hybrid CPU capacity scaling enabled\n"); -+ -+ return true; -+} -+ -+/** -+ * arch_set_cpu_capacity - Set scale-invariance parameters for a CPU -+ * @cpu: Target CPU. -+ * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap. -+ * @max_cap: System-wide maximum CPU capacity. -+ * @cap_freq: Frequency of @cpu corresponding to @cap. -+ * @base_freq: Frequency of @cpu at which MPERF counts. -+ * -+ * The units in which @cap and @max_cap are expressed do not matter, so long -+ * as they are consistent, because the former is effectively divided by the -+ * latter. Analogously for @cap_freq and @base_freq. -+ * -+ * After calling this function for all CPUs, call arch_rebuild_sched_domains() -+ * to let the scheduler know that capacity-aware scheduling can be used going -+ * forward. -+ */ -+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, -+ unsigned long cap_freq, unsigned long base_freq) -+{ -+ if (static_branch_likely(&arch_hybrid_cap_scale_key)) { -+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity, -+ div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap)); -+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio, -+ div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq)); -+ } else { -+ WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled"); -+ } -+} -+ -+unsigned long arch_scale_cpu_capacity(int cpu) -+{ -+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) -+ return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity); -+ -+ return SCHED_CAPACITY_SCALE; -+} -+EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity); -+ - static void scale_freq_tick(u64 acnt, u64 mcnt) - { -- u64 freq_scale; -+ u64 freq_scale, freq_ratio; - - if (!arch_scale_freq_invariant()) - return; -@@ -359,7 +439,12 @@ static void scale_freq_tick(u64 acnt, u64 mcnt) - if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) - goto error; - -- if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) -+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) -+ freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio); -+ else -+ freq_ratio = arch_max_freq_ratio; -+ -+ if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt) - goto error; - - freq_scale = div64_u64(acnt, mcnt); -diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c -index 348a330678bd..c11be253bfa3 100644 ---- a/drivers/cpufreq/intel_pstate.c -+++ b/drivers/cpufreq/intel_pstate.c -@@ -16,6 +16,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -215,6 +216,7 @@ struct global_params { - * @hwp_req_cached: Cached value of the last HWP Request MSR - * @hwp_cap_cached: Cached value of the last HWP Capabilities MSR - * @last_io_update: Last time when IO wake flag was set -+ * @capacity_perf: Highest perf used for scale invariance - * @sched_flags: Store scheduler flags for possible cross CPU update - * @hwp_boost_min: Last HWP boosted min performance - * @suspended: Whether or not the driver has been suspended. -@@ -253,6 +255,7 @@ struct cpudata { - u64 hwp_req_cached; - u64 hwp_cap_cached; - u64 last_io_update; -+ unsigned int capacity_perf; - unsigned int sched_flags; - u32 hwp_boost_min; - bool suspended; -@@ -295,6 +298,7 @@ static int hwp_mode_bdw __ro_after_init; - static bool per_cpu_limits __ro_after_init; - static bool hwp_forced __ro_after_init; - static bool hwp_boost __read_mostly; -+static bool hwp_is_hybrid; - - static struct cpufreq_driver *intel_pstate_driver __read_mostly; - -@@ -934,6 +938,135 @@ static struct freq_attr *hwp_cpufreq_attrs[] = { - NULL, - }; - -+static struct cpudata *hybrid_max_perf_cpu __read_mostly; +/* -+ * Protects hybrid_max_perf_cpu, the capacity_perf fields in struct cpudata, -+ * and the x86 arch scale-invariance information from concurrent updates. ++ * Optional: only supported since gcc >= 15 ++ * Optional: only supported since clang >= 18 ++ * ++ * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896 ++ * clang: https://github.com/llvm/llvm-project/pull/76348 ++ * ++ * __bdos on clang < 19.1.2 can erroneously return 0: ++ * https://github.com/llvm/llvm-project/pull/110497 ++ * ++ * __bdos on clang < 19.1.3 can be off by 4: ++ * https://github.com/llvm/llvm-project/pull/112636 + */ -+static DEFINE_MUTEX(hybrid_capacity_lock); ++#ifdef CONFIG_CC_HAS_COUNTED_BY ++# define __counted_by(member) __attribute__((__counted_by__(member))) ++#else ++# define __counted_by(member) ++#endif + -+static void hybrid_set_cpu_capacity(struct cpudata *cpu) -+{ -+ arch_set_cpu_capacity(cpu->cpu, cpu->capacity_perf, -+ hybrid_max_perf_cpu->capacity_perf, -+ cpu->capacity_perf, -+ cpu->pstate.max_pstate_physical); + /* + * Apply __counted_by() when the Endianness matches to increase test coverage. + */ +diff --git a/init/Kconfig b/init/Kconfig +index 38dbd16da6a9..504e8a7c4e2a 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -120,6 +120,14 @@ config CC_HAS_ASM_INLINE + config CC_HAS_NO_PROFILE_FN_ATTR + def_bool $(success,echo '__attribute__((no_profile_instrument_function)) int x();' | $(CC) -x c - -c -o /dev/null -Werror) + ++# clang needs to be at least 19.1.3 to avoid __bdos miscalculations ++# https://github.com/llvm/llvm-project/pull/110497 ++# https://github.com/llvm/llvm-project/pull/112636 ++# TODO: when gcc 15 is released remove the build test and add gcc version check ++config CC_HAS_COUNTED_BY ++ def_bool $(success,echo 'struct flex { int count; int array[] __attribute__((__counted_by__(count))); };' | $(CC) $(CLANG_FLAGS) -x c - -c -o /dev/null -Werror) ++ depends on !(CC_IS_CLANG && CLANG_VERSION < 190103) + -+ pr_debug("CPU%d: perf = %u, max. perf = %u, base perf = %d\n", cpu->cpu, -+ cpu->capacity_perf, hybrid_max_perf_cpu->capacity_perf, -+ cpu->pstate.max_pstate_physical); -+} -+ -+static void hybrid_clear_cpu_capacity(unsigned int cpunum) -+{ -+ arch_set_cpu_capacity(cpunum, 1, 1, 1, 1); -+} -+ -+static void hybrid_get_capacity_perf(struct cpudata *cpu) -+{ -+ if (READ_ONCE(global.no_turbo)) { -+ cpu->capacity_perf = cpu->pstate.max_pstate_physical; -+ return; -+ } -+ -+ cpu->capacity_perf = HWP_HIGHEST_PERF(READ_ONCE(cpu->hwp_cap_cached)); -+} -+ -+static void hybrid_set_capacity_of_cpus(void) -+{ -+ int cpunum; -+ -+ for_each_online_cpu(cpunum) { -+ struct cpudata *cpu = all_cpu_data[cpunum]; -+ -+ if (cpu) -+ hybrid_set_cpu_capacity(cpu); -+ } -+} -+ -+static void hybrid_update_cpu_scaling(void) -+{ -+ struct cpudata *max_perf_cpu = NULL; -+ unsigned int max_cap_perf = 0; -+ int cpunum; -+ -+ for_each_online_cpu(cpunum) { -+ struct cpudata *cpu = all_cpu_data[cpunum]; -+ -+ if (!cpu) -+ continue; -+ -+ /* -+ * During initialization, CPU performance at full capacity needs -+ * to be determined. -+ */ -+ if (!hybrid_max_perf_cpu) -+ hybrid_get_capacity_perf(cpu); -+ -+ /* -+ * If hybrid_max_perf_cpu is not NULL at this point, it is -+ * being replaced, so don't take it into account when looking -+ * for the new one. -+ */ -+ if (cpu == hybrid_max_perf_cpu) -+ continue; -+ -+ if (cpu->capacity_perf > max_cap_perf) { -+ max_cap_perf = cpu->capacity_perf; -+ max_perf_cpu = cpu; -+ } -+ } -+ -+ if (max_perf_cpu) { -+ hybrid_max_perf_cpu = max_perf_cpu; -+ hybrid_set_capacity_of_cpus(); -+ } else { -+ pr_info("Found no CPUs with nonzero maximum performance\n"); -+ /* Revert to the flat CPU capacity structure. */ -+ for_each_online_cpu(cpunum) -+ hybrid_clear_cpu_capacity(cpunum); -+ } -+} -+ -+static void __hybrid_init_cpu_scaling(void) -+{ -+ hybrid_max_perf_cpu = NULL; -+ hybrid_update_cpu_scaling(); -+} -+ -+static void hybrid_init_cpu_scaling(void) -+{ -+ bool disable_itmt = false; -+ -+ mutex_lock(&hybrid_capacity_lock); -+ -+ /* -+ * If hybrid_max_perf_cpu is set at this point, the hybrid CPU capacity -+ * scaling has been enabled already and the driver is just changing the -+ * operation mode. -+ */ -+ if (hybrid_max_perf_cpu) { -+ __hybrid_init_cpu_scaling(); -+ goto unlock; -+ } -+ -+ /* -+ * On hybrid systems, use asym capacity instead of ITMT, but because -+ * the capacity of SMT threads is not deterministic even approximately, -+ * do not do that when SMT is in use. -+ */ -+ if (hwp_is_hybrid && !sched_smt_active() && arch_enable_hybrid_capacity_scale()) { -+ __hybrid_init_cpu_scaling(); -+ disable_itmt = true; -+ } -+ -+unlock: -+ mutex_unlock(&hybrid_capacity_lock); -+ -+ if (disable_itmt) -+ sched_clear_itmt_support(); -+} -+ - static void __intel_pstate_get_hwp_cap(struct cpudata *cpu) + config PAHOLE_VERSION + int + default $(shell,$(srctree)/scripts/pahole-version.sh $(PAHOLE)) +diff --git a/lib/overflow_kunit.c b/lib/overflow_kunit.c +index 2abc78367dd1..5222c6393f11 100644 +--- a/lib/overflow_kunit.c ++++ b/lib/overflow_kunit.c +@@ -1187,7 +1187,7 @@ static void DEFINE_FLEX_test(struct kunit *test) { - u64 cap; -@@ -962,6 +1095,43 @@ static void intel_pstate_get_hwp_cap(struct cpudata *cpu) - } - } + /* Using _RAW_ on a __counted_by struct will initialize "counter" to zero */ + DEFINE_RAW_FLEX(struct foo, two_but_zero, array, 2); +-#if __has_attribute(__counted_by__) ++#ifdef CONFIG_CC_HAS_COUNTED_BY + int expected_raw_size = sizeof(struct foo); + #else + int expected_raw_size = sizeof(struct foo) + 2 * sizeof(s16); +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 2fb328880b50..a1d345f1680c 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -3718,8 +3718,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + unsigned long flags; + LIST_HEAD(list); +- struct folio *folio, *next; +- int split = 0; ++ struct folio *folio, *next, *prev = NULL; ++ int split = 0, removed = 0; -+static void hybrid_update_capacity(struct cpudata *cpu) -+{ -+ unsigned int max_cap_perf; -+ -+ mutex_lock(&hybrid_capacity_lock); -+ -+ if (!hybrid_max_perf_cpu) -+ goto unlock; -+ -+ /* -+ * The maximum performance of the CPU may have changed, but assume -+ * that the performance of the other CPUs has not changed. -+ */ -+ max_cap_perf = hybrid_max_perf_cpu->capacity_perf; -+ -+ intel_pstate_get_hwp_cap(cpu); -+ -+ hybrid_get_capacity_perf(cpu); -+ /* Should hybrid_max_perf_cpu be replaced by this CPU? */ -+ if (cpu->capacity_perf > max_cap_perf) { -+ hybrid_max_perf_cpu = cpu; -+ hybrid_set_capacity_of_cpus(); -+ goto unlock; -+ } -+ -+ /* If this CPU is hybrid_max_perf_cpu, should it be replaced? */ -+ if (cpu == hybrid_max_perf_cpu && cpu->capacity_perf < max_cap_perf) { -+ hybrid_update_cpu_scaling(); -+ goto unlock; -+ } -+ -+ hybrid_set_cpu_capacity(cpu); -+ -+unlock: -+ mutex_unlock(&hybrid_capacity_lock); -+} -+ - static void intel_pstate_hwp_set(unsigned int cpu) - { - struct cpudata *cpu_data = all_cpu_data[cpu]; -@@ -1070,6 +1240,22 @@ static void intel_pstate_hwp_offline(struct cpudata *cpu) - value |= HWP_ENERGY_PERF_PREFERENCE(HWP_EPP_POWERSAVE); - - wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); -+ -+ mutex_lock(&hybrid_capacity_lock); -+ -+ if (!hybrid_max_perf_cpu) { -+ mutex_unlock(&hybrid_capacity_lock); -+ -+ return; -+ } -+ -+ if (hybrid_max_perf_cpu == cpu) -+ hybrid_update_cpu_scaling(); -+ -+ mutex_unlock(&hybrid_capacity_lock); -+ -+ /* Reset the capacity of the CPU going offline to the initial value. */ -+ hybrid_clear_cpu_capacity(cpu->cpu); - } - - #define POWER_CTL_EE_ENABLE 1 -@@ -1165,21 +1351,46 @@ static void __intel_pstate_update_max_freq(struct cpudata *cpudata, - static void intel_pstate_update_limits(unsigned int cpu) - { - struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu); -+ struct cpudata *cpudata; - - if (!policy) - return; - -- __intel_pstate_update_max_freq(all_cpu_data[cpu], policy); -+ cpudata = all_cpu_data[cpu]; -+ -+ __intel_pstate_update_max_freq(cpudata, policy); -+ -+ /* Prevent the driver from being unregistered now. */ -+ mutex_lock(&intel_pstate_driver_lock); - - cpufreq_cpu_release(policy); -+ -+ hybrid_update_capacity(cpudata); -+ -+ mutex_unlock(&intel_pstate_driver_lock); - } - - static void intel_pstate_update_limits_for_all(void) - { - int cpu; - -- for_each_possible_cpu(cpu) -- intel_pstate_update_limits(cpu); -+ for_each_possible_cpu(cpu) { -+ struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu); -+ -+ if (!policy) -+ continue; -+ -+ __intel_pstate_update_max_freq(all_cpu_data[cpu], policy); -+ -+ cpufreq_cpu_release(policy); -+ } -+ -+ mutex_lock(&hybrid_capacity_lock); -+ -+ if (hybrid_max_perf_cpu) -+ __hybrid_init_cpu_scaling(); -+ -+ mutex_unlock(&hybrid_capacity_lock); - } - - /************************** sysfs begin ************************/ -@@ -1618,6 +1829,13 @@ static void intel_pstate_notify_work(struct work_struct *work) - __intel_pstate_update_max_freq(cpudata, policy); - - cpufreq_cpu_release(policy); -+ -+ /* -+ * The driver will not be unregistered while this function is -+ * running, so update the capacity without acquiring the driver -+ * lock. -+ */ -+ hybrid_update_capacity(cpudata); - } - - wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_STATUS, 0); -@@ -2034,8 +2252,10 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) - - if (pstate_funcs.get_cpu_scaling) { - cpu->pstate.scaling = pstate_funcs.get_cpu_scaling(cpu->cpu); -- if (cpu->pstate.scaling != perf_ctl_scaling) -+ if (cpu->pstate.scaling != perf_ctl_scaling) { - intel_pstate_hybrid_hwp_adjust(cpu); -+ hwp_is_hybrid = true; -+ } - } else { - cpu->pstate.scaling = perf_ctl_scaling; - } -@@ -2703,6 +2923,8 @@ static int intel_pstate_cpu_online(struct cpufreq_policy *policy) + #ifdef CONFIG_MEMCG + if (sc->memcg) +@@ -3775,15 +3775,28 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, */ - intel_pstate_hwp_reenable(cpu); - cpu->suspended = false; -+ -+ hybrid_update_capacity(cpu); + if (!did_split && !folio_test_partially_mapped(folio)) { + list_del_init(&folio->_deferred_list); +- ds_queue->split_queue_len--; ++ removed++; ++ } else { ++ /* ++ * That unlocked list_del_init() above would be unsafe, ++ * unless its folio is separated from any earlier folios ++ * left on the list (which may be concurrently unqueued) ++ * by one safe folio with refcount still raised. ++ */ ++ swap(folio, prev); + } +- folio_put(folio); ++ if (folio) ++ folio_put(folio); } - return 0; -@@ -3143,6 +3365,8 @@ static int intel_pstate_register_driver(struct cpufreq_driver *driver) + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + list_splice_tail(&list, &ds_queue->split_queue); ++ ds_queue->split_queue_len -= removed; + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); - global.min_perf_pct = min_perf_pct_min(); - -+ hybrid_init_cpu_scaling(); ++ if (prev) ++ folio_put(prev); + - return 0; - } + /* + * Stop shrinker if we didn't split any page, but the queue is empty. + * This can happen if pages were freed under us. +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index 7845c64a2c57..2703227cce88 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -4631,8 +4631,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + VM_BUG_ON_FOLIO(folio_order(folio) > 1 && + !folio_test_hugetlb(folio) && +- !list_empty(&folio->_deferred_list) && +- folio_test_partially_mapped(folio), folio); ++ !list_empty(&folio->_deferred_list), folio); + /* + * Nobody should be changing or seriously looking at +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 43f8f7290bf0..e6e51d4a6f3e 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -965,9 +965,8 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page) + break; + case 2: + /* the second tail page: deferred_list overlaps ->mapping */ +- if (unlikely(!list_empty(&folio->_deferred_list) && +- folio_test_partially_mapped(folio))) { +- bad_page(page, "partially mapped folio on deferred list"); ++ if (unlikely(!list_empty(&folio->_deferred_list))) { ++ bad_page(page, "on deferred list"); + goto out; + } + break; +diff --git a/scripts/package/PKGBUILD b/scripts/package/PKGBUILD +index f83493838cf9..4010899652b8 100644 +--- a/scripts/package/PKGBUILD ++++ b/scripts/package/PKGBUILD +@@ -91,6 +91,11 @@ _package-headers() { + "${srctree}/scripts/package/install-extmod-build" "${builddir}" + fi + ++ # required when DEBUG_INFO_BTF_MODULES is enabled ++ if [ -f tools/bpf/resolve_btfids/resolve_btfids ]; then ++ install -Dt "$builddir/tools/bpf/resolve_btfids" tools/bpf/resolve_btfids/resolve_btfids ++ fi ++ + echo "Installing System.map and config..." + mkdir -p "${builddir}" + cp System.map "${builddir}/System.map" -- -2.47.0.rc0 +2.47.0 -From ac05835cead1a0a20b6297fdb0f7c47326d0ff71 Mon Sep 17 00:00:00 2001 -From: Eric Naim -Date: Tue, 22 Oct 2024 22:52:52 +0800 +From 09b968fac790022322af9311ce41ac43e80a1d89 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 4 Nov 2024 14:49:08 +0100 Subject: [PATCH 08/13] ksm -Signed-off-by: Eric Naim +Signed-off-by: Peter Jung --- arch/alpha/kernel/syscalls/syscall.tbl | 3 + arch/arm/tools/syscall.tbl | 3 + @@ -13987,7 +13482,7 @@ index 67083fc1b2f5..c1aecee4ad9b 100644 +464 common process_ksm_disable sys_process_ksm_disable +465 common process_ksm_status sys_process_ksm_status diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h -index 4bcf6754738d..b3ea08e920f7 100644 +index 5758104921e6..cc9c4fac2412 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -818,6 +818,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); @@ -14022,10 +13517,10 @@ index 5bf6148cac2b..613e559ad6e0 100644 /* * 32 bit systems traditionally used different diff --git a/kernel/sys.c b/kernel/sys.c -index 3a2df1bd9f64..bc77dc784527 100644 +index 4da31f28fda8..fcd3aeaddd05 100644 --- a/kernel/sys.c +++ b/kernel/sys.c -@@ -2789,6 +2789,144 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, +@@ -2791,6 +2791,144 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return error; } @@ -14218,14 +13713,14 @@ index 01071182763e..7394bad8178e 100644 +464 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable +465 common process_ksm_status sys_process_ksm_status sys_process_ksm_status -- -2.47.0.rc0 +2.47.0 -From 727728c9e456e4484ef8e1a05a66f78a90e1c24a Mon Sep 17 00:00:00 2001 -From: Eric Naim -Date: Tue, 22 Oct 2024 22:53:03 +0800 +From ef72e883f6dfa297aa8690dd76f1365a31cde616 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 4 Nov 2024 14:49:23 +0100 Subject: [PATCH 09/13] ntsync -Signed-off-by: Eric Naim +Signed-off-by: Peter Jung --- Documentation/userspace-api/index.rst | 1 + Documentation/userspace-api/ntsync.rst | 398 +++++ @@ -14662,10 +14157,10 @@ index 000000000000..767844637a7d + ``objs`` and in ``alert``. If this is attempted, the function fails + with ``EINVAL``. diff --git a/MAINTAINERS b/MAINTAINERS -index 2ba00c0cd701..0bcfbc58a9ab 100644 +index 919f01186c11..6113837d502a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS -@@ -16327,6 +16327,15 @@ T: git https://github.com/Paragon-Software-Group/linux-ntfs3.git +@@ -16500,6 +16500,15 @@ T: git https://github.com/Paragon-Software-Group/linux-ntfs3.git F: Documentation/filesystems/ntfs3.rst F: fs/ntfs3/ @@ -14682,10 +14177,10 @@ index 2ba00c0cd701..0bcfbc58a9ab 100644 M: Finn Thain L: linux-m68k@lists.linux-m68k.org diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig -index 41c54051347a..bde398e12696 100644 +index 3fe7e2a9bd29..6c8b999a5e08 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig -@@ -507,7 +507,6 @@ config OPEN_DICE +@@ -517,7 +517,6 @@ config OPEN_DICE config NTSYNC tristate "NT synchronization primitive emulation" @@ -14694,7 +14189,7 @@ index 41c54051347a..bde398e12696 100644 This module provides kernel support for emulation of Windows NT synchronization primitives. It is not a hardware driver. diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c -index 3c2f743c58b0..87a24798a5c7 100644 +index 4954553b7baa..3fac06270549 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -6,11 +6,17 @@ @@ -15365,7 +14860,7 @@ index 3c2f743c58b0..87a24798a5c7 100644 default: return -ENOIOCTLCMD; } -@@ -141,6 +717,9 @@ static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev, +@@ -140,6 +716,9 @@ static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev, obj->dev = dev; get_file(dev->file); spin_lock_init(&obj->lock); @@ -15375,7 +14870,7 @@ index 3c2f743c58b0..87a24798a5c7 100644 return obj; } -@@ -191,6 +770,400 @@ static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp) +@@ -190,6 +769,400 @@ static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp) return put_user(fd, &user_args->sem); } @@ -15776,7 +15271,7 @@ index 3c2f743c58b0..87a24798a5c7 100644 static int ntsync_char_open(struct inode *inode, struct file *file) { struct ntsync_device *dev; -@@ -199,6 +1172,8 @@ static int ntsync_char_open(struct inode *inode, struct file *file) +@@ -198,6 +1171,8 @@ static int ntsync_char_open(struct inode *inode, struct file *file) if (!dev) return -ENOMEM; @@ -15785,7 +15280,7 @@ index 3c2f743c58b0..87a24798a5c7 100644 file->private_data = dev; dev->file = file; return nonseekable_open(inode, file); -@@ -220,8 +1195,16 @@ static long ntsync_char_ioctl(struct file *file, unsigned int cmd, +@@ -219,8 +1194,16 @@ static long ntsync_char_ioctl(struct file *file, unsigned int cmd, void __user *argp = (void __user *)parm; switch (cmd) { @@ -15855,10 +15350,10 @@ index dcfa38fdc93c..4a8095a3fc34 100644 #endif diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile -index bc8fe9e8f7f2..b1296bd8eb3f 100644 +index 363d031a16f7..ff18c0361e38 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile -@@ -17,6 +17,7 @@ TARGETS += devices/error_logs +@@ -18,6 +18,7 @@ TARGETS += devices/error_logs TARGETS += devices/probe TARGETS += dmabuf-heaps TARGETS += drivers/dma-buf @@ -17307,22 +16802,9972 @@ index 000000000000..5fa2c9a0768c + +TEST_HARNESS_MAIN -- -2.47.0.rc0 +2.47.0 -From 94b78b402e239cd15095fa2d3e07f99a060a4b45 Mon Sep 17 00:00:00 2001 -From: Eric Naim -Date: Tue, 22 Oct 2024 22:53:18 +0800 -Subject: [PATCH 10/13] perf-per-core +From a4bf3a3e048257500b1ffca9827570c7dfd10aff Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 4 Nov 2024 14:49:50 +0100 +Subject: [PATCH 10/13] openvpn-dco -Signed-off-by: Eric Naim +Signed-off-by: Peter Jung +--- + Documentation/netlink/specs/ovpn.yaml | 362 +++ + MAINTAINERS | 11 + + drivers/net/Kconfig | 14 + + drivers/net/Makefile | 1 + + drivers/net/ovpn/Makefile | 22 + + drivers/net/ovpn/bind.c | 54 + + drivers/net/ovpn/bind.h | 117 + + drivers/net/ovpn/crypto.c | 214 ++ + drivers/net/ovpn/crypto.h | 145 + + drivers/net/ovpn/crypto_aead.c | 386 +++ + drivers/net/ovpn/crypto_aead.h | 33 + + drivers/net/ovpn/io.c | 462 ++++ + drivers/net/ovpn/io.h | 25 + + drivers/net/ovpn/main.c | 337 +++ + drivers/net/ovpn/main.h | 24 + + drivers/net/ovpn/netlink-gen.c | 212 ++ + drivers/net/ovpn/netlink-gen.h | 41 + + drivers/net/ovpn/netlink.c | 1135 ++++++++ + drivers/net/ovpn/netlink.h | 18 + + drivers/net/ovpn/ovpnstruct.h | 61 + + drivers/net/ovpn/packet.h | 40 + + drivers/net/ovpn/peer.c | 1201 +++++++++ + drivers/net/ovpn/peer.h | 165 ++ + drivers/net/ovpn/pktid.c | 130 + + drivers/net/ovpn/pktid.h | 87 + + drivers/net/ovpn/proto.h | 104 + + drivers/net/ovpn/skb.h | 56 + + drivers/net/ovpn/socket.c | 178 ++ + drivers/net/ovpn/socket.h | 55 + + drivers/net/ovpn/stats.c | 21 + + drivers/net/ovpn/stats.h | 47 + + drivers/net/ovpn/tcp.c | 506 ++++ + drivers/net/ovpn/tcp.h | 44 + + drivers/net/ovpn/udp.c | 406 +++ + drivers/net/ovpn/udp.h | 26 + + include/net/netlink.h | 1 + + include/uapi/linux/if_link.h | 15 + + include/uapi/linux/ovpn.h | 109 + + include/uapi/linux/udp.h | 1 + + tools/net/ynl/ynl-gen-c.py | 2 + + tools/testing/selftests/Makefile | 1 + + tools/testing/selftests/net/ovpn/.gitignore | 2 + + tools/testing/selftests/net/ovpn/Makefile | 17 + + tools/testing/selftests/net/ovpn/config | 10 + + tools/testing/selftests/net/ovpn/data64.key | 5 + + tools/testing/selftests/net/ovpn/ovpn-cli.c | 2370 +++++++++++++++++ + .../testing/selftests/net/ovpn/tcp_peers.txt | 5 + + .../selftests/net/ovpn/test-chachapoly.sh | 9 + + .../testing/selftests/net/ovpn/test-float.sh | 9 + + tools/testing/selftests/net/ovpn/test-tcp.sh | 9 + + tools/testing/selftests/net/ovpn/test.sh | 183 ++ + .../testing/selftests/net/ovpn/udp_peers.txt | 5 + + 52 files changed, 9493 insertions(+) + create mode 100644 Documentation/netlink/specs/ovpn.yaml + create mode 100644 drivers/net/ovpn/Makefile + create mode 100644 drivers/net/ovpn/bind.c + create mode 100644 drivers/net/ovpn/bind.h + create mode 100644 drivers/net/ovpn/crypto.c + create mode 100644 drivers/net/ovpn/crypto.h + create mode 100644 drivers/net/ovpn/crypto_aead.c + create mode 100644 drivers/net/ovpn/crypto_aead.h + create mode 100644 drivers/net/ovpn/io.c + create mode 100644 drivers/net/ovpn/io.h + create mode 100644 drivers/net/ovpn/main.c + create mode 100644 drivers/net/ovpn/main.h + create mode 100644 drivers/net/ovpn/netlink-gen.c + create mode 100644 drivers/net/ovpn/netlink-gen.h + create mode 100644 drivers/net/ovpn/netlink.c + create mode 100644 drivers/net/ovpn/netlink.h + create mode 100644 drivers/net/ovpn/ovpnstruct.h + create mode 100644 drivers/net/ovpn/packet.h + create mode 100644 drivers/net/ovpn/peer.c + create mode 100644 drivers/net/ovpn/peer.h + create mode 100644 drivers/net/ovpn/pktid.c + create mode 100644 drivers/net/ovpn/pktid.h + create mode 100644 drivers/net/ovpn/proto.h + create mode 100644 drivers/net/ovpn/skb.h + create mode 100644 drivers/net/ovpn/socket.c + create mode 100644 drivers/net/ovpn/socket.h + create mode 100644 drivers/net/ovpn/stats.c + create mode 100644 drivers/net/ovpn/stats.h + create mode 100644 drivers/net/ovpn/tcp.c + create mode 100644 drivers/net/ovpn/tcp.h + create mode 100644 drivers/net/ovpn/udp.c + create mode 100644 drivers/net/ovpn/udp.h + create mode 100644 include/uapi/linux/ovpn.h + create mode 100644 tools/testing/selftests/net/ovpn/.gitignore + create mode 100644 tools/testing/selftests/net/ovpn/Makefile + create mode 100644 tools/testing/selftests/net/ovpn/config + create mode 100644 tools/testing/selftests/net/ovpn/data64.key + create mode 100644 tools/testing/selftests/net/ovpn/ovpn-cli.c + create mode 100644 tools/testing/selftests/net/ovpn/tcp_peers.txt + create mode 100755 tools/testing/selftests/net/ovpn/test-chachapoly.sh + create mode 100755 tools/testing/selftests/net/ovpn/test-float.sh + create mode 100755 tools/testing/selftests/net/ovpn/test-tcp.sh + create mode 100755 tools/testing/selftests/net/ovpn/test.sh + create mode 100644 tools/testing/selftests/net/ovpn/udp_peers.txt + +diff --git a/Documentation/netlink/specs/ovpn.yaml b/Documentation/netlink/specs/ovpn.yaml +new file mode 100644 +index 000000000000..79339c25d607 +--- /dev/null ++++ b/Documentation/netlink/specs/ovpn.yaml +@@ -0,0 +1,362 @@ ++# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) ++# ++# Author: Antonio Quartulli ++# ++# Copyright (c) 2024, OpenVPN Inc. ++# ++ ++name: ovpn ++ ++protocol: genetlink ++ ++doc: Netlink protocol to control OpenVPN network devices ++ ++definitions: ++ - ++ type: const ++ name: nonce-tail-size ++ value: 8 ++ - ++ type: enum ++ name: cipher-alg ++ entries: [ none, aes-gcm, chacha20-poly1305 ] ++ - ++ type: enum ++ name: del-peer-reason ++ entries: [ teardown, userspace, expired, transport-error, transport-disconnect ] ++ - ++ type: enum ++ name: key-slot ++ entries: [ primary, secondary ] ++ ++attribute-sets: ++ - ++ name: peer ++ attributes: ++ - ++ name: id ++ type: u32 ++ doc: | ++ The unique ID of the peer. To be used to identify peers during ++ operations ++ checks: ++ max: 0xFFFFFF ++ - ++ name: remote-ipv4 ++ type: u32 ++ doc: The remote IPv4 address of the peer ++ byte-order: big-endian ++ display-hint: ipv4 ++ - ++ name: remote-ipv6 ++ type: binary ++ doc: The remote IPv6 address of the peer ++ display-hint: ipv6 ++ checks: ++ exact-len: 16 ++ - ++ name: remote-ipv6-scope-id ++ type: u32 ++ doc: The scope id of the remote IPv6 address of the peer (RFC2553) ++ - ++ name: remote-port ++ type: u16 ++ doc: The remote port of the peer ++ byte-order: big-endian ++ checks: ++ min: 1 ++ - ++ name: socket ++ type: u32 ++ doc: The socket to be used to communicate with the peer ++ - ++ name: vpn-ipv4 ++ type: u32 ++ doc: The IPv4 address assigned to the peer by the server ++ byte-order: big-endian ++ display-hint: ipv4 ++ - ++ name: vpn-ipv6 ++ type: binary ++ doc: The IPv6 address assigned to the peer by the server ++ display-hint: ipv6 ++ checks: ++ exact-len: 16 ++ - ++ name: local-ipv4 ++ type: u32 ++ doc: The local IPv4 to be used to send packets to the peer (UDP only) ++ byte-order: big-endian ++ display-hint: ipv4 ++ - ++ name: local-ipv6 ++ type: binary ++ doc: The local IPv6 to be used to send packets to the peer (UDP only) ++ display-hint: ipv6 ++ checks: ++ exact-len: 16 ++ - ++ name: local-port ++ type: u16 ++ doc: The local port to be used to send packets to the peer (UDP only) ++ byte-order: big-endian ++ checks: ++ min: 1 ++ - ++ name: keepalive-interval ++ type: u32 ++ doc: | ++ The number of seconds after which a keep alive message is sent to the ++ peer ++ - ++ name: keepalive-timeout ++ type: u32 ++ doc: | ++ The number of seconds from the last activity after which the peer is ++ assumed dead ++ - ++ name: del-reason ++ type: u32 ++ doc: The reason why a peer was deleted ++ enum: del-peer-reason ++ - ++ name: vpn-rx-bytes ++ type: uint ++ doc: Number of bytes received over the tunnel ++ - ++ name: vpn-tx-bytes ++ type: uint ++ doc: Number of bytes transmitted over the tunnel ++ - ++ name: vpn-rx-packets ++ type: uint ++ doc: Number of packets received over the tunnel ++ - ++ name: vpn-tx-packets ++ type: uint ++ doc: Number of packets transmitted over the tunnel ++ - ++ name: link-rx-bytes ++ type: uint ++ doc: Number of bytes received at the transport level ++ - ++ name: link-tx-bytes ++ type: uint ++ doc: Number of bytes transmitted at the transport level ++ - ++ name: link-rx-packets ++ type: u32 ++ doc: Number of packets received at the transport level ++ - ++ name: link-tx-packets ++ type: u32 ++ doc: Number of packets transmitted at the transport level ++ - ++ name: keyconf ++ attributes: ++ - ++ name: peer-id ++ type: u32 ++ doc: | ++ The unique ID of the peer. To be used to identify peers during ++ key operations ++ checks: ++ max: 0xFFFFFF ++ - ++ name: slot ++ type: u32 ++ doc: The slot where the key should be stored ++ enum: key-slot ++ - ++ name: key-id ++ doc: | ++ The unique ID of the key. Used to fetch the correct key upon ++ decryption ++ type: u32 ++ checks: ++ max: 7 ++ - ++ name: cipher-alg ++ type: u32 ++ doc: The cipher to be used when communicating with the peer ++ enum: cipher-alg ++ - ++ name: encrypt-dir ++ type: nest ++ doc: Key material for encrypt direction ++ nested-attributes: keydir ++ - ++ name: decrypt-dir ++ type: nest ++ doc: Key material for decrypt direction ++ nested-attributes: keydir ++ - ++ name: keydir ++ attributes: ++ - ++ name: cipher-key ++ type: binary ++ doc: The actual key to be used by the cipher ++ checks: ++ max-len: 256 ++ - ++ name: nonce-tail ++ type: binary ++ doc: | ++ Random nonce to be concatenated to the packet ID, in order to ++ obtain the actual cipher IV ++ checks: ++ exact-len: nonce-tail-size ++ - ++ name: ovpn ++ attributes: ++ - ++ name: ifindex ++ type: u32 ++ doc: Index of the ovpn interface to operate on ++ - ++ name: ifname ++ type: string ++ doc: Name of the ovpn interface ++ - ++ name: peer ++ type: nest ++ doc: | ++ The peer object containing the attributed of interest for the specific ++ operation ++ nested-attributes: peer ++ - ++ name: keyconf ++ type: nest ++ doc: Peer specific cipher configuration ++ nested-attributes: keyconf ++ ++operations: ++ list: ++ - ++ name: peer-new ++ attribute-set: ovpn ++ flags: [ admin-perm ] ++ doc: Add a remote peer ++ do: ++ pre: ovpn-nl-pre-doit ++ post: ovpn-nl-post-doit ++ request: ++ attributes: ++ - ifindex ++ - peer ++ - ++ name: peer-set ++ attribute-set: ovpn ++ flags: [ admin-perm ] ++ doc: modify a remote peer ++ do: ++ pre: ovpn-nl-pre-doit ++ post: ovpn-nl-post-doit ++ request: ++ attributes: ++ - ifindex ++ - peer ++ - ++ name: peer-get ++ attribute-set: ovpn ++ flags: [ admin-perm ] ++ doc: Retrieve data about existing remote peers (or a specific one) ++ do: ++ pre: ovpn-nl-pre-doit ++ post: ovpn-nl-post-doit ++ request: ++ attributes: ++ - ifindex ++ - peer ++ reply: ++ attributes: ++ - peer ++ dump: ++ request: ++ attributes: ++ - ifindex ++ reply: ++ attributes: ++ - peer ++ - ++ name: peer-del ++ attribute-set: ovpn ++ flags: [ admin-perm ] ++ doc: Delete existing remote peer ++ do: ++ pre: ovpn-nl-pre-doit ++ post: ovpn-nl-post-doit ++ request: ++ attributes: ++ - ifindex ++ - peer ++ - ++ name: peer-del-ntf ++ doc: Notification about a peer being deleted ++ notify: peer-get ++ mcgrp: peers ++ ++ - ++ name: key-new ++ attribute-set: ovpn ++ flags: [ admin-perm ] ++ doc: Add a cipher key for a specific peer ++ do: ++ pre: ovpn-nl-pre-doit ++ post: ovpn-nl-post-doit ++ request: ++ attributes: ++ - ifindex ++ - keyconf ++ - ++ name: key-get ++ attribute-set: ovpn ++ flags: [ admin-perm ] ++ doc: Retrieve non-sensitive data about peer key and cipher ++ do: ++ pre: ovpn-nl-pre-doit ++ post: ovpn-nl-post-doit ++ request: ++ attributes: ++ - ifindex ++ - keyconf ++ reply: ++ attributes: ++ - keyconf ++ - ++ name: key-swap ++ attribute-set: ovpn ++ flags: [ admin-perm ] ++ doc: Swap primary and secondary session keys for a specific peer ++ do: ++ pre: ovpn-nl-pre-doit ++ post: ovpn-nl-post-doit ++ request: ++ attributes: ++ - ifindex ++ - keyconf ++ - ++ name: key-swap-ntf ++ notify: key-get ++ doc: | ++ Notification about key having exhausted its IV space and requiring ++ renegotiation ++ mcgrp: peers ++ - ++ name: key-del ++ attribute-set: ovpn ++ flags: [ admin-perm ] ++ doc: Delete cipher key for a specific peer ++ do: ++ pre: ovpn-nl-pre-doit ++ post: ovpn-nl-post-doit ++ request: ++ attributes: ++ - ifindex ++ - keyconf ++ ++mcast-groups: ++ list: ++ - ++ name: peers +diff --git a/MAINTAINERS b/MAINTAINERS +index 6113837d502a..271b59a9c585 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -17361,6 +17361,17 @@ F: arch/openrisc/ + F: drivers/irqchip/irq-ompic.c + F: drivers/irqchip/irq-or1k-* + ++OPENVPN DATA CHANNEL OFFLOAD ++M: Antonio Quartulli ++L: openvpn-devel@lists.sourceforge.net (moderated for non-subscribers) ++L: netdev@vger.kernel.org ++S: Supported ++T: git https://github.com/OpenVPN/linux-kernel-ovpn.git ++F: Documentation/netlink/specs/ovpn.yaml ++F: drivers/net/ovpn/ ++F: include/uapi/linux/ovpn.h ++F: tools/testing/selftests/net/ovpn/ ++ + OPENVSWITCH + M: Pravin B Shelar + L: netdev@vger.kernel.org +diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig +index 9920b3a68ed1..ddc65bc1e218 100644 +--- a/drivers/net/Kconfig ++++ b/drivers/net/Kconfig +@@ -115,6 +115,20 @@ config WIREGUARD_DEBUG + + Say N here unless you know what you're doing. + ++config OVPN ++ tristate "OpenVPN data channel offload" ++ depends on NET && INET ++ select STREAM_PARSER ++ select NET_UDP_TUNNEL ++ select DST_CACHE ++ select CRYPTO ++ select CRYPTO_AES ++ select CRYPTO_GCM ++ select CRYPTO_CHACHA20POLY1305 ++ help ++ This module enhances the performance of the OpenVPN userspace software ++ by offloading the data channel processing to kernelspace. ++ + config EQUALIZER + tristate "EQL (serial line load balancing) support" + help +diff --git a/drivers/net/Makefile b/drivers/net/Makefile +index 13743d0e83b5..5152b3330e28 100644 +--- a/drivers/net/Makefile ++++ b/drivers/net/Makefile +@@ -11,6 +11,7 @@ obj-$(CONFIG_IPVLAN) += ipvlan/ + obj-$(CONFIG_IPVTAP) += ipvlan/ + obj-$(CONFIG_DUMMY) += dummy.o + obj-$(CONFIG_WIREGUARD) += wireguard/ ++obj-$(CONFIG_OVPN) += ovpn/ + obj-$(CONFIG_EQUALIZER) += eql.o + obj-$(CONFIG_IFB) += ifb.o + obj-$(CONFIG_MACSEC) += macsec.o +diff --git a/drivers/net/ovpn/Makefile b/drivers/net/ovpn/Makefile +new file mode 100644 +index 000000000000..f4d4bd87c851 +--- /dev/null ++++ b/drivers/net/ovpn/Makefile +@@ -0,0 +1,22 @@ ++# SPDX-License-Identifier: GPL-2.0 ++# ++# ovpn -- OpenVPN data channel offload in kernel space ++# ++# Copyright (C) 2020-2024 OpenVPN, Inc. ++# ++# Author: Antonio Quartulli ++ ++obj-$(CONFIG_OVPN) := ovpn.o ++ovpn-y += bind.o ++ovpn-y += crypto.o ++ovpn-y += crypto_aead.o ++ovpn-y += main.o ++ovpn-y += io.o ++ovpn-y += netlink.o ++ovpn-y += netlink-gen.o ++ovpn-y += peer.o ++ovpn-y += pktid.o ++ovpn-y += socket.o ++ovpn-y += stats.o ++ovpn-y += tcp.o ++ovpn-y += udp.o +diff --git a/drivers/net/ovpn/bind.c b/drivers/net/ovpn/bind.c +new file mode 100644 +index 000000000000..d17d078c5730 +--- /dev/null ++++ b/drivers/net/ovpn/bind.c +@@ -0,0 +1,54 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2012-2024 OpenVPN, Inc. ++ * ++ * Author: James Yonan ++ * Antonio Quartulli ++ */ ++ ++#include ++#include ++ ++#include "ovpnstruct.h" ++#include "bind.h" ++#include "peer.h" ++ ++/** ++ * ovpn_bind_from_sockaddr - retrieve binding matching sockaddr ++ * @ss: the sockaddr to match ++ * ++ * Return: the bind matching the passed sockaddr if found, NULL otherwise ++ */ ++struct ovpn_bind *ovpn_bind_from_sockaddr(const struct sockaddr_storage *ss) ++{ ++ struct ovpn_bind *bind; ++ size_t sa_len; ++ ++ if (ss->ss_family == AF_INET) ++ sa_len = sizeof(struct sockaddr_in); ++ else if (ss->ss_family == AF_INET6) ++ sa_len = sizeof(struct sockaddr_in6); ++ else ++ return ERR_PTR(-EAFNOSUPPORT); ++ ++ bind = kzalloc(sizeof(*bind), GFP_ATOMIC); ++ if (unlikely(!bind)) ++ return ERR_PTR(-ENOMEM); ++ ++ memcpy(&bind->remote, ss, sa_len); ++ ++ return bind; ++} ++ ++/** ++ * ovpn_bind_reset - assign new binding to peer ++ * @peer: the peer whose binding has to be replaced ++ * @new: the new bind to assign ++ */ ++void ovpn_bind_reset(struct ovpn_peer *peer, struct ovpn_bind *new) ++ __must_hold(&peer->lock) ++{ ++ kfree_rcu(rcu_replace_pointer(peer->bind, new, ++ lockdep_is_held(&peer->lock)), rcu); ++} +diff --git a/drivers/net/ovpn/bind.h b/drivers/net/ovpn/bind.h +new file mode 100644 +index 000000000000..859213d5040d +--- /dev/null ++++ b/drivers/net/ovpn/bind.h +@@ -0,0 +1,117 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2012-2024 OpenVPN, Inc. ++ * ++ * Author: James Yonan ++ * Antonio Quartulli ++ */ ++ ++#ifndef _NET_OVPN_OVPNBIND_H_ ++#define _NET_OVPN_OVPNBIND_H_ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++struct ovpn_peer; ++ ++/** ++ * union ovpn_sockaddr - basic transport layer address ++ * @in4: IPv4 address ++ * @in6: IPv6 address ++ */ ++union ovpn_sockaddr { ++ struct sockaddr_in in4; ++ struct sockaddr_in6 in6; ++}; ++ ++/** ++ * struct ovpn_bind - remote peer binding ++ * @remote: the remote peer sockaddress ++ * @local: local endpoint used to talk to the peer ++ * @local.ipv4: local IPv4 used to talk to the peer ++ * @local.ipv6: local IPv6 used to talk to the peer ++ * @rcu: used to schedule RCU cleanup job ++ */ ++struct ovpn_bind { ++ union ovpn_sockaddr remote; /* remote sockaddr */ ++ ++ union { ++ struct in_addr ipv4; ++ struct in6_addr ipv6; ++ } local; ++ ++ struct rcu_head rcu; ++}; ++ ++/** ++ * skb_protocol_to_family - translate skb->protocol to AF_INET or AF_INET6 ++ * @skb: the packet sk_buff to inspect ++ * ++ * Return: AF_INET, AF_INET6 or 0 in case of unknown protocol ++ */ ++static inline unsigned short skb_protocol_to_family(const struct sk_buff *skb) ++{ ++ switch (skb->protocol) { ++ case htons(ETH_P_IP): ++ return AF_INET; ++ case htons(ETH_P_IPV6): ++ return AF_INET6; ++ default: ++ return 0; ++ } ++} ++ ++/** ++ * ovpn_bind_skb_src_match - match packet source with binding ++ * @bind: the binding to match ++ * @skb: the packet to match ++ * ++ * Return: true if the packet source matches the remote peer sockaddr ++ * in the binding ++ */ ++static inline bool ovpn_bind_skb_src_match(const struct ovpn_bind *bind, ++ const struct sk_buff *skb) ++{ ++ const unsigned short family = skb_protocol_to_family(skb); ++ const union ovpn_sockaddr *remote; ++ ++ if (unlikely(!bind)) ++ return false; ++ ++ remote = &bind->remote; ++ ++ if (unlikely(remote->in4.sin_family != family)) ++ return false; ++ ++ switch (family) { ++ case AF_INET: ++ if (unlikely(remote->in4.sin_addr.s_addr != ip_hdr(skb)->saddr)) ++ return false; ++ ++ if (unlikely(remote->in4.sin_port != udp_hdr(skb)->source)) ++ return false; ++ break; ++ case AF_INET6: ++ if (unlikely(!ipv6_addr_equal(&remote->in6.sin6_addr, ++ &ipv6_hdr(skb)->saddr))) ++ return false; ++ ++ if (unlikely(remote->in6.sin6_port != udp_hdr(skb)->source)) ++ return false; ++ break; ++ default: ++ return false; ++ } ++ ++ return true; ++} ++ ++struct ovpn_bind *ovpn_bind_from_sockaddr(const struct sockaddr_storage *sa); ++void ovpn_bind_reset(struct ovpn_peer *peer, struct ovpn_bind *bind); ++ ++#endif /* _NET_OVPN_OVPNBIND_H_ */ +diff --git a/drivers/net/ovpn/crypto.c b/drivers/net/ovpn/crypto.c +new file mode 100644 +index 000000000000..a2346bc630be +--- /dev/null ++++ b/drivers/net/ovpn/crypto.c +@@ -0,0 +1,214 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: James Yonan ++ * Antonio Quartulli ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "ovpnstruct.h" ++#include "main.h" ++#include "packet.h" ++#include "pktid.h" ++#include "crypto_aead.h" ++#include "crypto.h" ++ ++static void ovpn_ks_destroy_rcu(struct rcu_head *head) ++{ ++ struct ovpn_crypto_key_slot *ks; ++ ++ ks = container_of(head, struct ovpn_crypto_key_slot, rcu); ++ ovpn_aead_crypto_key_slot_destroy(ks); ++} ++ ++void ovpn_crypto_key_slot_release(struct kref *kref) ++{ ++ struct ovpn_crypto_key_slot *ks; ++ ++ ks = container_of(kref, struct ovpn_crypto_key_slot, refcount); ++ call_rcu(&ks->rcu, ovpn_ks_destroy_rcu); ++} ++ ++/* can only be invoked when all peer references have been dropped (i.e. RCU ++ * release routine) ++ */ ++void ovpn_crypto_state_release(struct ovpn_crypto_state *cs) ++{ ++ struct ovpn_crypto_key_slot *ks; ++ ++ ks = rcu_access_pointer(cs->slots[0]); ++ if (ks) { ++ RCU_INIT_POINTER(cs->slots[0], NULL); ++ ovpn_crypto_key_slot_put(ks); ++ } ++ ++ ks = rcu_access_pointer(cs->slots[1]); ++ if (ks) { ++ RCU_INIT_POINTER(cs->slots[1], NULL); ++ ovpn_crypto_key_slot_put(ks); ++ } ++} ++ ++/* removes the key matching the specified id from the crypto context */ ++void ovpn_crypto_kill_key(struct ovpn_crypto_state *cs, u8 key_id) ++{ ++ struct ovpn_crypto_key_slot *ks = NULL; ++ ++ spin_lock_bh(&cs->lock); ++ if (rcu_access_pointer(cs->slots[0])->key_id == key_id) { ++ ks = rcu_replace_pointer(cs->slots[0], NULL, ++ lockdep_is_held(&cs->lock)); ++ } else if (rcu_access_pointer(cs->slots[1])->key_id == key_id) { ++ ks = rcu_replace_pointer(cs->slots[1], NULL, ++ lockdep_is_held(&cs->lock)); ++ } ++ spin_unlock_bh(&cs->lock); ++ ++ if (ks) ++ ovpn_crypto_key_slot_put(ks); ++} ++ ++/* Reset the ovpn_crypto_state object in a way that is atomic ++ * to RCU readers. ++ */ ++int ovpn_crypto_state_reset(struct ovpn_crypto_state *cs, ++ const struct ovpn_peer_key_reset *pkr) ++{ ++ struct ovpn_crypto_key_slot *old = NULL, *new; ++ u8 idx; ++ ++ if (pkr->slot != OVPN_KEY_SLOT_PRIMARY && ++ pkr->slot != OVPN_KEY_SLOT_SECONDARY) ++ return -EINVAL; ++ ++ new = ovpn_aead_crypto_key_slot_new(&pkr->key); ++ if (IS_ERR(new)) ++ return PTR_ERR(new); ++ ++ spin_lock_bh(&cs->lock); ++ idx = cs->primary_idx; ++ switch (pkr->slot) { ++ case OVPN_KEY_SLOT_PRIMARY: ++ old = rcu_replace_pointer(cs->slots[idx], new, ++ lockdep_is_held(&cs->lock)); ++ break; ++ case OVPN_KEY_SLOT_SECONDARY: ++ old = rcu_replace_pointer(cs->slots[!idx], new, ++ lockdep_is_held(&cs->lock)); ++ break; ++ } ++ spin_unlock_bh(&cs->lock); ++ ++ if (old) ++ ovpn_crypto_key_slot_put(old); ++ ++ return 0; ++} ++ ++void ovpn_crypto_key_slot_delete(struct ovpn_crypto_state *cs, ++ enum ovpn_key_slot slot) ++{ ++ struct ovpn_crypto_key_slot *ks = NULL; ++ u8 idx; ++ ++ if (slot != OVPN_KEY_SLOT_PRIMARY && ++ slot != OVPN_KEY_SLOT_SECONDARY) { ++ pr_warn("Invalid slot to release: %u\n", slot); ++ return; ++ } ++ ++ spin_lock_bh(&cs->lock); ++ idx = cs->primary_idx; ++ switch (slot) { ++ case OVPN_KEY_SLOT_PRIMARY: ++ ks = rcu_replace_pointer(cs->slots[idx], NULL, ++ lockdep_is_held(&cs->lock)); ++ break; ++ case OVPN_KEY_SLOT_SECONDARY: ++ ks = rcu_replace_pointer(cs->slots[!idx], NULL, ++ lockdep_is_held(&cs->lock)); ++ break; ++ } ++ spin_unlock_bh(&cs->lock); ++ ++ if (!ks) { ++ pr_debug("Key slot already released: %u\n", slot); ++ return; ++ } ++ ++ pr_debug("deleting key slot %u, key_id=%u\n", slot, ks->key_id); ++ ovpn_crypto_key_slot_put(ks); ++} ++ ++/* this swap is not atomic, but there will be a very short time frame where the ++ * old_secondary key won't be available. This should not be a big deal as most ++ * likely both peers are already using the new primary at this point. ++ */ ++void ovpn_crypto_key_slots_swap(struct ovpn_crypto_state *cs) ++{ ++ const struct ovpn_crypto_key_slot *old_primary, *old_secondary; ++ u8 idx; ++ ++ spin_lock_bh(&cs->lock); ++ idx = cs->primary_idx; ++ old_primary = rcu_dereference_protected(cs->slots[idx], ++ lockdep_is_held(&cs->lock)); ++ old_secondary = rcu_dereference_protected(cs->slots[!idx], ++ lockdep_is_held(&cs->lock)); ++ /* perform real swap by switching the index of the primary key */ ++ cs->primary_idx = !cs->primary_idx; ++ ++ pr_debug("key swapped: (old primary) %d <-> (new primary) %d\n", ++ old_primary ? old_primary->key_id : -1, ++ old_secondary ? old_secondary->key_id : -1); ++ ++ spin_unlock_bh(&cs->lock); ++} ++ ++/** ++ * ovpn_crypto_config_get - populate keyconf object with non-sensible key data ++ * @cs: the crypto state to extract the key data from ++ * @slot: the specific slot to inspect ++ * @keyconf: the output object to populate ++ * ++ * Return: 0 on success or a negative error code otherwise ++ */ ++int ovpn_crypto_config_get(struct ovpn_crypto_state *cs, ++ enum ovpn_key_slot slot, ++ struct ovpn_key_config *keyconf) ++{ ++ struct ovpn_crypto_key_slot *ks; ++ int idx; ++ ++ switch (slot) { ++ case OVPN_KEY_SLOT_PRIMARY: ++ idx = cs->primary_idx; ++ break; ++ case OVPN_KEY_SLOT_SECONDARY: ++ idx = !cs->primary_idx; ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ rcu_read_lock(); ++ ks = rcu_dereference(cs->slots[idx]); ++ if (!ks || (ks && !ovpn_crypto_key_slot_hold(ks))) { ++ rcu_read_unlock(); ++ return -ENOENT; ++ } ++ rcu_read_unlock(); ++ ++ keyconf->cipher_alg = ovpn_aead_crypto_alg(ks); ++ keyconf->key_id = ks->key_id; ++ ++ ovpn_crypto_key_slot_put(ks); ++ ++ return 0; ++} +diff --git a/drivers/net/ovpn/crypto.h b/drivers/net/ovpn/crypto.h +new file mode 100644 +index 000000000000..b7a7be752d54 +--- /dev/null ++++ b/drivers/net/ovpn/crypto.h +@@ -0,0 +1,145 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: James Yonan ++ * Antonio Quartulli ++ */ ++ ++#ifndef _NET_OVPN_OVPNCRYPTO_H_ ++#define _NET_OVPN_OVPNCRYPTO_H_ ++ ++#include "packet.h" ++#include "pktid.h" ++ ++/* info needed for both encrypt and decrypt directions */ ++struct ovpn_key_direction { ++ const u8 *cipher_key; ++ size_t cipher_key_size; ++ const u8 *nonce_tail; /* only needed for GCM modes */ ++ size_t nonce_tail_size; /* only needed for GCM modes */ ++}; ++ ++/* all info for a particular symmetric key (primary or secondary) */ ++struct ovpn_key_config { ++ enum ovpn_cipher_alg cipher_alg; ++ u8 key_id; ++ struct ovpn_key_direction encrypt; ++ struct ovpn_key_direction decrypt; ++}; ++ ++/* used to pass settings from netlink to the crypto engine */ ++struct ovpn_peer_key_reset { ++ enum ovpn_key_slot slot; ++ struct ovpn_key_config key; ++}; ++ ++struct ovpn_crypto_key_slot { ++ u8 key_id; ++ ++ struct crypto_aead *encrypt; ++ struct crypto_aead *decrypt; ++ struct ovpn_nonce_tail nonce_tail_xmit; ++ struct ovpn_nonce_tail nonce_tail_recv; ++ ++ struct ovpn_pktid_recv pid_recv ____cacheline_aligned_in_smp; ++ struct ovpn_pktid_xmit pid_xmit ____cacheline_aligned_in_smp; ++ struct kref refcount; ++ struct rcu_head rcu; ++}; ++ ++struct ovpn_crypto_state { ++ struct ovpn_crypto_key_slot __rcu *slots[2]; ++ u8 primary_idx; ++ ++ /* protects primary and secondary slots */ ++ spinlock_t lock; ++}; ++ ++static inline bool ovpn_crypto_key_slot_hold(struct ovpn_crypto_key_slot *ks) ++{ ++ return kref_get_unless_zero(&ks->refcount); ++} ++ ++static inline void ovpn_crypto_state_init(struct ovpn_crypto_state *cs) ++{ ++ RCU_INIT_POINTER(cs->slots[0], NULL); ++ RCU_INIT_POINTER(cs->slots[1], NULL); ++ cs->primary_idx = 0; ++ spin_lock_init(&cs->lock); ++} ++ ++static inline struct ovpn_crypto_key_slot * ++ovpn_crypto_key_id_to_slot(const struct ovpn_crypto_state *cs, u8 key_id) ++{ ++ struct ovpn_crypto_key_slot *ks; ++ u8 idx; ++ ++ if (unlikely(!cs)) ++ return NULL; ++ ++ rcu_read_lock(); ++ idx = cs->primary_idx; ++ ks = rcu_dereference(cs->slots[idx]); ++ if (ks && ks->key_id == key_id) { ++ if (unlikely(!ovpn_crypto_key_slot_hold(ks))) ++ ks = NULL; ++ goto out; ++ } ++ ++ ks = rcu_dereference(cs->slots[idx ^ 1]); ++ if (ks && ks->key_id == key_id) { ++ if (unlikely(!ovpn_crypto_key_slot_hold(ks))) ++ ks = NULL; ++ goto out; ++ } ++ ++ /* when both key slots are occupied but no matching key ID is found, ks ++ * has to be reset to NULL to avoid carrying a stale pointer ++ */ ++ ks = NULL; ++out: ++ rcu_read_unlock(); ++ ++ return ks; ++} ++ ++static inline struct ovpn_crypto_key_slot * ++ovpn_crypto_key_slot_primary(const struct ovpn_crypto_state *cs) ++{ ++ struct ovpn_crypto_key_slot *ks; ++ ++ rcu_read_lock(); ++ ks = rcu_dereference(cs->slots[cs->primary_idx]); ++ if (unlikely(ks && !ovpn_crypto_key_slot_hold(ks))) ++ ks = NULL; ++ rcu_read_unlock(); ++ ++ return ks; ++} ++ ++void ovpn_crypto_key_slot_release(struct kref *kref); ++ ++static inline void ovpn_crypto_key_slot_put(struct ovpn_crypto_key_slot *ks) ++{ ++ kref_put(&ks->refcount, ovpn_crypto_key_slot_release); ++} ++ ++int ovpn_crypto_state_reset(struct ovpn_crypto_state *cs, ++ const struct ovpn_peer_key_reset *pkr); ++ ++void ovpn_crypto_key_slot_delete(struct ovpn_crypto_state *cs, ++ enum ovpn_key_slot slot); ++ ++void ovpn_crypto_state_release(struct ovpn_crypto_state *cs); ++ ++void ovpn_crypto_key_slots_swap(struct ovpn_crypto_state *cs); ++ ++int ovpn_crypto_config_get(struct ovpn_crypto_state *cs, ++ enum ovpn_key_slot slot, ++ struct ovpn_key_config *keyconf); ++ ++void ovpn_crypto_kill_key(struct ovpn_crypto_state *cs, u8 key_id); ++ ++#endif /* _NET_OVPN_OVPNCRYPTO_H_ */ +diff --git a/drivers/net/ovpn/crypto_aead.c b/drivers/net/ovpn/crypto_aead.c +new file mode 100644 +index 000000000000..25e4e4a453b2 +--- /dev/null ++++ b/drivers/net/ovpn/crypto_aead.c +@@ -0,0 +1,386 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: James Yonan ++ * Antonio Quartulli ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include "ovpnstruct.h" ++#include "main.h" ++#include "io.h" ++#include "packet.h" ++#include "pktid.h" ++#include "crypto_aead.h" ++#include "crypto.h" ++#include "peer.h" ++#include "proto.h" ++#include "skb.h" ++ ++#define AUTH_TAG_SIZE 16 ++ ++#define ALG_NAME_AES "gcm(aes)" ++#define ALG_NAME_CHACHAPOLY "rfc7539(chacha20,poly1305)" ++ ++static int ovpn_aead_encap_overhead(const struct ovpn_crypto_key_slot *ks) ++{ ++ return OVPN_OP_SIZE_V2 + /* OP header size */ ++ 4 + /* Packet ID */ ++ crypto_aead_authsize(ks->encrypt); /* Auth Tag */ ++} ++ ++int ovpn_aead_encrypt(struct ovpn_peer *peer, struct ovpn_crypto_key_slot *ks, ++ struct sk_buff *skb) ++{ ++ const unsigned int tag_size = crypto_aead_authsize(ks->encrypt); ++ const unsigned int head_size = ovpn_aead_encap_overhead(ks); ++ struct aead_request *req; ++ struct sk_buff *trailer; ++ struct scatterlist *sg; ++ u8 iv[NONCE_SIZE]; ++ int nfrags, ret; ++ u32 pktid, op; ++ ++ ovpn_skb_cb(skb)->orig_len = skb->len; ++ ovpn_skb_cb(skb)->peer = peer; ++ ovpn_skb_cb(skb)->ks = ks; ++ ++ /* Sample AEAD header format: ++ * 48000001 00000005 7e7046bd 444a7e28 cc6387b1 64a4d6c1 380275a... ++ * [ OP32 ] [seq # ] [ auth tag ] [ payload ... ] ++ * [4-byte ++ * IV head] ++ */ ++ ++ /* check that there's enough headroom in the skb for packet ++ * encapsulation, after adding network header and encryption overhead ++ */ ++ if (unlikely(skb_cow_head(skb, OVPN_HEAD_ROOM + head_size))) ++ return -ENOBUFS; ++ ++ /* get number of skb frags and ensure that packet data is writable */ ++ nfrags = skb_cow_data(skb, 0, &trailer); ++ if (unlikely(nfrags < 0)) ++ return nfrags; ++ ++ if (unlikely(nfrags + 2 > (MAX_SKB_FRAGS + 2))) ++ return -ENOSPC; ++ ++ ovpn_skb_cb(skb)->sg = kmalloc(sizeof(*ovpn_skb_cb(skb)->sg) * ++ (nfrags + 2), GFP_ATOMIC); ++ if (unlikely(!ovpn_skb_cb(skb)->sg)) ++ return -ENOMEM; ++ ++ sg = ovpn_skb_cb(skb)->sg; ++ ++ /* sg table: ++ * 0: op, wire nonce (AD, len=OVPN_OP_SIZE_V2+NONCE_WIRE_SIZE), ++ * 1, 2, 3, ..., n: payload, ++ * n+1: auth_tag (len=tag_size) ++ */ ++ sg_init_table(sg, nfrags + 2); ++ ++ /* build scatterlist to encrypt packet payload */ ++ ret = skb_to_sgvec_nomark(skb, sg + 1, 0, skb->len); ++ if (unlikely(nfrags != ret)) { ++ ret = -EINVAL; ++ goto free_sg; ++ } ++ ++ /* append auth_tag onto scatterlist */ ++ __skb_push(skb, tag_size); ++ sg_set_buf(sg + nfrags + 1, skb->data, tag_size); ++ ++ /* obtain packet ID, which is used both as a first ++ * 4 bytes of nonce and last 4 bytes of associated data. ++ */ ++ ret = ovpn_pktid_xmit_next(&ks->pid_xmit, &pktid); ++ if (unlikely(ret < 0)) ++ goto free_sg; ++ ++ /* concat 4 bytes packet id and 8 bytes nonce tail into 12 bytes ++ * nonce ++ */ ++ ovpn_pktid_aead_write(pktid, &ks->nonce_tail_xmit, iv); ++ ++ /* make space for packet id and push it to the front */ ++ __skb_push(skb, NONCE_WIRE_SIZE); ++ memcpy(skb->data, iv, NONCE_WIRE_SIZE); ++ ++ /* add packet op as head of additional data */ ++ op = ovpn_opcode_compose(OVPN_DATA_V2, ks->key_id, peer->id); ++ __skb_push(skb, OVPN_OP_SIZE_V2); ++ BUILD_BUG_ON(sizeof(op) != OVPN_OP_SIZE_V2); ++ *((__force __be32 *)skb->data) = htonl(op); ++ ++ /* AEAD Additional data */ ++ sg_set_buf(sg, skb->data, OVPN_OP_SIZE_V2 + NONCE_WIRE_SIZE); ++ ++ req = aead_request_alloc(ks->encrypt, GFP_ATOMIC); ++ if (unlikely(!req)) { ++ ret = -ENOMEM; ++ goto free_sg; ++ } ++ ++ ovpn_skb_cb(skb)->req = req; ++ ++ /* setup async crypto operation */ ++ aead_request_set_tfm(req, ks->encrypt); ++ aead_request_set_callback(req, 0, ovpn_encrypt_post, skb); ++ aead_request_set_crypt(req, sg, sg, skb->len - head_size, iv); ++ aead_request_set_ad(req, OVPN_OP_SIZE_V2 + NONCE_WIRE_SIZE); ++ ++ /* encrypt it */ ++ return crypto_aead_encrypt(req); ++free_sg: ++ kfree(ovpn_skb_cb(skb)->sg); ++ ovpn_skb_cb(skb)->sg = NULL; ++ return ret; ++} ++ ++int ovpn_aead_decrypt(struct ovpn_peer *peer, struct ovpn_crypto_key_slot *ks, ++ struct sk_buff *skb) ++{ ++ const unsigned int tag_size = crypto_aead_authsize(ks->decrypt); ++ int ret, payload_len, nfrags; ++ unsigned int payload_offset; ++ struct aead_request *req; ++ struct sk_buff *trailer; ++ struct scatterlist *sg; ++ unsigned int sg_len; ++ u8 iv[NONCE_SIZE]; ++ ++ payload_offset = OVPN_OP_SIZE_V2 + NONCE_WIRE_SIZE + tag_size; ++ payload_len = skb->len - payload_offset; ++ ++ ovpn_skb_cb(skb)->orig_len = skb->len; ++ ovpn_skb_cb(skb)->payload_offset = payload_offset; ++ ovpn_skb_cb(skb)->peer = peer; ++ ovpn_skb_cb(skb)->ks = ks; ++ ++ /* sanity check on packet size, payload size must be >= 0 */ ++ if (unlikely(payload_len < 0)) ++ return -EINVAL; ++ ++ /* Prepare the skb data buffer to be accessed up until the auth tag. ++ * This is required because this area is directly mapped into the sg ++ * list. ++ */ ++ if (unlikely(!pskb_may_pull(skb, payload_offset))) ++ return -ENODATA; ++ ++ /* get number of skb frags and ensure that packet data is writable */ ++ nfrags = skb_cow_data(skb, 0, &trailer); ++ if (unlikely(nfrags < 0)) ++ return nfrags; ++ ++ if (unlikely(nfrags + 2 > (MAX_SKB_FRAGS + 2))) ++ return -ENOSPC; ++ ++ ovpn_skb_cb(skb)->sg = kmalloc(sizeof(*ovpn_skb_cb(skb)->sg) * ++ (nfrags + 2), GFP_ATOMIC); ++ if (unlikely(!ovpn_skb_cb(skb)->sg)) ++ return -ENOMEM; ++ ++ sg = ovpn_skb_cb(skb)->sg; ++ ++ /* sg table: ++ * 0: op, wire nonce (AD, len=OVPN_OP_SIZE_V2+NONCE_WIRE_SIZE), ++ * 1, 2, 3, ..., n: payload, ++ * n+1: auth_tag (len=tag_size) ++ */ ++ sg_init_table(sg, nfrags + 2); ++ ++ /* packet op is head of additional data */ ++ sg_len = OVPN_OP_SIZE_V2 + NONCE_WIRE_SIZE; ++ sg_set_buf(sg, skb->data, sg_len); ++ ++ /* build scatterlist to decrypt packet payload */ ++ ret = skb_to_sgvec_nomark(skb, sg + 1, payload_offset, payload_len); ++ if (unlikely(nfrags != ret)) { ++ ret = -EINVAL; ++ goto free_sg; ++ } ++ ++ /* append auth_tag onto scatterlist */ ++ sg_set_buf(sg + nfrags + 1, skb->data + sg_len, tag_size); ++ ++ /* copy nonce into IV buffer */ ++ memcpy(iv, skb->data + OVPN_OP_SIZE_V2, NONCE_WIRE_SIZE); ++ memcpy(iv + NONCE_WIRE_SIZE, ks->nonce_tail_recv.u8, ++ sizeof(struct ovpn_nonce_tail)); ++ ++ req = aead_request_alloc(ks->decrypt, GFP_ATOMIC); ++ if (unlikely(!req)) { ++ ret = -ENOMEM; ++ goto free_sg; ++ } ++ ++ ovpn_skb_cb(skb)->req = req; ++ ++ /* setup async crypto operation */ ++ aead_request_set_tfm(req, ks->decrypt); ++ aead_request_set_callback(req, 0, ovpn_decrypt_post, skb); ++ aead_request_set_crypt(req, sg, sg, payload_len + tag_size, iv); ++ ++ aead_request_set_ad(req, NONCE_WIRE_SIZE + OVPN_OP_SIZE_V2); ++ ++ /* decrypt it */ ++ return crypto_aead_decrypt(req); ++free_sg: ++ kfree(ovpn_skb_cb(skb)->sg); ++ ovpn_skb_cb(skb)->sg = NULL; ++ return ret; ++} ++ ++/* Initialize a struct crypto_aead object */ ++struct crypto_aead *ovpn_aead_init(const char *title, const char *alg_name, ++ const unsigned char *key, ++ unsigned int keylen) ++{ ++ struct crypto_aead *aead; ++ int ret; ++ ++ aead = crypto_alloc_aead(alg_name, 0, 0); ++ if (IS_ERR(aead)) { ++ ret = PTR_ERR(aead); ++ pr_err("%s crypto_alloc_aead failed, err=%d\n", title, ret); ++ aead = NULL; ++ goto error; ++ } ++ ++ ret = crypto_aead_setkey(aead, key, keylen); ++ if (ret) { ++ pr_err("%s crypto_aead_setkey size=%u failed, err=%d\n", title, ++ keylen, ret); ++ goto error; ++ } ++ ++ ret = crypto_aead_setauthsize(aead, AUTH_TAG_SIZE); ++ if (ret) { ++ pr_err("%s crypto_aead_setauthsize failed, err=%d\n", title, ++ ret); ++ goto error; ++ } ++ ++ /* basic AEAD assumption */ ++ if (crypto_aead_ivsize(aead) != NONCE_SIZE) { ++ pr_err("%s IV size must be %d\n", title, NONCE_SIZE); ++ ret = -EINVAL; ++ goto error; ++ } ++ ++ pr_debug("********* Cipher %s (%s)\n", alg_name, title); ++ pr_debug("*** IV size=%u\n", crypto_aead_ivsize(aead)); ++ pr_debug("*** req size=%u\n", crypto_aead_reqsize(aead)); ++ pr_debug("*** block size=%u\n", crypto_aead_blocksize(aead)); ++ pr_debug("*** auth size=%u\n", crypto_aead_authsize(aead)); ++ pr_debug("*** alignmask=0x%x\n", crypto_aead_alignmask(aead)); ++ ++ return aead; ++ ++error: ++ crypto_free_aead(aead); ++ return ERR_PTR(ret); ++} ++ ++void ovpn_aead_crypto_key_slot_destroy(struct ovpn_crypto_key_slot *ks) ++{ ++ if (!ks) ++ return; ++ ++ crypto_free_aead(ks->encrypt); ++ crypto_free_aead(ks->decrypt); ++ kfree(ks); ++} ++ ++struct ovpn_crypto_key_slot * ++ovpn_aead_crypto_key_slot_new(const struct ovpn_key_config *kc) ++{ ++ struct ovpn_crypto_key_slot *ks = NULL; ++ const char *alg_name; ++ int ret; ++ ++ /* validate crypto alg */ ++ switch (kc->cipher_alg) { ++ case OVPN_CIPHER_ALG_AES_GCM: ++ alg_name = ALG_NAME_AES; ++ break; ++ case OVPN_CIPHER_ALG_CHACHA20_POLY1305: ++ alg_name = ALG_NAME_CHACHAPOLY; ++ break; ++ default: ++ return ERR_PTR(-EOPNOTSUPP); ++ } ++ ++ if (sizeof(struct ovpn_nonce_tail) != kc->encrypt.nonce_tail_size || ++ sizeof(struct ovpn_nonce_tail) != kc->decrypt.nonce_tail_size) ++ return ERR_PTR(-EINVAL); ++ ++ /* build the key slot */ ++ ks = kmalloc(sizeof(*ks), GFP_KERNEL); ++ if (!ks) ++ return ERR_PTR(-ENOMEM); ++ ++ ks->encrypt = NULL; ++ ks->decrypt = NULL; ++ kref_init(&ks->refcount); ++ ks->key_id = kc->key_id; ++ ++ ks->encrypt = ovpn_aead_init("encrypt", alg_name, ++ kc->encrypt.cipher_key, ++ kc->encrypt.cipher_key_size); ++ if (IS_ERR(ks->encrypt)) { ++ ret = PTR_ERR(ks->encrypt); ++ ks->encrypt = NULL; ++ goto destroy_ks; ++ } ++ ++ ks->decrypt = ovpn_aead_init("decrypt", alg_name, ++ kc->decrypt.cipher_key, ++ kc->decrypt.cipher_key_size); ++ if (IS_ERR(ks->decrypt)) { ++ ret = PTR_ERR(ks->decrypt); ++ ks->decrypt = NULL; ++ goto destroy_ks; ++ } ++ ++ memcpy(ks->nonce_tail_xmit.u8, kc->encrypt.nonce_tail, ++ sizeof(struct ovpn_nonce_tail)); ++ memcpy(ks->nonce_tail_recv.u8, kc->decrypt.nonce_tail, ++ sizeof(struct ovpn_nonce_tail)); ++ ++ /* init packet ID generation/validation */ ++ ovpn_pktid_xmit_init(&ks->pid_xmit); ++ ovpn_pktid_recv_init(&ks->pid_recv); ++ ++ return ks; ++ ++destroy_ks: ++ ovpn_aead_crypto_key_slot_destroy(ks); ++ return ERR_PTR(ret); ++} ++ ++enum ovpn_cipher_alg ovpn_aead_crypto_alg(struct ovpn_crypto_key_slot *ks) ++{ ++ const char *alg_name; ++ ++ if (!ks->encrypt) ++ return OVPN_CIPHER_ALG_NONE; ++ ++ alg_name = crypto_tfm_alg_name(crypto_aead_tfm(ks->encrypt)); ++ ++ if (!strcmp(alg_name, ALG_NAME_AES)) ++ return OVPN_CIPHER_ALG_AES_GCM; ++ else if (!strcmp(alg_name, ALG_NAME_CHACHAPOLY)) ++ return OVPN_CIPHER_ALG_CHACHA20_POLY1305; ++ else ++ return OVPN_CIPHER_ALG_NONE; ++} +diff --git a/drivers/net/ovpn/crypto_aead.h b/drivers/net/ovpn/crypto_aead.h +new file mode 100644 +index 000000000000..fb65be82436e +--- /dev/null ++++ b/drivers/net/ovpn/crypto_aead.h +@@ -0,0 +1,33 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: James Yonan ++ * Antonio Quartulli ++ */ ++ ++#ifndef _NET_OVPN_OVPNAEAD_H_ ++#define _NET_OVPN_OVPNAEAD_H_ ++ ++#include "crypto.h" ++ ++#include ++#include ++ ++struct crypto_aead *ovpn_aead_init(const char *title, const char *alg_name, ++ const unsigned char *key, ++ unsigned int keylen); ++ ++int ovpn_aead_encrypt(struct ovpn_peer *peer, struct ovpn_crypto_key_slot *ks, ++ struct sk_buff *skb); ++int ovpn_aead_decrypt(struct ovpn_peer *peer, struct ovpn_crypto_key_slot *ks, ++ struct sk_buff *skb); ++ ++struct ovpn_crypto_key_slot * ++ovpn_aead_crypto_key_slot_new(const struct ovpn_key_config *kc); ++void ovpn_aead_crypto_key_slot_destroy(struct ovpn_crypto_key_slot *ks); ++ ++enum ovpn_cipher_alg ovpn_aead_crypto_alg(struct ovpn_crypto_key_slot *ks); ++ ++#endif /* _NET_OVPN_OVPNAEAD_H_ */ +diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c +new file mode 100644 +index 000000000000..c04791a508e5 +--- /dev/null ++++ b/drivers/net/ovpn/io.c +@@ -0,0 +1,462 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2019-2024 OpenVPN, Inc. ++ * ++ * Author: James Yonan ++ * Antonio Quartulli ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "ovpnstruct.h" ++#include "peer.h" ++#include "io.h" ++#include "bind.h" ++#include "crypto.h" ++#include "crypto_aead.h" ++#include "netlink.h" ++#include "proto.h" ++#include "tcp.h" ++#include "udp.h" ++#include "skb.h" ++#include "socket.h" ++ ++const unsigned char ovpn_keepalive_message[OVPN_KEEPALIVE_SIZE] = { ++ 0x2a, 0x18, 0x7b, 0xf3, 0x64, 0x1e, 0xb4, 0xcb, ++ 0x07, 0xed, 0x2d, 0x0a, 0x98, 0x1f, 0xc7, 0x48 ++}; ++ ++/** ++ * ovpn_is_keepalive - check if skb contains a keepalive message ++ * @skb: packet to check ++ * ++ * Assumes that the first byte of skb->data is defined. ++ * ++ * Return: true if skb contains a keepalive or false otherwise ++ */ ++static bool ovpn_is_keepalive(struct sk_buff *skb) ++{ ++ if (*skb->data != ovpn_keepalive_message[0]) ++ return false; ++ ++ if (skb->len != OVPN_KEEPALIVE_SIZE) ++ return false; ++ ++ if (!pskb_may_pull(skb, OVPN_KEEPALIVE_SIZE)) ++ return false; ++ ++ return !memcmp(skb->data, ovpn_keepalive_message, OVPN_KEEPALIVE_SIZE); ++} ++ ++/* Called after decrypt to write the IP packet to the device. ++ * This method is expected to manage/free the skb. ++ */ ++static void ovpn_netdev_write(struct ovpn_peer *peer, struct sk_buff *skb) ++{ ++ unsigned int pkt_len; ++ ++ /* we can't guarantee the packet wasn't corrupted before entering the ++ * VPN, therefore we give other layers a chance to check that ++ */ ++ skb->ip_summed = CHECKSUM_NONE; ++ ++ /* skb hash for transport packet no longer valid after decapsulation */ ++ skb_clear_hash(skb); ++ ++ /* post-decrypt scrub -- prepare to inject encapsulated packet onto the ++ * interface, based on __skb_tunnel_rx() in dst.h ++ */ ++ skb->dev = peer->ovpn->dev; ++ skb_set_queue_mapping(skb, 0); ++ skb_scrub_packet(skb, true); ++ ++ skb_reset_network_header(skb); ++ skb_reset_transport_header(skb); ++ skb_probe_transport_header(skb); ++ skb_reset_inner_headers(skb); ++ ++ memset(skb->cb, 0, sizeof(skb->cb)); ++ ++ /* cause packet to be "received" by the interface */ ++ pkt_len = skb->len; ++ if (likely(gro_cells_receive(&peer->ovpn->gro_cells, ++ skb) == NET_RX_SUCCESS)) ++ /* update RX stats with the size of decrypted packet */ ++ dev_sw_netstats_rx_add(peer->ovpn->dev, pkt_len); ++} ++ ++void ovpn_decrypt_post(void *data, int ret) ++{ ++ struct ovpn_crypto_key_slot *ks; ++ unsigned int payload_offset = 0; ++ struct sk_buff *skb = data; ++ struct ovpn_peer *peer; ++ unsigned int orig_len; ++ __be16 proto; ++ __be32 *pid; ++ ++ /* crypto is happening asynchronously. this function will be called ++ * again later by the crypto callback with a proper return code ++ */ ++ if (unlikely(ret == -EINPROGRESS)) ++ return; ++ ++ payload_offset = ovpn_skb_cb(skb)->payload_offset; ++ ks = ovpn_skb_cb(skb)->ks; ++ peer = ovpn_skb_cb(skb)->peer; ++ orig_len = ovpn_skb_cb(skb)->orig_len; ++ ++ /* crypto is done, cleanup skb CB and its members */ ++ ++ if (likely(ovpn_skb_cb(skb)->sg)) ++ kfree(ovpn_skb_cb(skb)->sg); ++ ++ if (likely(ovpn_skb_cb(skb)->req)) ++ aead_request_free(ovpn_skb_cb(skb)->req); ++ ++ if (unlikely(ret < 0)) ++ goto drop; ++ ++ /* PID sits after the op */ ++ pid = (__force __be32 *)(skb->data + OVPN_OP_SIZE_V2); ++ ret = ovpn_pktid_recv(&ks->pid_recv, ntohl(*pid), 0); ++ if (unlikely(ret < 0)) { ++ net_err_ratelimited("%s: PKT ID RX error: %d\n", ++ peer->ovpn->dev->name, ret); ++ goto drop; ++ } ++ ++ /* keep track of last received authenticated packet for keepalive */ ++ peer->last_recv = ktime_get_real_seconds(); ++ ++ if (peer->sock->sock->sk->sk_protocol == IPPROTO_UDP) { ++ /* check if this peer changed it's IP address and update ++ * state ++ */ ++ ovpn_peer_float(peer, skb); ++ /* update source endpoint for this peer */ ++ ovpn_peer_update_local_endpoint(peer, skb); ++ } ++ ++ /* point to encapsulated IP packet */ ++ __skb_pull(skb, payload_offset); ++ ++ /* check if this is a valid datapacket that has to be delivered to the ++ * ovpn interface ++ */ ++ skb_reset_network_header(skb); ++ proto = ovpn_ip_check_protocol(skb); ++ if (unlikely(!proto)) { ++ /* check if null packet */ ++ if (unlikely(!pskb_may_pull(skb, 1))) { ++ net_info_ratelimited("%s: NULL packet received from peer %u\n", ++ peer->ovpn->dev->name, peer->id); ++ goto drop; ++ } ++ ++ if (ovpn_is_keepalive(skb)) { ++ net_dbg_ratelimited("%s: ping received from peer %u\n", ++ peer->ovpn->dev->name, peer->id); ++ goto drop; ++ } ++ ++ net_info_ratelimited("%s: unsupported protocol received from peer %u\n", ++ peer->ovpn->dev->name, peer->id); ++ goto drop; ++ } ++ skb->protocol = proto; ++ ++ /* perform Reverse Path Filtering (RPF) */ ++ if (unlikely(!ovpn_peer_check_by_src(peer->ovpn, skb, peer))) { ++ if (skb_protocol_to_family(skb) == AF_INET6) ++ net_dbg_ratelimited("%s: RPF dropped packet from peer %u, src: %pI6c\n", ++ peer->ovpn->dev->name, peer->id, ++ &ipv6_hdr(skb)->saddr); ++ else ++ net_dbg_ratelimited("%s: RPF dropped packet from peer %u, src: %pI4\n", ++ peer->ovpn->dev->name, peer->id, ++ &ip_hdr(skb)->saddr); ++ goto drop; ++ } ++ ++ /* increment RX stats */ ++ ovpn_peer_stats_increment_rx(&peer->vpn_stats, skb->len); ++ ovpn_peer_stats_increment_rx(&peer->link_stats, orig_len); ++ ++ ovpn_netdev_write(peer, skb); ++ /* skb is passed to upper layer - don't free it */ ++ skb = NULL; ++drop: ++ if (unlikely(skb)) ++ dev_core_stats_rx_dropped_inc(peer->ovpn->dev); ++ if (likely(peer)) ++ ovpn_peer_put(peer); ++ if (likely(ks)) ++ ovpn_crypto_key_slot_put(ks); ++ kfree_skb(skb); ++} ++ ++/* pick next packet from RX queue, decrypt and forward it to the device */ ++void ovpn_recv(struct ovpn_peer *peer, struct sk_buff *skb) ++{ ++ struct ovpn_crypto_key_slot *ks; ++ u8 key_id; ++ ++ /* get the key slot matching the key ID in the received packet */ ++ key_id = ovpn_key_id_from_skb(skb); ++ ks = ovpn_crypto_key_id_to_slot(&peer->crypto, key_id); ++ if (unlikely(!ks)) { ++ net_info_ratelimited("%s: no available key for peer %u, key-id: %u\n", ++ peer->ovpn->dev->name, peer->id, key_id); ++ dev_core_stats_rx_dropped_inc(peer->ovpn->dev); ++ kfree_skb(skb); ++ return; ++ } ++ ++ memset(ovpn_skb_cb(skb), 0, sizeof(struct ovpn_cb)); ++ ovpn_decrypt_post(skb, ovpn_aead_decrypt(peer, ks, skb)); ++} ++ ++void ovpn_encrypt_post(void *data, int ret) ++{ ++ struct ovpn_crypto_key_slot *ks; ++ struct sk_buff *skb = data; ++ struct ovpn_peer *peer; ++ unsigned int orig_len; ++ ++ /* encryption is happening asynchronously. This function will be ++ * called later by the crypto callback with a proper return value ++ */ ++ if (unlikely(ret == -EINPROGRESS)) ++ return; ++ ++ ks = ovpn_skb_cb(skb)->ks; ++ peer = ovpn_skb_cb(skb)->peer; ++ orig_len = ovpn_skb_cb(skb)->orig_len; ++ ++ /* crypto is done, cleanup skb CB and its members */ ++ ++ if (likely(ovpn_skb_cb(skb)->sg)) ++ kfree(ovpn_skb_cb(skb)->sg); ++ ++ if (likely(ovpn_skb_cb(skb)->req)) ++ aead_request_free(ovpn_skb_cb(skb)->req); ++ ++ if (unlikely(ret == -ERANGE)) { ++ /* we ran out of IVs and we must kill the key as it can't be ++ * use anymore ++ */ ++ netdev_warn(peer->ovpn->dev, ++ "killing key %u for peer %u\n", ks->key_id, ++ peer->id); ++ ovpn_crypto_kill_key(&peer->crypto, ks->key_id); ++ /* let userspace know so that a new key must be negotiated */ ++ ovpn_nl_key_swap_notify(peer, ks->key_id); ++ goto err; ++ } ++ ++ if (unlikely(ret < 0)) ++ goto err; ++ ++ skb_mark_not_on_list(skb); ++ ovpn_peer_stats_increment_tx(&peer->link_stats, skb->len); ++ ovpn_peer_stats_increment_tx(&peer->vpn_stats, orig_len); ++ ++ switch (peer->sock->sock->sk->sk_protocol) { ++ case IPPROTO_UDP: ++ ovpn_udp_send_skb(peer->ovpn, peer, skb); ++ break; ++ case IPPROTO_TCP: ++ ovpn_tcp_send_skb(peer, skb); ++ break; ++ default: ++ /* no transport configured yet */ ++ goto err; ++ } ++ ++ /* keep track of last sent packet for keepalive */ ++ peer->last_sent = ktime_get_real_seconds(); ++ ++ /* skb passed down the stack - don't free it */ ++ skb = NULL; ++err: ++ if (unlikely(skb)) ++ dev_core_stats_tx_dropped_inc(peer->ovpn->dev); ++ if (likely(peer)) ++ ovpn_peer_put(peer); ++ if (likely(ks)) ++ ovpn_crypto_key_slot_put(ks); ++ kfree_skb(skb); ++} ++ ++static bool ovpn_encrypt_one(struct ovpn_peer *peer, struct sk_buff *skb) ++{ ++ struct ovpn_crypto_key_slot *ks; ++ ++ if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL && ++ skb_checksum_help(skb))) { ++ net_warn_ratelimited("%s: cannot compute checksum for outgoing packet\n", ++ peer->ovpn->dev->name); ++ return false; ++ } ++ ++ /* get primary key to be used for encrypting data */ ++ ks = ovpn_crypto_key_slot_primary(&peer->crypto); ++ if (unlikely(!ks)) { ++ net_warn_ratelimited("%s: error while retrieving primary key slot for peer %u\n", ++ peer->ovpn->dev->name, peer->id); ++ return false; ++ } ++ ++ /* take a reference to the peer because the crypto code may run async. ++ * ovpn_encrypt_post() will release it upon completion ++ */ ++ if (unlikely(!ovpn_peer_hold(peer))) { ++ DEBUG_NET_WARN_ON_ONCE(1); ++ return false; ++ } ++ ++ memset(ovpn_skb_cb(skb), 0, sizeof(struct ovpn_cb)); ++ ovpn_encrypt_post(skb, ovpn_aead_encrypt(peer, ks, skb)); ++ return true; ++} ++ ++/* send skb to connected peer, if any */ ++static void ovpn_send(struct ovpn_struct *ovpn, struct sk_buff *skb, ++ struct ovpn_peer *peer) ++{ ++ struct sk_buff *curr, *next; ++ ++ if (likely(!peer)) ++ /* retrieve peer serving the destination IP of this packet */ ++ peer = ovpn_peer_get_by_dst(ovpn, skb); ++ if (unlikely(!peer)) { ++ net_dbg_ratelimited("%s: no peer to send data to\n", ++ ovpn->dev->name); ++ dev_core_stats_tx_dropped_inc(ovpn->dev); ++ goto drop; ++ } ++ ++ /* this might be a GSO-segmented skb list: process each skb ++ * independently ++ */ ++ skb_list_walk_safe(skb, curr, next) ++ if (unlikely(!ovpn_encrypt_one(peer, curr))) { ++ dev_core_stats_tx_dropped_inc(ovpn->dev); ++ kfree_skb(curr); ++ } ++ ++ /* skb passed over, no need to free */ ++ skb = NULL; ++drop: ++ if (likely(peer)) ++ ovpn_peer_put(peer); ++ kfree_skb_list(skb); ++} ++ ++/* Send user data to the network ++ */ ++netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct ovpn_struct *ovpn = netdev_priv(dev); ++ struct sk_buff *segments, *curr, *next; ++ struct sk_buff_head skb_list; ++ __be16 proto; ++ int ret; ++ ++ /* reset netfilter state */ ++ nf_reset_ct(skb); ++ ++ /* verify IP header size in network packet */ ++ proto = ovpn_ip_check_protocol(skb); ++ if (unlikely(!proto || skb->protocol != proto)) { ++ net_err_ratelimited("%s: dropping malformed payload packet\n", ++ dev->name); ++ dev_core_stats_tx_dropped_inc(ovpn->dev); ++ goto drop; ++ } ++ ++ if (skb_is_gso(skb)) { ++ segments = skb_gso_segment(skb, 0); ++ if (IS_ERR(segments)) { ++ ret = PTR_ERR(segments); ++ net_err_ratelimited("%s: cannot segment packet: %d\n", ++ dev->name, ret); ++ dev_core_stats_tx_dropped_inc(ovpn->dev); ++ goto drop; ++ } ++ ++ consume_skb(skb); ++ skb = segments; ++ } ++ ++ /* from this moment on, "skb" might be a list */ ++ ++ __skb_queue_head_init(&skb_list); ++ skb_list_walk_safe(skb, curr, next) { ++ skb_mark_not_on_list(curr); ++ ++ curr = skb_share_check(curr, GFP_ATOMIC); ++ if (unlikely(!curr)) { ++ net_err_ratelimited("%s: skb_share_check failed\n", ++ dev->name); ++ dev_core_stats_tx_dropped_inc(ovpn->dev); ++ continue; ++ } ++ ++ __skb_queue_tail(&skb_list, curr); ++ } ++ skb_list.prev->next = NULL; ++ ++ ovpn_send(ovpn, skb_list.next, NULL); ++ ++ return NETDEV_TX_OK; ++ ++drop: ++ skb_tx_error(skb); ++ kfree_skb_list(skb); ++ return NET_XMIT_DROP; ++} ++ ++/** ++ * ovpn_xmit_special - encrypt and transmit an out-of-band message to peer ++ * @peer: peer to send the message to ++ * @data: message content ++ * @len: message length ++ * ++ * Assumes that caller holds a reference to peer ++ */ ++void ovpn_xmit_special(struct ovpn_peer *peer, const void *data, ++ const unsigned int len) ++{ ++ struct ovpn_struct *ovpn; ++ struct sk_buff *skb; ++ ++ ovpn = peer->ovpn; ++ if (unlikely(!ovpn)) ++ return; ++ ++ skb = alloc_skb(256 + len, GFP_ATOMIC); ++ if (unlikely(!skb)) ++ return; ++ ++ skb_reserve(skb, 128); ++ skb->priority = TC_PRIO_BESTEFFORT; ++ __skb_put_data(skb, data, len); ++ ++ /* increase reference counter when passing peer to sending queue */ ++ if (!ovpn_peer_hold(peer)) { ++ netdev_dbg(ovpn->dev, "%s: cannot hold peer reference for sending special packet\n", ++ __func__); ++ kfree_skb(skb); ++ return; ++ } ++ ++ ovpn_send(ovpn, skb, peer); ++} +diff --git a/drivers/net/ovpn/io.h b/drivers/net/ovpn/io.h +new file mode 100644 +index 000000000000..eb224114152c +--- /dev/null ++++ b/drivers/net/ovpn/io.h +@@ -0,0 +1,25 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2019-2024 OpenVPN, Inc. ++ * ++ * Author: James Yonan ++ * Antonio Quartulli ++ */ ++ ++#ifndef _NET_OVPN_OVPN_H_ ++#define _NET_OVPN_OVPN_H_ ++ ++#define OVPN_KEEPALIVE_SIZE 16 ++extern const unsigned char ovpn_keepalive_message[OVPN_KEEPALIVE_SIZE]; ++ ++netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev); ++ ++void ovpn_recv(struct ovpn_peer *peer, struct sk_buff *skb); ++void ovpn_xmit_special(struct ovpn_peer *peer, const void *data, ++ const unsigned int len); ++ ++void ovpn_encrypt_post(void *data, int ret); ++void ovpn_decrypt_post(void *data, int ret); ++ ++#endif /* _NET_OVPN_OVPN_H_ */ +diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c +new file mode 100644 +index 000000000000..9dcf51ae1497 +--- /dev/null ++++ b/drivers/net/ovpn/main.c +@@ -0,0 +1,337 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: Antonio Quartulli ++ * James Yonan ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "ovpnstruct.h" ++#include "main.h" ++#include "netlink.h" ++#include "io.h" ++#include "packet.h" ++#include "peer.h" ++#include "tcp.h" ++ ++/* Driver info */ ++#define DRV_DESCRIPTION "OpenVPN data channel offload (ovpn)" ++#define DRV_COPYRIGHT "(C) 2020-2024 OpenVPN, Inc." ++ ++static void ovpn_struct_free(struct net_device *net) ++{ ++ struct ovpn_struct *ovpn = netdev_priv(net); ++ ++ kfree(ovpn->peers); ++} ++ ++static int ovpn_net_init(struct net_device *dev) ++{ ++ struct ovpn_struct *ovpn = netdev_priv(dev); ++ ++ return gro_cells_init(&ovpn->gro_cells, dev); ++} ++ ++static void ovpn_net_uninit(struct net_device *dev) ++{ ++ struct ovpn_struct *ovpn = netdev_priv(dev); ++ ++ gro_cells_destroy(&ovpn->gro_cells); ++} ++ ++static int ovpn_net_open(struct net_device *dev) ++{ ++ /* ovpn keeps the carrier always on to avoid losing IP or route ++ * configuration upon disconnection. This way it can prevent leaks ++ * of traffic outside of the VPN tunnel. ++ * The user may override this behaviour by tearing down the interface ++ * manually. ++ */ ++ netif_carrier_on(dev); ++ netif_tx_start_all_queues(dev); ++ return 0; ++} ++ ++static int ovpn_net_stop(struct net_device *dev) ++{ ++ netif_tx_stop_all_queues(dev); ++ return 0; ++} ++ ++static const struct net_device_ops ovpn_netdev_ops = { ++ .ndo_init = ovpn_net_init, ++ .ndo_uninit = ovpn_net_uninit, ++ .ndo_open = ovpn_net_open, ++ .ndo_stop = ovpn_net_stop, ++ .ndo_start_xmit = ovpn_net_xmit, ++}; ++ ++static const struct device_type ovpn_type = { ++ .name = OVPN_FAMILY_NAME, ++}; ++ ++static const struct nla_policy ovpn_policy[IFLA_OVPN_MAX + 1] = { ++ [IFLA_OVPN_MODE] = NLA_POLICY_RANGE(NLA_U8, OVPN_MODE_P2P, ++ OVPN_MODE_MP), ++}; ++ ++/** ++ * ovpn_dev_is_valid - check if the netdevice is of type 'ovpn' ++ * @dev: the interface to check ++ * ++ * Return: whether the netdevice is of type 'ovpn' ++ */ ++bool ovpn_dev_is_valid(const struct net_device *dev) ++{ ++ return dev->netdev_ops->ndo_start_xmit == ovpn_net_xmit; ++} ++ ++static void ovpn_get_drvinfo(struct net_device *dev, ++ struct ethtool_drvinfo *info) ++{ ++ strscpy(info->driver, OVPN_FAMILY_NAME, sizeof(info->driver)); ++ strscpy(info->bus_info, "ovpn", sizeof(info->bus_info)); ++} ++ ++static const struct ethtool_ops ovpn_ethtool_ops = { ++ .get_drvinfo = ovpn_get_drvinfo, ++ .get_link = ethtool_op_get_link, ++ .get_ts_info = ethtool_op_get_ts_info, ++}; ++ ++static void ovpn_setup(struct net_device *dev) ++{ ++ /* compute the overhead considering AEAD encryption */ ++ const int overhead = sizeof(u32) + NONCE_WIRE_SIZE + 16 + ++ sizeof(struct udphdr) + ++ max(sizeof(struct ipv6hdr), sizeof(struct iphdr)); ++ ++ netdev_features_t feat = NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM | ++ NETIF_F_GSO | NETIF_F_GSO_SOFTWARE | ++ NETIF_F_HIGHDMA; ++ ++ dev->needs_free_netdev = true; ++ ++ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; ++ ++ dev->ethtool_ops = &ovpn_ethtool_ops; ++ dev->netdev_ops = &ovpn_netdev_ops; ++ ++ dev->priv_destructor = ovpn_struct_free; ++ ++ dev->hard_header_len = 0; ++ dev->addr_len = 0; ++ dev->mtu = ETH_DATA_LEN - overhead; ++ dev->min_mtu = IPV4_MIN_MTU; ++ dev->max_mtu = IP_MAX_MTU - overhead; ++ ++ dev->type = ARPHRD_NONE; ++ dev->flags = IFF_POINTOPOINT | IFF_NOARP; ++ dev->priv_flags |= IFF_NO_QUEUE; ++ ++ dev->lltx = true; ++ dev->features |= feat; ++ dev->hw_features |= feat; ++ dev->hw_enc_features |= feat; ++ ++ dev->needed_headroom = OVPN_HEAD_ROOM; ++ dev->needed_tailroom = OVPN_MAX_PADDING; ++ ++ SET_NETDEV_DEVTYPE(dev, &ovpn_type); ++} ++ ++static int ovpn_mp_alloc(struct ovpn_struct *ovpn) ++{ ++ struct in_device *dev_v4; ++ int i; ++ ++ if (ovpn->mode != OVPN_MODE_MP) ++ return 0; ++ ++ dev_v4 = __in_dev_get_rtnl(ovpn->dev); ++ if (dev_v4) { ++ /* disable redirects as Linux gets confused by ovpn ++ * handling same-LAN routing. ++ * This happens because a multipeer interface is used as ++ * relay point between hosts in the same subnet, while ++ * in a classic LAN this would not be needed because the ++ * two hosts would be able to talk directly. ++ */ ++ IN_DEV_CONF_SET(dev_v4, SEND_REDIRECTS, false); ++ IPV4_DEVCONF_ALL(dev_net(ovpn->dev), SEND_REDIRECTS) = false; ++ } ++ ++ /* the peer container is fairly large, therefore we allocate it only in ++ * MP mode ++ */ ++ ovpn->peers = kzalloc(sizeof(*ovpn->peers), GFP_KERNEL); ++ if (!ovpn->peers) ++ return -ENOMEM; ++ ++ spin_lock_init(&ovpn->peers->lock); ++ ++ for (i = 0; i < ARRAY_SIZE(ovpn->peers->by_id); i++) { ++ INIT_HLIST_HEAD(&ovpn->peers->by_id[i]); ++ INIT_HLIST_NULLS_HEAD(&ovpn->peers->by_vpn_addr[i], i); ++ INIT_HLIST_NULLS_HEAD(&ovpn->peers->by_transp_addr[i], i); ++ } ++ ++ return 0; ++} ++ ++static int ovpn_newlink(struct net *src_net, struct net_device *dev, ++ struct nlattr *tb[], struct nlattr *data[], ++ struct netlink_ext_ack *extack) ++{ ++ struct ovpn_struct *ovpn = netdev_priv(dev); ++ enum ovpn_mode mode = OVPN_MODE_P2P; ++ int err; ++ ++ if (data && data[IFLA_OVPN_MODE]) { ++ mode = nla_get_u8(data[IFLA_OVPN_MODE]); ++ netdev_dbg(dev, "setting device mode: %u\n", mode); ++ } ++ ++ ovpn->dev = dev; ++ ovpn->mode = mode; ++ spin_lock_init(&ovpn->lock); ++ INIT_DELAYED_WORK(&ovpn->keepalive_work, ovpn_peer_keepalive_work); ++ ++ err = ovpn_mp_alloc(ovpn); ++ if (err < 0) ++ return err; ++ ++ /* turn carrier explicitly off after registration, this way state is ++ * clearly defined ++ */ ++ netif_carrier_off(dev); ++ ++ return register_netdevice(dev); ++} ++ ++static struct rtnl_link_ops ovpn_link_ops = { ++ .kind = OVPN_FAMILY_NAME, ++ .netns_refund = false, ++ .priv_size = sizeof(struct ovpn_struct), ++ .setup = ovpn_setup, ++ .policy = ovpn_policy, ++ .maxtype = IFLA_OVPN_MAX, ++ .newlink = ovpn_newlink, ++ .dellink = unregister_netdevice_queue, ++}; ++ ++static int ovpn_netdev_notifier_call(struct notifier_block *nb, ++ unsigned long state, void *ptr) ++{ ++ struct net_device *dev = netdev_notifier_info_to_dev(ptr); ++ struct ovpn_struct *ovpn; ++ ++ if (!ovpn_dev_is_valid(dev)) ++ return NOTIFY_DONE; ++ ++ ovpn = netdev_priv(dev); ++ ++ switch (state) { ++ case NETDEV_REGISTER: ++ ovpn->registered = true; ++ break; ++ case NETDEV_UNREGISTER: ++ /* twiddle thumbs on netns device moves */ ++ if (dev->reg_state != NETREG_UNREGISTERING) ++ break; ++ ++ /* can be delivered multiple times, so check registered flag, ++ * then destroy the interface ++ */ ++ if (!ovpn->registered) ++ return NOTIFY_DONE; ++ ++ netif_carrier_off(dev); ++ ovpn->registered = false; ++ ++ cancel_delayed_work_sync(&ovpn->keepalive_work); ++ ++ switch (ovpn->mode) { ++ case OVPN_MODE_P2P: ++ ovpn_peer_release_p2p(ovpn); ++ break; ++ case OVPN_MODE_MP: ++ ovpn_peers_free(ovpn); ++ break; ++ } ++ break; ++ case NETDEV_POST_INIT: ++ case NETDEV_GOING_DOWN: ++ case NETDEV_DOWN: ++ case NETDEV_UP: ++ case NETDEV_PRE_UP: ++ break; ++ default: ++ return NOTIFY_DONE; ++ } ++ ++ return NOTIFY_OK; ++} ++ ++static struct notifier_block ovpn_netdev_notifier = { ++ .notifier_call = ovpn_netdev_notifier_call, ++}; ++ ++static int __init ovpn_init(void) ++{ ++ int err = register_netdevice_notifier(&ovpn_netdev_notifier); ++ ++ if (err) { ++ pr_err("ovpn: can't register netdevice notifier: %d\n", err); ++ return err; ++ } ++ ++ err = rtnl_link_register(&ovpn_link_ops); ++ if (err) { ++ pr_err("ovpn: can't register rtnl link ops: %d\n", err); ++ goto unreg_netdev; ++ } ++ ++ err = ovpn_nl_register(); ++ if (err) { ++ pr_err("ovpn: can't register netlink family: %d\n", err); ++ goto unreg_rtnl; ++ } ++ ++ ovpn_tcp_init(); ++ ++ return 0; ++ ++unreg_rtnl: ++ rtnl_link_unregister(&ovpn_link_ops); ++unreg_netdev: ++ unregister_netdevice_notifier(&ovpn_netdev_notifier); ++ return err; ++} ++ ++static __exit void ovpn_cleanup(void) ++{ ++ ovpn_nl_unregister(); ++ rtnl_link_unregister(&ovpn_link_ops); ++ unregister_netdevice_notifier(&ovpn_netdev_notifier); ++ ++ rcu_barrier(); ++} ++ ++module_init(ovpn_init); ++module_exit(ovpn_cleanup); ++ ++MODULE_DESCRIPTION(DRV_DESCRIPTION); ++MODULE_AUTHOR(DRV_COPYRIGHT); ++MODULE_LICENSE("GPL"); +diff --git a/drivers/net/ovpn/main.h b/drivers/net/ovpn/main.h +new file mode 100644 +index 000000000000..28e5c44816e1 +--- /dev/null ++++ b/drivers/net/ovpn/main.h +@@ -0,0 +1,24 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2019-2024 OpenVPN, Inc. ++ * ++ * Author: James Yonan ++ * Antonio Quartulli ++ */ ++ ++#ifndef _NET_OVPN_MAIN_H_ ++#define _NET_OVPN_MAIN_H_ ++ ++bool ovpn_dev_is_valid(const struct net_device *dev); ++ ++#define SKB_HEADER_LEN \ ++ (max(sizeof(struct iphdr), sizeof(struct ipv6hdr)) + \ ++ sizeof(struct udphdr) + NET_SKB_PAD) ++ ++#define OVPN_HEAD_ROOM ALIGN(16 + SKB_HEADER_LEN, 4) ++#define OVPN_MAX_PADDING 16 ++ ++#define OVPN_QUEUE_LEN 1024 ++ ++#endif /* _NET_OVPN_MAIN_H_ */ +diff --git a/drivers/net/ovpn/netlink-gen.c b/drivers/net/ovpn/netlink-gen.c +new file mode 100644 +index 000000000000..6a43eab9a136 +--- /dev/null ++++ b/drivers/net/ovpn/netlink-gen.c +@@ -0,0 +1,212 @@ ++// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) ++/* Do not edit directly, auto-generated from: */ ++/* Documentation/netlink/specs/ovpn.yaml */ ++/* YNL-GEN kernel source */ ++ ++#include ++#include ++ ++#include "netlink-gen.h" ++ ++#include ++ ++/* Integer value ranges */ ++static const struct netlink_range_validation ovpn_a_peer_id_range = { ++ .max = 16777215ULL, ++}; ++ ++static const struct netlink_range_validation ovpn_a_keyconf_peer_id_range = { ++ .max = 16777215ULL, ++}; ++ ++/* Common nested types */ ++const struct nla_policy ovpn_keyconf_nl_policy[OVPN_A_KEYCONF_DECRYPT_DIR + 1] = { ++ [OVPN_A_KEYCONF_PEER_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &ovpn_a_keyconf_peer_id_range), ++ [OVPN_A_KEYCONF_SLOT] = NLA_POLICY_MAX(NLA_U32, 1), ++ [OVPN_A_KEYCONF_KEY_ID] = NLA_POLICY_MAX(NLA_U32, 7), ++ [OVPN_A_KEYCONF_CIPHER_ALG] = NLA_POLICY_MAX(NLA_U32, 2), ++ [OVPN_A_KEYCONF_ENCRYPT_DIR] = NLA_POLICY_NESTED(ovpn_keydir_nl_policy), ++ [OVPN_A_KEYCONF_DECRYPT_DIR] = NLA_POLICY_NESTED(ovpn_keydir_nl_policy), ++}; ++ ++const struct nla_policy ovpn_keydir_nl_policy[OVPN_A_KEYDIR_NONCE_TAIL + 1] = { ++ [OVPN_A_KEYDIR_CIPHER_KEY] = NLA_POLICY_MAX_LEN(256), ++ [OVPN_A_KEYDIR_NONCE_TAIL] = NLA_POLICY_EXACT_LEN(OVPN_NONCE_TAIL_SIZE), ++}; ++ ++const struct nla_policy ovpn_peer_nl_policy[OVPN_A_PEER_LINK_TX_PACKETS + 1] = { ++ [OVPN_A_PEER_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &ovpn_a_peer_id_range), ++ [OVPN_A_PEER_REMOTE_IPV4] = { .type = NLA_U32, }, ++ [OVPN_A_PEER_REMOTE_IPV6] = NLA_POLICY_EXACT_LEN(16), ++ [OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID] = { .type = NLA_U32, }, ++ [OVPN_A_PEER_REMOTE_PORT] = NLA_POLICY_MIN(NLA_U16, 1), ++ [OVPN_A_PEER_SOCKET] = { .type = NLA_U32, }, ++ [OVPN_A_PEER_VPN_IPV4] = { .type = NLA_U32, }, ++ [OVPN_A_PEER_VPN_IPV6] = NLA_POLICY_EXACT_LEN(16), ++ [OVPN_A_PEER_LOCAL_IPV4] = { .type = NLA_U32, }, ++ [OVPN_A_PEER_LOCAL_IPV6] = NLA_POLICY_EXACT_LEN(16), ++ [OVPN_A_PEER_LOCAL_PORT] = NLA_POLICY_MIN(NLA_U16, 1), ++ [OVPN_A_PEER_KEEPALIVE_INTERVAL] = { .type = NLA_U32, }, ++ [OVPN_A_PEER_KEEPALIVE_TIMEOUT] = { .type = NLA_U32, }, ++ [OVPN_A_PEER_DEL_REASON] = NLA_POLICY_MAX(NLA_U32, 4), ++ [OVPN_A_PEER_VPN_RX_BYTES] = { .type = NLA_UINT, }, ++ [OVPN_A_PEER_VPN_TX_BYTES] = { .type = NLA_UINT, }, ++ [OVPN_A_PEER_VPN_RX_PACKETS] = { .type = NLA_UINT, }, ++ [OVPN_A_PEER_VPN_TX_PACKETS] = { .type = NLA_UINT, }, ++ [OVPN_A_PEER_LINK_RX_BYTES] = { .type = NLA_UINT, }, ++ [OVPN_A_PEER_LINK_TX_BYTES] = { .type = NLA_UINT, }, ++ [OVPN_A_PEER_LINK_RX_PACKETS] = { .type = NLA_U32, }, ++ [OVPN_A_PEER_LINK_TX_PACKETS] = { .type = NLA_U32, }, ++}; ++ ++/* OVPN_CMD_PEER_NEW - do */ ++static const struct nla_policy ovpn_peer_new_nl_policy[OVPN_A_PEER + 1] = { ++ [OVPN_A_IFINDEX] = { .type = NLA_U32, }, ++ [OVPN_A_PEER] = NLA_POLICY_NESTED(ovpn_peer_nl_policy), ++}; ++ ++/* OVPN_CMD_PEER_SET - do */ ++static const struct nla_policy ovpn_peer_set_nl_policy[OVPN_A_PEER + 1] = { ++ [OVPN_A_IFINDEX] = { .type = NLA_U32, }, ++ [OVPN_A_PEER] = NLA_POLICY_NESTED(ovpn_peer_nl_policy), ++}; ++ ++/* OVPN_CMD_PEER_GET - do */ ++static const struct nla_policy ovpn_peer_get_do_nl_policy[OVPN_A_PEER + 1] = { ++ [OVPN_A_IFINDEX] = { .type = NLA_U32, }, ++ [OVPN_A_PEER] = NLA_POLICY_NESTED(ovpn_peer_nl_policy), ++}; ++ ++/* OVPN_CMD_PEER_GET - dump */ ++static const struct nla_policy ovpn_peer_get_dump_nl_policy[OVPN_A_IFINDEX + 1] = { ++ [OVPN_A_IFINDEX] = { .type = NLA_U32, }, ++}; ++ ++/* OVPN_CMD_PEER_DEL - do */ ++static const struct nla_policy ovpn_peer_del_nl_policy[OVPN_A_PEER + 1] = { ++ [OVPN_A_IFINDEX] = { .type = NLA_U32, }, ++ [OVPN_A_PEER] = NLA_POLICY_NESTED(ovpn_peer_nl_policy), ++}; ++ ++/* OVPN_CMD_KEY_NEW - do */ ++static const struct nla_policy ovpn_key_new_nl_policy[OVPN_A_KEYCONF + 1] = { ++ [OVPN_A_IFINDEX] = { .type = NLA_U32, }, ++ [OVPN_A_KEYCONF] = NLA_POLICY_NESTED(ovpn_keyconf_nl_policy), ++}; ++ ++/* OVPN_CMD_KEY_GET - do */ ++static const struct nla_policy ovpn_key_get_nl_policy[OVPN_A_KEYCONF + 1] = { ++ [OVPN_A_IFINDEX] = { .type = NLA_U32, }, ++ [OVPN_A_KEYCONF] = NLA_POLICY_NESTED(ovpn_keyconf_nl_policy), ++}; ++ ++/* OVPN_CMD_KEY_SWAP - do */ ++static const struct nla_policy ovpn_key_swap_nl_policy[OVPN_A_KEYCONF + 1] = { ++ [OVPN_A_IFINDEX] = { .type = NLA_U32, }, ++ [OVPN_A_KEYCONF] = NLA_POLICY_NESTED(ovpn_keyconf_nl_policy), ++}; ++ ++/* OVPN_CMD_KEY_DEL - do */ ++static const struct nla_policy ovpn_key_del_nl_policy[OVPN_A_KEYCONF + 1] = { ++ [OVPN_A_IFINDEX] = { .type = NLA_U32, }, ++ [OVPN_A_KEYCONF] = NLA_POLICY_NESTED(ovpn_keyconf_nl_policy), ++}; ++ ++/* Ops table for ovpn */ ++static const struct genl_split_ops ovpn_nl_ops[] = { ++ { ++ .cmd = OVPN_CMD_PEER_NEW, ++ .pre_doit = ovpn_nl_pre_doit, ++ .doit = ovpn_nl_peer_new_doit, ++ .post_doit = ovpn_nl_post_doit, ++ .policy = ovpn_peer_new_nl_policy, ++ .maxattr = OVPN_A_PEER, ++ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, ++ }, ++ { ++ .cmd = OVPN_CMD_PEER_SET, ++ .pre_doit = ovpn_nl_pre_doit, ++ .doit = ovpn_nl_peer_set_doit, ++ .post_doit = ovpn_nl_post_doit, ++ .policy = ovpn_peer_set_nl_policy, ++ .maxattr = OVPN_A_PEER, ++ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, ++ }, ++ { ++ .cmd = OVPN_CMD_PEER_GET, ++ .pre_doit = ovpn_nl_pre_doit, ++ .doit = ovpn_nl_peer_get_doit, ++ .post_doit = ovpn_nl_post_doit, ++ .policy = ovpn_peer_get_do_nl_policy, ++ .maxattr = OVPN_A_PEER, ++ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, ++ }, ++ { ++ .cmd = OVPN_CMD_PEER_GET, ++ .dumpit = ovpn_nl_peer_get_dumpit, ++ .policy = ovpn_peer_get_dump_nl_policy, ++ .maxattr = OVPN_A_IFINDEX, ++ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DUMP, ++ }, ++ { ++ .cmd = OVPN_CMD_PEER_DEL, ++ .pre_doit = ovpn_nl_pre_doit, ++ .doit = ovpn_nl_peer_del_doit, ++ .post_doit = ovpn_nl_post_doit, ++ .policy = ovpn_peer_del_nl_policy, ++ .maxattr = OVPN_A_PEER, ++ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, ++ }, ++ { ++ .cmd = OVPN_CMD_KEY_NEW, ++ .pre_doit = ovpn_nl_pre_doit, ++ .doit = ovpn_nl_key_new_doit, ++ .post_doit = ovpn_nl_post_doit, ++ .policy = ovpn_key_new_nl_policy, ++ .maxattr = OVPN_A_KEYCONF, ++ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, ++ }, ++ { ++ .cmd = OVPN_CMD_KEY_GET, ++ .pre_doit = ovpn_nl_pre_doit, ++ .doit = ovpn_nl_key_get_doit, ++ .post_doit = ovpn_nl_post_doit, ++ .policy = ovpn_key_get_nl_policy, ++ .maxattr = OVPN_A_KEYCONF, ++ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, ++ }, ++ { ++ .cmd = OVPN_CMD_KEY_SWAP, ++ .pre_doit = ovpn_nl_pre_doit, ++ .doit = ovpn_nl_key_swap_doit, ++ .post_doit = ovpn_nl_post_doit, ++ .policy = ovpn_key_swap_nl_policy, ++ .maxattr = OVPN_A_KEYCONF, ++ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, ++ }, ++ { ++ .cmd = OVPN_CMD_KEY_DEL, ++ .pre_doit = ovpn_nl_pre_doit, ++ .doit = ovpn_nl_key_del_doit, ++ .post_doit = ovpn_nl_post_doit, ++ .policy = ovpn_key_del_nl_policy, ++ .maxattr = OVPN_A_KEYCONF, ++ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, ++ }, ++}; ++ ++static const struct genl_multicast_group ovpn_nl_mcgrps[] = { ++ [OVPN_NLGRP_PEERS] = { "peers", }, ++}; ++ ++struct genl_family ovpn_nl_family __ro_after_init = { ++ .name = OVPN_FAMILY_NAME, ++ .version = OVPN_FAMILY_VERSION, ++ .netnsok = true, ++ .parallel_ops = true, ++ .module = THIS_MODULE, ++ .split_ops = ovpn_nl_ops, ++ .n_split_ops = ARRAY_SIZE(ovpn_nl_ops), ++ .mcgrps = ovpn_nl_mcgrps, ++ .n_mcgrps = ARRAY_SIZE(ovpn_nl_mcgrps), ++}; +diff --git a/drivers/net/ovpn/netlink-gen.h b/drivers/net/ovpn/netlink-gen.h +new file mode 100644 +index 000000000000..66a4e4a0a055 +--- /dev/null ++++ b/drivers/net/ovpn/netlink-gen.h +@@ -0,0 +1,41 @@ ++/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ ++/* Do not edit directly, auto-generated from: */ ++/* Documentation/netlink/specs/ovpn.yaml */ ++/* YNL-GEN kernel header */ ++ ++#ifndef _LINUX_OVPN_GEN_H ++#define _LINUX_OVPN_GEN_H ++ ++#include ++#include ++ ++#include ++ ++/* Common nested types */ ++extern const struct nla_policy ovpn_keyconf_nl_policy[OVPN_A_KEYCONF_DECRYPT_DIR + 1]; ++extern const struct nla_policy ovpn_keydir_nl_policy[OVPN_A_KEYDIR_NONCE_TAIL + 1]; ++extern const struct nla_policy ovpn_peer_nl_policy[OVPN_A_PEER_LINK_TX_PACKETS + 1]; ++ ++int ovpn_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, ++ struct genl_info *info); ++void ++ovpn_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, ++ struct genl_info *info); ++ ++int ovpn_nl_peer_new_doit(struct sk_buff *skb, struct genl_info *info); ++int ovpn_nl_peer_set_doit(struct sk_buff *skb, struct genl_info *info); ++int ovpn_nl_peer_get_doit(struct sk_buff *skb, struct genl_info *info); ++int ovpn_nl_peer_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); ++int ovpn_nl_peer_del_doit(struct sk_buff *skb, struct genl_info *info); ++int ovpn_nl_key_new_doit(struct sk_buff *skb, struct genl_info *info); ++int ovpn_nl_key_get_doit(struct sk_buff *skb, struct genl_info *info); ++int ovpn_nl_key_swap_doit(struct sk_buff *skb, struct genl_info *info); ++int ovpn_nl_key_del_doit(struct sk_buff *skb, struct genl_info *info); ++ ++enum { ++ OVPN_NLGRP_PEERS, ++}; ++ ++extern struct genl_family ovpn_nl_family; ++ ++#endif /* _LINUX_OVPN_GEN_H */ +diff --git a/drivers/net/ovpn/netlink.c b/drivers/net/ovpn/netlink.c +new file mode 100644 +index 000000000000..4d7d835cb47f +--- /dev/null ++++ b/drivers/net/ovpn/netlink.c +@@ -0,0 +1,1135 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: Antonio Quartulli ++ */ ++ ++#include ++#include ++#include ++ ++#include ++ ++#include "ovpnstruct.h" ++#include "main.h" ++#include "io.h" ++#include "netlink.h" ++#include "netlink-gen.h" ++#include "bind.h" ++#include "crypto.h" ++#include "packet.h" ++#include "peer.h" ++#include "socket.h" ++ ++MODULE_ALIAS_GENL_FAMILY(OVPN_FAMILY_NAME); ++ ++/** ++ * ovpn_get_dev_from_attrs - retrieve the ovpn private data from the netdevice ++ * a netlink message is targeting ++ * @net: network namespace where to look for the interface ++ * @info: generic netlink info from the user request ++ * ++ * Return: the ovpn private data, if found, or an error otherwise ++ */ ++static struct ovpn_struct * ++ovpn_get_dev_from_attrs(struct net *net, const struct genl_info *info) ++{ ++ struct ovpn_struct *ovpn; ++ struct net_device *dev; ++ int ifindex; ++ ++ if (GENL_REQ_ATTR_CHECK(info, OVPN_A_IFINDEX)) ++ return ERR_PTR(-EINVAL); ++ ++ ifindex = nla_get_u32(info->attrs[OVPN_A_IFINDEX]); ++ ++ rcu_read_lock(); ++ dev = dev_get_by_index_rcu(net, ifindex); ++ if (!dev) { ++ rcu_read_unlock(); ++ NL_SET_ERR_MSG_MOD(info->extack, ++ "ifindex does not match any interface"); ++ return ERR_PTR(-ENODEV); ++ } ++ ++ if (!ovpn_dev_is_valid(dev)) { ++ rcu_read_unlock(); ++ NL_SET_ERR_MSG_MOD(info->extack, ++ "specified interface is not ovpn"); ++ NL_SET_BAD_ATTR(info->extack, info->attrs[OVPN_A_IFINDEX]); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ ovpn = netdev_priv(dev); ++ netdev_hold(dev, &ovpn->dev_tracker, GFP_KERNEL); ++ rcu_read_unlock(); ++ ++ return ovpn; ++} ++ ++int ovpn_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, ++ struct genl_info *info) ++{ ++ struct ovpn_struct *ovpn = ovpn_get_dev_from_attrs(genl_info_net(info), ++ info); ++ ++ if (IS_ERR(ovpn)) ++ return PTR_ERR(ovpn); ++ ++ info->user_ptr[0] = ovpn; ++ ++ return 0; ++} ++ ++void ovpn_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, ++ struct genl_info *info) ++{ ++ struct ovpn_struct *ovpn = info->user_ptr[0]; ++ ++ if (ovpn) ++ netdev_put(ovpn->dev, &ovpn->dev_tracker); ++} ++ ++static int ovpn_nl_attr_sockaddr_remote(struct nlattr **attrs, ++ struct sockaddr_storage *ss) ++{ ++ struct sockaddr_in6 *sin6; ++ struct sockaddr_in *sin; ++ struct in6_addr *in6; ++ __be16 port = 0; ++ __be32 *in; ++ int af; ++ ++ ss->ss_family = AF_UNSPEC; ++ ++ if (attrs[OVPN_A_PEER_REMOTE_PORT]) ++ port = nla_get_be16(attrs[OVPN_A_PEER_REMOTE_PORT]); ++ ++ if (attrs[OVPN_A_PEER_REMOTE_IPV4]) { ++ af = AF_INET; ++ ss->ss_family = AF_INET; ++ in = nla_data(attrs[OVPN_A_PEER_REMOTE_IPV4]); ++ } else if (attrs[OVPN_A_PEER_REMOTE_IPV6]) { ++ af = AF_INET6; ++ ss->ss_family = AF_INET6; ++ in6 = nla_data(attrs[OVPN_A_PEER_REMOTE_IPV6]); ++ } else { ++ return AF_UNSPEC; ++ } ++ ++ switch (ss->ss_family) { ++ case AF_INET6: ++ /* If this is a regular IPv6 just break and move on, ++ * otherwise switch to AF_INET and extract the IPv4 accordingly ++ */ ++ if (!ipv6_addr_v4mapped(in6)) { ++ sin6 = (struct sockaddr_in6 *)ss; ++ sin6->sin6_port = port; ++ memcpy(&sin6->sin6_addr, in6, sizeof(*in6)); ++ break; ++ } ++ ++ /* v4-mapped-v6 address */ ++ ss->ss_family = AF_INET; ++ in = &in6->s6_addr32[3]; ++ fallthrough; ++ case AF_INET: ++ sin = (struct sockaddr_in *)ss; ++ sin->sin_port = port; ++ sin->sin_addr.s_addr = *in; ++ break; ++ } ++ ++ /* don't return ss->ss_family as it may have changed in case of ++ * v4-mapped-v6 address ++ */ ++ return af; ++} ++ ++static u8 *ovpn_nl_attr_local_ip(struct nlattr **attrs) ++{ ++ u8 *addr6; ++ ++ if (!attrs[OVPN_A_PEER_LOCAL_IPV4] && !attrs[OVPN_A_PEER_LOCAL_IPV6]) ++ return NULL; ++ ++ if (attrs[OVPN_A_PEER_LOCAL_IPV4]) ++ return nla_data(attrs[OVPN_A_PEER_LOCAL_IPV4]); ++ ++ addr6 = nla_data(attrs[OVPN_A_PEER_LOCAL_IPV6]); ++ /* this is an IPv4-mapped IPv6 address, therefore extract the actual ++ * v4 address from the last 4 bytes ++ */ ++ if (ipv6_addr_v4mapped((struct in6_addr *)addr6)) ++ return addr6 + 12; ++ ++ return addr6; ++} ++ ++static int ovpn_nl_peer_precheck(struct ovpn_struct *ovpn, ++ struct genl_info *info, ++ struct nlattr **attrs) ++{ ++ if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_PEER], attrs, ++ OVPN_A_PEER_ID)) ++ return -EINVAL; ++ ++ if (attrs[OVPN_A_PEER_REMOTE_IPV4] && attrs[OVPN_A_PEER_REMOTE_IPV6]) { ++ NL_SET_ERR_MSG_MOD(info->extack, ++ "cannot specify both remote IPv4 or IPv6 address"); ++ return -EINVAL; ++ } ++ ++ if (!attrs[OVPN_A_PEER_REMOTE_IPV4] && ++ !attrs[OVPN_A_PEER_REMOTE_IPV6] && attrs[OVPN_A_PEER_REMOTE_PORT]) { ++ NL_SET_ERR_MSG_MOD(info->extack, ++ "cannot specify remote port without IP address"); ++ return -EINVAL; ++ } ++ ++ if (!attrs[OVPN_A_PEER_REMOTE_IPV4] && ++ attrs[OVPN_A_PEER_LOCAL_IPV4]) { ++ NL_SET_ERR_MSG_MOD(info->extack, ++ "cannot specify local IPv4 address without remote"); ++ return -EINVAL; ++ } ++ ++ if (!attrs[OVPN_A_PEER_REMOTE_IPV6] && ++ attrs[OVPN_A_PEER_LOCAL_IPV6]) { ++ NL_SET_ERR_MSG_MOD(info->extack, ++ "cannot specify local IPV6 address without remote"); ++ return -EINVAL; ++ } ++ ++ if (!attrs[OVPN_A_PEER_REMOTE_IPV6] && ++ attrs[OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID]) { ++ NL_SET_ERR_MSG_MOD(info->extack, ++ "cannot specify scope id without remote IPv6 address"); ++ return -EINVAL; ++ } ++ ++ /* VPN IPs are needed only in MP mode for selecting the right peer */ ++ if (ovpn->mode == OVPN_MODE_P2P && (attrs[OVPN_A_PEER_VPN_IPV4] || ++ attrs[OVPN_A_PEER_VPN_IPV6])) { ++ NL_SET_ERR_MSG_FMT_MOD(info->extack, ++ "VPN IP unexpected in P2P mode"); ++ return -EINVAL; ++ } ++ ++ if ((attrs[OVPN_A_PEER_KEEPALIVE_INTERVAL] && ++ !attrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT]) || ++ (!attrs[OVPN_A_PEER_KEEPALIVE_INTERVAL] && ++ attrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT])) { ++ NL_SET_ERR_MSG_FMT_MOD(info->extack, ++ "keepalive interval and timeout are required together"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++/** ++ * ovpn_nl_peer_modify - modify the peer attributes according to the incoming msg ++ * @peer: the peer to modify ++ * @info: generic netlink info from the user request ++ * @attrs: the attributes from the user request ++ * ++ * Return: a negative error code in case of failure, 0 on success or 1 on ++ * success and the VPN IPs have been modified (requires rehashing in MP ++ * mode) ++ */ ++static int ovpn_nl_peer_modify(struct ovpn_peer *peer, struct genl_info *info, ++ struct nlattr **attrs) ++{ ++ struct sockaddr_storage ss = {}; ++ u32 sockfd, interv, timeout; ++ struct socket *sock = NULL; ++ u8 *local_ip = NULL; ++ bool rehash = false; ++ int ret; ++ ++ if (attrs[OVPN_A_PEER_SOCKET]) { ++ /* lookup the fd in the kernel table and extract the socket ++ * object ++ */ ++ sockfd = nla_get_u32(attrs[OVPN_A_PEER_SOCKET]); ++ /* sockfd_lookup() increases sock's refcounter */ ++ sock = sockfd_lookup(sockfd, &ret); ++ if (!sock) { ++ NL_SET_ERR_MSG_FMT_MOD(info->extack, ++ "cannot lookup peer socket (fd=%u): %d", ++ sockfd, ret); ++ return -ENOTSOCK; ++ } ++ ++ /* Only when using UDP as transport protocol the remote endpoint ++ * can be configured so that ovpn knows where to send packets ++ * to. ++ * ++ * In case of TCP, the socket is connected to the peer and ovpn ++ * will just send bytes over it, without the need to specify a ++ * destination. ++ */ ++ if (sock->sk->sk_protocol != IPPROTO_UDP && ++ (attrs[OVPN_A_PEER_REMOTE_IPV4] || ++ attrs[OVPN_A_PEER_REMOTE_IPV6])) { ++ NL_SET_ERR_MSG_FMT_MOD(info->extack, ++ "unexpected remote IP address for non UDP socket"); ++ sockfd_put(sock); ++ return -EINVAL; ++ } ++ ++ if (peer->sock) ++ ovpn_socket_put(peer->sock); ++ ++ peer->sock = ovpn_socket_new(sock, peer); ++ if (IS_ERR(peer->sock)) { ++ NL_SET_ERR_MSG_FMT_MOD(info->extack, ++ "cannot encapsulate socket: %ld", ++ PTR_ERR(peer->sock)); ++ sockfd_put(sock); ++ peer->sock = NULL; ++ return -ENOTSOCK; ++ } ++ } ++ ++ if (ovpn_nl_attr_sockaddr_remote(attrs, &ss) != AF_UNSPEC) { ++ /* we carry the local IP in a generic container. ++ * ovpn_peer_reset_sockaddr() will properly interpret it ++ * based on ss.ss_family ++ */ ++ local_ip = ovpn_nl_attr_local_ip(attrs); ++ ++ spin_lock_bh(&peer->lock); ++ /* set peer sockaddr */ ++ ret = ovpn_peer_reset_sockaddr(peer, &ss, local_ip); ++ if (ret < 0) { ++ NL_SET_ERR_MSG_FMT_MOD(info->extack, ++ "cannot set peer sockaddr: %d", ++ ret); ++ spin_unlock_bh(&peer->lock); ++ return ret; ++ } ++ spin_unlock_bh(&peer->lock); ++ } ++ ++ if (attrs[OVPN_A_PEER_VPN_IPV4]) { ++ rehash = true; ++ peer->vpn_addrs.ipv4.s_addr = ++ nla_get_in_addr(attrs[OVPN_A_PEER_VPN_IPV4]); ++ } ++ ++ if (attrs[OVPN_A_PEER_VPN_IPV6]) { ++ rehash = true; ++ peer->vpn_addrs.ipv6 = ++ nla_get_in6_addr(attrs[OVPN_A_PEER_VPN_IPV6]); ++ } ++ ++ /* when setting the keepalive, both parameters have to be configured */ ++ if (attrs[OVPN_A_PEER_KEEPALIVE_INTERVAL] && ++ attrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT]) { ++ interv = nla_get_u32(attrs[OVPN_A_PEER_KEEPALIVE_INTERVAL]); ++ timeout = nla_get_u32(attrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT]); ++ ovpn_peer_keepalive_set(peer, interv, timeout); ++ } ++ ++ netdev_dbg(peer->ovpn->dev, ++ "%s: peer id=%u endpoint=%pIScp/%s VPN-IPv4=%pI4 VPN-IPv6=%pI6c\n", ++ __func__, peer->id, &ss, ++ peer->sock->sock->sk->sk_prot_creator->name, ++ &peer->vpn_addrs.ipv4.s_addr, &peer->vpn_addrs.ipv6); ++ ++ return rehash ? 1 : 0; ++} ++ ++int ovpn_nl_peer_new_doit(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr *attrs[OVPN_A_PEER_MAX + 1]; ++ struct ovpn_struct *ovpn = info->user_ptr[0]; ++ struct ovpn_peer *peer; ++ u32 peer_id; ++ int ret; ++ ++ if (GENL_REQ_ATTR_CHECK(info, OVPN_A_PEER)) ++ return -EINVAL; ++ ++ ret = nla_parse_nested(attrs, OVPN_A_PEER_MAX, info->attrs[OVPN_A_PEER], ++ ovpn_peer_nl_policy, info->extack); ++ if (ret) ++ return ret; ++ ++ ret = ovpn_nl_peer_precheck(ovpn, info, attrs); ++ if (ret < 0) ++ return ret; ++ ++ if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_PEER], attrs, ++ OVPN_A_PEER_SOCKET)) ++ return -EINVAL; ++ ++ peer_id = nla_get_u32(attrs[OVPN_A_PEER_ID]); ++ peer = ovpn_peer_new(ovpn, peer_id); ++ if (IS_ERR(peer)) { ++ NL_SET_ERR_MSG_FMT_MOD(info->extack, ++ "cannot create new peer object for peer %u: %ld", ++ peer_id, PTR_ERR(peer)); ++ return PTR_ERR(peer); ++ } ++ ++ ret = ovpn_nl_peer_modify(peer, info, attrs); ++ if (ret < 0) ++ goto peer_release; ++ ++ ret = ovpn_peer_add(ovpn, peer); ++ if (ret < 0) { ++ NL_SET_ERR_MSG_FMT_MOD(info->extack, ++ "cannot add new peer (id=%u) to hashtable: %d\n", ++ peer->id, ret); ++ goto peer_release; ++ } ++ ++ return 0; ++ ++peer_release: ++ /* release right away because peer is not used in any context */ ++ ovpn_peer_release(peer); ++ ++ return ret; ++} ++ ++int ovpn_nl_peer_set_doit(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr *attrs[OVPN_A_PEER_MAX + 1]; ++ struct ovpn_struct *ovpn = info->user_ptr[0]; ++ struct ovpn_peer *peer; ++ u32 peer_id; ++ int ret; ++ ++ if (GENL_REQ_ATTR_CHECK(info, OVPN_A_PEER)) ++ return -EINVAL; ++ ++ ret = nla_parse_nested(attrs, OVPN_A_PEER_MAX, info->attrs[OVPN_A_PEER], ++ ovpn_peer_nl_policy, info->extack); ++ if (ret) ++ return ret; ++ ++ ret = ovpn_nl_peer_precheck(ovpn, info, attrs); ++ if (ret < 0) ++ return ret; ++ ++ peer_id = nla_get_u32(attrs[OVPN_A_PEER_ID]); ++ peer = ovpn_peer_get_by_id(ovpn, peer_id); ++ if (!peer) ++ return -ENOENT; ++ ++ ret = ovpn_nl_peer_modify(peer, info, attrs); ++ if (ret < 0) { ++ ovpn_peer_put(peer); ++ return ret; ++ } ++ ++ /* ret == 1 means that VPN IPv4/6 has been modified and rehashing ++ * is required ++ */ ++ if (ret > 0) { ++ spin_lock_bh(&ovpn->peers->lock); ++ ovpn_peer_hash_vpn_ip(peer); ++ spin_unlock_bh(&ovpn->peers->lock); ++ } ++ ++ ovpn_peer_put(peer); ++ ++ return 0; ++} ++ ++static int ovpn_nl_send_peer(struct sk_buff *skb, const struct genl_info *info, ++ const struct ovpn_peer *peer, u32 portid, u32 seq, ++ int flags) ++{ ++ const struct ovpn_bind *bind; ++ struct nlattr *attr; ++ void *hdr; ++ ++ hdr = genlmsg_put(skb, portid, seq, &ovpn_nl_family, flags, ++ OVPN_CMD_PEER_GET); ++ if (!hdr) ++ return -ENOBUFS; ++ ++ attr = nla_nest_start(skb, OVPN_A_PEER); ++ if (!attr) ++ goto err; ++ ++ if (nla_put_u32(skb, OVPN_A_PEER_ID, peer->id)) ++ goto err; ++ ++ if (peer->vpn_addrs.ipv4.s_addr != htonl(INADDR_ANY)) ++ if (nla_put_in_addr(skb, OVPN_A_PEER_VPN_IPV4, ++ peer->vpn_addrs.ipv4.s_addr)) ++ goto err; ++ ++ if (!ipv6_addr_equal(&peer->vpn_addrs.ipv6, &in6addr_any)) ++ if (nla_put_in6_addr(skb, OVPN_A_PEER_VPN_IPV6, ++ &peer->vpn_addrs.ipv6)) ++ goto err; ++ ++ if (nla_put_u32(skb, OVPN_A_PEER_KEEPALIVE_INTERVAL, ++ peer->keepalive_interval) || ++ nla_put_u32(skb, OVPN_A_PEER_KEEPALIVE_TIMEOUT, ++ peer->keepalive_timeout)) ++ goto err; ++ ++ rcu_read_lock(); ++ bind = rcu_dereference(peer->bind); ++ if (bind) { ++ if (bind->remote.in4.sin_family == AF_INET) { ++ if (nla_put_in_addr(skb, OVPN_A_PEER_REMOTE_IPV4, ++ bind->remote.in4.sin_addr.s_addr) || ++ nla_put_net16(skb, OVPN_A_PEER_REMOTE_PORT, ++ bind->remote.in4.sin_port) || ++ nla_put_in_addr(skb, OVPN_A_PEER_LOCAL_IPV4, ++ bind->local.ipv4.s_addr)) ++ goto err_unlock; ++ } else if (bind->remote.in4.sin_family == AF_INET6) { ++ if (nla_put_in6_addr(skb, OVPN_A_PEER_REMOTE_IPV6, ++ &bind->remote.in6.sin6_addr) || ++ nla_put_u32(skb, OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID, ++ bind->remote.in6.sin6_scope_id) || ++ nla_put_net16(skb, OVPN_A_PEER_REMOTE_PORT, ++ bind->remote.in6.sin6_port) || ++ nla_put_in6_addr(skb, OVPN_A_PEER_LOCAL_IPV6, ++ &bind->local.ipv6)) ++ goto err_unlock; ++ } ++ } ++ rcu_read_unlock(); ++ ++ if (nla_put_net16(skb, OVPN_A_PEER_LOCAL_PORT, ++ inet_sk(peer->sock->sock->sk)->inet_sport) || ++ /* VPN RX stats */ ++ nla_put_uint(skb, OVPN_A_PEER_VPN_RX_BYTES, ++ atomic64_read(&peer->vpn_stats.rx.bytes)) || ++ nla_put_uint(skb, OVPN_A_PEER_VPN_RX_PACKETS, ++ atomic64_read(&peer->vpn_stats.rx.packets)) || ++ /* VPN TX stats */ ++ nla_put_uint(skb, OVPN_A_PEER_VPN_TX_BYTES, ++ atomic64_read(&peer->vpn_stats.tx.bytes)) || ++ nla_put_uint(skb, OVPN_A_PEER_VPN_TX_PACKETS, ++ atomic64_read(&peer->vpn_stats.tx.packets)) || ++ /* link RX stats */ ++ nla_put_uint(skb, OVPN_A_PEER_LINK_RX_BYTES, ++ atomic64_read(&peer->link_stats.rx.bytes)) || ++ nla_put_uint(skb, OVPN_A_PEER_LINK_RX_PACKETS, ++ atomic64_read(&peer->link_stats.rx.packets)) || ++ /* link TX stats */ ++ nla_put_uint(skb, OVPN_A_PEER_LINK_TX_BYTES, ++ atomic64_read(&peer->link_stats.tx.bytes)) || ++ nla_put_uint(skb, OVPN_A_PEER_LINK_TX_PACKETS, ++ atomic64_read(&peer->link_stats.tx.packets))) ++ goto err; ++ ++ nla_nest_end(skb, attr); ++ genlmsg_end(skb, hdr); ++ ++ return 0; ++err_unlock: ++ rcu_read_unlock(); ++err: ++ genlmsg_cancel(skb, hdr); ++ return -EMSGSIZE; ++} ++ ++int ovpn_nl_peer_get_doit(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr *attrs[OVPN_A_PEER_MAX + 1]; ++ struct ovpn_struct *ovpn = info->user_ptr[0]; ++ struct ovpn_peer *peer; ++ struct sk_buff *msg; ++ u32 peer_id; ++ int ret; ++ ++ if (GENL_REQ_ATTR_CHECK(info, OVPN_A_PEER)) ++ return -EINVAL; ++ ++ ret = nla_parse_nested(attrs, OVPN_A_PEER_MAX, info->attrs[OVPN_A_PEER], ++ ovpn_peer_nl_policy, info->extack); ++ if (ret) ++ return ret; ++ ++ if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_PEER], attrs, ++ OVPN_A_PEER_ID)) ++ return -EINVAL; ++ ++ peer_id = nla_get_u32(attrs[OVPN_A_PEER_ID]); ++ peer = ovpn_peer_get_by_id(ovpn, peer_id); ++ if (!peer) { ++ NL_SET_ERR_MSG_FMT_MOD(info->extack, ++ "cannot find peer with id %u", peer_id); ++ return -ENOENT; ++ } ++ ++ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); ++ if (!msg) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = ovpn_nl_send_peer(msg, info, peer, info->snd_portid, ++ info->snd_seq, 0); ++ if (ret < 0) { ++ nlmsg_free(msg); ++ goto err; ++ } ++ ++ ret = genlmsg_reply(msg, info); ++err: ++ ovpn_peer_put(peer); ++ return ret; ++} ++ ++int ovpn_nl_peer_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) ++{ ++ const struct genl_info *info = genl_info_dump(cb); ++ int bkt, last_idx = cb->args[1], dumped = 0; ++ struct ovpn_struct *ovpn; ++ struct ovpn_peer *peer; ++ ++ ovpn = ovpn_get_dev_from_attrs(sock_net(cb->skb->sk), info); ++ if (IS_ERR(ovpn)) ++ return PTR_ERR(ovpn); ++ ++ if (ovpn->mode == OVPN_MODE_P2P) { ++ /* if we already dumped a peer it means we are done */ ++ if (last_idx) ++ goto out; ++ ++ rcu_read_lock(); ++ peer = rcu_dereference(ovpn->peer); ++ if (peer) { ++ if (ovpn_nl_send_peer(skb, info, peer, ++ NETLINK_CB(cb->skb).portid, ++ cb->nlh->nlmsg_seq, ++ NLM_F_MULTI) == 0) ++ dumped++; ++ } ++ rcu_read_unlock(); ++ } else { ++ rcu_read_lock(); ++ hash_for_each_rcu(ovpn->peers->by_id, bkt, peer, ++ hash_entry_id) { ++ /* skip already dumped peers that were dumped by ++ * previous invocations ++ */ ++ if (last_idx > 0) { ++ last_idx--; ++ continue; ++ } ++ ++ if (ovpn_nl_send_peer(skb, info, peer, ++ NETLINK_CB(cb->skb).portid, ++ cb->nlh->nlmsg_seq, ++ NLM_F_MULTI) < 0) ++ break; ++ ++ /* count peers being dumped during this invocation */ ++ dumped++; ++ } ++ rcu_read_unlock(); ++ } ++ ++out: ++ netdev_put(ovpn->dev, &ovpn->dev_tracker); ++ ++ /* sum up peers dumped in this message, so that at the next invocation ++ * we can continue from where we left ++ */ ++ cb->args[1] += dumped; ++ return skb->len; ++} ++ ++int ovpn_nl_peer_del_doit(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr *attrs[OVPN_A_PEER_MAX + 1]; ++ struct ovpn_struct *ovpn = info->user_ptr[0]; ++ struct ovpn_peer *peer; ++ u32 peer_id; ++ int ret; ++ ++ if (GENL_REQ_ATTR_CHECK(info, OVPN_A_PEER)) ++ return -EINVAL; ++ ++ ret = nla_parse_nested(attrs, OVPN_A_PEER_MAX, info->attrs[OVPN_A_PEER], ++ ovpn_peer_nl_policy, info->extack); ++ if (ret) ++ return ret; ++ ++ if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_PEER], attrs, ++ OVPN_A_PEER_ID)) ++ return -EINVAL; ++ ++ peer_id = nla_get_u32(attrs[OVPN_A_PEER_ID]); ++ ++ peer = ovpn_peer_get_by_id(ovpn, peer_id); ++ if (!peer) ++ return -ENOENT; ++ ++ netdev_dbg(ovpn->dev, "%s: peer id=%u\n", __func__, peer->id); ++ ret = ovpn_peer_del(peer, OVPN_DEL_PEER_REASON_USERSPACE); ++ ovpn_peer_put(peer); ++ ++ return ret; ++} ++ ++static int ovpn_nl_get_key_dir(struct genl_info *info, struct nlattr *key, ++ enum ovpn_cipher_alg cipher, ++ struct ovpn_key_direction *dir) ++{ ++ struct nlattr *attrs[OVPN_A_KEYDIR_MAX + 1]; ++ int ret; ++ ++ ret = nla_parse_nested(attrs, OVPN_A_KEYDIR_MAX, key, ++ ovpn_keydir_nl_policy, info->extack); ++ if (ret) ++ return ret; ++ ++ switch (cipher) { ++ case OVPN_CIPHER_ALG_AES_GCM: ++ case OVPN_CIPHER_ALG_CHACHA20_POLY1305: ++ if (NL_REQ_ATTR_CHECK(info->extack, key, attrs, ++ OVPN_A_KEYDIR_CIPHER_KEY) || ++ NL_REQ_ATTR_CHECK(info->extack, key, attrs, ++ OVPN_A_KEYDIR_NONCE_TAIL)) ++ return -EINVAL; ++ ++ dir->cipher_key = nla_data(attrs[OVPN_A_KEYDIR_CIPHER_KEY]); ++ dir->cipher_key_size = nla_len(attrs[OVPN_A_KEYDIR_CIPHER_KEY]); ++ ++ /* These algorithms require a 96bit nonce, ++ * Construct it by combining 4-bytes packet id and ++ * 8-bytes nonce-tail from userspace ++ */ ++ dir->nonce_tail = nla_data(attrs[OVPN_A_KEYDIR_NONCE_TAIL]); ++ dir->nonce_tail_size = nla_len(attrs[OVPN_A_KEYDIR_NONCE_TAIL]); ++ break; ++ default: ++ NL_SET_ERR_MSG_MOD(info->extack, "unsupported cipher"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++/** ++ * ovpn_nl_key_new_doit - configure a new key for the specified peer ++ * @skb: incoming netlink message ++ * @info: genetlink metadata ++ * ++ * This function allows the user to install a new key in the peer crypto ++ * state. ++ * Each peer has two 'slots', namely 'primary' and 'secondary', where ++ * keys can be installed. The key in the 'primary' slot is used for ++ * encryption, while both keys can be used for decryption by matching the ++ * key ID carried in the incoming packet. ++ * ++ * The user is responsible for rotating keys when necessary. The user ++ * may fetch peer traffic statistics via netlink in order to better ++ * identify the right time to rotate keys. ++ * The renegotiation follows these steps: ++ * 1. a new key is computed by the user and is installed in the 'secondary' ++ * slot ++ * 2. at user discretion (usually after a predetermined time) 'primary' and ++ * 'secondary' contents are swapped and the new key starts being used for ++ * encryption, while the old key is kept around for decryption of late ++ * packets. ++ * ++ * Return: 0 on success or a negative error code otherwise. ++ */ ++int ovpn_nl_key_new_doit(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr *attrs[OVPN_A_KEYCONF_MAX + 1]; ++ struct ovpn_struct *ovpn = info->user_ptr[0]; ++ struct ovpn_peer_key_reset pkr; ++ struct ovpn_peer *peer; ++ u32 peer_id; ++ int ret; ++ ++ if (GENL_REQ_ATTR_CHECK(info, OVPN_A_KEYCONF)) ++ return -EINVAL; ++ ++ ret = nla_parse_nested(attrs, OVPN_A_KEYCONF_MAX, ++ info->attrs[OVPN_A_KEYCONF], ++ ovpn_keyconf_nl_policy, info->extack); ++ if (ret) ++ return ret; ++ ++ if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, ++ OVPN_A_KEYCONF_PEER_ID)) ++ return -EINVAL; ++ ++ if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, ++ OVPN_A_KEYCONF_SLOT) || ++ NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, ++ OVPN_A_KEYCONF_KEY_ID) || ++ NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, ++ OVPN_A_KEYCONF_CIPHER_ALG) || ++ NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, ++ OVPN_A_KEYCONF_ENCRYPT_DIR) || ++ NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, ++ OVPN_A_KEYCONF_DECRYPT_DIR)) ++ return -EINVAL; ++ ++ peer_id = nla_get_u32(attrs[OVPN_A_KEYCONF_PEER_ID]); ++ pkr.slot = nla_get_u8(attrs[OVPN_A_KEYCONF_SLOT]); ++ pkr.key.key_id = nla_get_u16(attrs[OVPN_A_KEYCONF_KEY_ID]); ++ pkr.key.cipher_alg = nla_get_u16(attrs[OVPN_A_KEYCONF_CIPHER_ALG]); ++ ++ ret = ovpn_nl_get_key_dir(info, attrs[OVPN_A_KEYCONF_ENCRYPT_DIR], ++ pkr.key.cipher_alg, &pkr.key.encrypt); ++ if (ret < 0) ++ return ret; ++ ++ ret = ovpn_nl_get_key_dir(info, attrs[OVPN_A_KEYCONF_DECRYPT_DIR], ++ pkr.key.cipher_alg, &pkr.key.decrypt); ++ if (ret < 0) ++ return ret; ++ ++ peer = ovpn_peer_get_by_id(ovpn, peer_id); ++ if (!peer) { ++ NL_SET_ERR_MSG_FMT_MOD(info->extack, ++ "no peer with id %u to set key for", ++ peer_id); ++ return -ENOENT; ++ } ++ ++ ret = ovpn_crypto_state_reset(&peer->crypto, &pkr); ++ if (ret < 0) { ++ NL_SET_ERR_MSG_FMT_MOD(info->extack, ++ "cannot install new key for peer %u", ++ peer_id); ++ goto out; ++ } ++ ++ netdev_dbg(ovpn->dev, "%s: new key installed (id=%u) for peer %u\n", ++ __func__, pkr.key.key_id, peer_id); ++out: ++ ovpn_peer_put(peer); ++ return ret; ++} ++ ++static int ovpn_nl_send_key(struct sk_buff *skb, const struct genl_info *info, ++ u32 peer_id, enum ovpn_key_slot slot, ++ const struct ovpn_key_config *keyconf, u32 portid, ++ u32 seq, int flags) ++{ ++ struct nlattr *attr; ++ void *hdr; ++ ++ hdr = genlmsg_put(skb, portid, seq, &ovpn_nl_family, flags, ++ OVPN_CMD_KEY_GET); ++ if (!hdr) ++ return -ENOBUFS; ++ ++ attr = nla_nest_start(skb, OVPN_A_KEYCONF); ++ if (!attr) ++ goto err; ++ ++ if (nla_put_u32(skb, OVPN_A_KEYCONF_PEER_ID, peer_id)) ++ goto err; ++ ++ if (nla_put_u32(skb, OVPN_A_KEYCONF_SLOT, slot) || ++ nla_put_u32(skb, OVPN_A_KEYCONF_KEY_ID, keyconf->key_id) || ++ nla_put_u32(skb, OVPN_A_KEYCONF_CIPHER_ALG, keyconf->cipher_alg)) ++ goto err; ++ ++ nla_nest_end(skb, attr); ++ genlmsg_end(skb, hdr); ++ ++ return 0; ++err: ++ genlmsg_cancel(skb, hdr); ++ return -EMSGSIZE; ++} ++ ++int ovpn_nl_key_get_doit(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr *attrs[OVPN_A_KEYCONF_MAX + 1]; ++ struct ovpn_struct *ovpn = info->user_ptr[0]; ++ struct ovpn_key_config keyconf = { 0 }; ++ enum ovpn_key_slot slot; ++ struct ovpn_peer *peer; ++ struct sk_buff *msg; ++ u32 peer_id; ++ int ret; ++ ++ if (GENL_REQ_ATTR_CHECK(info, OVPN_A_KEYCONF)) ++ return -EINVAL; ++ ++ ret = nla_parse_nested(attrs, OVPN_A_KEYCONF_MAX, ++ info->attrs[OVPN_A_KEYCONF], ++ ovpn_keyconf_nl_policy, info->extack); ++ if (ret) ++ return ret; ++ ++ if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, ++ OVPN_A_KEYCONF_PEER_ID)) ++ return -EINVAL; ++ ++ peer_id = nla_get_u32(attrs[OVPN_A_KEYCONF_PEER_ID]); ++ ++ peer = ovpn_peer_get_by_id(ovpn, peer_id); ++ if (!peer) { ++ NL_SET_ERR_MSG_FMT_MOD(info->extack, ++ "cannot find peer with id %u", 0); ++ return -ENOENT; ++ } ++ ++ if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, ++ OVPN_A_KEYCONF_SLOT)) ++ return -EINVAL; ++ ++ slot = nla_get_u32(attrs[OVPN_A_KEYCONF_SLOT]); ++ ++ ret = ovpn_crypto_config_get(&peer->crypto, slot, &keyconf); ++ if (ret < 0) { ++ NL_SET_ERR_MSG_FMT_MOD(info->extack, ++ "cannot extract key from slot %u for peer %u", ++ slot, peer_id); ++ goto err; ++ } ++ ++ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); ++ if (!msg) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = ovpn_nl_send_key(msg, info, peer->id, slot, &keyconf, ++ info->snd_portid, info->snd_seq, 0); ++ if (ret < 0) { ++ nlmsg_free(msg); ++ goto err; ++ } ++ ++ ret = genlmsg_reply(msg, info); ++err: ++ ovpn_peer_put(peer); ++ return ret; ++} ++ ++int ovpn_nl_key_swap_doit(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct ovpn_struct *ovpn = info->user_ptr[0]; ++ struct nlattr *attrs[OVPN_A_PEER_MAX + 1]; ++ struct ovpn_peer *peer; ++ u32 peer_id; ++ int ret; ++ ++ if (GENL_REQ_ATTR_CHECK(info, OVPN_A_KEYCONF)) ++ return -EINVAL; ++ ++ ret = nla_parse_nested(attrs, OVPN_A_KEYCONF_MAX, ++ info->attrs[OVPN_A_KEYCONF], ++ ovpn_keyconf_nl_policy, info->extack); ++ if (ret) ++ return ret; ++ ++ if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, ++ OVPN_A_KEYCONF_PEER_ID)) ++ return -EINVAL; ++ ++ peer_id = nla_get_u32(attrs[OVPN_A_KEYCONF_PEER_ID]); ++ ++ peer = ovpn_peer_get_by_id(ovpn, peer_id); ++ if (!peer) { ++ NL_SET_ERR_MSG_FMT_MOD(info->extack, ++ "no peer with id %u to swap keys for", ++ peer_id); ++ return -ENOENT; ++ } ++ ++ ovpn_crypto_key_slots_swap(&peer->crypto); ++ ovpn_peer_put(peer); ++ ++ return 0; ++} ++ ++int ovpn_nl_key_del_doit(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr *attrs[OVPN_A_KEYCONF_MAX + 1]; ++ struct ovpn_struct *ovpn = info->user_ptr[0]; ++ enum ovpn_key_slot slot; ++ struct ovpn_peer *peer; ++ u32 peer_id; ++ int ret; ++ ++ if (GENL_REQ_ATTR_CHECK(info, OVPN_A_KEYCONF)) ++ return -EINVAL; ++ ++ ret = nla_parse_nested(attrs, OVPN_A_KEYCONF_MAX, ++ info->attrs[OVPN_A_KEYCONF], ++ ovpn_keyconf_nl_policy, info->extack); ++ if (ret) ++ return ret; ++ ++ if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, ++ OVPN_A_KEYCONF_PEER_ID)) ++ return -EINVAL; ++ ++ if (ret) ++ return ret; ++ ++ if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, ++ OVPN_A_KEYCONF_SLOT)) ++ return -EINVAL; ++ ++ peer_id = nla_get_u32(attrs[OVPN_A_KEYCONF_PEER_ID]); ++ slot = nla_get_u8(attrs[OVPN_A_KEYCONF_SLOT]); ++ ++ peer = ovpn_peer_get_by_id(ovpn, peer_id); ++ if (!peer) { ++ NL_SET_ERR_MSG_FMT_MOD(info->extack, ++ "no peer with id %u to delete key for", ++ peer_id); ++ return -ENOENT; ++ } ++ ++ ovpn_crypto_key_slot_delete(&peer->crypto, slot); ++ ovpn_peer_put(peer); ++ ++ return 0; ++} ++ ++/** ++ * ovpn_nl_peer_del_notify - notify userspace about peer being deleted ++ * @peer: the peer being deleted ++ * ++ * Return: 0 on success or a negative error code otherwise ++ */ ++int ovpn_nl_peer_del_notify(struct ovpn_peer *peer) ++{ ++ struct sk_buff *msg; ++ struct nlattr *attr; ++ int ret = -EMSGSIZE; ++ void *hdr; ++ ++ netdev_info(peer->ovpn->dev, "deleting peer with id %u, reason %d\n", ++ peer->id, peer->delete_reason); ++ ++ msg = nlmsg_new(100, GFP_ATOMIC); ++ if (!msg) ++ return -ENOMEM; ++ ++ hdr = genlmsg_put(msg, 0, 0, &ovpn_nl_family, 0, OVPN_CMD_PEER_DEL_NTF); ++ if (!hdr) { ++ ret = -ENOBUFS; ++ goto err_free_msg; ++ } ++ ++ if (nla_put_u32(msg, OVPN_A_IFINDEX, peer->ovpn->dev->ifindex)) ++ goto err_cancel_msg; ++ ++ attr = nla_nest_start(msg, OVPN_A_PEER); ++ if (!attr) ++ goto err_cancel_msg; ++ ++ if (nla_put_u8(msg, OVPN_A_PEER_DEL_REASON, peer->delete_reason)) ++ goto err_cancel_msg; ++ ++ if (nla_put_u32(msg, OVPN_A_PEER_ID, peer->id)) ++ goto err_cancel_msg; ++ ++ nla_nest_end(msg, attr); ++ ++ genlmsg_end(msg, hdr); ++ ++ genlmsg_multicast_netns(&ovpn_nl_family, dev_net(peer->ovpn->dev), msg, ++ 0, OVPN_NLGRP_PEERS, GFP_ATOMIC); ++ ++ return 0; ++ ++err_cancel_msg: ++ genlmsg_cancel(msg, hdr); ++err_free_msg: ++ nlmsg_free(msg); ++ return ret; ++} ++ ++/** ++ * ovpn_nl_key_swap_notify - notify userspace peer's key must be renewed ++ * @peer: the peer whose key needs to be renewed ++ * @key_id: the ID of the key that needs to be renewed ++ * ++ * Return: 0 on success or a negative error code otherwise ++ */ ++int ovpn_nl_key_swap_notify(struct ovpn_peer *peer, u8 key_id) ++{ ++ struct nlattr *k_attr; ++ struct sk_buff *msg; ++ int ret = -EMSGSIZE; ++ void *hdr; ++ ++ netdev_info(peer->ovpn->dev, "peer with id %u must rekey - primary key unusable.\n", ++ peer->id); ++ ++ msg = nlmsg_new(100, GFP_ATOMIC); ++ if (!msg) ++ return -ENOMEM; ++ ++ hdr = genlmsg_put(msg, 0, 0, &ovpn_nl_family, 0, OVPN_CMD_KEY_SWAP_NTF); ++ if (!hdr) { ++ ret = -ENOBUFS; ++ goto err_free_msg; ++ } ++ ++ if (nla_put_u32(msg, OVPN_A_IFINDEX, peer->ovpn->dev->ifindex)) ++ goto err_cancel_msg; ++ ++ k_attr = nla_nest_start(msg, OVPN_A_KEYCONF); ++ if (!k_attr) ++ goto err_cancel_msg; ++ ++ if (nla_put_u32(msg, OVPN_A_KEYCONF_PEER_ID, peer->id)) ++ goto err_cancel_msg; ++ ++ if (nla_put_u16(msg, OVPN_A_KEYCONF_KEY_ID, key_id)) ++ goto err_cancel_msg; ++ ++ nla_nest_end(msg, k_attr); ++ genlmsg_end(msg, hdr); ++ ++ genlmsg_multicast_netns(&ovpn_nl_family, dev_net(peer->ovpn->dev), msg, ++ 0, OVPN_NLGRP_PEERS, GFP_ATOMIC); ++ ++ return 0; ++ ++err_cancel_msg: ++ genlmsg_cancel(msg, hdr); ++err_free_msg: ++ nlmsg_free(msg); ++ return ret; ++} ++ ++/** ++ * ovpn_nl_register - perform any needed registration in the NL subsustem ++ * ++ * Return: 0 on success, a negative error code otherwise ++ */ ++int __init ovpn_nl_register(void) ++{ ++ int ret = genl_register_family(&ovpn_nl_family); ++ ++ if (ret) { ++ pr_err("ovpn: genl_register_family failed: %d\n", ret); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/** ++ * ovpn_nl_unregister - undo any module wide netlink registration ++ */ ++void ovpn_nl_unregister(void) ++{ ++ genl_unregister_family(&ovpn_nl_family); ++} +diff --git a/drivers/net/ovpn/netlink.h b/drivers/net/ovpn/netlink.h +new file mode 100644 +index 000000000000..4ab3abcf23db +--- /dev/null ++++ b/drivers/net/ovpn/netlink.h +@@ -0,0 +1,18 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: Antonio Quartulli ++ */ ++ ++#ifndef _NET_OVPN_NETLINK_H_ ++#define _NET_OVPN_NETLINK_H_ ++ ++int ovpn_nl_register(void); ++void ovpn_nl_unregister(void); ++ ++int ovpn_nl_peer_del_notify(struct ovpn_peer *peer); ++int ovpn_nl_key_swap_notify(struct ovpn_peer *peer, u8 key_id); ++ ++#endif /* _NET_OVPN_NETLINK_H_ */ +diff --git a/drivers/net/ovpn/ovpnstruct.h b/drivers/net/ovpn/ovpnstruct.h +new file mode 100644 +index 000000000000..4ac00d550ecb +--- /dev/null ++++ b/drivers/net/ovpn/ovpnstruct.h +@@ -0,0 +1,61 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2019-2024 OpenVPN, Inc. ++ * ++ * Author: James Yonan ++ * Antonio Quartulli ++ */ ++ ++#ifndef _NET_OVPN_OVPNSTRUCT_H_ ++#define _NET_OVPN_OVPNSTRUCT_H_ ++ ++#include ++#include ++#include ++#include ++ ++/** ++ * struct ovpn_peer_collection - container of peers for MultiPeer mode ++ * @by_id: table of peers index by ID ++ * @by_vpn_addr: table of peers indexed by VPN IP address (items can be ++ * rehashed on the fly due to peer IP change) ++ * @by_transp_addr: table of peers indexed by transport address (items can be ++ * rehashed on the fly due to peer IP change) ++ * @lock: protects writes to peer tables ++ */ ++struct ovpn_peer_collection { ++ DECLARE_HASHTABLE(by_id, 12); ++ struct hlist_nulls_head by_vpn_addr[1 << 12]; ++ struct hlist_nulls_head by_transp_addr[1 << 12]; ++ ++ spinlock_t lock; /* protects writes to peer tables */ ++}; ++ ++/** ++ * struct ovpn_struct - per ovpn interface state ++ * @dev: the actual netdev representing the tunnel ++ * @dev_tracker: reference tracker for associated dev ++ * @registered: whether dev is still registered with netdev or not ++ * @mode: device operation mode (i.e. p2p, mp, ..) ++ * @lock: protect this object ++ * @peers: data structures holding multi-peer references ++ * @peer: in P2P mode, this is the only remote peer ++ * @dev_list: entry for the module wide device list ++ * @gro_cells: pointer to the Generic Receive Offload cell ++ * @keepalive_work: struct used to schedule keepalive periodic job ++ */ ++struct ovpn_struct { ++ struct net_device *dev; ++ netdevice_tracker dev_tracker; ++ bool registered; ++ enum ovpn_mode mode; ++ spinlock_t lock; /* protect writing to the ovpn_struct object */ ++ struct ovpn_peer_collection *peers; ++ struct ovpn_peer __rcu *peer; ++ struct list_head dev_list; ++ struct gro_cells gro_cells; ++ struct delayed_work keepalive_work; ++}; ++ ++#endif /* _NET_OVPN_OVPNSTRUCT_H_ */ +diff --git a/drivers/net/ovpn/packet.h b/drivers/net/ovpn/packet.h +new file mode 100644 +index 000000000000..e14c9bf464f7 +--- /dev/null ++++ b/drivers/net/ovpn/packet.h +@@ -0,0 +1,40 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: Antonio Quartulli ++ * James Yonan ++ */ ++ ++#ifndef _NET_OVPN_PACKET_H_ ++#define _NET_OVPN_PACKET_H_ ++ ++/* When the OpenVPN protocol is run in AEAD mode, use ++ * the OpenVPN packet ID as the AEAD nonce: ++ * ++ * 00000005 521c3b01 4308c041 ++ * [seq # ] [ nonce_tail ] ++ * [ 12-byte full IV ] -> NONCE_SIZE ++ * [4-bytes -> NONCE_WIRE_SIZE ++ * on wire] ++ */ ++ ++/* OpenVPN nonce size */ ++#define NONCE_SIZE 12 ++ ++/* OpenVPN nonce size reduced by 8-byte nonce tail -- this is the ++ * size of the AEAD Associated Data (AD) sent over the wire ++ * and is normally the head of the IV ++ */ ++#define NONCE_WIRE_SIZE (NONCE_SIZE - sizeof(struct ovpn_nonce_tail)) ++ ++/* Last 8 bytes of AEAD nonce ++ * Provided by userspace and usually derived from ++ * key material generated during TLS handshake ++ */ ++struct ovpn_nonce_tail { ++ u8 u8[OVPN_NONCE_TAIL_SIZE]; ++}; ++ ++#endif /* _NET_OVPN_PACKET_H_ */ +diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c +new file mode 100644 +index 000000000000..91c608f1ffa1 +--- /dev/null ++++ b/drivers/net/ovpn/peer.c +@@ -0,0 +1,1201 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: James Yonan ++ * Antonio Quartulli ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "ovpnstruct.h" ++#include "bind.h" ++#include "pktid.h" ++#include "crypto.h" ++#include "io.h" ++#include "main.h" ++#include "netlink.h" ++#include "peer.h" ++#include "socket.h" ++ ++/** ++ * ovpn_peer_keepalive_set - configure keepalive values for peer ++ * @peer: the peer to configure ++ * @interval: outgoing keepalive interval ++ * @timeout: incoming keepalive timeout ++ */ ++void ovpn_peer_keepalive_set(struct ovpn_peer *peer, u32 interval, u32 timeout) ++{ ++ time64_t now = ktime_get_real_seconds(); ++ ++ netdev_dbg(peer->ovpn->dev, ++ "%s: scheduling keepalive for peer %u: interval=%u timeout=%u\n", ++ __func__, peer->id, interval, timeout); ++ ++ peer->keepalive_interval = interval; ++ peer->last_sent = now; ++ peer->keepalive_xmit_exp = now + interval; ++ ++ peer->keepalive_timeout = timeout; ++ peer->last_recv = now; ++ peer->keepalive_recv_exp = now + timeout; ++ ++ /* now that interval and timeout have been changed, kick ++ * off the worker so that the next delay can be recomputed ++ */ ++ mod_delayed_work(system_wq, &peer->ovpn->keepalive_work, 0); ++} ++ ++/** ++ * ovpn_peer_new - allocate and initialize a new peer object ++ * @ovpn: the openvpn instance inside which the peer should be created ++ * @id: the ID assigned to this peer ++ * ++ * Return: a pointer to the new peer on success or an error code otherwise ++ */ ++struct ovpn_peer *ovpn_peer_new(struct ovpn_struct *ovpn, u32 id) ++{ ++ struct ovpn_peer *peer; ++ int ret; ++ ++ /* alloc and init peer object */ ++ peer = kzalloc(sizeof(*peer), GFP_KERNEL); ++ if (!peer) ++ return ERR_PTR(-ENOMEM); ++ ++ peer->id = id; ++ peer->halt = false; ++ peer->ovpn = ovpn; ++ ++ peer->vpn_addrs.ipv4.s_addr = htonl(INADDR_ANY); ++ peer->vpn_addrs.ipv6 = in6addr_any; ++ ++ RCU_INIT_POINTER(peer->bind, NULL); ++ ovpn_crypto_state_init(&peer->crypto); ++ spin_lock_init(&peer->lock); ++ kref_init(&peer->refcount); ++ ovpn_peer_stats_init(&peer->vpn_stats); ++ ovpn_peer_stats_init(&peer->link_stats); ++ ++ ret = dst_cache_init(&peer->dst_cache, GFP_KERNEL); ++ if (ret < 0) { ++ netdev_err(ovpn->dev, "%s: cannot initialize dst cache\n", ++ __func__); ++ kfree(peer); ++ return ERR_PTR(ret); ++ } ++ ++ netdev_hold(ovpn->dev, &ovpn->dev_tracker, GFP_KERNEL); ++ ++ return peer; ++} ++ ++/** ++ * ovpn_peer_reset_sockaddr - recreate binding for peer ++ * @peer: peer to recreate the binding for ++ * @ss: sockaddr to use as remote endpoint for the binding ++ * @local_ip: local IP for the binding ++ * ++ * Return: 0 on success or a negative error code otherwise ++ */ ++int ovpn_peer_reset_sockaddr(struct ovpn_peer *peer, ++ const struct sockaddr_storage *ss, ++ const u8 *local_ip) ++ __must_hold(&peer->lock) ++{ ++ struct ovpn_bind *bind; ++ size_t ip_len; ++ ++ /* create new ovpn_bind object */ ++ bind = ovpn_bind_from_sockaddr(ss); ++ if (IS_ERR(bind)) ++ return PTR_ERR(bind); ++ ++ if (local_ip) { ++ if (ss->ss_family == AF_INET) { ++ ip_len = sizeof(struct in_addr); ++ } else if (ss->ss_family == AF_INET6) { ++ ip_len = sizeof(struct in6_addr); ++ } else { ++ netdev_dbg(peer->ovpn->dev, "%s: invalid family for remote endpoint\n", ++ __func__); ++ kfree(bind); ++ return -EINVAL; ++ } ++ ++ memcpy(&bind->local, local_ip, ip_len); ++ } ++ ++ /* set binding */ ++ ovpn_bind_reset(peer, bind); ++ ++ return 0; ++} ++ ++#define ovpn_get_hash_head(_tbl, _key, _key_len) ({ \ ++ typeof(_tbl) *__tbl = &(_tbl); \ ++ (&(*__tbl)[jhash(_key, _key_len, 0) % HASH_SIZE(*__tbl)]); }) \ ++ ++/** ++ * ovpn_peer_float - update remote endpoint for peer ++ * @peer: peer to update the remote endpoint for ++ * @skb: incoming packet to retrieve the source address (remote) from ++ */ ++void ovpn_peer_float(struct ovpn_peer *peer, struct sk_buff *skb) ++{ ++ struct hlist_nulls_head *nhead; ++ struct sockaddr_storage ss; ++ const u8 *local_ip = NULL; ++ struct sockaddr_in6 *sa6; ++ struct sockaddr_in *sa; ++ struct ovpn_bind *bind; ++ sa_family_t family; ++ size_t salen; ++ ++ rcu_read_lock(); ++ bind = rcu_dereference(peer->bind); ++ if (unlikely(!bind)) { ++ rcu_read_unlock(); ++ return; ++ } ++ ++ spin_lock_bh(&peer->lock); ++ if (likely(ovpn_bind_skb_src_match(bind, skb))) ++ goto unlock; ++ ++ family = skb_protocol_to_family(skb); ++ ++ if (bind->remote.in4.sin_family == family) ++ local_ip = (u8 *)&bind->local; ++ ++ switch (family) { ++ case AF_INET: ++ sa = (struct sockaddr_in *)&ss; ++ sa->sin_family = AF_INET; ++ sa->sin_addr.s_addr = ip_hdr(skb)->saddr; ++ sa->sin_port = udp_hdr(skb)->source; ++ salen = sizeof(*sa); ++ break; ++ case AF_INET6: ++ sa6 = (struct sockaddr_in6 *)&ss; ++ sa6->sin6_family = AF_INET6; ++ sa6->sin6_addr = ipv6_hdr(skb)->saddr; ++ sa6->sin6_port = udp_hdr(skb)->source; ++ sa6->sin6_scope_id = ipv6_iface_scope_id(&ipv6_hdr(skb)->saddr, ++ skb->skb_iif); ++ salen = sizeof(*sa6); ++ break; ++ default: ++ goto unlock; ++ } ++ ++ netdev_dbg(peer->ovpn->dev, "%s: peer %d floated to %pIScp", __func__, ++ peer->id, &ss); ++ ovpn_peer_reset_sockaddr(peer, (struct sockaddr_storage *)&ss, ++ local_ip); ++ spin_unlock_bh(&peer->lock); ++ rcu_read_unlock(); ++ ++ /* rehashing is required only in MP mode as P2P has one peer ++ * only and thus there is no hashtable ++ */ ++ if (peer->ovpn->mode == OVPN_MODE_MP) { ++ spin_lock_bh(&peer->ovpn->peers->lock); ++ /* remove old hashing */ ++ hlist_nulls_del_init_rcu(&peer->hash_entry_transp_addr); ++ /* re-add with new transport address */ ++ nhead = ovpn_get_hash_head(peer->ovpn->peers->by_transp_addr, ++ &ss, salen); ++ hlist_nulls_add_head_rcu(&peer->hash_entry_transp_addr, nhead); ++ spin_unlock_bh(&peer->ovpn->peers->lock); ++ } ++ return; ++unlock: ++ spin_unlock_bh(&peer->lock); ++ rcu_read_unlock(); ++} ++ ++void ovpn_peer_release(struct ovpn_peer *peer) ++{ ++ if (peer->sock) ++ ovpn_socket_put(peer->sock); ++ ++ ovpn_crypto_state_release(&peer->crypto); ++ spin_lock_bh(&peer->lock); ++ ovpn_bind_reset(peer, NULL); ++ spin_unlock_bh(&peer->lock); ++ ++ dst_cache_destroy(&peer->dst_cache); ++ netdev_put(peer->ovpn->dev, &peer->ovpn->dev_tracker); ++ kfree_rcu(peer, rcu); ++} ++ ++/** ++ * ovpn_peer_release_kref - callback for kref_put ++ * @kref: the kref object belonging to the peer ++ */ ++void ovpn_peer_release_kref(struct kref *kref) ++{ ++ struct ovpn_peer *peer = container_of(kref, struct ovpn_peer, refcount); ++ ++ ovpn_nl_peer_del_notify(peer); ++ ovpn_peer_release(peer); ++} ++ ++/** ++ * ovpn_peer_skb_to_sockaddr - fill sockaddr with skb source address ++ * @skb: the packet to extract data from ++ * @ss: the sockaddr to fill ++ * ++ * Return: true on success or false otherwise ++ */ ++static bool ovpn_peer_skb_to_sockaddr(struct sk_buff *skb, ++ struct sockaddr_storage *ss) ++{ ++ struct sockaddr_in6 *sa6; ++ struct sockaddr_in *sa4; ++ ++ ss->ss_family = skb_protocol_to_family(skb); ++ switch (ss->ss_family) { ++ case AF_INET: ++ sa4 = (struct sockaddr_in *)ss; ++ sa4->sin_family = AF_INET; ++ sa4->sin_addr.s_addr = ip_hdr(skb)->saddr; ++ sa4->sin_port = udp_hdr(skb)->source; ++ break; ++ case AF_INET6: ++ sa6 = (struct sockaddr_in6 *)ss; ++ sa6->sin6_family = AF_INET6; ++ sa6->sin6_addr = ipv6_hdr(skb)->saddr; ++ sa6->sin6_port = udp_hdr(skb)->source; ++ break; ++ default: ++ return false; ++ } ++ ++ return true; ++} ++ ++/** ++ * ovpn_nexthop_from_skb4 - retrieve IPv4 nexthop for outgoing skb ++ * @skb: the outgoing packet ++ * ++ * Return: the IPv4 of the nexthop ++ */ ++static __be32 ovpn_nexthop_from_skb4(struct sk_buff *skb) ++{ ++ const struct rtable *rt = skb_rtable(skb); ++ ++ if (rt && rt->rt_uses_gateway) ++ return rt->rt_gw4; ++ ++ return ip_hdr(skb)->daddr; ++} ++ ++/** ++ * ovpn_nexthop_from_skb6 - retrieve IPv6 nexthop for outgoing skb ++ * @skb: the outgoing packet ++ * ++ * Return: the IPv6 of the nexthop ++ */ ++static struct in6_addr ovpn_nexthop_from_skb6(struct sk_buff *skb) ++{ ++ const struct rt6_info *rt = skb_rt6_info(skb); ++ ++ if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) ++ return ipv6_hdr(skb)->daddr; ++ ++ return rt->rt6i_gateway; ++} ++ ++/** ++ * ovpn_peer_get_by_vpn_addr4 - retrieve peer by its VPN IPv4 address ++ * @ovpn: the openvpn instance to search ++ * @addr: VPN IPv4 to use as search key ++ * ++ * Refcounter is not increased for the returned peer. ++ * ++ * Return: the peer if found or NULL otherwise ++ */ ++static struct ovpn_peer *ovpn_peer_get_by_vpn_addr4(struct ovpn_struct *ovpn, ++ __be32 addr) ++{ ++ struct hlist_nulls_head *nhead; ++ struct hlist_nulls_node *ntmp; ++ struct ovpn_peer *tmp; ++ ++ nhead = ovpn_get_hash_head(ovpn->peers->by_vpn_addr, &addr, ++ sizeof(addr)); ++ ++ hlist_nulls_for_each_entry_rcu(tmp, ntmp, nhead, hash_entry_addr4) ++ if (addr == tmp->vpn_addrs.ipv4.s_addr) ++ return tmp; ++ ++ return NULL; ++} ++ ++/** ++ * ovpn_peer_get_by_vpn_addr6 - retrieve peer by its VPN IPv6 address ++ * @ovpn: the openvpn instance to search ++ * @addr: VPN IPv6 to use as search key ++ * ++ * Refcounter is not increased for the returned peer. ++ * ++ * Return: the peer if found or NULL otherwise ++ */ ++static struct ovpn_peer *ovpn_peer_get_by_vpn_addr6(struct ovpn_struct *ovpn, ++ struct in6_addr *addr) ++{ ++ struct hlist_nulls_head *nhead; ++ struct hlist_nulls_node *ntmp; ++ struct ovpn_peer *tmp; ++ ++ nhead = ovpn_get_hash_head(ovpn->peers->by_vpn_addr, addr, ++ sizeof(*addr)); ++ ++ hlist_nulls_for_each_entry_rcu(tmp, ntmp, nhead, hash_entry_addr6) ++ if (ipv6_addr_equal(addr, &tmp->vpn_addrs.ipv6)) ++ return tmp; ++ ++ return NULL; ++} ++ ++/** ++ * ovpn_peer_transp_match - check if sockaddr and peer binding match ++ * @peer: the peer to get the binding from ++ * @ss: the sockaddr to match ++ * ++ * Return: true if sockaddr and binding match or false otherwise ++ */ ++static bool ovpn_peer_transp_match(const struct ovpn_peer *peer, ++ const struct sockaddr_storage *ss) ++{ ++ struct ovpn_bind *bind = rcu_dereference(peer->bind); ++ struct sockaddr_in6 *sa6; ++ struct sockaddr_in *sa4; ++ ++ if (unlikely(!bind)) ++ return false; ++ ++ if (ss->ss_family != bind->remote.in4.sin_family) ++ return false; ++ ++ switch (ss->ss_family) { ++ case AF_INET: ++ sa4 = (struct sockaddr_in *)ss; ++ if (sa4->sin_addr.s_addr != bind->remote.in4.sin_addr.s_addr) ++ return false; ++ if (sa4->sin_port != bind->remote.in4.sin_port) ++ return false; ++ break; ++ case AF_INET6: ++ sa6 = (struct sockaddr_in6 *)ss; ++ if (!ipv6_addr_equal(&sa6->sin6_addr, ++ &bind->remote.in6.sin6_addr)) ++ return false; ++ if (sa6->sin6_port != bind->remote.in6.sin6_port) ++ return false; ++ break; ++ default: ++ return false; ++ } ++ ++ return true; ++} ++ ++/** ++ * ovpn_peer_get_by_transp_addr_p2p - get peer by transport address in a P2P ++ * instance ++ * @ovpn: the openvpn instance to search ++ * @ss: the transport socket address ++ * ++ * Return: the peer if found or NULL otherwise ++ */ ++static struct ovpn_peer * ++ovpn_peer_get_by_transp_addr_p2p(struct ovpn_struct *ovpn, ++ struct sockaddr_storage *ss) ++{ ++ struct ovpn_peer *tmp, *peer = NULL; ++ ++ rcu_read_lock(); ++ tmp = rcu_dereference(ovpn->peer); ++ if (likely(tmp && ovpn_peer_transp_match(tmp, ss) && ++ ovpn_peer_hold(tmp))) ++ peer = tmp; ++ rcu_read_unlock(); ++ ++ return peer; ++} ++ ++/** ++ * ovpn_peer_get_by_transp_addr - retrieve peer by transport address ++ * @ovpn: the openvpn instance to search ++ * @skb: the skb to retrieve the source transport address from ++ * ++ * Return: a pointer to the peer if found or NULL otherwise ++ */ ++struct ovpn_peer *ovpn_peer_get_by_transp_addr(struct ovpn_struct *ovpn, ++ struct sk_buff *skb) ++{ ++ struct ovpn_peer *tmp, *peer = NULL; ++ struct sockaddr_storage ss = { 0 }; ++ struct hlist_nulls_head *nhead; ++ struct hlist_nulls_node *ntmp; ++ size_t sa_len; ++ ++ if (unlikely(!ovpn_peer_skb_to_sockaddr(skb, &ss))) ++ return NULL; ++ ++ if (ovpn->mode == OVPN_MODE_P2P) ++ return ovpn_peer_get_by_transp_addr_p2p(ovpn, &ss); ++ ++ switch (ss.ss_family) { ++ case AF_INET: ++ sa_len = sizeof(struct sockaddr_in); ++ break; ++ case AF_INET6: ++ sa_len = sizeof(struct sockaddr_in6); ++ break; ++ default: ++ return NULL; ++ } ++ ++ nhead = ovpn_get_hash_head(ovpn->peers->by_transp_addr, &ss, sa_len); ++ ++ rcu_read_lock(); ++ hlist_nulls_for_each_entry_rcu(tmp, ntmp, nhead, ++ hash_entry_transp_addr) { ++ if (!ovpn_peer_transp_match(tmp, &ss)) ++ continue; ++ ++ if (!ovpn_peer_hold(tmp)) ++ continue; ++ ++ peer = tmp; ++ break; ++ } ++ rcu_read_unlock(); ++ ++ return peer; ++} ++ ++/** ++ * ovpn_peer_get_by_id_p2p - get peer by ID in a P2P instance ++ * @ovpn: the openvpn instance to search ++ * @peer_id: the ID of the peer to find ++ * ++ * Return: the peer if found or NULL otherwise ++ */ ++static struct ovpn_peer *ovpn_peer_get_by_id_p2p(struct ovpn_struct *ovpn, ++ u32 peer_id) ++{ ++ struct ovpn_peer *tmp, *peer = NULL; ++ ++ rcu_read_lock(); ++ tmp = rcu_dereference(ovpn->peer); ++ if (likely(tmp && tmp->id == peer_id && ovpn_peer_hold(tmp))) ++ peer = tmp; ++ rcu_read_unlock(); ++ ++ return peer; ++} ++ ++/** ++ * ovpn_peer_get_by_id - retrieve peer by ID ++ * @ovpn: the openvpn instance to search ++ * @peer_id: the unique peer identifier to match ++ * ++ * Return: a pointer to the peer if found or NULL otherwise ++ */ ++struct ovpn_peer *ovpn_peer_get_by_id(struct ovpn_struct *ovpn, u32 peer_id) ++{ ++ struct ovpn_peer *tmp, *peer = NULL; ++ struct hlist_head *head; ++ ++ if (ovpn->mode == OVPN_MODE_P2P) ++ return ovpn_peer_get_by_id_p2p(ovpn, peer_id); ++ ++ head = ovpn_get_hash_head(ovpn->peers->by_id, &peer_id, ++ sizeof(peer_id)); ++ ++ rcu_read_lock(); ++ hlist_for_each_entry_rcu(tmp, head, hash_entry_id) { ++ if (tmp->id != peer_id) ++ continue; ++ ++ if (!ovpn_peer_hold(tmp)) ++ continue; ++ ++ peer = tmp; ++ break; ++ } ++ rcu_read_unlock(); ++ ++ return peer; ++} ++ ++/** ++ * ovpn_peer_update_local_endpoint - update local endpoint for peer ++ * @peer: peer to update the endpoint for ++ * @skb: incoming packet to retrieve the destination address (local) from ++ */ ++void ovpn_peer_update_local_endpoint(struct ovpn_peer *peer, ++ struct sk_buff *skb) ++{ ++ struct ovpn_bind *bind; ++ ++ rcu_read_lock(); ++ bind = rcu_dereference(peer->bind); ++ if (unlikely(!bind)) ++ goto unlock; ++ ++ spin_lock_bh(&peer->lock); ++ switch (skb_protocol_to_family(skb)) { ++ case AF_INET: ++ if (unlikely(bind->local.ipv4.s_addr != ip_hdr(skb)->daddr)) { ++ netdev_dbg(peer->ovpn->dev, ++ "%s: learning local IPv4 for peer %d (%pI4 -> %pI4)\n", ++ __func__, peer->id, &bind->local.ipv4.s_addr, ++ &ip_hdr(skb)->daddr); ++ bind->local.ipv4.s_addr = ip_hdr(skb)->daddr; ++ } ++ break; ++ case AF_INET6: ++ if (unlikely(!ipv6_addr_equal(&bind->local.ipv6, ++ &ipv6_hdr(skb)->daddr))) { ++ netdev_dbg(peer->ovpn->dev, ++ "%s: learning local IPv6 for peer %d (%pI6c -> %pI6c\n", ++ __func__, peer->id, &bind->local.ipv6, ++ &ipv6_hdr(skb)->daddr); ++ bind->local.ipv6 = ipv6_hdr(skb)->daddr; ++ } ++ break; ++ default: ++ break; ++ } ++ spin_unlock_bh(&peer->lock); ++ ++unlock: ++ rcu_read_unlock(); ++} ++ ++/** ++ * ovpn_peer_get_by_dst - Lookup peer to send skb to ++ * @ovpn: the private data representing the current VPN session ++ * @skb: the skb to extract the destination address from ++ * ++ * This function takes a tunnel packet and looks up the peer to send it to ++ * after encapsulation. The skb is expected to be the in-tunnel packet, without ++ * any OpenVPN related header. ++ * ++ * Assume that the IP header is accessible in the skb data. ++ * ++ * Return: the peer if found or NULL otherwise. ++ */ ++struct ovpn_peer *ovpn_peer_get_by_dst(struct ovpn_struct *ovpn, ++ struct sk_buff *skb) ++{ ++ struct ovpn_peer *peer = NULL; ++ struct in6_addr addr6; ++ __be32 addr4; ++ ++ /* in P2P mode, no matter the destination, packets are always sent to ++ * the single peer listening on the other side ++ */ ++ if (ovpn->mode == OVPN_MODE_P2P) { ++ rcu_read_lock(); ++ peer = rcu_dereference(ovpn->peer); ++ if (unlikely(peer && !ovpn_peer_hold(peer))) ++ peer = NULL; ++ rcu_read_unlock(); ++ return peer; ++ } ++ ++ rcu_read_lock(); ++ switch (skb_protocol_to_family(skb)) { ++ case AF_INET: ++ addr4 = ovpn_nexthop_from_skb4(skb); ++ peer = ovpn_peer_get_by_vpn_addr4(ovpn, addr4); ++ break; ++ case AF_INET6: ++ addr6 = ovpn_nexthop_from_skb6(skb); ++ peer = ovpn_peer_get_by_vpn_addr6(ovpn, &addr6); ++ break; ++ } ++ ++ if (unlikely(peer && !ovpn_peer_hold(peer))) ++ peer = NULL; ++ rcu_read_unlock(); ++ ++ return peer; ++} ++ ++/** ++ * ovpn_nexthop_from_rt4 - look up the IPv4 nexthop for the given destination ++ * @ovpn: the private data representing the current VPN session ++ * @dest: the destination to be looked up ++ * ++ * Looks up in the IPv4 system routing table the IP of the nexthop to be used ++ * to reach the destination passed as argument. If no nexthop can be found, the ++ * destination itself is returned as it probably has to be used as nexthop. ++ * ++ * Return: the IP of the next hop if found or dest itself otherwise ++ */ ++static __be32 ovpn_nexthop_from_rt4(struct ovpn_struct *ovpn, __be32 dest) ++{ ++ struct rtable *rt; ++ struct flowi4 fl = { ++ .daddr = dest ++ }; ++ ++ rt = ip_route_output_flow(dev_net(ovpn->dev), &fl, NULL); ++ if (IS_ERR(rt)) { ++ net_dbg_ratelimited("%s: no route to host %pI4\n", __func__, ++ &dest); ++ /* if we end up here this packet is probably going to be ++ * thrown away later ++ */ ++ return dest; ++ } ++ ++ if (!rt->rt_uses_gateway) ++ goto out; ++ ++ dest = rt->rt_gw4; ++out: ++ ip_rt_put(rt); ++ return dest; ++} ++ ++/** ++ * ovpn_nexthop_from_rt6 - look up the IPv6 nexthop for the given destination ++ * @ovpn: the private data representing the current VPN session ++ * @dest: the destination to be looked up ++ * ++ * Looks up in the IPv6 system routing table the IP of the nexthop to be used ++ * to reach the destination passed as argument. If no nexthop can be found, the ++ * destination itself is returned as it probably has to be used as nexthop. ++ * ++ * Return: the IP of the next hop if found or dest itself otherwise ++ */ ++static struct in6_addr ovpn_nexthop_from_rt6(struct ovpn_struct *ovpn, ++ struct in6_addr dest) ++{ ++#if IS_ENABLED(CONFIG_IPV6) ++ struct dst_entry *entry; ++ struct rt6_info *rt; ++ struct flowi6 fl = { ++ .daddr = dest, ++ }; ++ ++ entry = ipv6_stub->ipv6_dst_lookup_flow(dev_net(ovpn->dev), NULL, &fl, ++ NULL); ++ if (IS_ERR(entry)) { ++ net_dbg_ratelimited("%s: no route to host %pI6c\n", __func__, ++ &dest); ++ /* if we end up here this packet is probably going to be ++ * thrown away later ++ */ ++ return dest; ++ } ++ ++ rt = dst_rt6_info(entry); ++ ++ if (!(rt->rt6i_flags & RTF_GATEWAY)) ++ goto out; ++ ++ dest = rt->rt6i_gateway; ++out: ++ dst_release((struct dst_entry *)rt); ++#endif ++ return dest; ++} ++ ++/** ++ * ovpn_peer_check_by_src - check that skb source is routed via peer ++ * @ovpn: the openvpn instance to search ++ * @skb: the packet to extract source address from ++ * @peer: the peer to check against the source address ++ * ++ * Return: true if the peer is matching or false otherwise ++ */ ++bool ovpn_peer_check_by_src(struct ovpn_struct *ovpn, struct sk_buff *skb, ++ struct ovpn_peer *peer) ++{ ++ bool match = false; ++ struct in6_addr addr6; ++ __be32 addr4; ++ ++ if (ovpn->mode == OVPN_MODE_P2P) { ++ /* in P2P mode, no matter the destination, packets are always ++ * sent to the single peer listening on the other side ++ */ ++ rcu_read_lock(); ++ match = (peer == rcu_dereference(ovpn->peer)); ++ rcu_read_unlock(); ++ return match; ++ } ++ ++ /* This function performs a reverse path check, therefore we now ++ * lookup the nexthop we would use if we wanted to route a packet ++ * to the source IP. If the nexthop matches the sender we know the ++ * latter is valid and we allow the packet to come in ++ */ ++ ++ switch (skb_protocol_to_family(skb)) { ++ case AF_INET: ++ addr4 = ovpn_nexthop_from_rt4(ovpn, ip_hdr(skb)->saddr); ++ rcu_read_lock(); ++ match = (peer == ovpn_peer_get_by_vpn_addr4(ovpn, addr4)); ++ rcu_read_unlock(); ++ break; ++ case AF_INET6: ++ addr6 = ovpn_nexthop_from_rt6(ovpn, ipv6_hdr(skb)->saddr); ++ rcu_read_lock(); ++ match = (peer == ovpn_peer_get_by_vpn_addr6(ovpn, &addr6)); ++ rcu_read_unlock(); ++ break; ++ } ++ ++ return match; ++} ++ ++void ovpn_peer_hash_vpn_ip(struct ovpn_peer *peer) ++ __must_hold(&peer->ovpn->peers->lock) ++{ ++ struct hlist_nulls_head *nhead; ++ ++ if (peer->vpn_addrs.ipv4.s_addr != htonl(INADDR_ANY)) { ++ /* remove potential old hashing */ ++ hlist_nulls_del_init_rcu(&peer->hash_entry_transp_addr); ++ ++ nhead = ovpn_get_hash_head(peer->ovpn->peers->by_vpn_addr, ++ &peer->vpn_addrs.ipv4, ++ sizeof(peer->vpn_addrs.ipv4)); ++ hlist_nulls_add_head_rcu(&peer->hash_entry_addr4, nhead); ++ } ++ ++ if (!ipv6_addr_any(&peer->vpn_addrs.ipv6)) { ++ /* remove potential old hashing */ ++ hlist_nulls_del_init_rcu(&peer->hash_entry_transp_addr); ++ ++ nhead = ovpn_get_hash_head(peer->ovpn->peers->by_vpn_addr, ++ &peer->vpn_addrs.ipv6, ++ sizeof(peer->vpn_addrs.ipv6)); ++ hlist_nulls_add_head_rcu(&peer->hash_entry_addr6, nhead); ++ } ++} ++ ++/** ++ * ovpn_peer_add_mp - add peer to related tables in a MP instance ++ * @ovpn: the instance to add the peer to ++ * @peer: the peer to add ++ * ++ * Return: 0 on success or a negative error code otherwise ++ */ ++static int ovpn_peer_add_mp(struct ovpn_struct *ovpn, struct ovpn_peer *peer) ++{ ++ struct sockaddr_storage sa = { 0 }; ++ struct hlist_nulls_head *nhead; ++ struct sockaddr_in6 *sa6; ++ struct sockaddr_in *sa4; ++ struct ovpn_bind *bind; ++ struct ovpn_peer *tmp; ++ size_t salen; ++ int ret = 0; ++ ++ spin_lock_bh(&ovpn->peers->lock); ++ /* do not add duplicates */ ++ tmp = ovpn_peer_get_by_id(ovpn, peer->id); ++ if (tmp) { ++ ovpn_peer_put(tmp); ++ ret = -EEXIST; ++ goto out; ++ } ++ ++ bind = rcu_dereference_protected(peer->bind, true); ++ /* peers connected via TCP have bind == NULL */ ++ if (bind) { ++ switch (bind->remote.in4.sin_family) { ++ case AF_INET: ++ sa4 = (struct sockaddr_in *)&sa; ++ ++ sa4->sin_family = AF_INET; ++ sa4->sin_addr.s_addr = bind->remote.in4.sin_addr.s_addr; ++ sa4->sin_port = bind->remote.in4.sin_port; ++ salen = sizeof(*sa4); ++ break; ++ case AF_INET6: ++ sa6 = (struct sockaddr_in6 *)&sa; ++ ++ sa6->sin6_family = AF_INET6; ++ sa6->sin6_addr = bind->remote.in6.sin6_addr; ++ sa6->sin6_port = bind->remote.in6.sin6_port; ++ salen = sizeof(*sa6); ++ break; ++ default: ++ ret = -EPROTONOSUPPORT; ++ goto out; ++ } ++ ++ nhead = ovpn_get_hash_head(ovpn->peers->by_transp_addr, &sa, ++ salen); ++ hlist_nulls_add_head_rcu(&peer->hash_entry_transp_addr, nhead); ++ } ++ ++ hlist_add_head_rcu(&peer->hash_entry_id, ++ ovpn_get_hash_head(ovpn->peers->by_id, &peer->id, ++ sizeof(peer->id))); ++ ++ ovpn_peer_hash_vpn_ip(peer); ++out: ++ spin_unlock_bh(&ovpn->peers->lock); ++ return ret; ++} ++ ++/** ++ * ovpn_peer_add_p2p - add peer to related tables in a P2P instance ++ * @ovpn: the instance to add the peer to ++ * @peer: the peer to add ++ * ++ * Return: 0 on success or a negative error code otherwise ++ */ ++static int ovpn_peer_add_p2p(struct ovpn_struct *ovpn, struct ovpn_peer *peer) ++{ ++ struct ovpn_peer *tmp; ++ ++ spin_lock_bh(&ovpn->lock); ++ /* in p2p mode it is possible to have a single peer only, therefore the ++ * old one is released and substituted by the new one ++ */ ++ tmp = rcu_dereference_protected(ovpn->peer, ++ lockdep_is_held(&ovpn->lock)); ++ if (tmp) { ++ tmp->delete_reason = OVPN_DEL_PEER_REASON_TEARDOWN; ++ ovpn_peer_put(tmp); ++ } ++ ++ rcu_assign_pointer(ovpn->peer, peer); ++ spin_unlock_bh(&ovpn->lock); ++ ++ return 0; ++} ++ ++/** ++ * ovpn_peer_add - add peer to the related tables ++ * @ovpn: the openvpn instance the peer belongs to ++ * @peer: the peer object to add ++ * ++ * Assume refcounter was increased by caller ++ * ++ * Return: 0 on success or a negative error code otherwise ++ */ ++int ovpn_peer_add(struct ovpn_struct *ovpn, struct ovpn_peer *peer) ++{ ++ switch (ovpn->mode) { ++ case OVPN_MODE_MP: ++ return ovpn_peer_add_mp(ovpn, peer); ++ case OVPN_MODE_P2P: ++ return ovpn_peer_add_p2p(ovpn, peer); ++ default: ++ return -EOPNOTSUPP; ++ } ++} ++ ++/** ++ * ovpn_peer_unhash - remove peer reference from all hashtables ++ * @peer: the peer to remove ++ * @reason: the delete reason to attach to the peer ++ */ ++static void ovpn_peer_unhash(struct ovpn_peer *peer, ++ enum ovpn_del_peer_reason reason) ++ __must_hold(&ovpn->peers->lock) ++{ ++ hlist_del_init_rcu(&peer->hash_entry_id); ++ ++ hlist_nulls_del_init_rcu(&peer->hash_entry_addr4); ++ hlist_nulls_del_init_rcu(&peer->hash_entry_addr6); ++ hlist_nulls_del_init_rcu(&peer->hash_entry_transp_addr); ++ ++ ovpn_peer_put(peer); ++ peer->delete_reason = reason; ++} ++ ++/** ++ * ovpn_peer_del_mp - delete peer from related tables in a MP instance ++ * @peer: the peer to delete ++ * @reason: reason why the peer was deleted (sent to userspace) ++ * ++ * Return: 0 on success or a negative error code otherwise ++ */ ++static int ovpn_peer_del_mp(struct ovpn_peer *peer, ++ enum ovpn_del_peer_reason reason) ++ __must_hold(&peer->ovpn->peers->lock) ++{ ++ struct ovpn_peer *tmp; ++ int ret = -ENOENT; ++ ++ tmp = ovpn_peer_get_by_id(peer->ovpn, peer->id); ++ if (tmp == peer) { ++ ovpn_peer_unhash(peer, reason); ++ ret = 0; ++ } ++ ++ if (tmp) ++ ovpn_peer_put(tmp); ++ ++ return ret; ++} ++ ++/** ++ * ovpn_peer_del_p2p - delete peer from related tables in a P2P instance ++ * @peer: the peer to delete ++ * @reason: reason why the peer was deleted (sent to userspace) ++ * ++ * Return: 0 on success or a negative error code otherwise ++ */ ++static int ovpn_peer_del_p2p(struct ovpn_peer *peer, ++ enum ovpn_del_peer_reason reason) ++ __must_hold(&peer->ovpn->lock) ++{ ++ struct ovpn_peer *tmp; ++ ++ tmp = rcu_dereference_protected(peer->ovpn->peer, ++ lockdep_is_held(&peer->ovpn->lock)); ++ if (tmp != peer) { ++ DEBUG_NET_WARN_ON_ONCE(1); ++ if (tmp) ++ ovpn_peer_put(tmp); ++ ++ return -ENOENT; ++ } ++ ++ tmp->delete_reason = reason; ++ RCU_INIT_POINTER(peer->ovpn->peer, NULL); ++ ovpn_peer_put(tmp); ++ ++ return 0; ++} ++ ++/** ++ * ovpn_peer_release_p2p - release peer upon P2P device teardown ++ * @ovpn: the instance being torn down ++ */ ++void ovpn_peer_release_p2p(struct ovpn_struct *ovpn) ++{ ++ struct ovpn_peer *tmp; ++ ++ spin_lock_bh(&ovpn->lock); ++ tmp = rcu_dereference_protected(ovpn->peer, ++ lockdep_is_held(&ovpn->lock)); ++ if (tmp) ++ ovpn_peer_del_p2p(tmp, OVPN_DEL_PEER_REASON_TEARDOWN); ++ spin_unlock_bh(&ovpn->lock); ++} ++ ++/** ++ * ovpn_peer_del - delete peer from related tables ++ * @peer: the peer object to delete ++ * @reason: reason for deleting peer (will be sent to userspace) ++ * ++ * Return: 0 on success or a negative error code otherwise ++ */ ++int ovpn_peer_del(struct ovpn_peer *peer, enum ovpn_del_peer_reason reason) ++{ ++ int ret; ++ ++ switch (peer->ovpn->mode) { ++ case OVPN_MODE_MP: ++ spin_lock_bh(&peer->ovpn->peers->lock); ++ ret = ovpn_peer_del_mp(peer, reason); ++ spin_unlock_bh(&peer->ovpn->peers->lock); ++ return ret; ++ case OVPN_MODE_P2P: ++ spin_lock_bh(&peer->ovpn->lock); ++ ret = ovpn_peer_del_p2p(peer, reason); ++ spin_unlock_bh(&peer->ovpn->lock); ++ return ret; ++ default: ++ return -EOPNOTSUPP; ++ } ++} ++ ++static int ovpn_peer_del_nolock(struct ovpn_peer *peer, ++ enum ovpn_del_peer_reason reason) ++{ ++ switch (peer->ovpn->mode) { ++ case OVPN_MODE_MP: ++ return ovpn_peer_del_mp(peer, reason); ++ case OVPN_MODE_P2P: ++ return ovpn_peer_del_p2p(peer, reason); ++ default: ++ return -EOPNOTSUPP; ++ } ++} ++ ++/** ++ * ovpn_peers_free - free all peers in the instance ++ * @ovpn: the instance whose peers should be released ++ */ ++void ovpn_peers_free(struct ovpn_struct *ovpn) ++{ ++ struct hlist_node *tmp; ++ struct ovpn_peer *peer; ++ int bkt; ++ ++ spin_lock_bh(&ovpn->peers->lock); ++ hash_for_each_safe(ovpn->peers->by_id, bkt, tmp, peer, hash_entry_id) ++ ovpn_peer_unhash(peer, OVPN_DEL_PEER_REASON_TEARDOWN); ++ spin_unlock_bh(&ovpn->peers->lock); ++} ++ ++static time64_t ovpn_peer_keepalive_work_single(struct ovpn_peer *peer, ++ time64_t now) ++{ ++ time64_t next_run1, next_run2, delta; ++ unsigned long timeout, interval; ++ bool expired; ++ ++ spin_lock_bh(&peer->lock); ++ /* we expect both timers to be configured at the same time, ++ * therefore bail out if either is not set ++ */ ++ if (!peer->keepalive_timeout || !peer->keepalive_interval) { ++ spin_unlock_bh(&peer->lock); ++ return 0; ++ } ++ ++ /* check for peer timeout */ ++ expired = false; ++ timeout = peer->keepalive_timeout; ++ delta = now - peer->last_recv; ++ if (delta < timeout) { ++ peer->keepalive_recv_exp = now + timeout - delta; ++ next_run1 = peer->keepalive_recv_exp; ++ } else if (peer->keepalive_recv_exp > now) { ++ next_run1 = peer->keepalive_recv_exp; ++ } else { ++ expired = true; ++ } ++ ++ if (expired) { ++ /* peer is dead -> kill it and move on */ ++ spin_unlock_bh(&peer->lock); ++ netdev_dbg(peer->ovpn->dev, "peer %u expired\n", ++ peer->id); ++ ovpn_peer_del_nolock(peer, OVPN_DEL_PEER_REASON_EXPIRED); ++ return 0; ++ } ++ ++ /* check for peer keepalive */ ++ expired = false; ++ interval = peer->keepalive_interval; ++ delta = now - peer->last_sent; ++ if (delta < interval) { ++ peer->keepalive_xmit_exp = now + interval - delta; ++ next_run2 = peer->keepalive_xmit_exp; ++ } else if (peer->keepalive_xmit_exp > now) { ++ next_run2 = peer->keepalive_xmit_exp; ++ } else { ++ expired = true; ++ next_run2 = now + interval; ++ } ++ spin_unlock_bh(&peer->lock); ++ ++ if (expired) { ++ /* a keepalive packet is required */ ++ netdev_dbg(peer->ovpn->dev, ++ "sending keepalive to peer %u\n", ++ peer->id); ++ ovpn_xmit_special(peer, ovpn_keepalive_message, ++ sizeof(ovpn_keepalive_message)); ++ } ++ ++ if (next_run1 < next_run2) ++ return next_run1; ++ ++ return next_run2; ++} ++ ++static time64_t ovpn_peer_keepalive_work_mp(struct ovpn_struct *ovpn, ++ time64_t now) ++{ ++ time64_t tmp_next_run, next_run = 0; ++ struct hlist_node *tmp; ++ struct ovpn_peer *peer; ++ int bkt; ++ ++ spin_lock_bh(&ovpn->peers->lock); ++ hash_for_each_safe(ovpn->peers->by_id, bkt, tmp, peer, hash_entry_id) { ++ tmp_next_run = ovpn_peer_keepalive_work_single(peer, now); ++ if (!tmp_next_run) ++ continue; ++ ++ /* the next worker run will be scheduled based on the shortest ++ * required interval across all peers ++ */ ++ if (!next_run || tmp_next_run < next_run) ++ next_run = tmp_next_run; ++ } ++ spin_unlock_bh(&ovpn->peers->lock); ++ ++ return next_run; ++} ++ ++static time64_t ovpn_peer_keepalive_work_p2p(struct ovpn_struct *ovpn, ++ time64_t now) ++{ ++ struct ovpn_peer *peer; ++ time64_t next_run = 0; ++ ++ spin_lock_bh(&ovpn->lock); ++ peer = rcu_dereference_protected(ovpn->peer, ++ lockdep_is_held(&ovpn->lock)); ++ if (peer) ++ next_run = ovpn_peer_keepalive_work_single(peer, now); ++ spin_unlock_bh(&ovpn->lock); ++ ++ return next_run; ++} ++ ++/** ++ * ovpn_peer_keepalive_work - run keepalive logic on each known peer ++ * @work: pointer to the work member of the related ovpn object ++ * ++ * Each peer has two timers (if configured): ++ * 1. peer timeout: when no data is received for a certain interval, ++ * the peer is considered dead and it gets killed. ++ * 2. peer keepalive: when no data is sent to a certain peer for a ++ * certain interval, a special 'keepalive' packet is explicitly sent. ++ * ++ * This function iterates across the whole peer collection while ++ * checking the timers described above. ++ */ ++void ovpn_peer_keepalive_work(struct work_struct *work) ++{ ++ struct ovpn_struct *ovpn = container_of(work, struct ovpn_struct, ++ keepalive_work.work); ++ time64_t next_run = 0, now = ktime_get_real_seconds(); ++ ++ switch (ovpn->mode) { ++ case OVPN_MODE_MP: ++ next_run = ovpn_peer_keepalive_work_mp(ovpn, now); ++ break; ++ case OVPN_MODE_P2P: ++ next_run = ovpn_peer_keepalive_work_p2p(ovpn, now); ++ break; ++ } ++ ++ /* prevent rearming if the interface is being destroyed */ ++ if (next_run > 0 && ovpn->registered) { ++ netdev_dbg(ovpn->dev, ++ "scheduling keepalive work: now=%llu next_run=%llu delta=%llu\n", ++ next_run, now, next_run - now); ++ schedule_delayed_work(&ovpn->keepalive_work, ++ (next_run - now) * HZ); ++ } ++} +diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h +new file mode 100644 +index 000000000000..1adecd0f79f8 +--- /dev/null ++++ b/drivers/net/ovpn/peer.h +@@ -0,0 +1,165 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: James Yonan ++ * Antonio Quartulli ++ */ ++ ++#ifndef _NET_OVPN_OVPNPEER_H_ ++#define _NET_OVPN_OVPNPEER_H_ ++ ++#include ++#include ++ ++#include "crypto.h" ++#include "stats.h" ++ ++/** ++ * struct ovpn_peer - the main remote peer object ++ * @ovpn: main openvpn instance this peer belongs to ++ * @id: unique identifier ++ * @vpn_addrs: IP addresses assigned over the tunnel ++ * @vpn_addrs.ipv4: IPv4 assigned to peer on the tunnel ++ * @vpn_addrs.ipv6: IPv6 assigned to peer on the tunnel ++ * @hash_entry_id: entry in the peer ID hashtable ++ * @hash_entry_addr4: entry in the peer IPv4 hashtable ++ * @hash_entry_addr6: entry in the peer IPv6 hashtable ++ * @hash_entry_transp_addr: entry in the peer transport address hashtable ++ * @sock: the socket being used to talk to this peer ++ * @tcp: keeps track of TCP specific state ++ * @tcp.strp: stream parser context (TCP only) ++ * @tcp.tx_work: work for deferring outgoing packet processing (TCP only) ++ * @tcp.user_queue: received packets that have to go to userspace (TCP only) ++ * @tcp.tx_in_progress: true if TX is already ongoing (TCP only) ++ * @tcp.out_msg.skb: packet scheduled for sending (TCP only) ++ * @tcp.out_msg.offset: offset where next send should start (TCP only) ++ * @tcp.out_msg.len: remaining data to send within packet (TCP only) ++ * @tcp.sk_cb.sk_data_ready: pointer to original cb (TCP only) ++ * @tcp.sk_cb.sk_write_space: pointer to original cb (TCP only) ++ * @tcp.sk_cb.prot: pointer to original prot object (TCP only) ++ * @tcp.sk_cb.ops: pointer to the original prot_ops object (TCP only) ++ * @crypto: the crypto configuration (ciphers, keys, etc..) ++ * @dst_cache: cache for dst_entry used to send to peer ++ * @bind: remote peer binding ++ * @keepalive_interval: seconds after which a new keepalive should be sent ++ * @keepalive_xmit_exp: future timestamp when next keepalive should be sent ++ * @last_sent: timestamp of the last successfully sent packet ++ * @keepalive_timeout: seconds after which an inactive peer is considered dead ++ * @keepalive_recv_exp: future timestamp when the peer should expire ++ * @last_recv: timestamp of the last authenticated received packet ++ * @halt: true if ovpn_peer_mark_delete was called ++ * @vpn_stats: per-peer in-VPN TX/RX stays ++ * @link_stats: per-peer link/transport TX/RX stats ++ * @delete_reason: why peer was deleted (i.e. timeout, transport error, ..) ++ * @lock: protects binding to peer (bind) ++ * @refcount: reference counter ++ * @rcu: used to free peer in an RCU safe way ++ * @delete_work: deferred cleanup work, used to notify userspace ++ */ ++struct ovpn_peer { ++ struct ovpn_struct *ovpn; ++ u32 id; ++ struct { ++ struct in_addr ipv4; ++ struct in6_addr ipv6; ++ } vpn_addrs; ++ struct hlist_node hash_entry_id; ++ struct hlist_nulls_node hash_entry_addr4; ++ struct hlist_nulls_node hash_entry_addr6; ++ struct hlist_nulls_node hash_entry_transp_addr; ++ struct ovpn_socket *sock; ++ ++ /* state of the TCP reading. Needed to keep track of how much of a ++ * single packet has already been read from the stream and how much is ++ * missing ++ */ ++ struct { ++ struct strparser strp; ++ struct work_struct tx_work; ++ struct sk_buff_head user_queue; ++ bool tx_in_progress; ++ ++ struct { ++ struct sk_buff *skb; ++ int offset; ++ int len; ++ } out_msg; ++ ++ struct { ++ void (*sk_data_ready)(struct sock *sk); ++ void (*sk_write_space)(struct sock *sk); ++ struct proto *prot; ++ const struct proto_ops *ops; ++ } sk_cb; ++ } tcp; ++ struct ovpn_crypto_state crypto; ++ struct dst_cache dst_cache; ++ struct ovpn_bind __rcu *bind; ++ unsigned long keepalive_interval; ++ unsigned long keepalive_xmit_exp; ++ time64_t last_sent; ++ unsigned long keepalive_timeout; ++ unsigned long keepalive_recv_exp; ++ time64_t last_recv; ++ bool halt; ++ struct ovpn_peer_stats vpn_stats; ++ struct ovpn_peer_stats link_stats; ++ enum ovpn_del_peer_reason delete_reason; ++ spinlock_t lock; /* protects bind */ ++ struct kref refcount; ++ struct rcu_head rcu; ++ struct work_struct delete_work; ++}; ++ ++/** ++ * ovpn_peer_hold - increase reference counter ++ * @peer: the peer whose counter should be increased ++ * ++ * Return: true if the counter was increased or false if it was zero already ++ */ ++static inline bool ovpn_peer_hold(struct ovpn_peer *peer) ++{ ++ return kref_get_unless_zero(&peer->refcount); ++} ++ ++void ovpn_peer_release(struct ovpn_peer *peer); ++void ovpn_peer_release_kref(struct kref *kref); ++ ++/** ++ * ovpn_peer_put - decrease reference counter ++ * @peer: the peer whose counter should be decreased ++ */ ++static inline void ovpn_peer_put(struct ovpn_peer *peer) ++{ ++ kref_put(&peer->refcount, ovpn_peer_release_kref); ++} ++ ++struct ovpn_peer *ovpn_peer_new(struct ovpn_struct *ovpn, u32 id); ++int ovpn_peer_add(struct ovpn_struct *ovpn, struct ovpn_peer *peer); ++int ovpn_peer_del(struct ovpn_peer *peer, enum ovpn_del_peer_reason reason); ++void ovpn_peer_release_p2p(struct ovpn_struct *ovpn); ++void ovpn_peers_free(struct ovpn_struct *ovpn); ++ ++struct ovpn_peer *ovpn_peer_get_by_transp_addr(struct ovpn_struct *ovpn, ++ struct sk_buff *skb); ++struct ovpn_peer *ovpn_peer_get_by_id(struct ovpn_struct *ovpn, u32 peer_id); ++struct ovpn_peer *ovpn_peer_get_by_dst(struct ovpn_struct *ovpn, ++ struct sk_buff *skb); ++void ovpn_peer_hash_vpn_ip(struct ovpn_peer *peer); ++bool ovpn_peer_check_by_src(struct ovpn_struct *ovpn, struct sk_buff *skb, ++ struct ovpn_peer *peer); ++ ++void ovpn_peer_keepalive_set(struct ovpn_peer *peer, u32 interval, u32 timeout); ++void ovpn_peer_keepalive_work(struct work_struct *work); ++ ++void ovpn_peer_update_local_endpoint(struct ovpn_peer *peer, ++ struct sk_buff *skb); ++ ++void ovpn_peer_float(struct ovpn_peer *peer, struct sk_buff *skb); ++int ovpn_peer_reset_sockaddr(struct ovpn_peer *peer, ++ const struct sockaddr_storage *ss, ++ const u8 *local_ip); ++ ++#endif /* _NET_OVPN_OVPNPEER_H_ */ +diff --git a/drivers/net/ovpn/pktid.c b/drivers/net/ovpn/pktid.c +new file mode 100644 +index 000000000000..96dc87635670 +--- /dev/null ++++ b/drivers/net/ovpn/pktid.c +@@ -0,0 +1,130 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: Antonio Quartulli ++ * James Yonan ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include "ovpnstruct.h" ++#include "main.h" ++#include "packet.h" ++#include "pktid.h" ++ ++void ovpn_pktid_xmit_init(struct ovpn_pktid_xmit *pid) ++{ ++ atomic64_set(&pid->seq_num, 1); ++} ++ ++void ovpn_pktid_recv_init(struct ovpn_pktid_recv *pr) ++{ ++ memset(pr, 0, sizeof(*pr)); ++ spin_lock_init(&pr->lock); ++} ++ ++/* Packet replay detection. ++ * Allows ID backtrack of up to REPLAY_WINDOW_SIZE - 1. ++ */ ++int ovpn_pktid_recv(struct ovpn_pktid_recv *pr, u32 pkt_id, u32 pkt_time) ++{ ++ const unsigned long now = jiffies; ++ int ret; ++ ++ /* ID must not be zero */ ++ if (unlikely(pkt_id == 0)) ++ return -EINVAL; ++ ++ spin_lock_bh(&pr->lock); ++ ++ /* expire backtracks at or below pr->id after PKTID_RECV_EXPIRE time */ ++ if (unlikely(time_after_eq(now, pr->expire))) ++ pr->id_floor = pr->id; ++ ++ /* time changed? */ ++ if (unlikely(pkt_time != pr->time)) { ++ if (pkt_time > pr->time) { ++ /* time moved forward, accept */ ++ pr->base = 0; ++ pr->extent = 0; ++ pr->id = 0; ++ pr->time = pkt_time; ++ pr->id_floor = 0; ++ } else { ++ /* time moved backward, reject */ ++ ret = -ETIME; ++ goto out; ++ } ++ } ++ ++ if (likely(pkt_id == pr->id + 1)) { ++ /* well-formed ID sequence (incremented by 1) */ ++ pr->base = REPLAY_INDEX(pr->base, -1); ++ pr->history[pr->base / 8] |= (1 << (pr->base % 8)); ++ if (pr->extent < REPLAY_WINDOW_SIZE) ++ ++pr->extent; ++ pr->id = pkt_id; ++ } else if (pkt_id > pr->id) { ++ /* ID jumped forward by more than one */ ++ const unsigned int delta = pkt_id - pr->id; ++ ++ if (delta < REPLAY_WINDOW_SIZE) { ++ unsigned int i; ++ ++ pr->base = REPLAY_INDEX(pr->base, -delta); ++ pr->history[pr->base / 8] |= (1 << (pr->base % 8)); ++ pr->extent += delta; ++ if (pr->extent > REPLAY_WINDOW_SIZE) ++ pr->extent = REPLAY_WINDOW_SIZE; ++ for (i = 1; i < delta; ++i) { ++ unsigned int newb = REPLAY_INDEX(pr->base, i); ++ ++ pr->history[newb / 8] &= ~BIT(newb % 8); ++ } ++ } else { ++ pr->base = 0; ++ pr->extent = REPLAY_WINDOW_SIZE; ++ memset(pr->history, 0, sizeof(pr->history)); ++ pr->history[0] = 1; ++ } ++ pr->id = pkt_id; ++ } else { ++ /* ID backtrack */ ++ const unsigned int delta = pr->id - pkt_id; ++ ++ if (delta > pr->max_backtrack) ++ pr->max_backtrack = delta; ++ if (delta < pr->extent) { ++ if (pkt_id > pr->id_floor) { ++ const unsigned int ri = REPLAY_INDEX(pr->base, ++ delta); ++ u8 *p = &pr->history[ri / 8]; ++ const u8 mask = (1 << (ri % 8)); ++ ++ if (*p & mask) { ++ ret = -EINVAL; ++ goto out; ++ } ++ *p |= mask; ++ } else { ++ ret = -EINVAL; ++ goto out; ++ } ++ } else { ++ ret = -EINVAL; ++ goto out; ++ } ++ } ++ ++ pr->expire = now + PKTID_RECV_EXPIRE; ++ ret = 0; ++out: ++ spin_unlock_bh(&pr->lock); ++ return ret; ++} +diff --git a/drivers/net/ovpn/pktid.h b/drivers/net/ovpn/pktid.h +new file mode 100644 +index 000000000000..fe02f0667e1a +--- /dev/null ++++ b/drivers/net/ovpn/pktid.h +@@ -0,0 +1,87 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: Antonio Quartulli ++ * James Yonan ++ */ ++ ++#ifndef _NET_OVPN_OVPNPKTID_H_ ++#define _NET_OVPN_OVPNPKTID_H_ ++ ++#include "packet.h" ++ ++/* If no packets received for this length of time, set a backtrack floor ++ * at highest received packet ID thus far. ++ */ ++#define PKTID_RECV_EXPIRE (30 * HZ) ++ ++/* Packet-ID state for transmitter */ ++struct ovpn_pktid_xmit { ++ atomic64_t seq_num; ++}; ++ ++/* replay window sizing in bytes = 2^REPLAY_WINDOW_ORDER */ ++#define REPLAY_WINDOW_ORDER 8 ++ ++#define REPLAY_WINDOW_BYTES BIT(REPLAY_WINDOW_ORDER) ++#define REPLAY_WINDOW_SIZE (REPLAY_WINDOW_BYTES * 8) ++#define REPLAY_INDEX(base, i) (((base) + (i)) & (REPLAY_WINDOW_SIZE - 1)) ++ ++/* Packet-ID state for receiver. ++ * Other than lock member, can be zeroed to initialize. ++ */ ++struct ovpn_pktid_recv { ++ /* "sliding window" bitmask of recent packet IDs received */ ++ u8 history[REPLAY_WINDOW_BYTES]; ++ /* bit position of deque base in history */ ++ unsigned int base; ++ /* extent (in bits) of deque in history */ ++ unsigned int extent; ++ /* expiration of history in jiffies */ ++ unsigned long expire; ++ /* highest sequence number received */ ++ u32 id; ++ /* highest time stamp received */ ++ u32 time; ++ /* we will only accept backtrack IDs > id_floor */ ++ u32 id_floor; ++ unsigned int max_backtrack; ++ /* protects entire pktd ID state */ ++ spinlock_t lock; ++}; ++ ++/* Get the next packet ID for xmit */ ++static inline int ovpn_pktid_xmit_next(struct ovpn_pktid_xmit *pid, u32 *pktid) ++{ ++ const s64 seq_num = atomic64_fetch_add_unless(&pid->seq_num, 1, ++ 0x100000000LL); ++ /* when the 32bit space is over, we return an error because the packet ++ * ID is used to create the cipher IV and we do not want to reuse the ++ * same value more than once ++ */ ++ if (unlikely(seq_num == 0x100000000LL)) ++ return -ERANGE; ++ ++ *pktid = (u32)seq_num; ++ ++ return 0; ++} ++ ++/* Write 12-byte AEAD IV to dest */ ++static inline void ovpn_pktid_aead_write(const u32 pktid, ++ const struct ovpn_nonce_tail *nt, ++ unsigned char *dest) ++{ ++ *(__force __be32 *)(dest) = htonl(pktid); ++ BUILD_BUG_ON(4 + sizeof(struct ovpn_nonce_tail) != NONCE_SIZE); ++ memcpy(dest + 4, nt->u8, sizeof(struct ovpn_nonce_tail)); ++} ++ ++void ovpn_pktid_xmit_init(struct ovpn_pktid_xmit *pid); ++void ovpn_pktid_recv_init(struct ovpn_pktid_recv *pr); ++ ++int ovpn_pktid_recv(struct ovpn_pktid_recv *pr, u32 pkt_id, u32 pkt_time); ++ ++#endif /* _NET_OVPN_OVPNPKTID_H_ */ +diff --git a/drivers/net/ovpn/proto.h b/drivers/net/ovpn/proto.h +new file mode 100644 +index 000000000000..0de8bafadc89 +--- /dev/null ++++ b/drivers/net/ovpn/proto.h +@@ -0,0 +1,104 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: Antonio Quartulli ++ * James Yonan ++ */ ++ ++#ifndef _NET_OVPN_OVPNPROTO_H_ ++#define _NET_OVPN_OVPNPROTO_H_ ++ ++#include "main.h" ++ ++#include ++ ++/* Methods for operating on the initial command ++ * byte of the OpenVPN protocol. ++ */ ++ ++/* packet opcode (high 5 bits) and key-id (low 3 bits) are combined in ++ * one byte ++ */ ++#define OVPN_KEY_ID_MASK 0x07 ++#define OVPN_OPCODE_SHIFT 3 ++#define OVPN_OPCODE_MASK 0x1F ++/* upper bounds on opcode and key ID */ ++#define OVPN_KEY_ID_MAX (OVPN_KEY_ID_MASK + 1) ++#define OVPN_OPCODE_MAX (OVPN_OPCODE_MASK + 1) ++/* packet opcodes of interest to us */ ++#define OVPN_DATA_V1 6 /* data channel V1 packet */ ++#define OVPN_DATA_V2 9 /* data channel V2 packet */ ++/* size of initial packet opcode */ ++#define OVPN_OP_SIZE_V1 1 ++#define OVPN_OP_SIZE_V2 4 ++#define OVPN_PEER_ID_MASK 0x00FFFFFF ++#define OVPN_PEER_ID_UNDEF 0x00FFFFFF ++/* first byte of exit message */ ++#define OVPN_EXPLICIT_EXIT_NOTIFY_FIRST_BYTE 0x28 ++ ++/** ++ * ovpn_opcode_from_skb - extract OP code from skb at specified offset ++ * @skb: the packet to extract the OP code from ++ * @offset: the offset in the data buffer where the OP code is located ++ * ++ * Note: this function assumes that the skb head was pulled enough ++ * to access the first byte. ++ * ++ * Return: the OP code ++ */ ++static inline u8 ovpn_opcode_from_skb(const struct sk_buff *skb, u16 offset) ++{ ++ u8 byte = *(skb->data + offset); ++ ++ return byte >> OVPN_OPCODE_SHIFT; ++} ++ ++/** ++ * ovpn_peer_id_from_skb - extract peer ID from skb at specified offset ++ * @skb: the packet to extract the OP code from ++ * @offset: the offset in the data buffer where the OP code is located ++ * ++ * Note: this function assumes that the skb head was pulled enough ++ * to access the first 4 bytes. ++ * ++ * Return: the peer ID. ++ */ ++static inline u32 ovpn_peer_id_from_skb(const struct sk_buff *skb, u16 offset) ++{ ++ return ntohl(*(__be32 *)(skb->data + offset)) & OVPN_PEER_ID_MASK; ++} ++ ++/** ++ * ovpn_key_id_from_skb - extract key ID from the skb head ++ * @skb: the packet to extract the key ID code from ++ * ++ * Note: this function assumes that the skb head was pulled enough ++ * to access the first byte. ++ * ++ * Return: the key ID ++ */ ++static inline u8 ovpn_key_id_from_skb(const struct sk_buff *skb) ++{ ++ return *skb->data & OVPN_KEY_ID_MASK; ++} ++ ++/** ++ * ovpn_opcode_compose - combine OP code, key ID and peer ID to wire format ++ * @opcode: the OP code ++ * @key_id: the key ID ++ * @peer_id: the peer ID ++ * ++ * Return: a 4 bytes integer obtained combining all input values following the ++ * OpenVPN wire format. This integer can then be written to the packet header. ++ */ ++static inline u32 ovpn_opcode_compose(u8 opcode, u8 key_id, u32 peer_id) ++{ ++ const u8 op = (opcode << OVPN_OPCODE_SHIFT) | ++ (key_id & OVPN_KEY_ID_MASK); ++ ++ return (op << 24) | (peer_id & OVPN_PEER_ID_MASK); ++} ++ ++#endif /* _NET_OVPN_OVPNPROTO_H_ */ +diff --git a/drivers/net/ovpn/skb.h b/drivers/net/ovpn/skb.h +new file mode 100644 +index 000000000000..96afa01466ab +--- /dev/null ++++ b/drivers/net/ovpn/skb.h +@@ -0,0 +1,56 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: Antonio Quartulli ++ * James Yonan ++ */ ++ ++#ifndef _NET_OVPN_SKB_H_ ++#define _NET_OVPN_SKB_H_ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++struct ovpn_cb { ++ struct ovpn_peer *peer; ++ struct ovpn_crypto_key_slot *ks; ++ struct aead_request *req; ++ struct scatterlist *sg; ++ unsigned int orig_len; ++ unsigned int payload_offset; ++}; ++ ++static inline struct ovpn_cb *ovpn_skb_cb(struct sk_buff *skb) ++{ ++ BUILD_BUG_ON(sizeof(struct ovpn_cb) > sizeof(skb->cb)); ++ return (struct ovpn_cb *)skb->cb; ++} ++ ++/* Return IP protocol version from skb header. ++ * Return 0 if protocol is not IPv4/IPv6 or cannot be read. ++ */ ++static inline __be16 ovpn_ip_check_protocol(struct sk_buff *skb) ++{ ++ __be16 proto = 0; ++ ++ /* skb could be non-linear, ++ * make sure IP header is in non-fragmented part ++ */ ++ if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) ++ return 0; ++ ++ if (ip_hdr(skb)->version == 4) ++ proto = htons(ETH_P_IP); ++ else if (ip_hdr(skb)->version == 6) ++ proto = htons(ETH_P_IPV6); ++ ++ return proto; ++} ++ ++#endif /* _NET_OVPN_SKB_H_ */ +diff --git a/drivers/net/ovpn/socket.c b/drivers/net/ovpn/socket.c +new file mode 100644 +index 000000000000..a0c2a02ff205 +--- /dev/null ++++ b/drivers/net/ovpn/socket.c +@@ -0,0 +1,178 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: James Yonan ++ * Antonio Quartulli ++ */ ++ ++#include ++#include ++ ++#include "ovpnstruct.h" ++#include "main.h" ++#include "io.h" ++#include "peer.h" ++#include "socket.h" ++#include "tcp.h" ++#include "udp.h" ++ ++static void ovpn_socket_detach(struct socket *sock) ++{ ++ if (!sock) ++ return; ++ ++ if (sock->sk->sk_protocol == IPPROTO_UDP) ++ ovpn_udp_socket_detach(sock); ++ else if (sock->sk->sk_protocol == IPPROTO_TCP) ++ ovpn_tcp_socket_detach(sock); ++ ++ sockfd_put(sock); ++} ++ ++static void ovpn_socket_release_work(struct work_struct *work) ++{ ++ struct ovpn_socket *sock = container_of(work, struct ovpn_socket, work); ++ ++ ovpn_socket_detach(sock->sock); ++ kfree_rcu(sock, rcu); ++} ++ ++static void ovpn_socket_schedule_release(struct ovpn_socket *sock) ++{ ++ INIT_WORK(&sock->work, ovpn_socket_release_work); ++ schedule_work(&sock->work); ++} ++ ++/** ++ * ovpn_socket_release_kref - kref_put callback ++ * @kref: the kref object ++ */ ++void ovpn_socket_release_kref(struct kref *kref) ++{ ++ struct ovpn_socket *sock = container_of(kref, struct ovpn_socket, ++ refcount); ++ ++ ovpn_socket_schedule_release(sock); ++} ++ ++static bool ovpn_socket_hold(struct ovpn_socket *sock) ++{ ++ return kref_get_unless_zero(&sock->refcount); ++} ++ ++static struct ovpn_socket *ovpn_socket_get(struct socket *sock) ++{ ++ struct ovpn_socket *ovpn_sock; ++ ++ rcu_read_lock(); ++ ovpn_sock = rcu_dereference_sk_user_data(sock->sk); ++ if (!ovpn_socket_hold(ovpn_sock)) { ++ pr_warn("%s: found ovpn_socket with ref = 0\n", __func__); ++ ovpn_sock = NULL; ++ } ++ rcu_read_unlock(); ++ ++ return ovpn_sock; ++} ++ ++static int ovpn_socket_attach(struct socket *sock, struct ovpn_peer *peer) ++{ ++ int ret = -EOPNOTSUPP; ++ ++ if (!sock || !peer) ++ return -EINVAL; ++ ++ if (sock->sk->sk_protocol == IPPROTO_UDP) ++ ret = ovpn_udp_socket_attach(sock, peer->ovpn); ++ else if (sock->sk->sk_protocol == IPPROTO_TCP) ++ ret = ovpn_tcp_socket_attach(sock, peer); ++ ++ return ret; ++} ++ ++/* Retrieve the corresponding ovpn object from a UDP socket ++ * rcu_read_lock must be held on entry ++ */ ++struct ovpn_struct *ovpn_from_udp_sock(struct sock *sk) ++{ ++ struct ovpn_socket *ovpn_sock; ++ ++ if (unlikely(READ_ONCE(udp_sk(sk)->encap_type) != UDP_ENCAP_OVPNINUDP)) ++ return NULL; ++ ++ ovpn_sock = rcu_dereference_sk_user_data(sk); ++ if (unlikely(!ovpn_sock)) ++ return NULL; ++ ++ /* make sure that sk matches our stored transport socket */ ++ if (unlikely(!ovpn_sock->sock || sk != ovpn_sock->sock->sk)) ++ return NULL; ++ ++ return ovpn_sock->ovpn; ++} ++ ++/** ++ * ovpn_socket_new - create a new socket and initialize it ++ * @sock: the kernel socket to embed ++ * @peer: the peer reachable via this socket ++ * ++ * Return: an openvpn socket on success or a negative error code otherwise ++ */ ++struct ovpn_socket *ovpn_socket_new(struct socket *sock, struct ovpn_peer *peer) ++{ ++ struct ovpn_socket *ovpn_sock; ++ int ret; ++ ++ ret = ovpn_socket_attach(sock, peer); ++ if (ret < 0 && ret != -EALREADY) ++ return ERR_PTR(ret); ++ ++ /* if this socket is already owned by this interface, just increase the ++ * refcounter and use it as expected. ++ * ++ * Since UDP sockets can be used to talk to multiple remote endpoints, ++ * openvpn normally instantiates only one socket and shares it among all ++ * its peers. For this reason, when we find out that a socket is already ++ * used for some other peer in *this* instance, we can happily increase ++ * its refcounter and use it normally. ++ */ ++ if (ret == -EALREADY) { ++ /* caller is expected to increase the sock refcounter before ++ * passing it to this function. For this reason we drop it if ++ * not needed, like when this socket is already owned. ++ */ ++ ovpn_sock = ovpn_socket_get(sock); ++ sockfd_put(sock); ++ return ovpn_sock; ++ } ++ ++ ovpn_sock = kzalloc(sizeof(*ovpn_sock), GFP_KERNEL); ++ if (!ovpn_sock) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ovpn_sock->sock = sock; ++ kref_init(&ovpn_sock->refcount); ++ ++ /* TCP sockets are per-peer, therefore they are linked to their unique ++ * peer ++ */ ++ if (sock->sk->sk_protocol == IPPROTO_TCP) { ++ ovpn_sock->peer = peer; ++ } else { ++ /* in UDP we only link the ovpn instance since the socket is ++ * shared among multiple peers ++ */ ++ ovpn_sock->ovpn = peer->ovpn; ++ } ++ ++ rcu_assign_sk_user_data(sock->sk, ovpn_sock); ++ ++ return ovpn_sock; ++err: ++ ovpn_socket_detach(sock); ++ return ERR_PTR(ret); ++} +diff --git a/drivers/net/ovpn/socket.h b/drivers/net/ovpn/socket.h +new file mode 100644 +index 000000000000..bc22fff453ad +--- /dev/null ++++ b/drivers/net/ovpn/socket.h +@@ -0,0 +1,55 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: James Yonan ++ * Antonio Quartulli ++ */ ++ ++#ifndef _NET_OVPN_SOCK_H_ ++#define _NET_OVPN_SOCK_H_ ++ ++#include ++#include ++#include ++ ++struct ovpn_struct; ++struct ovpn_peer; ++ ++/** ++ * struct ovpn_socket - a kernel socket referenced in the ovpn code ++ * @ovpn: ovpn instance owning this socket (UDP only) ++ * @peer: unique peer transmitting over this socket (TCP only) ++ * @sock: the low level sock object ++ * @refcount: amount of contexts currently referencing this object ++ * @work: member used to schedule release routine (it may block) ++ * @rcu: member used to schedule RCU destructor callback ++ */ ++struct ovpn_socket { ++ union { ++ struct ovpn_struct *ovpn; ++ struct ovpn_peer *peer; ++ }; ++ ++ struct socket *sock; ++ struct kref refcount; ++ struct work_struct work; ++ struct rcu_head rcu; ++}; ++ ++void ovpn_socket_release_kref(struct kref *kref); ++ ++/** ++ * ovpn_socket_put - decrease reference counter ++ * @sock: the socket whose reference counter should be decreased ++ */ ++static inline void ovpn_socket_put(struct ovpn_socket *sock) ++{ ++ kref_put(&sock->refcount, ovpn_socket_release_kref); ++} ++ ++struct ovpn_socket *ovpn_socket_new(struct socket *sock, ++ struct ovpn_peer *peer); ++ ++#endif /* _NET_OVPN_SOCK_H_ */ +diff --git a/drivers/net/ovpn/stats.c b/drivers/net/ovpn/stats.c +new file mode 100644 +index 000000000000..a383842c3449 +--- /dev/null ++++ b/drivers/net/ovpn/stats.c +@@ -0,0 +1,21 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: James Yonan ++ * Antonio Quartulli ++ */ ++ ++#include ++ ++#include "stats.h" ++ ++void ovpn_peer_stats_init(struct ovpn_peer_stats *ps) ++{ ++ atomic64_set(&ps->rx.bytes, 0); ++ atomic64_set(&ps->rx.packets, 0); ++ ++ atomic64_set(&ps->tx.bytes, 0); ++ atomic64_set(&ps->tx.packets, 0); ++} +diff --git a/drivers/net/ovpn/stats.h b/drivers/net/ovpn/stats.h +new file mode 100644 +index 000000000000..868f49d25eaa +--- /dev/null ++++ b/drivers/net/ovpn/stats.h +@@ -0,0 +1,47 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: James Yonan ++ * Antonio Quartulli ++ * Lev Stipakov ++ */ ++ ++#ifndef _NET_OVPN_OVPNSTATS_H_ ++#define _NET_OVPN_OVPNSTATS_H_ ++ ++/* one stat */ ++struct ovpn_peer_stat { ++ atomic64_t bytes; ++ atomic64_t packets; ++}; ++ ++/* rx and tx stats combined */ ++struct ovpn_peer_stats { ++ struct ovpn_peer_stat rx; ++ struct ovpn_peer_stat tx; ++}; ++ ++void ovpn_peer_stats_init(struct ovpn_peer_stats *ps); ++ ++static inline void ovpn_peer_stats_increment(struct ovpn_peer_stat *stat, ++ const unsigned int n) ++{ ++ atomic64_add(n, &stat->bytes); ++ atomic64_inc(&stat->packets); ++} ++ ++static inline void ovpn_peer_stats_increment_rx(struct ovpn_peer_stats *stats, ++ const unsigned int n) ++{ ++ ovpn_peer_stats_increment(&stats->rx, n); ++} ++ ++static inline void ovpn_peer_stats_increment_tx(struct ovpn_peer_stats *stats, ++ const unsigned int n) ++{ ++ ovpn_peer_stats_increment(&stats->tx, n); ++} ++ ++#endif /* _NET_OVPN_OVPNSTATS_H_ */ +diff --git a/drivers/net/ovpn/tcp.c b/drivers/net/ovpn/tcp.c +new file mode 100644 +index 000000000000..d6f377a116ef +--- /dev/null ++++ b/drivers/net/ovpn/tcp.c +@@ -0,0 +1,506 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2019-2024 OpenVPN, Inc. ++ * ++ * Author: Antonio Quartulli ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "ovpnstruct.h" ++#include "main.h" ++#include "io.h" ++#include "packet.h" ++#include "peer.h" ++#include "proto.h" ++#include "skb.h" ++#include "tcp.h" ++ ++static struct proto ovpn_tcp_prot __ro_after_init; ++static struct proto_ops ovpn_tcp_ops __ro_after_init; ++static struct proto ovpn_tcp6_prot; ++static struct proto_ops ovpn_tcp6_ops; ++static DEFINE_MUTEX(tcp6_prot_mutex); ++ ++static int ovpn_tcp_parse(struct strparser *strp, struct sk_buff *skb) ++{ ++ struct strp_msg *rxm = strp_msg(skb); ++ __be16 blen; ++ u16 len; ++ int err; ++ ++ /* when packets are written to the TCP stream, they are prepended with ++ * two bytes indicating the actual packet size. ++ * Here we read those two bytes and move the skb data pointer to the ++ * beginning of the packet ++ */ ++ ++ if (skb->len < rxm->offset + 2) ++ return 0; ++ ++ err = skb_copy_bits(skb, rxm->offset, &blen, sizeof(blen)); ++ if (err < 0) ++ return err; ++ ++ len = be16_to_cpu(blen); ++ if (len < 2) ++ return -EINVAL; ++ ++ return len + 2; ++} ++ ++/* queue skb for sending to userspace via recvmsg on the socket */ ++static void ovpn_tcp_to_userspace(struct ovpn_peer *peer, struct sock *sk, ++ struct sk_buff *skb) ++{ ++ skb_set_owner_r(skb, sk); ++ memset(skb->cb, 0, sizeof(skb->cb)); ++ skb_queue_tail(&peer->tcp.user_queue, skb); ++ peer->tcp.sk_cb.sk_data_ready(sk); ++} ++ ++static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff *skb) ++{ ++ struct ovpn_peer *peer = container_of(strp, struct ovpn_peer, tcp.strp); ++ struct strp_msg *msg = strp_msg(skb); ++ size_t pkt_len = msg->full_len - 2; ++ size_t off = msg->offset + 2; ++ ++ /* ensure skb->data points to the beginning of the openvpn packet */ ++ if (!pskb_pull(skb, off)) { ++ net_warn_ratelimited("%s: packet too small\n", ++ peer->ovpn->dev->name); ++ goto err; ++ } ++ ++ /* strparser does not trim the skb for us, therefore we do it now */ ++ if (pskb_trim(skb, pkt_len) != 0) { ++ net_warn_ratelimited("%s: trimming skb failed\n", ++ peer->ovpn->dev->name); ++ goto err; ++ } ++ ++ /* we need the first byte of data to be accessible ++ * to extract the opcode and the key ID later on ++ */ ++ if (!pskb_may_pull(skb, 1)) { ++ net_warn_ratelimited("%s: packet too small to fetch opcode\n", ++ peer->ovpn->dev->name); ++ goto err; ++ } ++ ++ /* DATA_V2 packets are handled in kernel, the rest goes to user space */ ++ if (likely(ovpn_opcode_from_skb(skb, 0) == OVPN_DATA_V2)) { ++ /* hold reference to peer as required by ovpn_recv(). ++ * ++ * NOTE: in this context we should already be holding a ++ * reference to this peer, therefore ovpn_peer_hold() is ++ * not expected to fail ++ */ ++ if (WARN_ON(!ovpn_peer_hold(peer))) ++ goto err; ++ ++ ovpn_recv(peer, skb); ++ } else { ++ /* The packet size header must be there when sending the packet ++ * to userspace, therefore we put it back ++ */ ++ skb_push(skb, 2); ++ ovpn_tcp_to_userspace(peer, strp->sk, skb); ++ } ++ ++ return; ++err: ++ netdev_err(peer->ovpn->dev, ++ "cannot process incoming TCP data for peer %u\n", peer->id); ++ dev_core_stats_rx_dropped_inc(peer->ovpn->dev); ++ kfree_skb(skb); ++ ovpn_peer_del(peer, OVPN_DEL_PEER_REASON_TRANSPORT_ERROR); ++} ++ ++static int ovpn_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, ++ int flags, int *addr_len) ++{ ++ int err = 0, off, copied = 0, ret; ++ struct ovpn_socket *sock; ++ struct ovpn_peer *peer; ++ struct sk_buff *skb; ++ ++ rcu_read_lock(); ++ sock = rcu_dereference_sk_user_data(sk); ++ if (!sock || !sock->peer) { ++ rcu_read_unlock(); ++ return -EBADF; ++ } ++ /* we take a reference to the peer linked to this TCP socket, because ++ * in turn the peer holds a reference to the socket itself. ++ * By doing so we also ensure that the peer stays alive along with ++ * the socket while executing this function ++ */ ++ ovpn_peer_hold(sock->peer); ++ peer = sock->peer; ++ rcu_read_unlock(); ++ ++ skb = __skb_recv_datagram(sk, &peer->tcp.user_queue, flags, &off, &err); ++ if (!skb) { ++ if (err == -EAGAIN && sk->sk_shutdown & RCV_SHUTDOWN) { ++ ret = 0; ++ goto out; ++ } ++ ret = err; ++ goto out; ++ } ++ ++ copied = len; ++ if (copied > skb->len) ++ copied = skb->len; ++ else if (copied < skb->len) ++ msg->msg_flags |= MSG_TRUNC; ++ ++ err = skb_copy_datagram_msg(skb, 0, msg, copied); ++ if (unlikely(err)) { ++ kfree_skb(skb); ++ ret = err; ++ goto out; ++ } ++ ++ if (flags & MSG_TRUNC) ++ copied = skb->len; ++ kfree_skb(skb); ++ ret = copied; ++out: ++ ovpn_peer_put(peer); ++ return ret; ++} ++ ++void ovpn_tcp_socket_detach(struct socket *sock) ++{ ++ struct ovpn_socket *ovpn_sock; ++ struct ovpn_peer *peer; ++ ++ if (!sock) ++ return; ++ ++ rcu_read_lock(); ++ ovpn_sock = rcu_dereference_sk_user_data(sock->sk); ++ ++ if (!ovpn_sock->peer) { ++ rcu_read_unlock(); ++ return; ++ } ++ ++ peer = ovpn_sock->peer; ++ strp_stop(&peer->tcp.strp); ++ ++ skb_queue_purge(&peer->tcp.user_queue); ++ ++ /* restore CBs that were saved in ovpn_sock_set_tcp_cb() */ ++ sock->sk->sk_data_ready = peer->tcp.sk_cb.sk_data_ready; ++ sock->sk->sk_write_space = peer->tcp.sk_cb.sk_write_space; ++ sock->sk->sk_prot = peer->tcp.sk_cb.prot; ++ sock->sk->sk_socket->ops = peer->tcp.sk_cb.ops; ++ rcu_assign_sk_user_data(sock->sk, NULL); ++ ++ rcu_read_unlock(); ++ ++ /* cancel any ongoing work. Done after removing the CBs so that these ++ * workers cannot be re-armed ++ */ ++ cancel_work_sync(&peer->tcp.tx_work); ++ strp_done(&peer->tcp.strp); ++} ++ ++static void ovpn_tcp_send_sock(struct ovpn_peer *peer) ++{ ++ struct sk_buff *skb = peer->tcp.out_msg.skb; ++ ++ if (!skb) ++ return; ++ ++ if (peer->tcp.tx_in_progress) ++ return; ++ ++ peer->tcp.tx_in_progress = true; ++ ++ do { ++ int ret = skb_send_sock_locked(peer->sock->sock->sk, skb, ++ peer->tcp.out_msg.offset, ++ peer->tcp.out_msg.len); ++ if (unlikely(ret < 0)) { ++ if (ret == -EAGAIN) ++ goto out; ++ ++ net_warn_ratelimited("%s: TCP error to peer %u: %d\n", ++ peer->ovpn->dev->name, peer->id, ++ ret); ++ ++ /* in case of TCP error we can't recover the VPN ++ * stream therefore we abort the connection ++ */ ++ ovpn_peer_del(peer, ++ OVPN_DEL_PEER_REASON_TRANSPORT_ERROR); ++ break; ++ } ++ ++ peer->tcp.out_msg.len -= ret; ++ peer->tcp.out_msg.offset += ret; ++ } while (peer->tcp.out_msg.len > 0); ++ ++ if (!peer->tcp.out_msg.len) ++ dev_sw_netstats_tx_add(peer->ovpn->dev, 1, skb->len); ++ ++ kfree_skb(peer->tcp.out_msg.skb); ++ peer->tcp.out_msg.skb = NULL; ++ peer->tcp.out_msg.len = 0; ++ peer->tcp.out_msg.offset = 0; ++ ++out: ++ peer->tcp.tx_in_progress = false; ++} ++ ++static void ovpn_tcp_tx_work(struct work_struct *work) ++{ ++ struct ovpn_peer *peer; ++ ++ peer = container_of(work, struct ovpn_peer, tcp.tx_work); ++ ++ lock_sock(peer->sock->sock->sk); ++ ovpn_tcp_send_sock(peer); ++ release_sock(peer->sock->sock->sk); ++} ++ ++void ovpn_tcp_send_sock_skb(struct ovpn_peer *peer, struct sk_buff *skb) ++{ ++ if (peer->tcp.out_msg.skb) ++ return; ++ ++ peer->tcp.out_msg.skb = skb; ++ peer->tcp.out_msg.len = skb->len; ++ peer->tcp.out_msg.offset = 0; ++ ++ ovpn_tcp_send_sock(peer); ++} ++ ++static int ovpn_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) ++{ ++ struct ovpn_socket *sock; ++ int ret, linear = PAGE_SIZE; ++ struct ovpn_peer *peer; ++ struct sk_buff *skb; ++ ++ rcu_read_lock(); ++ sock = rcu_dereference_sk_user_data(sk); ++ peer = sock->peer; ++ if (unlikely(!ovpn_peer_hold(peer))) { ++ rcu_read_unlock(); ++ return -EIO; ++ } ++ rcu_read_unlock(); ++ ++ if (msg->msg_flags & ~MSG_DONTWAIT) { ++ ret = -EOPNOTSUPP; ++ goto peer_free; ++ } ++ ++ lock_sock(sk); ++ ++ if (peer->tcp.out_msg.skb) { ++ ret = -EAGAIN; ++ goto unlock; ++ } ++ ++ if (size < linear) ++ linear = size; ++ ++ skb = sock_alloc_send_pskb(sk, linear, size - linear, ++ msg->msg_flags & MSG_DONTWAIT, &ret, 0); ++ if (!skb) { ++ net_err_ratelimited("%s: skb alloc failed: %d\n", ++ sock->peer->ovpn->dev->name, ret); ++ goto unlock; ++ } ++ ++ skb_put(skb, linear); ++ skb->len = size; ++ skb->data_len = size - linear; ++ ++ ret = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); ++ if (ret) { ++ kfree_skb(skb); ++ net_err_ratelimited("%s: skb copy from iter failed: %d\n", ++ sock->peer->ovpn->dev->name, ret); ++ goto unlock; ++ } ++ ++ ovpn_tcp_send_sock_skb(sock->peer, skb); ++ ret = size; ++unlock: ++ release_sock(sk); ++peer_free: ++ ovpn_peer_put(peer); ++ return ret; ++} ++ ++static void ovpn_tcp_data_ready(struct sock *sk) ++{ ++ struct ovpn_socket *sock; ++ ++ trace_sk_data_ready(sk); ++ ++ rcu_read_lock(); ++ sock = rcu_dereference_sk_user_data(sk); ++ strp_data_ready(&sock->peer->tcp.strp); ++ rcu_read_unlock(); ++} ++ ++static void ovpn_tcp_write_space(struct sock *sk) ++{ ++ struct ovpn_socket *sock; ++ ++ rcu_read_lock(); ++ sock = rcu_dereference_sk_user_data(sk); ++ schedule_work(&sock->peer->tcp.tx_work); ++ sock->peer->tcp.sk_cb.sk_write_space(sk); ++ rcu_read_unlock(); ++} ++ ++static void ovpn_tcp_build_protos(struct proto *new_prot, ++ struct proto_ops *new_ops, ++ const struct proto *orig_prot, ++ const struct proto_ops *orig_ops); ++ ++/* Set TCP encapsulation callbacks */ ++int ovpn_tcp_socket_attach(struct socket *sock, struct ovpn_peer *peer) ++{ ++ struct strp_callbacks cb = { ++ .rcv_msg = ovpn_tcp_rcv, ++ .parse_msg = ovpn_tcp_parse, ++ }; ++ int ret; ++ ++ /* make sure no pre-existing encapsulation handler exists */ ++ if (sock->sk->sk_user_data) ++ return -EBUSY; ++ ++ /* sanity check */ ++ if (sock->sk->sk_protocol != IPPROTO_TCP) { ++ netdev_err(peer->ovpn->dev, ++ "provided socket is not TCP as expected\n"); ++ return -EINVAL; ++ } ++ ++ /* only a fully connected socket are expected. Connection should be ++ * handled in userspace ++ */ ++ if (sock->sk->sk_state != TCP_ESTABLISHED) { ++ netdev_err(peer->ovpn->dev, ++ "provided TCP socket is not in ESTABLISHED state: %d\n", ++ sock->sk->sk_state); ++ return -EINVAL; ++ } ++ ++ lock_sock(sock->sk); ++ ++ ret = strp_init(&peer->tcp.strp, sock->sk, &cb); ++ if (ret < 0) { ++ DEBUG_NET_WARN_ON_ONCE(1); ++ release_sock(sock->sk); ++ return ret; ++ } ++ ++ INIT_WORK(&peer->tcp.tx_work, ovpn_tcp_tx_work); ++ __sk_dst_reset(sock->sk); ++ skb_queue_head_init(&peer->tcp.user_queue); ++ ++ /* save current CBs so that they can be restored upon socket release */ ++ peer->tcp.sk_cb.sk_data_ready = sock->sk->sk_data_ready; ++ peer->tcp.sk_cb.sk_write_space = sock->sk->sk_write_space; ++ peer->tcp.sk_cb.prot = sock->sk->sk_prot; ++ peer->tcp.sk_cb.ops = sock->sk->sk_socket->ops; ++ ++ /* assign our static CBs and prot/ops */ ++ sock->sk->sk_data_ready = ovpn_tcp_data_ready; ++ sock->sk->sk_write_space = ovpn_tcp_write_space; ++ ++ if (sock->sk->sk_family == AF_INET) { ++ sock->sk->sk_prot = &ovpn_tcp_prot; ++ sock->sk->sk_socket->ops = &ovpn_tcp_ops; ++ } else { ++ mutex_lock(&tcp6_prot_mutex); ++ if (!ovpn_tcp6_prot.recvmsg) ++ ovpn_tcp_build_protos(&ovpn_tcp6_prot, &ovpn_tcp6_ops, ++ sock->sk->sk_prot, ++ sock->sk->sk_socket->ops); ++ mutex_unlock(&tcp6_prot_mutex); ++ ++ sock->sk->sk_prot = &ovpn_tcp6_prot; ++ sock->sk->sk_socket->ops = &ovpn_tcp6_ops; ++ } ++ ++ /* avoid using task_frag */ ++ sock->sk->sk_allocation = GFP_ATOMIC; ++ sock->sk->sk_use_task_frag = false; ++ ++ /* enqueue the RX worker */ ++ strp_check_rcv(&peer->tcp.strp); ++ ++ release_sock(sock->sk); ++ return 0; ++} ++ ++static void ovpn_tcp_close(struct sock *sk, long timeout) ++{ ++ struct ovpn_socket *sock; ++ ++ rcu_read_lock(); ++ sock = rcu_dereference_sk_user_data(sk); ++ ++ strp_stop(&sock->peer->tcp.strp); ++ barrier(); ++ ++ tcp_close(sk, timeout); ++ ++ ovpn_peer_del(sock->peer, OVPN_DEL_PEER_REASON_TRANSPORT_ERROR); ++ rcu_read_unlock(); ++} ++ ++static __poll_t ovpn_tcp_poll(struct file *file, struct socket *sock, ++ poll_table *wait) ++{ ++ __poll_t mask = datagram_poll(file, sock, wait); ++ struct ovpn_socket *ovpn_sock; ++ ++ rcu_read_lock(); ++ ovpn_sock = rcu_dereference_sk_user_data(sock->sk); ++ if (!skb_queue_empty(&ovpn_sock->peer->tcp.user_queue)) ++ mask |= EPOLLIN | EPOLLRDNORM; ++ rcu_read_unlock(); ++ ++ return mask; ++} ++ ++static void ovpn_tcp_build_protos(struct proto *new_prot, ++ struct proto_ops *new_ops, ++ const struct proto *orig_prot, ++ const struct proto_ops *orig_ops) ++{ ++ memcpy(new_prot, orig_prot, sizeof(*new_prot)); ++ memcpy(new_ops, orig_ops, sizeof(*new_ops)); ++ new_prot->recvmsg = ovpn_tcp_recvmsg; ++ new_prot->sendmsg = ovpn_tcp_sendmsg; ++ new_prot->close = ovpn_tcp_close; ++ new_ops->poll = ovpn_tcp_poll; ++} ++ ++/* Initialize TCP static objects */ ++void __init ovpn_tcp_init(void) ++{ ++ ovpn_tcp_build_protos(&ovpn_tcp_prot, &ovpn_tcp_ops, &tcp_prot, ++ &inet_stream_ops); ++} +diff --git a/drivers/net/ovpn/tcp.h b/drivers/net/ovpn/tcp.h +new file mode 100644 +index 000000000000..fb2cd0b606b4 +--- /dev/null ++++ b/drivers/net/ovpn/tcp.h +@@ -0,0 +1,44 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2019-2024 OpenVPN, Inc. ++ * ++ * Author: Antonio Quartulli ++ */ ++ ++#ifndef _NET_OVPN_TCP_H_ ++#define _NET_OVPN_TCP_H_ ++ ++#include ++#include ++#include ++ ++#include "peer.h" ++#include "skb.h" ++#include "socket.h" ++ ++void __init ovpn_tcp_init(void); ++ ++int ovpn_tcp_socket_attach(struct socket *sock, struct ovpn_peer *peer); ++void ovpn_tcp_socket_detach(struct socket *sock); ++void ovpn_tcp_send_sock_skb(struct ovpn_peer *peer, struct sk_buff *skb); ++ ++/* Prepare skb and enqueue it for sending to peer. ++ * ++ * Preparation consist in prepending the skb payload with its size. ++ * Required by the OpenVPN protocol in order to extract packets from ++ * the TCP stream on the receiver side. ++ */ ++static inline void ovpn_tcp_send_skb(struct ovpn_peer *peer, ++ struct sk_buff *skb) ++{ ++ u16 len = skb->len; ++ ++ *(__be16 *)__skb_push(skb, sizeof(u16)) = htons(len); ++ ++ bh_lock_sock(peer->sock->sock->sk); ++ ovpn_tcp_send_sock_skb(peer, skb); ++ bh_unlock_sock(peer->sock->sock->sk); ++} ++ ++#endif /* _NET_OVPN_TCP_H_ */ +diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c +new file mode 100644 +index 000000000000..d1e88ae83843 +--- /dev/null ++++ b/drivers/net/ovpn/udp.c +@@ -0,0 +1,406 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2019-2024 OpenVPN, Inc. ++ * ++ * Author: Antonio Quartulli ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "ovpnstruct.h" ++#include "main.h" ++#include "bind.h" ++#include "io.h" ++#include "peer.h" ++#include "proto.h" ++#include "socket.h" ++#include "udp.h" ++ ++/** ++ * ovpn_udp_encap_recv - Start processing a received UDP packet. ++ * @sk: socket over which the packet was received ++ * @skb: the received packet ++ * ++ * If the first byte of the payload is DATA_V2, the packet is further processed, ++ * otherwise it is forwarded to the UDP stack for delivery to user space. ++ * ++ * Return: ++ * 0 if skb was consumed or dropped ++ * >0 if skb should be passed up to userspace as UDP (packet not consumed) ++ * <0 if skb should be resubmitted as proto -N (packet not consumed) ++ */ ++static int ovpn_udp_encap_recv(struct sock *sk, struct sk_buff *skb) ++{ ++ struct ovpn_peer *peer = NULL; ++ struct ovpn_struct *ovpn; ++ u32 peer_id; ++ u8 opcode; ++ ++ ovpn = ovpn_from_udp_sock(sk); ++ if (unlikely(!ovpn)) { ++ net_err_ratelimited("%s: cannot obtain ovpn object from UDP socket\n", ++ __func__); ++ goto drop_noovpn; ++ } ++ ++ /* Make sure the first 4 bytes of the skb data buffer after the UDP ++ * header are accessible. ++ * They are required to fetch the OP code, the key ID and the peer ID. ++ */ ++ if (unlikely(!pskb_may_pull(skb, sizeof(struct udphdr) + ++ OVPN_OP_SIZE_V2))) { ++ net_dbg_ratelimited("%s: packet too small\n", __func__); ++ goto drop; ++ } ++ ++ opcode = ovpn_opcode_from_skb(skb, sizeof(struct udphdr)); ++ if (unlikely(opcode != OVPN_DATA_V2)) { ++ /* DATA_V1 is not supported */ ++ if (opcode == OVPN_DATA_V1) ++ goto drop; ++ ++ /* unknown or control packet: let it bubble up to userspace */ ++ return 1; ++ } ++ ++ peer_id = ovpn_peer_id_from_skb(skb, sizeof(struct udphdr)); ++ /* some OpenVPN server implementations send data packets with the ++ * peer-id set to undef. In this case we skip the peer lookup by peer-id ++ * and we try with the transport address ++ */ ++ if (peer_id != OVPN_PEER_ID_UNDEF) { ++ peer = ovpn_peer_get_by_id(ovpn, peer_id); ++ if (!peer) { ++ net_err_ratelimited("%s: received data from unknown peer (id: %d)\n", ++ __func__, peer_id); ++ goto drop; ++ } ++ } ++ ++ if (!peer) { ++ /* data packet with undef peer-id */ ++ peer = ovpn_peer_get_by_transp_addr(ovpn, skb); ++ if (unlikely(!peer)) { ++ net_dbg_ratelimited("%s: received data with undef peer-id from unknown source\n", ++ __func__); ++ goto drop; ++ } ++ } ++ ++ /* pop off outer UDP header */ ++ __skb_pull(skb, sizeof(struct udphdr)); ++ ovpn_recv(peer, skb); ++ return 0; ++ ++drop: ++ if (peer) ++ ovpn_peer_put(peer); ++ dev_core_stats_rx_dropped_inc(ovpn->dev); ++drop_noovpn: ++ kfree_skb(skb); ++ return 0; ++} ++ ++/** ++ * ovpn_udp4_output - send IPv4 packet over udp socket ++ * @ovpn: the openvpn instance ++ * @bind: the binding related to the destination peer ++ * @cache: dst cache ++ * @sk: the socket to send the packet over ++ * @skb: the packet to send ++ * ++ * Return: 0 on success or a negative error code otherwise ++ */ ++static int ovpn_udp4_output(struct ovpn_struct *ovpn, struct ovpn_bind *bind, ++ struct dst_cache *cache, struct sock *sk, ++ struct sk_buff *skb) ++{ ++ struct rtable *rt; ++ struct flowi4 fl = { ++ .saddr = bind->local.ipv4.s_addr, ++ .daddr = bind->remote.in4.sin_addr.s_addr, ++ .fl4_sport = inet_sk(sk)->inet_sport, ++ .fl4_dport = bind->remote.in4.sin_port, ++ .flowi4_proto = sk->sk_protocol, ++ .flowi4_mark = sk->sk_mark, ++ }; ++ int ret; ++ ++ local_bh_disable(); ++ rt = dst_cache_get_ip4(cache, &fl.saddr); ++ if (rt) ++ goto transmit; ++ ++ if (unlikely(!inet_confirm_addr(sock_net(sk), NULL, 0, fl.saddr, ++ RT_SCOPE_HOST))) { ++ /* we may end up here when the cached address is not usable ++ * anymore. In this case we reset address/cache and perform a ++ * new look up ++ */ ++ fl.saddr = 0; ++ bind->local.ipv4.s_addr = 0; ++ dst_cache_reset(cache); ++ } ++ ++ rt = ip_route_output_flow(sock_net(sk), &fl, sk); ++ if (IS_ERR(rt) && PTR_ERR(rt) == -EINVAL) { ++ fl.saddr = 0; ++ bind->local.ipv4.s_addr = 0; ++ dst_cache_reset(cache); ++ ++ rt = ip_route_output_flow(sock_net(sk), &fl, sk); ++ } ++ ++ if (IS_ERR(rt)) { ++ ret = PTR_ERR(rt); ++ net_dbg_ratelimited("%s: no route to host %pISpc: %d\n", ++ ovpn->dev->name, &bind->remote.in4, ret); ++ goto err; ++ } ++ dst_cache_set_ip4(cache, &rt->dst, fl.saddr); ++ ++transmit: ++ udp_tunnel_xmit_skb(rt, sk, skb, fl.saddr, fl.daddr, 0, ++ ip4_dst_hoplimit(&rt->dst), 0, fl.fl4_sport, ++ fl.fl4_dport, false, sk->sk_no_check_tx); ++ ret = 0; ++err: ++ local_bh_enable(); ++ return ret; ++} ++ ++#if IS_ENABLED(CONFIG_IPV6) ++/** ++ * ovpn_udp6_output - send IPv6 packet over udp socket ++ * @ovpn: the openvpn instance ++ * @bind: the binding related to the destination peer ++ * @cache: dst cache ++ * @sk: the socket to send the packet over ++ * @skb: the packet to send ++ * ++ * Return: 0 on success or a negative error code otherwise ++ */ ++static int ovpn_udp6_output(struct ovpn_struct *ovpn, struct ovpn_bind *bind, ++ struct dst_cache *cache, struct sock *sk, ++ struct sk_buff *skb) ++{ ++ struct dst_entry *dst; ++ int ret; ++ ++ struct flowi6 fl = { ++ .saddr = bind->local.ipv6, ++ .daddr = bind->remote.in6.sin6_addr, ++ .fl6_sport = inet_sk(sk)->inet_sport, ++ .fl6_dport = bind->remote.in6.sin6_port, ++ .flowi6_proto = sk->sk_protocol, ++ .flowi6_mark = sk->sk_mark, ++ .flowi6_oif = bind->remote.in6.sin6_scope_id, ++ }; ++ ++ local_bh_disable(); ++ dst = dst_cache_get_ip6(cache, &fl.saddr); ++ if (dst) ++ goto transmit; ++ ++ if (unlikely(!ipv6_chk_addr(sock_net(sk), &fl.saddr, NULL, 0))) { ++ /* we may end up here when the cached address is not usable ++ * anymore. In this case we reset address/cache and perform a ++ * new look up ++ */ ++ fl.saddr = in6addr_any; ++ bind->local.ipv6 = in6addr_any; ++ dst_cache_reset(cache); ++ } ++ ++ dst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(sk), sk, &fl, NULL); ++ if (IS_ERR(dst)) { ++ ret = PTR_ERR(dst); ++ net_dbg_ratelimited("%s: no route to host %pISpc: %d\n", ++ ovpn->dev->name, &bind->remote.in6, ret); ++ goto err; ++ } ++ dst_cache_set_ip6(cache, dst, &fl.saddr); ++ ++transmit: ++ udp_tunnel6_xmit_skb(dst, sk, skb, skb->dev, &fl.saddr, &fl.daddr, 0, ++ ip6_dst_hoplimit(dst), 0, fl.fl6_sport, ++ fl.fl6_dport, udp_get_no_check6_tx(sk)); ++ ret = 0; ++err: ++ local_bh_enable(); ++ return ret; ++} ++#endif ++ ++/** ++ * ovpn_udp_output - transmit skb using udp-tunnel ++ * @ovpn: the openvpn instance ++ * @bind: the binding related to the destination peer ++ * @cache: dst cache ++ * @sk: the socket to send the packet over ++ * @skb: the packet to send ++ * ++ * rcu_read_lock should be held on entry. ++ * On return, the skb is consumed. ++ * ++ * Return: 0 on success or a negative error code otherwise ++ */ ++static int ovpn_udp_output(struct ovpn_struct *ovpn, struct ovpn_bind *bind, ++ struct dst_cache *cache, struct sock *sk, ++ struct sk_buff *skb) ++{ ++ int ret; ++ ++ /* set sk to null if skb is already orphaned */ ++ if (!skb->destructor) ++ skb->sk = NULL; ++ ++ /* always permit openvpn-created packets to be (outside) fragmented */ ++ skb->ignore_df = 1; ++ ++ switch (bind->remote.in4.sin_family) { ++ case AF_INET: ++ ret = ovpn_udp4_output(ovpn, bind, cache, sk, skb); ++ break; ++#if IS_ENABLED(CONFIG_IPV6) ++ case AF_INET6: ++ ret = ovpn_udp6_output(ovpn, bind, cache, sk, skb); ++ break; ++#endif ++ default: ++ ret = -EAFNOSUPPORT; ++ break; ++ } ++ ++ return ret; ++} ++ ++/** ++ * ovpn_udp_send_skb - prepare skb and send it over via UDP ++ * @ovpn: the openvpn instance ++ * @peer: the destination peer ++ * @skb: the packet to send ++ */ ++void ovpn_udp_send_skb(struct ovpn_struct *ovpn, struct ovpn_peer *peer, ++ struct sk_buff *skb) ++{ ++ struct ovpn_bind *bind; ++ unsigned int pkt_len; ++ struct socket *sock; ++ int ret = -1; ++ ++ skb->dev = ovpn->dev; ++ /* no checksum performed at this layer */ ++ skb->ip_summed = CHECKSUM_NONE; ++ ++ /* get socket info */ ++ sock = peer->sock->sock; ++ if (unlikely(!sock)) { ++ net_warn_ratelimited("%s: no sock for remote peer\n", __func__); ++ goto out; ++ } ++ ++ rcu_read_lock(); ++ /* get binding */ ++ bind = rcu_dereference(peer->bind); ++ if (unlikely(!bind)) { ++ net_warn_ratelimited("%s: no bind for remote peer\n", __func__); ++ goto out_unlock; ++ } ++ ++ /* crypto layer -> transport (UDP) */ ++ pkt_len = skb->len; ++ ret = ovpn_udp_output(ovpn, bind, &peer->dst_cache, sock->sk, skb); ++ ++out_unlock: ++ rcu_read_unlock(); ++out: ++ if (unlikely(ret < 0)) { ++ dev_core_stats_tx_dropped_inc(ovpn->dev); ++ kfree_skb(skb); ++ return; ++ } ++ ++ dev_sw_netstats_tx_add(ovpn->dev, 1, pkt_len); ++} ++ ++/** ++ * ovpn_udp_socket_attach - set udp-tunnel CBs on socket and link it to ovpn ++ * @sock: socket to configure ++ * @ovpn: the openvp instance to link ++ * ++ * After invoking this function, the sock will be controlled by ovpn so that ++ * any incoming packet may be processed by ovpn first. ++ * ++ * Return: 0 on success or a negative error code otherwise ++ */ ++int ovpn_udp_socket_attach(struct socket *sock, struct ovpn_struct *ovpn) ++{ ++ struct udp_tunnel_sock_cfg cfg = { ++ .encap_type = UDP_ENCAP_OVPNINUDP, ++ .encap_rcv = ovpn_udp_encap_recv, ++ }; ++ struct ovpn_socket *old_data; ++ int ret; ++ ++ /* sanity check */ ++ if (sock->sk->sk_protocol != IPPROTO_UDP) { ++ DEBUG_NET_WARN_ON_ONCE(1); ++ return -EINVAL; ++ } ++ ++ /* make sure no pre-existing encapsulation handler exists */ ++ rcu_read_lock(); ++ old_data = rcu_dereference_sk_user_data(sock->sk); ++ if (!old_data) { ++ /* socket is currently unused - we can take it */ ++ rcu_read_unlock(); ++ setup_udp_tunnel_sock(sock_net(sock->sk), sock, &cfg); ++ return 0; ++ } ++ ++ /* socket is in use. We need to understand if it's owned by this ovpn ++ * instance or by something else. ++ * In the former case, we can increase the refcounter and happily ++ * use it, because the same UDP socket is expected to be shared among ++ * different peers. ++ * ++ * Unlikely TCP, a single UDP socket can be used to talk to many remote ++ * hosts and therefore openvpn instantiates one only for all its peers ++ */ ++ if ((READ_ONCE(udp_sk(sock->sk)->encap_type) == UDP_ENCAP_OVPNINUDP) && ++ old_data->ovpn == ovpn) { ++ netdev_dbg(ovpn->dev, ++ "%s: provided socket already owned by this interface\n", ++ __func__); ++ ret = -EALREADY; ++ } else { ++ netdev_err(ovpn->dev, ++ "%s: provided socket already taken by other user\n", ++ __func__); ++ ret = -EBUSY; ++ } ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++/** ++ * ovpn_udp_socket_detach - clean udp-tunnel status for this socket ++ * @sock: the socket to clean ++ */ ++void ovpn_udp_socket_detach(struct socket *sock) ++{ ++ struct udp_tunnel_sock_cfg cfg = { }; ++ ++ setup_udp_tunnel_sock(sock_net(sock->sk), sock, &cfg); ++} +diff --git a/drivers/net/ovpn/udp.h b/drivers/net/ovpn/udp.h +new file mode 100644 +index 000000000000..fecb68464896 +--- /dev/null ++++ b/drivers/net/ovpn/udp.h +@@ -0,0 +1,26 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* OpenVPN data channel offload ++ * ++ * Copyright (C) 2019-2024 OpenVPN, Inc. ++ * ++ * Author: Antonio Quartulli ++ */ ++ ++#ifndef _NET_OVPN_UDP_H_ ++#define _NET_OVPN_UDP_H_ ++ ++#include ++#include ++ ++struct ovpn_peer; ++struct ovpn_struct; ++struct sk_buff; ++struct socket; ++ ++int ovpn_udp_socket_attach(struct socket *sock, struct ovpn_struct *ovpn); ++void ovpn_udp_socket_detach(struct socket *sock); ++void ovpn_udp_send_skb(struct ovpn_struct *ovpn, struct ovpn_peer *peer, ++ struct sk_buff *skb); ++struct ovpn_struct *ovpn_from_udp_sock(struct sock *sk); ++ ++#endif /* _NET_OVPN_UDP_H_ */ +diff --git a/include/net/netlink.h b/include/net/netlink.h +index db6af207287c..2dc671c977ff 100644 +--- a/include/net/netlink.h ++++ b/include/net/netlink.h +@@ -469,6 +469,7 @@ struct nla_policy { + .max = _len \ + } + #define NLA_POLICY_MIN_LEN(_len) NLA_POLICY_MIN(NLA_BINARY, _len) ++#define NLA_POLICY_MAX_LEN(_len) NLA_POLICY_MAX(NLA_BINARY, _len) + + /** + * struct nl_info - netlink source information +diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h +index 6dc258993b17..9a5419d60100 100644 +--- a/include/uapi/linux/if_link.h ++++ b/include/uapi/linux/if_link.h +@@ -1959,4 +1959,19 @@ enum { + + #define IFLA_DSA_MAX (__IFLA_DSA_MAX - 1) + ++/* OVPN section */ ++ ++enum ovpn_mode { ++ OVPN_MODE_P2P, ++ OVPN_MODE_MP, ++}; ++ ++enum { ++ IFLA_OVPN_UNSPEC, ++ IFLA_OVPN_MODE, ++ __IFLA_OVPN_MAX, ++}; ++ ++#define IFLA_OVPN_MAX (__IFLA_OVPN_MAX - 1) ++ + #endif /* _UAPI_LINUX_IF_LINK_H */ +diff --git a/include/uapi/linux/ovpn.h b/include/uapi/linux/ovpn.h +new file mode 100644 +index 000000000000..7bac0803cd9f +--- /dev/null ++++ b/include/uapi/linux/ovpn.h +@@ -0,0 +1,109 @@ ++/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ ++/* Do not edit directly, auto-generated from: */ ++/* Documentation/netlink/specs/ovpn.yaml */ ++/* YNL-GEN uapi header */ ++ ++#ifndef _UAPI_LINUX_OVPN_H ++#define _UAPI_LINUX_OVPN_H ++ ++#define OVPN_FAMILY_NAME "ovpn" ++#define OVPN_FAMILY_VERSION 1 ++ ++#define OVPN_NONCE_TAIL_SIZE 8 ++ ++enum ovpn_cipher_alg { ++ OVPN_CIPHER_ALG_NONE, ++ OVPN_CIPHER_ALG_AES_GCM, ++ OVPN_CIPHER_ALG_CHACHA20_POLY1305, ++}; ++ ++enum ovpn_del_peer_reason { ++ OVPN_DEL_PEER_REASON_TEARDOWN, ++ OVPN_DEL_PEER_REASON_USERSPACE, ++ OVPN_DEL_PEER_REASON_EXPIRED, ++ OVPN_DEL_PEER_REASON_TRANSPORT_ERROR, ++ OVPN_DEL_PEER_REASON_TRANSPORT_DISCONNECT, ++}; ++ ++enum ovpn_key_slot { ++ OVPN_KEY_SLOT_PRIMARY, ++ OVPN_KEY_SLOT_SECONDARY, ++}; ++ ++enum { ++ OVPN_A_PEER_ID = 1, ++ OVPN_A_PEER_REMOTE_IPV4, ++ OVPN_A_PEER_REMOTE_IPV6, ++ OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID, ++ OVPN_A_PEER_REMOTE_PORT, ++ OVPN_A_PEER_SOCKET, ++ OVPN_A_PEER_VPN_IPV4, ++ OVPN_A_PEER_VPN_IPV6, ++ OVPN_A_PEER_LOCAL_IPV4, ++ OVPN_A_PEER_LOCAL_IPV6, ++ OVPN_A_PEER_LOCAL_PORT, ++ OVPN_A_PEER_KEEPALIVE_INTERVAL, ++ OVPN_A_PEER_KEEPALIVE_TIMEOUT, ++ OVPN_A_PEER_DEL_REASON, ++ OVPN_A_PEER_VPN_RX_BYTES, ++ OVPN_A_PEER_VPN_TX_BYTES, ++ OVPN_A_PEER_VPN_RX_PACKETS, ++ OVPN_A_PEER_VPN_TX_PACKETS, ++ OVPN_A_PEER_LINK_RX_BYTES, ++ OVPN_A_PEER_LINK_TX_BYTES, ++ OVPN_A_PEER_LINK_RX_PACKETS, ++ OVPN_A_PEER_LINK_TX_PACKETS, ++ ++ __OVPN_A_PEER_MAX, ++ OVPN_A_PEER_MAX = (__OVPN_A_PEER_MAX - 1) ++}; ++ ++enum { ++ OVPN_A_KEYCONF_PEER_ID = 1, ++ OVPN_A_KEYCONF_SLOT, ++ OVPN_A_KEYCONF_KEY_ID, ++ OVPN_A_KEYCONF_CIPHER_ALG, ++ OVPN_A_KEYCONF_ENCRYPT_DIR, ++ OVPN_A_KEYCONF_DECRYPT_DIR, ++ ++ __OVPN_A_KEYCONF_MAX, ++ OVPN_A_KEYCONF_MAX = (__OVPN_A_KEYCONF_MAX - 1) ++}; ++ ++enum { ++ OVPN_A_KEYDIR_CIPHER_KEY = 1, ++ OVPN_A_KEYDIR_NONCE_TAIL, ++ ++ __OVPN_A_KEYDIR_MAX, ++ OVPN_A_KEYDIR_MAX = (__OVPN_A_KEYDIR_MAX - 1) ++}; ++ ++enum { ++ OVPN_A_IFINDEX = 1, ++ OVPN_A_IFNAME, ++ OVPN_A_PEER, ++ OVPN_A_KEYCONF, ++ ++ __OVPN_A_MAX, ++ OVPN_A_MAX = (__OVPN_A_MAX - 1) ++}; ++ ++enum { ++ OVPN_CMD_PEER_NEW = 1, ++ OVPN_CMD_PEER_SET, ++ OVPN_CMD_PEER_GET, ++ OVPN_CMD_PEER_DEL, ++ OVPN_CMD_PEER_DEL_NTF, ++ OVPN_CMD_KEY_NEW, ++ OVPN_CMD_KEY_GET, ++ OVPN_CMD_KEY_SWAP, ++ OVPN_CMD_KEY_SWAP_NTF, ++ OVPN_CMD_KEY_DEL, ++ ++ __OVPN_CMD_MAX, ++ OVPN_CMD_MAX = (__OVPN_CMD_MAX - 1) ++}; ++ ++#define OVPN_MCGRP_PEERS "peers" ++ ++#endif /* _UAPI_LINUX_OVPN_H */ +diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h +index 1a0fe8b151fb..f9f8ffddfd0c 100644 +--- a/include/uapi/linux/udp.h ++++ b/include/uapi/linux/udp.h +@@ -43,5 +43,6 @@ struct udphdr { + #define UDP_ENCAP_GTP1U 5 /* 3GPP TS 29.060 */ + #define UDP_ENCAP_RXRPC 6 + #define TCP_ENCAP_ESPINTCP 7 /* Yikes, this is really xfrm encap types. */ ++#define UDP_ENCAP_OVPNINUDP 8 /* OpenVPN traffic */ + + #endif /* _UAPI_LINUX_UDP_H */ +diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py +index 717530bc9c52..3ccbb301be87 100755 +--- a/tools/net/ynl/ynl-gen-c.py ++++ b/tools/net/ynl/ynl-gen-c.py +@@ -466,6 +466,8 @@ class TypeBinary(Type): + def _attr_policy(self, policy): + if 'exact-len' in self.checks: + mem = 'NLA_POLICY_EXACT_LEN(' + str(self.get_limit('exact-len')) + ')' ++ elif 'max-len' in self.checks: ++ mem = 'NLA_POLICY_MAX_LEN(' + str(self.get_limit('max-len')) + ')' + else: + mem = '{ ' + if len(self.checks) == 1 and 'min-len' in self.checks: +diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile +index ff18c0361e38..e4b4494b0765 100644 +--- a/tools/testing/selftests/Makefile ++++ b/tools/testing/selftests/Makefile +@@ -69,6 +69,7 @@ TARGETS += net/hsr + TARGETS += net/mptcp + TARGETS += net/netfilter + TARGETS += net/openvswitch ++TARGETS += net/ovpn + TARGETS += net/packetdrill + TARGETS += net/rds + TARGETS += net/tcp_ao +diff --git a/tools/testing/selftests/net/ovpn/.gitignore b/tools/testing/selftests/net/ovpn/.gitignore +new file mode 100644 +index 000000000000..ee44c081ca7c +--- /dev/null ++++ b/tools/testing/selftests/net/ovpn/.gitignore +@@ -0,0 +1,2 @@ ++# SPDX-License-Identifier: GPL-2.0+ ++ovpn-cli +diff --git a/tools/testing/selftests/net/ovpn/Makefile b/tools/testing/selftests/net/ovpn/Makefile +new file mode 100644 +index 000000000000..c76d8fd953c5 +--- /dev/null ++++ b/tools/testing/selftests/net/ovpn/Makefile +@@ -0,0 +1,17 @@ ++# SPDX-License-Identifier: GPL-2.0 ++# Copyright (C) 2020-2024 OpenVPN, Inc. ++# ++CFLAGS = -pedantic -Wextra -Wall -Wl,--no-as-needed -g -O0 -ggdb $(KHDR_INCLUDES) ++CFLAGS += $(shell pkg-config --cflags libnl-3.0 libnl-genl-3.0) ++ ++LDFLAGS = -lmbedtls -lmbedcrypto ++LDFLAGS += $(shell pkg-config --libs libnl-3.0 libnl-genl-3.0) ++ ++TEST_PROGS = test.sh \ ++ test-chachapoly.sh \ ++ test-tcp.sh \ ++ test-float.sh ++ ++TEST_GEN_FILES = ovpn-cli ++ ++include ../../lib.mk +diff --git a/tools/testing/selftests/net/ovpn/config b/tools/testing/selftests/net/ovpn/config +new file mode 100644 +index 000000000000..71946ba9fa17 +--- /dev/null ++++ b/tools/testing/selftests/net/ovpn/config +@@ -0,0 +1,10 @@ ++CONFIG_NET=y ++CONFIG_INET=y ++CONFIG_STREAM_PARSER=y ++CONFIG_NET_UDP_TUNNEL=y ++CONFIG_DST_CACHE=y ++CONFIG_CRYPTO=y ++CONFIG_CRYPTO_AES=y ++CONFIG_CRYPTO_GCM=y ++CONFIG_CRYPTO_CHACHA20POLY1305=y ++CONFIG_OVPN=m +diff --git a/tools/testing/selftests/net/ovpn/data64.key b/tools/testing/selftests/net/ovpn/data64.key +new file mode 100644 +index 000000000000..a99e88c4e290 +--- /dev/null ++++ b/tools/testing/selftests/net/ovpn/data64.key +@@ -0,0 +1,5 @@ ++jRqMACN7d7/aFQNT8S7jkrBD8uwrgHbG5OQZP2eu4R1Y7tfpS2bf5RHv06Vi163CGoaIiTX99R3B ++ia9ycAH8Wz1+9PWv51dnBLur9jbShlgZ2QHLtUc4a/gfT7zZwULXuuxdLnvR21DDeMBaTbkgbai9 ++uvAa7ne1liIgGFzbv+Bas4HDVrygxIxuAnP5Qgc3648IJkZ0QEXPF+O9f0n5+QIvGCxkAUVx+5K6 ++KIs+SoeWXnAopELmoGSjUpFtJbagXK82HfdqpuUxT2Tnuef0/14SzVE/vNleBNu2ZbyrSAaah8tE ++BofkPJUBFY+YQcfZNM5Dgrw3i+Bpmpq/gpdg5w== +diff --git a/tools/testing/selftests/net/ovpn/ovpn-cli.c b/tools/testing/selftests/net/ovpn/ovpn-cli.c +new file mode 100644 +index 000000000000..046dd069aaaf +--- /dev/null ++++ b/tools/testing/selftests/net/ovpn/ovpn-cli.c +@@ -0,0 +1,2370 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* OpenVPN data channel accelerator ++ * ++ * Copyright (C) 2020-2024 OpenVPN, Inc. ++ * ++ * Author: Antonio Quartulli ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++ ++/* defines to make checkpatch happy */ ++#define strscpy strncpy ++#define __always_unused __attribute__((__unused__)) ++ ++/* libnl < 3.5.0 does not set the NLA_F_NESTED on its own, therefore we ++ * have to explicitly do it to prevent the kernel from failing upon ++ * parsing of the message ++ */ ++#define nla_nest_start(_msg, _type) \ ++ nla_nest_start(_msg, (_type) | NLA_F_NESTED) ++ ++uint64_t nla_get_uint(struct nlattr *attr) ++{ ++ if (nla_len(attr) == sizeof(uint32_t)) ++ return nla_get_u32(attr); ++ else ++ return nla_get_u64(attr); ++} ++ ++typedef int (*ovpn_nl_cb)(struct nl_msg *msg, void *arg); ++ ++enum ovpn_key_direction { ++ KEY_DIR_IN = 0, ++ KEY_DIR_OUT, ++}; ++ ++#define KEY_LEN (256 / 8) ++#define NONCE_LEN 8 ++ ++#define PEER_ID_UNDEF 0x00FFFFFF ++ ++struct nl_ctx { ++ struct nl_sock *nl_sock; ++ struct nl_msg *nl_msg; ++ struct nl_cb *nl_cb; ++ ++ int ovpn_dco_id; ++}; ++ ++enum ovpn_cmd { ++ CMD_INVALID, ++ CMD_NEW_IFACE, ++ CMD_DEL_IFACE, ++ CMD_LISTEN, ++ CMD_CONNECT, ++ CMD_NEW_PEER, ++ CMD_NEW_MULTI_PEER, ++ CMD_SET_PEER, ++ CMD_DEL_PEER, ++ CMD_GET_PEER, ++ CMD_NEW_KEY, ++ CMD_DEL_KEY, ++ CMD_GET_KEY, ++ CMD_SWAP_KEYS, ++ CMD_LISTEN_MCAST, ++}; ++ ++struct ovpn_ctx { ++ enum ovpn_cmd cmd; ++ ++ __u8 key_enc[KEY_LEN]; ++ __u8 key_dec[KEY_LEN]; ++ __u8 nonce[NONCE_LEN]; ++ ++ enum ovpn_cipher_alg cipher; ++ ++ sa_family_t sa_family; ++ ++ unsigned long peer_id; ++ unsigned long lport; ++ ++ union { ++ struct sockaddr_in in4; ++ struct sockaddr_in6 in6; ++ } remote; ++ ++ union { ++ struct sockaddr_in in4; ++ struct sockaddr_in6 in6; ++ } peer_ip; ++ ++ bool peer_ip_set; ++ ++ unsigned int ifindex; ++ char ifname[IFNAMSIZ]; ++ enum ovpn_mode mode; ++ bool mode_set; ++ ++ int socket; ++ int cli_socket; ++ ++ __u32 keepalive_interval; ++ __u32 keepalive_timeout; ++ ++ enum ovpn_key_direction key_dir; ++ enum ovpn_key_slot key_slot; ++ int key_id; ++ ++ const char *peers_file; ++}; ++ ++static int ovpn_nl_recvmsgs(struct nl_ctx *ctx) ++{ ++ int ret; ++ ++ ret = nl_recvmsgs(ctx->nl_sock, ctx->nl_cb); ++ ++ switch (ret) { ++ case -NLE_INTR: ++ fprintf(stderr, ++ "netlink received interrupt due to signal - ignoring\n"); ++ break; ++ case -NLE_NOMEM: ++ fprintf(stderr, "netlink out of memory error\n"); ++ break; ++ case -NLE_AGAIN: ++ fprintf(stderr, ++ "netlink reports blocking read - aborting wait\n"); ++ break; ++ default: ++ if (ret) ++ fprintf(stderr, "netlink reports error (%d): %s\n", ++ ret, nl_geterror(-ret)); ++ break; ++ } ++ ++ return ret; ++} ++ ++static struct nl_ctx *nl_ctx_alloc_flags(struct ovpn_ctx *ovpn, int cmd, ++ int flags) ++{ ++ struct nl_ctx *ctx; ++ int err, ret; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) ++ return NULL; ++ ++ ctx->nl_sock = nl_socket_alloc(); ++ if (!ctx->nl_sock) { ++ fprintf(stderr, "cannot allocate netlink socket\n"); ++ goto err_free; ++ } ++ ++ nl_socket_set_buffer_size(ctx->nl_sock, 8192, 8192); ++ ++ ret = genl_connect(ctx->nl_sock); ++ if (ret) { ++ fprintf(stderr, "cannot connect to generic netlink: %s\n", ++ nl_geterror(ret)); ++ goto err_sock; ++ } ++ ++ /* enable Extended ACK for detailed error reporting */ ++ err = 1; ++ setsockopt(nl_socket_get_fd(ctx->nl_sock), SOL_NETLINK, NETLINK_EXT_ACK, ++ &err, sizeof(err)); ++ ++ ctx->ovpn_dco_id = genl_ctrl_resolve(ctx->nl_sock, OVPN_FAMILY_NAME); ++ if (ctx->ovpn_dco_id < 0) { ++ fprintf(stderr, "cannot find ovpn_dco netlink component: %d\n", ++ ctx->ovpn_dco_id); ++ goto err_free; ++ } ++ ++ ctx->nl_msg = nlmsg_alloc(); ++ if (!ctx->nl_msg) { ++ fprintf(stderr, "cannot allocate netlink message\n"); ++ goto err_sock; ++ } ++ ++ ctx->nl_cb = nl_cb_alloc(NL_CB_DEFAULT); ++ if (!ctx->nl_cb) { ++ fprintf(stderr, "failed to allocate netlink callback\n"); ++ goto err_msg; ++ } ++ ++ nl_socket_set_cb(ctx->nl_sock, ctx->nl_cb); ++ ++ genlmsg_put(ctx->nl_msg, 0, 0, ctx->ovpn_dco_id, 0, flags, cmd, 0); ++ ++ if (ovpn->ifindex > 0) ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_IFINDEX, ovpn->ifindex); ++ ++ return ctx; ++nla_put_failure: ++err_msg: ++ nlmsg_free(ctx->nl_msg); ++err_sock: ++ nl_socket_free(ctx->nl_sock); ++err_free: ++ free(ctx); ++ return NULL; ++} ++ ++static struct nl_ctx *nl_ctx_alloc(struct ovpn_ctx *ovpn, int cmd) ++{ ++ return nl_ctx_alloc_flags(ovpn, cmd, 0); ++} ++ ++static void nl_ctx_free(struct nl_ctx *ctx) ++{ ++ if (!ctx) ++ return; ++ ++ nl_socket_free(ctx->nl_sock); ++ nlmsg_free(ctx->nl_msg); ++ nl_cb_put(ctx->nl_cb); ++ free(ctx); ++} ++ ++static int ovpn_nl_cb_error(struct sockaddr_nl (*nla)__always_unused, ++ struct nlmsgerr *err, void *arg) ++{ ++ struct nlmsghdr *nlh = (struct nlmsghdr *)err - 1; ++ struct nlattr *tb_msg[NLMSGERR_ATTR_MAX + 1]; ++ int len = nlh->nlmsg_len; ++ struct nlattr *attrs; ++ int *ret = arg; ++ int ack_len = sizeof(*nlh) + sizeof(int) + sizeof(*nlh); ++ ++ *ret = err->error; ++ ++ if (!(nlh->nlmsg_flags & NLM_F_ACK_TLVS)) ++ return NL_STOP; ++ ++ if (!(nlh->nlmsg_flags & NLM_F_CAPPED)) ++ ack_len += err->msg.nlmsg_len - sizeof(*nlh); ++ ++ if (len <= ack_len) ++ return NL_STOP; ++ ++ attrs = (void *)((uint8_t *)nlh + ack_len); ++ len -= ack_len; ++ ++ nla_parse(tb_msg, NLMSGERR_ATTR_MAX, attrs, len, NULL); ++ if (tb_msg[NLMSGERR_ATTR_MSG]) { ++ len = strnlen((char *)nla_data(tb_msg[NLMSGERR_ATTR_MSG]), ++ nla_len(tb_msg[NLMSGERR_ATTR_MSG])); ++ fprintf(stderr, "kernel error: %*s\n", len, ++ (char *)nla_data(tb_msg[NLMSGERR_ATTR_MSG])); ++ } ++ ++ if (tb_msg[NLMSGERR_ATTR_MISS_NEST]) { ++ fprintf(stderr, "missing required nesting type %u\n", ++ nla_get_u32(tb_msg[NLMSGERR_ATTR_MISS_NEST])); ++ } ++ ++ if (tb_msg[NLMSGERR_ATTR_MISS_TYPE]) { ++ fprintf(stderr, "missing required attribute type %u\n", ++ nla_get_u32(tb_msg[NLMSGERR_ATTR_MISS_TYPE])); ++ } ++ ++ return NL_STOP; ++} ++ ++static int ovpn_nl_cb_finish(struct nl_msg (*msg)__always_unused, ++ void *arg) ++{ ++ int *status = arg; ++ ++ *status = 0; ++ return NL_SKIP; ++} ++ ++static int ovpn_nl_cb_ack(struct nl_msg (*msg)__always_unused, ++ void *arg) ++{ ++ int *status = arg; ++ ++ *status = 0; ++ return NL_STOP; ++} ++ ++static int ovpn_nl_msg_send(struct nl_ctx *ctx, ovpn_nl_cb cb) ++{ ++ int status = 1; ++ ++ nl_cb_err(ctx->nl_cb, NL_CB_CUSTOM, ovpn_nl_cb_error, &status); ++ nl_cb_set(ctx->nl_cb, NL_CB_FINISH, NL_CB_CUSTOM, ovpn_nl_cb_finish, ++ &status); ++ nl_cb_set(ctx->nl_cb, NL_CB_ACK, NL_CB_CUSTOM, ovpn_nl_cb_ack, &status); ++ ++ if (cb) ++ nl_cb_set(ctx->nl_cb, NL_CB_VALID, NL_CB_CUSTOM, cb, ctx); ++ ++ nl_send_auto_complete(ctx->nl_sock, ctx->nl_msg); ++ ++ while (status == 1) ++ ovpn_nl_recvmsgs(ctx); ++ ++ if (status < 0) ++ fprintf(stderr, "failed to send netlink message: %s (%d)\n", ++ strerror(-status), status); ++ ++ return status; ++} ++ ++static int ovpn_parse_key(const char *file, struct ovpn_ctx *ctx) ++{ ++ int idx_enc, idx_dec, ret = -1; ++ unsigned char *ckey = NULL; ++ __u8 *bkey = NULL; ++ size_t olen = 0; ++ long ckey_len; ++ FILE *fp; ++ ++ fp = fopen(file, "r"); ++ if (!fp) { ++ fprintf(stderr, "cannot open: %s\n", file); ++ return -1; ++ } ++ ++ /* get file size */ ++ fseek(fp, 0L, SEEK_END); ++ ckey_len = ftell(fp); ++ rewind(fp); ++ ++ /* if the file is longer, let's just read a portion */ ++ if (ckey_len > 256) ++ ckey_len = 256; ++ ++ ckey = malloc(ckey_len); ++ if (!ckey) ++ goto err; ++ ++ ret = fread(ckey, 1, ckey_len, fp); ++ if (ret != ckey_len) { ++ fprintf(stderr, ++ "couldn't read enough data from key file: %dbytes read\n", ++ ret); ++ goto err; ++ } ++ ++ olen = 0; ++ ret = mbedtls_base64_decode(NULL, 0, &olen, ckey, ckey_len); ++ if (ret != MBEDTLS_ERR_BASE64_BUFFER_TOO_SMALL) { ++ char buf[256]; ++ ++ mbedtls_strerror(ret, buf, sizeof(buf)); ++ fprintf(stderr, "unexpected base64 error1: %s (%d)\n", buf, ++ ret); ++ ++ goto err; ++ } ++ ++ bkey = malloc(olen); ++ if (!bkey) { ++ fprintf(stderr, "cannot allocate binary key buffer\n"); ++ goto err; ++ } ++ ++ ret = mbedtls_base64_decode(bkey, olen, &olen, ckey, ckey_len); ++ if (ret) { ++ char buf[256]; ++ ++ mbedtls_strerror(ret, buf, sizeof(buf)); ++ fprintf(stderr, "unexpected base64 error2: %s (%d)\n", buf, ++ ret); ++ ++ goto err; ++ } ++ ++ if (olen < 2 * KEY_LEN + NONCE_LEN) { ++ fprintf(stderr, ++ "not enough data in key file, found %zdB but needs %dB\n", ++ olen, 2 * KEY_LEN + NONCE_LEN); ++ goto err; ++ } ++ ++ switch (ctx->key_dir) { ++ case KEY_DIR_IN: ++ idx_enc = 0; ++ idx_dec = 1; ++ break; ++ case KEY_DIR_OUT: ++ idx_enc = 1; ++ idx_dec = 0; ++ break; ++ default: ++ goto err; ++ } ++ ++ memcpy(ctx->key_enc, bkey + KEY_LEN * idx_enc, KEY_LEN); ++ memcpy(ctx->key_dec, bkey + KEY_LEN * idx_dec, KEY_LEN); ++ memcpy(ctx->nonce, bkey + 2 * KEY_LEN, NONCE_LEN); ++ ++ ret = 0; ++ ++err: ++ fclose(fp); ++ free(bkey); ++ free(ckey); ++ ++ return ret; ++} ++ ++static int ovpn_parse_cipher(const char *cipher, struct ovpn_ctx *ctx) ++{ ++ if (strcmp(cipher, "aes") == 0) ++ ctx->cipher = OVPN_CIPHER_ALG_AES_GCM; ++ else if (strcmp(cipher, "chachapoly") == 0) ++ ctx->cipher = OVPN_CIPHER_ALG_CHACHA20_POLY1305; ++ else if (strcmp(cipher, "none") == 0) ++ ctx->cipher = OVPN_CIPHER_ALG_NONE; ++ else ++ return -ENOTSUP; ++ ++ return 0; ++} ++ ++static int ovpn_parse_key_direction(const char *dir, struct ovpn_ctx *ctx) ++{ ++ int in_dir; ++ ++ in_dir = strtoll(dir, NULL, 10); ++ switch (in_dir) { ++ case KEY_DIR_IN: ++ case KEY_DIR_OUT: ++ ctx->key_dir = in_dir; ++ break; ++ default: ++ fprintf(stderr, ++ "invalid key direction provided. Can be 0 or 1 only\n"); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int ovpn_socket(struct ovpn_ctx *ctx, sa_family_t family, int proto) ++{ ++ struct sockaddr_storage local_sock = { 0 }; ++ struct sockaddr_in6 *in6; ++ struct sockaddr_in *in; ++ int ret, s, sock_type; ++ size_t sock_len; ++ ++ if (proto == IPPROTO_UDP) ++ sock_type = SOCK_DGRAM; ++ else if (proto == IPPROTO_TCP) ++ sock_type = SOCK_STREAM; ++ else ++ return -EINVAL; ++ ++ s = socket(family, sock_type, 0); ++ if (s < 0) { ++ perror("cannot create socket"); ++ return -1; ++ } ++ ++ switch (family) { ++ case AF_INET: ++ in = (struct sockaddr_in *)&local_sock; ++ in->sin_family = family; ++ in->sin_port = htons(ctx->lport); ++ in->sin_addr.s_addr = htonl(INADDR_ANY); ++ sock_len = sizeof(*in); ++ break; ++ case AF_INET6: ++ in6 = (struct sockaddr_in6 *)&local_sock; ++ in6->sin6_family = family; ++ in6->sin6_port = htons(ctx->lport); ++ in6->sin6_addr = in6addr_any; ++ sock_len = sizeof(*in6); ++ break; ++ default: ++ return -1; ++ } ++ ++ int opt = 1; ++ ++ ret = setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); ++ ++ if (ret < 0) { ++ perror("setsockopt for SO_REUSEADDR"); ++ return ret; ++ } ++ ++ ret = setsockopt(s, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)); ++ if (ret < 0) { ++ perror("setsockopt for SO_REUSEPORT"); ++ return ret; ++ } ++ ++ if (family == AF_INET6) { ++ opt = 0; ++ if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY, &opt, ++ sizeof(opt))) { ++ perror("failed to set IPV6_V6ONLY"); ++ return -1; ++ } ++ } ++ ++ ret = bind(s, (struct sockaddr *)&local_sock, sock_len); ++ if (ret < 0) { ++ perror("cannot bind socket"); ++ goto err_socket; ++ } ++ ++ ctx->socket = s; ++ ctx->sa_family = family; ++ return 0; ++ ++err_socket: ++ close(s); ++ return -1; ++} ++ ++static int ovpn_udp_socket(struct ovpn_ctx *ctx, sa_family_t family) ++{ ++ return ovpn_socket(ctx, family, IPPROTO_UDP); ++} ++ ++static int ovpn_listen(struct ovpn_ctx *ctx, sa_family_t family) ++{ ++ int ret; ++ ++ ret = ovpn_socket(ctx, family, IPPROTO_TCP); ++ if (ret < 0) ++ return ret; ++ ++ ret = listen(ctx->socket, 10); ++ if (ret < 0) { ++ perror("listen"); ++ close(ctx->socket); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int ovpn_accept(struct ovpn_ctx *ctx) ++{ ++ socklen_t socklen; ++ int ret; ++ ++ socklen = sizeof(ctx->remote); ++ ret = accept(ctx->socket, (struct sockaddr *)&ctx->remote, &socklen); ++ if (ret < 0) { ++ perror("accept"); ++ goto err; ++ } ++ ++ fprintf(stderr, "Connection received!\n"); ++ ++ switch (socklen) { ++ case sizeof(struct sockaddr_in): ++ case sizeof(struct sockaddr_in6): ++ break; ++ default: ++ fprintf(stderr, "error: expecting IPv4 or IPv6 connection\n"); ++ close(ret); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ return ret; ++err: ++ close(ctx->socket); ++ return ret; ++} ++ ++static int ovpn_connect(struct ovpn_ctx *ovpn) ++{ ++ socklen_t socklen; ++ int s, ret; ++ ++ s = socket(ovpn->remote.in4.sin_family, SOCK_STREAM, 0); ++ if (s < 0) { ++ perror("cannot create socket"); ++ return -1; ++ } ++ ++ switch (ovpn->remote.in4.sin_family) { ++ case AF_INET: ++ socklen = sizeof(struct sockaddr_in); ++ break; ++ case AF_INET6: ++ socklen = sizeof(struct sockaddr_in6); ++ break; ++ default: ++ return -EOPNOTSUPP; ++ } ++ ++ ret = connect(s, (struct sockaddr *)&ovpn->remote, socklen); ++ if (ret < 0) { ++ perror("connect"); ++ goto err; ++ } ++ ++ fprintf(stderr, "connected\n"); ++ ++ ovpn->socket = s; ++ ++ return 0; ++err: ++ close(s); ++ return ret; ++} ++ ++static int ovpn_new_peer(struct ovpn_ctx *ovpn, bool is_tcp) ++{ ++ struct nlattr *attr; ++ struct nl_ctx *ctx; ++ int ret = -1; ++ ++ ctx = nl_ctx_alloc(ovpn, OVPN_CMD_PEER_NEW); ++ if (!ctx) ++ return -ENOMEM; ++ ++ attr = nla_nest_start(ctx->nl_msg, OVPN_A_PEER); ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_ID, ovpn->peer_id); ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_SOCKET, ovpn->socket); ++ ++ if (!is_tcp) { ++ switch (ovpn->remote.in4.sin_family) { ++ case AF_INET: ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_REMOTE_IPV4, ++ ovpn->remote.in4.sin_addr.s_addr); ++ NLA_PUT_U16(ctx->nl_msg, OVPN_A_PEER_REMOTE_PORT, ++ ovpn->remote.in4.sin_port); ++ break; ++ case AF_INET6: ++ NLA_PUT(ctx->nl_msg, OVPN_A_PEER_REMOTE_IPV6, ++ sizeof(ovpn->remote.in6.sin6_addr), ++ &ovpn->remote.in6.sin6_addr); ++ NLA_PUT_U32(ctx->nl_msg, ++ OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID, ++ ovpn->remote.in6.sin6_scope_id); ++ NLA_PUT_U16(ctx->nl_msg, OVPN_A_PEER_REMOTE_PORT, ++ ovpn->remote.in6.sin6_port); ++ break; ++ default: ++ fprintf(stderr, ++ "Invalid family for remote socket address\n"); ++ goto nla_put_failure; ++ } ++ } ++ ++ if (ovpn->peer_ip_set) { ++ switch (ovpn->peer_ip.in4.sin_family) { ++ case AF_INET: ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_VPN_IPV4, ++ ovpn->peer_ip.in4.sin_addr.s_addr); ++ break; ++ case AF_INET6: ++ NLA_PUT(ctx->nl_msg, OVPN_A_PEER_VPN_IPV6, ++ sizeof(struct in6_addr), ++ &ovpn->peer_ip.in6.sin6_addr); ++ break; ++ default: ++ fprintf(stderr, "Invalid family for peer address\n"); ++ goto nla_put_failure; ++ } ++ } ++ ++ nla_nest_end(ctx->nl_msg, attr); ++ ++ ret = ovpn_nl_msg_send(ctx, NULL); ++nla_put_failure: ++ nl_ctx_free(ctx); ++ return ret; ++} ++ ++static int ovpn_set_peer(struct ovpn_ctx *ovpn) ++{ ++ struct nlattr *attr; ++ struct nl_ctx *ctx; ++ int ret = -1; ++ ++ ctx = nl_ctx_alloc(ovpn, OVPN_CMD_PEER_SET); ++ if (!ctx) ++ return -ENOMEM; ++ ++ attr = nla_nest_start(ctx->nl_msg, OVPN_A_PEER); ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_ID, ovpn->peer_id); ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_KEEPALIVE_INTERVAL, ++ ovpn->keepalive_interval); ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_KEEPALIVE_TIMEOUT, ++ ovpn->keepalive_timeout); ++ nla_nest_end(ctx->nl_msg, attr); ++ ++ ret = ovpn_nl_msg_send(ctx, NULL); ++nla_put_failure: ++ nl_ctx_free(ctx); ++ return ret; ++} ++ ++static int ovpn_del_peer(struct ovpn_ctx *ovpn) ++{ ++ struct nlattr *attr; ++ struct nl_ctx *ctx; ++ int ret = -1; ++ ++ ctx = nl_ctx_alloc(ovpn, OVPN_CMD_PEER_DEL); ++ if (!ctx) ++ return -ENOMEM; ++ ++ attr = nla_nest_start(ctx->nl_msg, OVPN_A_PEER); ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_ID, ovpn->peer_id); ++ nla_nest_end(ctx->nl_msg, attr); ++ ++ ret = ovpn_nl_msg_send(ctx, NULL); ++nla_put_failure: ++ nl_ctx_free(ctx); ++ return ret; ++} ++ ++static int ovpn_handle_peer(struct nl_msg *msg, void (*arg)__always_unused) ++{ ++ struct nlattr *pattrs[OVPN_A_PEER_MAX + 1]; ++ struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg)); ++ struct nlattr *attrs[OVPN_A_MAX + 1]; ++ __u16 rport = 0, lport = 0; ++ ++ nla_parse(attrs, OVPN_A_MAX, genlmsg_attrdata(gnlh, 0), ++ genlmsg_attrlen(gnlh, 0), NULL); ++ ++ if (!attrs[OVPN_A_PEER]) { ++ fprintf(stderr, "no packet content in netlink message\n"); ++ return NL_SKIP; ++ } ++ ++ nla_parse(pattrs, OVPN_A_PEER_MAX, nla_data(attrs[OVPN_A_PEER]), ++ nla_len(attrs[OVPN_A_PEER]), NULL); ++ ++ if (pattrs[OVPN_A_PEER_ID]) ++ fprintf(stderr, "* Peer %u\n", ++ nla_get_u32(pattrs[OVPN_A_PEER_ID])); ++ ++ if (pattrs[OVPN_A_PEER_VPN_IPV4]) { ++ char buf[INET_ADDRSTRLEN]; ++ ++ inet_ntop(AF_INET, nla_data(pattrs[OVPN_A_PEER_VPN_IPV4]), ++ buf, sizeof(buf)); ++ fprintf(stderr, "\tVPN IPv4: %s\n", buf); ++ } ++ ++ if (pattrs[OVPN_A_PEER_VPN_IPV6]) { ++ char buf[INET6_ADDRSTRLEN]; ++ ++ inet_ntop(AF_INET6, nla_data(pattrs[OVPN_A_PEER_VPN_IPV6]), ++ buf, sizeof(buf)); ++ fprintf(stderr, "\tVPN IPv6: %s\n", buf); ++ } ++ ++ if (pattrs[OVPN_A_PEER_LOCAL_PORT]) ++ lport = ntohs(nla_get_u16(pattrs[OVPN_A_PEER_LOCAL_PORT])); ++ ++ if (pattrs[OVPN_A_PEER_REMOTE_PORT]) ++ rport = ntohs(nla_get_u16(pattrs[OVPN_A_PEER_REMOTE_PORT])); ++ ++ if (pattrs[OVPN_A_PEER_REMOTE_IPV6]) { ++ void *ip = pattrs[OVPN_A_PEER_REMOTE_IPV6]; ++ char buf[INET6_ADDRSTRLEN]; ++ int scope_id = -1; ++ ++ if (pattrs[OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID]) { ++ void *p = pattrs[OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID]; ++ ++ scope_id = nla_get_u32(p); ++ } ++ ++ inet_ntop(AF_INET6, nla_data(ip), buf, sizeof(buf)); ++ fprintf(stderr, "\tRemote: %s:%hu (scope-id: %u)\n", buf, rport, ++ scope_id); ++ ++ if (pattrs[OVPN_A_PEER_LOCAL_IPV6]) { ++ void *ip = pattrs[OVPN_A_PEER_LOCAL_IPV6]; ++ ++ inet_ntop(AF_INET6, nla_data(ip), buf, sizeof(buf)); ++ fprintf(stderr, "\tLocal: %s:%hu\n", buf, lport); ++ } ++ } ++ ++ if (pattrs[OVPN_A_PEER_REMOTE_IPV4]) { ++ void *ip = pattrs[OVPN_A_PEER_REMOTE_IPV4]; ++ char buf[INET_ADDRSTRLEN]; ++ ++ inet_ntop(AF_INET, nla_data(ip), buf, sizeof(buf)); ++ fprintf(stderr, "\tRemote: %s:%hu\n", buf, rport); ++ ++ if (pattrs[OVPN_A_PEER_LOCAL_IPV4]) { ++ void *p = pattrs[OVPN_A_PEER_LOCAL_IPV4]; ++ ++ inet_ntop(AF_INET, nla_data(p), buf, sizeof(buf)); ++ fprintf(stderr, "\tLocal: %s:%hu\n", buf, lport); ++ } ++ } ++ ++ if (pattrs[OVPN_A_PEER_KEEPALIVE_INTERVAL]) { ++ void *p = pattrs[OVPN_A_PEER_KEEPALIVE_INTERVAL]; ++ ++ fprintf(stderr, "\tKeepalive interval: %u sec\n", ++ nla_get_u32(p)); ++ } ++ ++ if (pattrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT]) ++ fprintf(stderr, "\tKeepalive timeout: %u sec\n", ++ nla_get_u32(pattrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT])); ++ ++ if (pattrs[OVPN_A_PEER_VPN_RX_BYTES]) ++ fprintf(stderr, "\tVPN RX bytes: %" PRIu64 "\n", ++ nla_get_uint(pattrs[OVPN_A_PEER_VPN_RX_BYTES])); ++ ++ if (pattrs[OVPN_A_PEER_VPN_TX_BYTES]) ++ fprintf(stderr, "\tVPN TX bytes: %" PRIu64 "\n", ++ nla_get_uint(pattrs[OVPN_A_PEER_VPN_TX_BYTES])); ++ ++ if (pattrs[OVPN_A_PEER_VPN_RX_PACKETS]) ++ fprintf(stderr, "\tVPN RX packets: %" PRIu64 "\n", ++ nla_get_uint(pattrs[OVPN_A_PEER_VPN_RX_PACKETS])); ++ ++ if (pattrs[OVPN_A_PEER_VPN_TX_PACKETS]) ++ fprintf(stderr, "\tVPN TX packets: %" PRIu64 "\n", ++ nla_get_uint(pattrs[OVPN_A_PEER_VPN_TX_PACKETS])); ++ ++ if (pattrs[OVPN_A_PEER_LINK_RX_BYTES]) ++ fprintf(stderr, "\tLINK RX bytes: %" PRIu64 "\n", ++ nla_get_uint(pattrs[OVPN_A_PEER_LINK_RX_BYTES])); ++ ++ if (pattrs[OVPN_A_PEER_LINK_TX_BYTES]) ++ fprintf(stderr, "\tLINK TX bytes: %" PRIu64 "\n", ++ nla_get_uint(pattrs[OVPN_A_PEER_LINK_TX_BYTES])); ++ ++ if (pattrs[OVPN_A_PEER_LINK_RX_PACKETS]) ++ fprintf(stderr, "\tLINK RX packets: %" PRIu64 "\n", ++ nla_get_uint(pattrs[OVPN_A_PEER_LINK_RX_PACKETS])); ++ ++ if (pattrs[OVPN_A_PEER_LINK_TX_PACKETS]) ++ fprintf(stderr, "\tLINK TX packets: %" PRIu64 "\n", ++ nla_get_uint(pattrs[OVPN_A_PEER_LINK_TX_PACKETS])); ++ ++ return NL_SKIP; ++} ++ ++static int ovpn_get_peer(struct ovpn_ctx *ovpn) ++{ ++ int flags = 0, ret = -1; ++ struct nlattr *attr; ++ struct nl_ctx *ctx; ++ ++ if (ovpn->peer_id == PEER_ID_UNDEF) ++ flags = NLM_F_DUMP; ++ ++ ctx = nl_ctx_alloc_flags(ovpn, OVPN_CMD_PEER_GET, flags); ++ if (!ctx) ++ return -ENOMEM; ++ ++ if (ovpn->peer_id != PEER_ID_UNDEF) { ++ attr = nla_nest_start(ctx->nl_msg, OVPN_A_PEER); ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_ID, ovpn->peer_id); ++ nla_nest_end(ctx->nl_msg, attr); ++ } ++ ++ ret = ovpn_nl_msg_send(ctx, ovpn_handle_peer); ++nla_put_failure: ++ nl_ctx_free(ctx); ++ return ret; ++} ++ ++static int ovpn_new_key(struct ovpn_ctx *ovpn) ++{ ++ struct nlattr *keyconf, *key_dir; ++ struct nl_ctx *ctx; ++ int ret = -1; ++ ++ ctx = nl_ctx_alloc(ovpn, OVPN_CMD_KEY_NEW); ++ if (!ctx) ++ return -ENOMEM; ++ ++ keyconf = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF); ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_PEER_ID, ovpn->peer_id); ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_SLOT, ovpn->key_slot); ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_KEY_ID, ovpn->key_id); ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_CIPHER_ALG, ovpn->cipher); ++ ++ key_dir = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF_ENCRYPT_DIR); ++ NLA_PUT(ctx->nl_msg, OVPN_A_KEYDIR_CIPHER_KEY, KEY_LEN, ovpn->key_enc); ++ NLA_PUT(ctx->nl_msg, OVPN_A_KEYDIR_NONCE_TAIL, NONCE_LEN, ovpn->nonce); ++ nla_nest_end(ctx->nl_msg, key_dir); ++ ++ key_dir = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF_DECRYPT_DIR); ++ NLA_PUT(ctx->nl_msg, OVPN_A_KEYDIR_CIPHER_KEY, KEY_LEN, ovpn->key_dec); ++ NLA_PUT(ctx->nl_msg, OVPN_A_KEYDIR_NONCE_TAIL, NONCE_LEN, ovpn->nonce); ++ nla_nest_end(ctx->nl_msg, key_dir); ++ ++ nla_nest_end(ctx->nl_msg, keyconf); ++ ++ ret = ovpn_nl_msg_send(ctx, NULL); ++nla_put_failure: ++ nl_ctx_free(ctx); ++ return ret; ++} ++ ++static int ovpn_del_key(struct ovpn_ctx *ovpn) ++{ ++ struct nlattr *keyconf; ++ struct nl_ctx *ctx; ++ int ret = -1; ++ ++ ctx = nl_ctx_alloc(ovpn, OVPN_CMD_KEY_DEL); ++ if (!ctx) ++ return -ENOMEM; ++ ++ keyconf = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF); ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_PEER_ID, ovpn->peer_id); ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_SLOT, ovpn->key_slot); ++ nla_nest_end(ctx->nl_msg, keyconf); ++ ++ ret = ovpn_nl_msg_send(ctx, NULL); ++nla_put_failure: ++ nl_ctx_free(ctx); ++ return ret; ++} ++ ++static int ovpn_handle_key(struct nl_msg *msg, void (*arg)__always_unused) ++{ ++ struct nlattr *kattrs[OVPN_A_KEYCONF_MAX + 1]; ++ struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg)); ++ struct nlattr *attrs[OVPN_A_MAX + 1]; ++ ++ nla_parse(attrs, OVPN_A_MAX, genlmsg_attrdata(gnlh, 0), ++ genlmsg_attrlen(gnlh, 0), NULL); ++ ++ if (!attrs[OVPN_A_KEYCONF]) { ++ fprintf(stderr, "no packet content in netlink message\n"); ++ return NL_SKIP; ++ } ++ ++ nla_parse(kattrs, OVPN_A_KEYCONF_MAX, nla_data(attrs[OVPN_A_KEYCONF]), ++ nla_len(attrs[OVPN_A_KEYCONF]), NULL); ++ ++ if (kattrs[OVPN_A_KEYCONF_PEER_ID]) ++ fprintf(stderr, "* Peer %u\n", ++ nla_get_u32(kattrs[OVPN_A_KEYCONF_PEER_ID])); ++ if (kattrs[OVPN_A_KEYCONF_SLOT]) { ++ fprintf(stderr, "\t- Slot: "); ++ switch (nla_get_u32(kattrs[OVPN_A_KEYCONF_SLOT])) { ++ case OVPN_KEY_SLOT_PRIMARY: ++ fprintf(stderr, "primary\n"); ++ break; ++ case OVPN_KEY_SLOT_SECONDARY: ++ fprintf(stderr, "secondary\n"); ++ break; ++ default: ++ fprintf(stderr, "invalid (%u)\n", ++ nla_get_u32(kattrs[OVPN_A_KEYCONF_SLOT])); ++ break; ++ } ++ } ++ if (kattrs[OVPN_A_KEYCONF_KEY_ID]) ++ fprintf(stderr, "\t- Key ID: %u\n", ++ nla_get_u32(kattrs[OVPN_A_KEYCONF_KEY_ID])); ++ if (kattrs[OVPN_A_KEYCONF_CIPHER_ALG]) { ++ fprintf(stderr, "\t- Cipher: "); ++ switch (nla_get_u32(kattrs[OVPN_A_KEYCONF_CIPHER_ALG])) { ++ case OVPN_CIPHER_ALG_NONE: ++ fprintf(stderr, "none\n"); ++ break; ++ case OVPN_CIPHER_ALG_AES_GCM: ++ fprintf(stderr, "aes-gcm\n"); ++ break; ++ case OVPN_CIPHER_ALG_CHACHA20_POLY1305: ++ fprintf(stderr, "chacha20poly1305\n"); ++ break; ++ default: ++ fprintf(stderr, "invalid (%u)\n", ++ nla_get_u32(kattrs[OVPN_A_KEYCONF_CIPHER_ALG])); ++ break; ++ } ++ } ++ ++ return NL_SKIP; ++} ++ ++static int ovpn_get_key(struct ovpn_ctx *ovpn) ++{ ++ struct nlattr *keyconf; ++ struct nl_ctx *ctx; ++ int ret = -1; ++ ++ ctx = nl_ctx_alloc(ovpn, OVPN_CMD_KEY_GET); ++ if (!ctx) ++ return -ENOMEM; ++ ++ keyconf = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF); ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_PEER_ID, ovpn->peer_id); ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_SLOT, ovpn->key_slot); ++ nla_nest_end(ctx->nl_msg, keyconf); ++ ++ ret = ovpn_nl_msg_send(ctx, ovpn_handle_key); ++nla_put_failure: ++ nl_ctx_free(ctx); ++ return ret; ++} ++ ++static int ovpn_swap_keys(struct ovpn_ctx *ovpn) ++{ ++ struct nl_ctx *ctx; ++ struct nlattr *kc; ++ int ret = -1; ++ ++ ctx = nl_ctx_alloc(ovpn, OVPN_CMD_KEY_SWAP); ++ if (!ctx) ++ return -ENOMEM; ++ ++ kc = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF); ++ NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_PEER_ID, ovpn->peer_id); ++ nla_nest_end(ctx->nl_msg, kc); ++ ++ ret = ovpn_nl_msg_send(ctx, NULL); ++nla_put_failure: ++ nl_ctx_free(ctx); ++ return ret; ++} ++ ++/** ++ * Helper function used to easily add attributes to a rtnl message ++ */ ++static int ovpn_addattr(struct nlmsghdr *n, int maxlen, int type, ++ const void *data, int alen) ++{ ++ int len = RTA_LENGTH(alen); ++ struct rtattr *rta; ++ ++ if ((int)(NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len)) > maxlen) { ++ fprintf(stderr, "%s: rtnl: message exceeded bound of %d\n", ++ __func__, maxlen); ++ return -EMSGSIZE; ++ } ++ ++ rta = nlmsg_tail(n); ++ rta->rta_type = type; ++ rta->rta_len = len; ++ ++ if (!data) ++ memset(RTA_DATA(rta), 0, alen); ++ else ++ memcpy(RTA_DATA(rta), data, alen); ++ ++ n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len); ++ ++ return 0; ++} ++ ++static struct rtattr *ovpn_nest_start(struct nlmsghdr *msg, size_t max_size, ++ int attr) ++{ ++ struct rtattr *nest = nlmsg_tail(msg); ++ ++ if (ovpn_addattr(msg, max_size, attr, NULL, 0) < 0) ++ return NULL; ++ ++ return nest; ++} ++ ++static void ovpn_nest_end(struct nlmsghdr *msg, struct rtattr *nest) ++{ ++ nest->rta_len = (uint8_t *)nlmsg_tail(msg) - (uint8_t *)nest; ++} ++ ++#define RT_SNDBUF_SIZE (1024 * 2) ++#define RT_RCVBUF_SIZE (1024 * 4) ++ ++/** ++ * Open RTNL socket ++ */ ++static int ovpn_rt_socket(void) ++{ ++ int sndbuf = RT_SNDBUF_SIZE, rcvbuf = RT_RCVBUF_SIZE, fd; ++ ++ fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); ++ if (fd < 0) { ++ fprintf(stderr, "%s: cannot open netlink socket\n", __func__); ++ return fd; ++ } ++ ++ if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf, ++ sizeof(sndbuf)) < 0) { ++ fprintf(stderr, "%s: SO_SNDBUF\n", __func__); ++ close(fd); ++ return -1; ++ } ++ ++ if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf, ++ sizeof(rcvbuf)) < 0) { ++ fprintf(stderr, "%s: SO_RCVBUF\n", __func__); ++ close(fd); ++ return -1; ++ } ++ ++ return fd; ++} ++ ++/** ++ * Bind socket to Netlink subsystem ++ */ ++static int ovpn_rt_bind(int fd, uint32_t groups) ++{ ++ struct sockaddr_nl local = { 0 }; ++ socklen_t addr_len; ++ ++ local.nl_family = AF_NETLINK; ++ local.nl_groups = groups; ++ ++ if (bind(fd, (struct sockaddr *)&local, sizeof(local)) < 0) { ++ fprintf(stderr, "%s: cannot bind netlink socket: %d\n", ++ __func__, errno); ++ return -errno; ++ } ++ ++ addr_len = sizeof(local); ++ if (getsockname(fd, (struct sockaddr *)&local, &addr_len) < 0) { ++ fprintf(stderr, "%s: cannot getsockname: %d\n", __func__, ++ errno); ++ return -errno; ++ } ++ ++ if (addr_len != sizeof(local)) { ++ fprintf(stderr, "%s: wrong address length %d\n", __func__, ++ addr_len); ++ return -EINVAL; ++ } ++ ++ if (local.nl_family != AF_NETLINK) { ++ fprintf(stderr, "%s: wrong address family %d\n", __func__, ++ local.nl_family); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++typedef int (*ovpn_parse_reply_cb)(struct nlmsghdr *msg, void *arg); ++ ++/** ++ * Send Netlink message and run callback on reply (if specified) ++ */ ++static int ovpn_rt_send(struct nlmsghdr *payload, pid_t peer, ++ unsigned int groups, ovpn_parse_reply_cb cb, ++ void *arg_cb) ++{ ++ int len, rem_len, fd, ret, rcv_len; ++ struct sockaddr_nl nladdr = { 0 }; ++ struct nlmsgerr *err; ++ struct nlmsghdr *h; ++ char buf[1024 * 16]; ++ struct iovec iov = { ++ .iov_base = payload, ++ .iov_len = payload->nlmsg_len, ++ }; ++ struct msghdr nlmsg = { ++ .msg_name = &nladdr, ++ .msg_namelen = sizeof(nladdr), ++ .msg_iov = &iov, ++ .msg_iovlen = 1, ++ }; ++ ++ nladdr.nl_family = AF_NETLINK; ++ nladdr.nl_pid = peer; ++ nladdr.nl_groups = groups; ++ ++ payload->nlmsg_seq = time(NULL); ++ ++ /* no need to send reply */ ++ if (!cb) ++ payload->nlmsg_flags |= NLM_F_ACK; ++ ++ fd = ovpn_rt_socket(); ++ if (fd < 0) { ++ fprintf(stderr, "%s: can't open rtnl socket\n", __func__); ++ return -errno; ++ } ++ ++ ret = ovpn_rt_bind(fd, 0); ++ if (ret < 0) { ++ fprintf(stderr, "%s: can't bind rtnl socket\n", __func__); ++ ret = -errno; ++ goto out; ++ } ++ ++ ret = sendmsg(fd, &nlmsg, 0); ++ if (ret < 0) { ++ fprintf(stderr, "%s: rtnl: error on sendmsg()\n", __func__); ++ ret = -errno; ++ goto out; ++ } ++ ++ /* prepare buffer to store RTNL replies */ ++ memset(buf, 0, sizeof(buf)); ++ iov.iov_base = buf; ++ ++ while (1) { ++ /* ++ * iov_len is modified by recvmsg(), therefore has to be initialized before ++ * using it again ++ */ ++ iov.iov_len = sizeof(buf); ++ rcv_len = recvmsg(fd, &nlmsg, 0); ++ if (rcv_len < 0) { ++ if (errno == EINTR || errno == EAGAIN) { ++ fprintf(stderr, "%s: interrupted call\n", ++ __func__); ++ continue; ++ } ++ fprintf(stderr, "%s: rtnl: error on recvmsg()\n", ++ __func__); ++ ret = -errno; ++ goto out; ++ } ++ ++ if (rcv_len == 0) { ++ fprintf(stderr, ++ "%s: rtnl: socket reached unexpected EOF\n", ++ __func__); ++ ret = -EIO; ++ goto out; ++ } ++ ++ if (nlmsg.msg_namelen != sizeof(nladdr)) { ++ fprintf(stderr, ++ "%s: sender address length: %u (expected %zu)\n", ++ __func__, nlmsg.msg_namelen, sizeof(nladdr)); ++ ret = -EIO; ++ goto out; ++ } ++ ++ h = (struct nlmsghdr *)buf; ++ while (rcv_len >= (int)sizeof(*h)) { ++ len = h->nlmsg_len; ++ rem_len = len - sizeof(*h); ++ ++ if (rem_len < 0 || len > rcv_len) { ++ if (nlmsg.msg_flags & MSG_TRUNC) { ++ fprintf(stderr, "%s: truncated message\n", ++ __func__); ++ ret = -EIO; ++ goto out; ++ } ++ fprintf(stderr, "%s: malformed message: len=%d\n", ++ __func__, len); ++ ret = -EIO; ++ goto out; ++ } ++ ++ if (h->nlmsg_type == NLMSG_DONE) { ++ ret = 0; ++ goto out; ++ } ++ ++ if (h->nlmsg_type == NLMSG_ERROR) { ++ err = (struct nlmsgerr *)NLMSG_DATA(h); ++ if (rem_len < (int)sizeof(struct nlmsgerr)) { ++ fprintf(stderr, "%s: ERROR truncated\n", ++ __func__); ++ ret = -EIO; ++ goto out; ++ } ++ ++ if (err->error) { ++ fprintf(stderr, "%s: (%d) %s\n", ++ __func__, err->error, ++ strerror(-err->error)); ++ ret = err->error; ++ goto out; ++ } ++ ++ ret = 0; ++ if (cb) { ++ int r = cb(h, arg_cb); ++ ++ if (r <= 0) ++ ret = r; ++ } ++ goto out; ++ } ++ ++ if (cb) { ++ int r = cb(h, arg_cb); ++ ++ if (r <= 0) { ++ ret = r; ++ goto out; ++ } ++ } else { ++ fprintf(stderr, "%s: RTNL: unexpected reply\n", ++ __func__); ++ } ++ ++ rcv_len -= NLMSG_ALIGN(len); ++ h = (struct nlmsghdr *)((uint8_t *)h + ++ NLMSG_ALIGN(len)); ++ } ++ ++ if (nlmsg.msg_flags & MSG_TRUNC) { ++ fprintf(stderr, "%s: message truncated\n", __func__); ++ continue; ++ } ++ ++ if (rcv_len) { ++ fprintf(stderr, "%s: rtnl: %d not parsed bytes\n", ++ __func__, rcv_len); ++ ret = -1; ++ goto out; ++ } ++ } ++out: ++ close(fd); ++ ++ return ret; ++} ++ ++struct ovpn_link_req { ++ struct nlmsghdr n; ++ struct ifinfomsg i; ++ char buf[256]; ++}; ++ ++static int ovpn_new_iface(struct ovpn_ctx *ovpn) ++{ ++ struct rtattr *linkinfo, *data; ++ struct ovpn_link_req req = { 0 }; ++ int ret = -1; ++ ++ fprintf(stdout, "Creating interface %s with mode %u\n", ovpn->ifname, ++ ovpn->mode); ++ ++ req.n.nlmsg_len = NLMSG_LENGTH(sizeof(req.i)); ++ req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; ++ req.n.nlmsg_type = RTM_NEWLINK; ++ ++ if (ovpn_addattr(&req.n, sizeof(req), IFLA_IFNAME, ovpn->ifname, ++ strlen(ovpn->ifname) + 1) < 0) ++ goto err; ++ ++ linkinfo = ovpn_nest_start(&req.n, sizeof(req), IFLA_LINKINFO); ++ if (!linkinfo) ++ goto err; ++ ++ if (ovpn_addattr(&req.n, sizeof(req), IFLA_INFO_KIND, OVPN_FAMILY_NAME, ++ strlen(OVPN_FAMILY_NAME) + 1) < 0) ++ goto err; ++ ++ if (ovpn->mode_set) { ++ data = ovpn_nest_start(&req.n, sizeof(req), IFLA_INFO_DATA); ++ if (!data) ++ goto err; ++ ++ if (ovpn_addattr(&req.n, sizeof(req), IFLA_OVPN_MODE, ++ &ovpn->mode, sizeof(uint8_t)) < 0) ++ goto err; ++ ++ ovpn_nest_end(&req.n, data); ++ } ++ ++ ovpn_nest_end(&req.n, linkinfo); ++ ++ req.i.ifi_family = AF_PACKET; ++ ++ ret = ovpn_rt_send(&req.n, 0, 0, NULL, NULL); ++err: ++ return ret; ++} ++ ++static int ovpn_del_iface(struct ovpn_ctx *ovpn) ++{ ++ struct ovpn_link_req req = { 0 }; ++ ++ fprintf(stdout, "Deleting interface %s ifindex %u\n", ovpn->ifname, ++ ovpn->ifindex); ++ ++ req.n.nlmsg_len = NLMSG_LENGTH(sizeof(req.i)); ++ req.n.nlmsg_flags = NLM_F_REQUEST; ++ req.n.nlmsg_type = RTM_DELLINK; ++ ++ req.i.ifi_family = AF_PACKET; ++ req.i.ifi_index = ovpn->ifindex; ++ ++ return ovpn_rt_send(&req.n, 0, 0, NULL, NULL); ++} ++ ++static int nl_seq_check(struct nl_msg (*msg)__always_unused, ++ void (*arg)__always_unused) ++{ ++ return NL_OK; ++} ++ ++struct mcast_handler_args { ++ const char *group; ++ int id; ++}; ++ ++static int mcast_family_handler(struct nl_msg *msg, void *arg) ++{ ++ struct mcast_handler_args *grp = arg; ++ struct nlattr *tb[CTRL_ATTR_MAX + 1]; ++ struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg)); ++ struct nlattr *mcgrp; ++ int rem_mcgrp; ++ ++ nla_parse(tb, CTRL_ATTR_MAX, genlmsg_attrdata(gnlh, 0), ++ genlmsg_attrlen(gnlh, 0), NULL); ++ ++ if (!tb[CTRL_ATTR_MCAST_GROUPS]) ++ return NL_SKIP; ++ ++ nla_for_each_nested(mcgrp, tb[CTRL_ATTR_MCAST_GROUPS], rem_mcgrp) { ++ struct nlattr *tb_mcgrp[CTRL_ATTR_MCAST_GRP_MAX + 1]; ++ ++ nla_parse(tb_mcgrp, CTRL_ATTR_MCAST_GRP_MAX, ++ nla_data(mcgrp), nla_len(mcgrp), NULL); ++ ++ if (!tb_mcgrp[CTRL_ATTR_MCAST_GRP_NAME] || ++ !tb_mcgrp[CTRL_ATTR_MCAST_GRP_ID]) ++ continue; ++ if (strncmp(nla_data(tb_mcgrp[CTRL_ATTR_MCAST_GRP_NAME]), ++ grp->group, nla_len(tb_mcgrp[CTRL_ATTR_MCAST_GRP_NAME]))) ++ continue; ++ grp->id = nla_get_u32(tb_mcgrp[CTRL_ATTR_MCAST_GRP_ID]); ++ break; ++ } ++ ++ return NL_SKIP; ++} ++ ++static int mcast_error_handler(struct sockaddr_nl (*nla)__always_unused, ++ struct nlmsgerr *err, void *arg) ++{ ++ int *ret = arg; ++ ++ *ret = err->error; ++ return NL_STOP; ++} ++ ++static int mcast_ack_handler(struct nl_msg (*msg)__always_unused, void *arg) ++{ ++ int *ret = arg; ++ ++ *ret = 0; ++ return NL_STOP; ++} ++ ++static int ovpn_handle_msg(struct nl_msg *msg, void *arg) ++{ ++ struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg)); ++ struct nlattr *attrs[OVPN_A_MAX + 1]; ++ struct nlmsghdr *nlh = nlmsg_hdr(msg); ++ //enum ovpn_del_peer_reason reason; ++ char ifname[IF_NAMESIZE]; ++ int *ret = arg; ++ __u32 ifindex; ++ ++ fprintf(stderr, "received message from ovpn-dco\n"); ++ ++ *ret = -1; ++ ++ if (!genlmsg_valid_hdr(nlh, 0)) { ++ fprintf(stderr, "invalid header\n"); ++ return NL_STOP; ++ } ++ ++ if (nla_parse(attrs, OVPN_A_MAX, genlmsg_attrdata(gnlh, 0), ++ genlmsg_attrlen(gnlh, 0), NULL)) { ++ fprintf(stderr, "received bogus data from ovpn-dco\n"); ++ return NL_STOP; ++ } ++ ++ if (!attrs[OVPN_A_IFINDEX]) { ++ fprintf(stderr, "no ifindex in this message\n"); ++ return NL_STOP; ++ } ++ ++ ifindex = nla_get_u32(attrs[OVPN_A_IFINDEX]); ++ if (!if_indextoname(ifindex, ifname)) { ++ fprintf(stderr, "cannot resolve ifname for ifindex: %u\n", ++ ifindex); ++ return NL_STOP; ++ } ++ ++ switch (gnlh->cmd) { ++ case OVPN_CMD_PEER_DEL_NTF: ++ /*if (!attrs[OVPN_A_DEL_PEER_REASON]) { ++ * fprintf(stderr, "no reason in DEL_PEER message\n"); ++ * return NL_STOP; ++ *} ++ * ++ *reason = nla_get_u8(attrs[OVPN_A_DEL_PEER_REASON]); ++ *fprintf(stderr, ++ * "received CMD_DEL_PEER, ifname: %s reason: %d\n", ++ * ifname, reason); ++ */ ++ fprintf(stdout, "received CMD_PEER_DEL_NTF\n"); ++ break; ++ case OVPN_CMD_KEY_SWAP_NTF: ++ fprintf(stdout, "received CMD_KEY_SWAP_NTF\n"); ++ break; ++ default: ++ fprintf(stderr, "received unknown command: %d\n", gnlh->cmd); ++ return NL_STOP; ++ } ++ ++ *ret = 0; ++ return NL_OK; ++} ++ ++static int ovpn_get_mcast_id(struct nl_sock *sock, const char *family, ++ const char *group) ++{ ++ struct nl_msg *msg; ++ struct nl_cb *cb; ++ int ret, ctrlid; ++ struct mcast_handler_args grp = { ++ .group = group, ++ .id = -ENOENT, ++ }; ++ ++ msg = nlmsg_alloc(); ++ if (!msg) ++ return -ENOMEM; ++ ++ cb = nl_cb_alloc(NL_CB_DEFAULT); ++ if (!cb) { ++ ret = -ENOMEM; ++ goto out_fail_cb; ++ } ++ ++ ctrlid = genl_ctrl_resolve(sock, "nlctrl"); ++ ++ genlmsg_put(msg, 0, 0, ctrlid, 0, 0, CTRL_CMD_GETFAMILY, 0); ++ ++ ret = -ENOBUFS; ++ NLA_PUT_STRING(msg, CTRL_ATTR_FAMILY_NAME, family); ++ ++ ret = nl_send_auto_complete(sock, msg); ++ if (ret < 0) ++ goto nla_put_failure; ++ ++ ret = 1; ++ ++ nl_cb_err(cb, NL_CB_CUSTOM, mcast_error_handler, &ret); ++ nl_cb_set(cb, NL_CB_ACK, NL_CB_CUSTOM, mcast_ack_handler, &ret); ++ nl_cb_set(cb, NL_CB_VALID, NL_CB_CUSTOM, mcast_family_handler, &grp); ++ ++ while (ret > 0) ++ nl_recvmsgs(sock, cb); ++ ++ if (ret == 0) ++ ret = grp.id; ++ nla_put_failure: ++ nl_cb_put(cb); ++ out_fail_cb: ++ nlmsg_free(msg); ++ return ret; ++} ++ ++static int ovpn_listen_mcast(void) ++{ ++ struct nl_sock *sock; ++ struct nl_cb *cb; ++ int mcid, ret; ++ ++ sock = nl_socket_alloc(); ++ if (!sock) { ++ fprintf(stderr, "cannot allocate netlink socket\n"); ++ goto err_free; ++ } ++ ++ nl_socket_set_buffer_size(sock, 8192, 8192); ++ ++ ret = genl_connect(sock); ++ if (ret < 0) { ++ fprintf(stderr, "cannot connect to generic netlink: %s\n", ++ nl_geterror(ret)); ++ goto err_free; ++ } ++ ++ mcid = ovpn_get_mcast_id(sock, OVPN_FAMILY_NAME, OVPN_MCGRP_PEERS); ++ if (mcid < 0) { ++ fprintf(stderr, "cannot get mcast group: %s\n", ++ nl_geterror(mcid)); ++ goto err_free; ++ } ++ ++ ret = nl_socket_add_membership(sock, mcid); ++ if (ret) { ++ fprintf(stderr, "failed to join mcast group: %d\n", ret); ++ goto err_free; ++ } ++ ++ ret = 1; ++ cb = nl_cb_alloc(NL_CB_DEFAULT); ++ nl_cb_set(cb, NL_CB_SEQ_CHECK, NL_CB_CUSTOM, nl_seq_check, NULL); ++ nl_cb_set(cb, NL_CB_VALID, NL_CB_CUSTOM, ovpn_handle_msg, &ret); ++ nl_cb_err(cb, NL_CB_CUSTOM, ovpn_nl_cb_error, &ret); ++ ++ while (ret == 1) { ++ int err = nl_recvmsgs(sock, cb); ++ ++ if (err < 0) { ++ fprintf(stderr, ++ "cannot receive netlink message: (%d) %s\n", ++ err, nl_geterror(-err)); ++ ret = -1; ++ break; ++ } ++ } ++ ++ nl_cb_put(cb); ++err_free: ++ nl_socket_free(sock); ++ return ret; ++} ++ ++static void usage(const char *cmd) ++{ ++ fprintf(stderr, ++ "Usage %s [arguments..]\n", ++ cmd); ++ fprintf(stderr, "where can be one of the following\n\n"); ++ ++ fprintf(stderr, "* new_iface [mode]: create new ovpn interface\n"); ++ fprintf(stderr, "\tiface: ovpn interface name\n"); ++ fprintf(stderr, "\tmode:\n"); ++ fprintf(stderr, "\t\t- P2P for peer-to-peer mode (i.e. client)\n"); ++ fprintf(stderr, "\t\t- MP for multi-peer mode (i.e. server)\n"); ++ ++ fprintf(stderr, "* del_iface : delete ovpn interface\n"); ++ fprintf(stderr, "\tiface: ovpn interface name\n"); ++ ++ fprintf(stderr, ++ "* listen [ipv6]: listen for incoming peer TCP connections\n"); ++ fprintf(stderr, "\tiface: ovpn interface name\n"); ++ fprintf(stderr, "\tlport: TCP port to listen to\n"); ++ fprintf(stderr, ++ "\tpeers_file: file containing one peer per line: Line format:\n"); ++ fprintf(stderr, "\t\t \n"); ++ fprintf(stderr, ++ "\tipv6: whether the socket should listen to the IPv6 wildcard address\n"); ++ ++ fprintf(stderr, ++ "* connect [key_file]: start connecting peer of TCP-based VPN session\n"); ++ fprintf(stderr, "\tiface: ovpn interface name\n"); ++ fprintf(stderr, "\tpeer_id: peer ID of the connecting peer\n"); ++ fprintf(stderr, "\traddr: peer IP address to connect to\n"); ++ fprintf(stderr, "\trport: peer TCP port to connect to\n"); ++ fprintf(stderr, ++ "\tkey_file: file containing the symmetric key for encryption\n"); ++ ++ fprintf(stderr, ++ "* new_peer [vpnaddr]: add new peer\n"); ++ fprintf(stderr, "\tiface: ovpn interface name\n"); ++ fprintf(stderr, "\tlport: local UDP port to bind to\n"); ++ fprintf(stderr, ++ "\tpeer_id: peer ID to be used in data packets to/from this peer\n"); ++ fprintf(stderr, "\traddr: peer IP address\n"); ++ fprintf(stderr, "\trport: peer UDP port\n"); ++ fprintf(stderr, "\tvpnaddr: peer VPN IP\n"); ++ ++ fprintf(stderr, ++ "* new_multi_peer : add multiple peers as listed in the file\n"); ++ fprintf(stderr, "\tiface: ovpn interface name\n"); ++ fprintf(stderr, "\tlport: local UDP port to bind to\n"); ++ fprintf(stderr, ++ "\tpeers_file: text file containing one peer per line. Line format:\n"); ++ fprintf(stderr, "\t\t \n"); ++ ++ fprintf(stderr, ++ "* set_peer : set peer attributes\n"); ++ fprintf(stderr, "\tiface: ovpn interface name\n"); ++ fprintf(stderr, "\tpeer_id: peer ID of the peer to modify\n"); ++ fprintf(stderr, ++ "\tkeepalive_interval: interval for sending ping messages\n"); ++ fprintf(stderr, ++ "\tkeepalive_timeout: time after which a peer is timed out\n"); ++ ++ fprintf(stderr, "* del_peer : delete peer\n"); ++ fprintf(stderr, "\tiface: ovpn interface name\n"); ++ fprintf(stderr, "\tpeer_id: peer ID of the peer to delete\n"); ++ ++ fprintf(stderr, "* get_peer [peer_id]: retrieve peer(s) status\n"); ++ fprintf(stderr, "\tiface: ovpn interface name\n"); ++ fprintf(stderr, ++ "\tpeer_id: peer ID of the peer to query. All peers are returned if omitted\n"); ++ ++ fprintf(stderr, ++ "* new_key : set data channel key\n"); ++ fprintf(stderr, "\tiface: ovpn interface name\n"); ++ fprintf(stderr, ++ "\tpeer_id: peer ID of the peer to configure the key for\n"); ++ fprintf(stderr, "\tslot: either 1 (primary) or 2 (secondary)\n"); ++ fprintf(stderr, "\tkey_id: an ID from 0 to 7\n"); ++ fprintf(stderr, ++ "\tcipher: cipher to use, supported: aes (AES-GCM), chachapoly (CHACHA20POLY1305)\n"); ++ fprintf(stderr, ++ "\tkey_dir: key direction, must 0 on one host and 1 on the other\n"); ++ fprintf(stderr, "\tkey_file: file containing the pre-shared key\n"); ++ ++ fprintf(stderr, ++ "* del_key [slot]: erase existing data channel key\n"); ++ fprintf(stderr, "\tiface: ovpn interface name\n"); ++ fprintf(stderr, "\tpeer_id: peer ID of the peer to modify\n"); ++ fprintf(stderr, "\tslot: slot to erase. PRIMARY if omitted\n"); ++ ++ fprintf(stderr, ++ "* get_key : retrieve non sensible key data\n"); ++ fprintf(stderr, "\tiface: ovpn interface name\n"); ++ fprintf(stderr, "\tpeer_id: peer ID of the peer to query\n"); ++ fprintf(stderr, "\tslot: either 1 (primary) or 2 (secondary)\n"); ++ ++ fprintf(stderr, ++ "* swap_keys : swap content of primary and secondary key slots\n"); ++ fprintf(stderr, "\tiface: ovpn interface name\n"); ++ fprintf(stderr, "\tpeer_id: peer ID of the peer to modify\n"); ++ ++ fprintf(stderr, ++ "* listen_mcast: listen to ovpn netlink multicast messages\n"); ++} ++ ++static int ovpn_parse_remote(struct ovpn_ctx *ovpn, const char *host, ++ const char *service, const char *vpnip) ++{ ++ int ret; ++ struct addrinfo *result; ++ struct addrinfo hints = { ++ .ai_family = ovpn->sa_family, ++ .ai_socktype = SOCK_DGRAM, ++ .ai_protocol = IPPROTO_UDP ++ }; ++ ++ if (host) { ++ ret = getaddrinfo(host, service, &hints, &result); ++ if (ret == EAI_NONAME || ret == EAI_FAIL) ++ return -1; ++ ++ if (!(result->ai_family == AF_INET && ++ result->ai_addrlen == sizeof(struct sockaddr_in)) && ++ !(result->ai_family == AF_INET6 && ++ result->ai_addrlen == sizeof(struct sockaddr_in6))) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ memcpy(&ovpn->remote, result->ai_addr, result->ai_addrlen); ++ } ++ ++ if (vpnip) { ++ ret = getaddrinfo(vpnip, NULL, &hints, &result); ++ if (ret == EAI_NONAME || ret == EAI_FAIL) ++ return -1; ++ ++ if (!(result->ai_family == AF_INET && ++ result->ai_addrlen == sizeof(struct sockaddr_in)) && ++ !(result->ai_family == AF_INET6 && ++ result->ai_addrlen == sizeof(struct sockaddr_in6))) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ memcpy(&ovpn->peer_ip, result->ai_addr, result->ai_addrlen); ++ ovpn->sa_family = result->ai_family; ++ ++ ovpn->peer_ip_set = true; ++ } ++ ++ ret = 0; ++out: ++ freeaddrinfo(result); ++ return ret; ++} ++ ++static int ovpn_parse_new_peer(struct ovpn_ctx *ovpn, const char *peer_id, ++ const char *raddr, const char *rport, ++ const char *vpnip) ++{ ++ ovpn->peer_id = strtoul(peer_id, NULL, 10); ++ if (errno == ERANGE || ovpn->peer_id > PEER_ID_UNDEF) { ++ fprintf(stderr, "peer ID value out of range\n"); ++ return -1; ++ } ++ ++ return ovpn_parse_remote(ovpn, raddr, rport, vpnip); ++} ++ ++static int ovpn_parse_key_slot(const char *arg, struct ovpn_ctx *ovpn) ++{ ++ int slot = strtoul(arg, NULL, 10); ++ ++ if (errno == ERANGE || slot < 1 || slot > 2) { ++ fprintf(stderr, "key slot out of range\n"); ++ return -1; ++ } ++ ++ switch (slot) { ++ case 1: ++ ovpn->key_slot = OVPN_KEY_SLOT_PRIMARY; ++ break; ++ case 2: ++ ovpn->key_slot = OVPN_KEY_SLOT_SECONDARY; ++ break; ++ } ++ ++ return 0; ++} ++ ++static int ovpn_send_tcp_data(int socket) ++{ ++ uint16_t len = htons(1000); ++ uint8_t buf[1002]; ++ int ret; ++ ++ memcpy(buf, &len, sizeof(len)); ++ memset(buf + sizeof(len), 0x86, sizeof(buf) - sizeof(len)); ++ ++ ret = send(socket, buf, sizeof(buf), 0); ++ ++ fprintf(stdout, "Sent %u bytes over TCP socket\n", ret); ++ ++ return ret > 0 ? 0 : ret; ++} ++ ++static int ovpn_recv_tcp_data(int socket) ++{ ++ uint8_t buf[1002]; ++ uint16_t len; ++ int ret; ++ ++ ret = recv(socket, buf, sizeof(buf), 0); ++ ++ if (ret < 2) { ++ fprintf(stderr, ">>>> Error while reading TCP data: %d\n", ret); ++ return ret; ++ } ++ ++ memcpy(&len, buf, sizeof(len)); ++ len = ntohs(len); ++ ++ fprintf(stdout, ">>>> Received %u bytes over TCP socket, header: %u\n", ++ ret, len); ++ ++/* int i; ++ * for (i = 2; i < ret; i++) { ++ * fprintf(stdout, "0x%.2x ", buf[i]); ++ * if (i && !((i - 2) % 16)) ++ * fprintf(stdout, "\n"); ++ * } ++ * fprintf(stdout, "\n"); ++ */ ++ return 0; ++} ++ ++static enum ovpn_cmd ovpn_parse_cmd(const char *cmd) ++{ ++ if (!strcmp(cmd, "new_iface")) ++ return CMD_NEW_IFACE; ++ ++ if (!strcmp(cmd, "del_iface")) ++ return CMD_DEL_IFACE; ++ ++ if (!strcmp(cmd, "listen")) ++ return CMD_LISTEN; ++ ++ if (!strcmp(cmd, "connect")) ++ return CMD_CONNECT; ++ ++ if (!strcmp(cmd, "new_peer")) ++ return CMD_NEW_PEER; ++ ++ if (!strcmp(cmd, "new_multi_peer")) ++ return CMD_NEW_MULTI_PEER; ++ ++ if (!strcmp(cmd, "set_peer")) ++ return CMD_SET_PEER; ++ ++ if (!strcmp(cmd, "del_peer")) ++ return CMD_DEL_PEER; ++ ++ if (!strcmp(cmd, "get_peer")) ++ return CMD_GET_PEER; ++ ++ if (!strcmp(cmd, "new_key")) ++ return CMD_NEW_KEY; ++ ++ if (!strcmp(cmd, "del_key")) ++ return CMD_DEL_KEY; ++ ++ if (!strcmp(cmd, "get_key")) ++ return CMD_GET_KEY; ++ ++ if (!strcmp(cmd, "swap_keys")) ++ return CMD_SWAP_KEYS; ++ ++ if (!strcmp(cmd, "listen_mcast")) ++ return CMD_LISTEN_MCAST; ++ ++ return CMD_INVALID; ++} ++ ++static int ovpn_run_cmd(struct ovpn_ctx *ovpn) ++{ ++ char peer_id[10], vpnip[INET6_ADDRSTRLEN], raddr[128], rport[10]; ++ int n, ret; ++ FILE *fp; ++ ++ switch (ovpn->cmd) { ++ case CMD_NEW_IFACE: ++ ret = ovpn_new_iface(ovpn); ++ break; ++ case CMD_DEL_IFACE: ++ ret = ovpn_del_iface(ovpn); ++ break; ++ case CMD_LISTEN: ++ ret = ovpn_listen(ovpn, ovpn->sa_family); ++ if (ret < 0) { ++ fprintf(stderr, "cannot listen on TCP socket\n"); ++ return ret; ++ } ++ ++ fp = fopen(ovpn->peers_file, "r"); ++ if (!fp) { ++ fprintf(stderr, "cannot open file: %s\n", ++ ovpn->peers_file); ++ return -1; ++ } ++ ++ while ((n = fscanf(fp, "%s %s\n", peer_id, vpnip)) == 2) { ++ struct ovpn_ctx peer_ctx = { 0 }; ++ ++ peer_ctx.ifindex = ovpn->ifindex; ++ peer_ctx.sa_family = ovpn->sa_family; ++ ++ peer_ctx.socket = ovpn_accept(ovpn); ++ if (peer_ctx.socket < 0) { ++ fprintf(stderr, "cannot accept connection!\n"); ++ return -1; ++ } ++ ++ /* store the socket of the first peer to test TCP I/O */ ++ if (ovpn->cli_socket < 0) ++ ovpn->cli_socket = peer_ctx.socket; ++ ++ ret = ovpn_parse_new_peer(&peer_ctx, peer_id, NULL, ++ NULL, vpnip); ++ if (ret < 0) { ++ fprintf(stderr, "error while parsing line\n"); ++ return -1; ++ } ++ ++ ret = ovpn_new_peer(&peer_ctx, true); ++ if (ret < 0) { ++ fprintf(stderr, ++ "cannot add peer to VPN: %s %s\n", ++ peer_id, vpnip); ++ return ret; ++ } ++ } ++ ++ if (ovpn->cli_socket >= 0) ++ ret = ovpn_recv_tcp_data(ovpn->cli_socket); ++ ++ break; ++ case CMD_CONNECT: ++ ret = ovpn_connect(ovpn); ++ if (ret < 0) { ++ fprintf(stderr, "cannot connect TCP socket\n"); ++ return ret; ++ } ++ ++ ret = ovpn_new_peer(ovpn, true); ++ if (ret < 0) { ++ fprintf(stderr, "cannot add peer to VPN\n"); ++ close(ovpn->socket); ++ return ret; ++ } ++ ++ if (ovpn->cipher != OVPN_CIPHER_ALG_NONE) { ++ ret = ovpn_new_key(ovpn); ++ if (ret < 0) { ++ fprintf(stderr, "cannot set key\n"); ++ return ret; ++ } ++ } ++ ++ ret = ovpn_send_tcp_data(ovpn->socket); ++ break; ++ case CMD_NEW_PEER: ++ ret = ovpn_udp_socket(ovpn, AF_INET6); //ovpn->sa_family ? ++ if (ret < 0) ++ return ret; ++ ++ ret = ovpn_new_peer(ovpn, false); ++ break; ++ case CMD_NEW_MULTI_PEER: ++ ret = ovpn_udp_socket(ovpn, AF_INET6); ++ if (ret < 0) ++ return ret; ++ ++ fp = fopen(ovpn->peers_file, "r"); ++ if (!fp) { ++ fprintf(stderr, "cannot open file: %s\n", ++ ovpn->peers_file); ++ return -1; ++ } ++ ++ while ((n = fscanf(fp, "%s %s %s %s\n", peer_id, raddr, rport, ++ vpnip)) == 4) { ++ struct ovpn_ctx peer_ctx = { 0 }; ++ ++ peer_ctx.ifindex = ovpn->ifindex; ++ peer_ctx.socket = ovpn->socket; ++ peer_ctx.sa_family = AF_UNSPEC; ++ ++ ret = ovpn_parse_new_peer(&peer_ctx, peer_id, raddr, ++ rport, vpnip); ++ if (ret < 0) { ++ fprintf(stderr, "error while parsing line\n"); ++ return -1; ++ } ++ ++ ret = ovpn_new_peer(&peer_ctx, false); ++ if (ret < 0) { ++ fprintf(stderr, ++ "cannot add peer to VPN: %s %s %s %s\n", ++ peer_id, raddr, rport, vpnip); ++ return ret; ++ } ++ } ++ break; ++ case CMD_SET_PEER: ++ ret = ovpn_set_peer(ovpn); ++ break; ++ case CMD_DEL_PEER: ++ ret = ovpn_del_peer(ovpn); ++ break; ++ case CMD_GET_PEER: ++ if (ovpn->peer_id == PEER_ID_UNDEF) ++ fprintf(stderr, "List of peers connected to: %s\n", ++ ovpn->ifname); ++ ++ ret = ovpn_get_peer(ovpn); ++ break; ++ case CMD_NEW_KEY: ++ ret = ovpn_new_key(ovpn); ++ break; ++ case CMD_DEL_KEY: ++ ret = ovpn_del_key(ovpn); ++ break; ++ case CMD_GET_KEY: ++ ret = ovpn_get_key(ovpn); ++ break; ++ case CMD_SWAP_KEYS: ++ ret = ovpn_swap_keys(ovpn); ++ break; ++ case CMD_LISTEN_MCAST: ++ ret = ovpn_listen_mcast(); ++ break; ++ case CMD_INVALID: ++ break; ++ } ++ ++ return ret; ++} ++ ++static int ovpn_parse_cmd_args(struct ovpn_ctx *ovpn, int argc, char *argv[]) ++{ ++ int ret; ++ ++ /* no args required for LISTEN_MCAST */ ++ if (ovpn->cmd == CMD_LISTEN_MCAST) ++ return 0; ++ ++ /* all commands need an ifname */ ++ if (argc < 3) ++ return -EINVAL; ++ ++ strscpy(ovpn->ifname, argv[2], IFNAMSIZ - 1); ++ ovpn->ifname[IFNAMSIZ - 1] = '\0'; ++ ++ /* all commands, except NEW_IFNAME, needs an ifindex */ ++ if (ovpn->cmd != CMD_NEW_IFACE) { ++ ovpn->ifindex = if_nametoindex(ovpn->ifname); ++ if (!ovpn->ifindex) { ++ fprintf(stderr, "cannot find interface: %s\n", ++ strerror(errno)); ++ return -1; ++ } ++ } ++ ++ switch (ovpn->cmd) { ++ case CMD_NEW_IFACE: ++ if (argc < 4) ++ break; ++ ++ if (!strcmp(argv[3], "P2P")) { ++ ovpn->mode = OVPN_MODE_P2P; ++ } else if (!strcmp(argv[3], "MP")) { ++ ovpn->mode = OVPN_MODE_MP; ++ } else { ++ fprintf(stderr, "Cannot parse iface mode: %s\n", ++ argv[3]); ++ return -1; ++ } ++ ovpn->mode_set = true; ++ break; ++ case CMD_DEL_IFACE: ++ break; ++ case CMD_LISTEN: ++ if (argc < 5) ++ return -EINVAL; ++ ++ ovpn->lport = strtoul(argv[3], NULL, 10); ++ if (errno == ERANGE || ovpn->lport > 65535) { ++ fprintf(stderr, "lport value out of range\n"); ++ return -1; ++ } ++ ++ ovpn->peers_file = argv[4]; ++ ++ if (argc > 5 && !strcmp(argv[5], "ipv6")) ++ ovpn->sa_family = AF_INET6; ++ break; ++ case CMD_CONNECT: ++ if (argc < 6) ++ return -EINVAL; ++ ++ ovpn->sa_family = AF_INET; ++ ++ ret = ovpn_parse_new_peer(ovpn, argv[3], argv[4], argv[5], ++ NULL); ++ if (ret < 0) { ++ fprintf(stderr, "Cannot parse remote peer data\n"); ++ return -1; ++ } ++ ++ if (argc > 6) { ++ ovpn->key_slot = OVPN_KEY_SLOT_PRIMARY; ++ ovpn->key_id = 0; ++ ovpn->cipher = OVPN_CIPHER_ALG_AES_GCM; ++ ovpn->key_dir = KEY_DIR_OUT; ++ ++ ret = ovpn_parse_key(argv[6], ovpn); ++ if (ret) ++ return -1; ++ } ++ break; ++ case CMD_NEW_PEER: ++ if (argc < 7) ++ return -EINVAL; ++ ++ ovpn->lport = strtoul(argv[4], NULL, 10); ++ if (errno == ERANGE || ovpn->lport > 65535) { ++ fprintf(stderr, "lport value out of range\n"); ++ return -1; ++ } ++ ++ const char *vpnip = (argc > 7) ? argv[7] : NULL; ++ ++ ret = ovpn_parse_new_peer(ovpn, argv[3], argv[5], argv[6], ++ vpnip); ++ if (ret < 0) ++ return -1; ++ break; ++ case CMD_NEW_MULTI_PEER: ++ if (argc < 5) ++ return -EINVAL; ++ ++ ovpn->lport = strtoul(argv[3], NULL, 10); ++ if (errno == ERANGE || ovpn->lport > 65535) { ++ fprintf(stderr, "lport value out of range\n"); ++ return -1; ++ } ++ ++ ovpn->peers_file = argv[4]; ++ break; ++ case CMD_SET_PEER: ++ if (argc < 6) ++ return -EINVAL; ++ ++ ovpn->peer_id = strtoul(argv[3], NULL, 10); ++ if (errno == ERANGE || ovpn->peer_id > PEER_ID_UNDEF) { ++ fprintf(stderr, "peer ID value out of range\n"); ++ return -1; ++ } ++ ++ ovpn->keepalive_interval = strtoul(argv[4], NULL, 10); ++ if (errno == ERANGE) { ++ fprintf(stderr, ++ "keepalive interval value out of range\n"); ++ return -1; ++ } ++ ++ ovpn->keepalive_timeout = strtoul(argv[5], NULL, 10); ++ if (errno == ERANGE) { ++ fprintf(stderr, ++ "keepalive interval value out of range\n"); ++ return -1; ++ } ++ break; ++ case CMD_DEL_PEER: ++ if (argc < 4) ++ return -EINVAL; ++ ++ ovpn->peer_id = strtoul(argv[3], NULL, 10); ++ if (errno == ERANGE || ovpn->peer_id > PEER_ID_UNDEF) { ++ fprintf(stderr, "peer ID value out of range\n"); ++ return -1; ++ } ++ break; ++ case CMD_GET_PEER: ++ ovpn->peer_id = PEER_ID_UNDEF; ++ if (argc > 3) { ++ ovpn->peer_id = strtoul(argv[3], NULL, 10); ++ if (errno == ERANGE || ovpn->peer_id > PEER_ID_UNDEF) { ++ fprintf(stderr, "peer ID value out of range\n"); ++ return -1; ++ } ++ } ++ break; ++ case CMD_NEW_KEY: ++ if (argc < 9) ++ return -EINVAL; ++ ++ ovpn->peer_id = strtoul(argv[3], NULL, 10); ++ if (errno == ERANGE) { ++ fprintf(stderr, "peer ID value out of range\n"); ++ return -1; ++ } ++ ++ ret = ovpn_parse_key_slot(argv[4], ovpn); ++ if (ret) ++ return -1; ++ ++ ovpn->key_id = strtoul(argv[5], NULL, 10); ++ if (errno == ERANGE || ovpn->key_id > 2) { ++ fprintf(stderr, "key ID out of range\n"); ++ return -1; ++ } ++ ++ ret = ovpn_parse_cipher(argv[6], ovpn); ++ if (ret < 0) ++ return -1; ++ ++ ret = ovpn_parse_key_direction(argv[7], ovpn); ++ if (ret < 0) ++ return -1; ++ ++ ret = ovpn_parse_key(argv[8], ovpn); ++ if (ret) ++ return -1; ++ break; ++ case CMD_DEL_KEY: ++ if (argc < 4) ++ return -EINVAL; ++ ++ ovpn->peer_id = strtoul(argv[3], NULL, 10); ++ if (errno == ERANGE) { ++ fprintf(stderr, "peer ID value out of range\n"); ++ return -1; ++ } ++ ++ ret = ovpn_parse_key_slot(argv[4], ovpn); ++ if (ret) ++ return ret; ++ break; ++ case CMD_GET_KEY: ++ if (argc < 5) ++ return -EINVAL; ++ ++ ovpn->peer_id = strtoul(argv[3], NULL, 10); ++ if (errno == ERANGE) { ++ fprintf(stderr, "peer ID value out of range\n"); ++ return -1; ++ } ++ ++ ret = ovpn_parse_key_slot(argv[4], ovpn); ++ if (ret) ++ return ret; ++ break; ++ case CMD_SWAP_KEYS: ++ if (argc < 4) ++ return -EINVAL; ++ ++ ovpn->peer_id = strtoul(argv[3], NULL, 10); ++ if (errno == ERANGE) { ++ fprintf(stderr, "peer ID value out of range\n"); ++ return -1; ++ } ++ break; ++ case CMD_LISTEN_MCAST: ++ break; ++ case CMD_INVALID: ++ break; ++ } ++ ++ return 0; ++} ++ ++int main(int argc, char *argv[]) ++{ ++ struct ovpn_ctx ovpn; ++ int ret; ++ ++ if (argc < 2) { ++ usage(argv[0]); ++ return -1; ++ } ++ ++ memset(&ovpn, 0, sizeof(ovpn)); ++ ovpn.sa_family = AF_INET; ++ ovpn.cipher = OVPN_CIPHER_ALG_NONE; ++ ovpn.cli_socket = -1; ++ ++ ovpn.cmd = ovpn_parse_cmd(argv[1]); ++ if (ovpn.cmd == CMD_INVALID) { ++ fprintf(stderr, "Error: unknown command.\n\n"); ++ usage(argv[0]); ++ return -1; ++ } ++ ++ ret = ovpn_parse_cmd_args(&ovpn, argc, argv); ++ if (ret < 0) { ++ fprintf(stderr, "Error: invalid arguments.\n\n"); ++ if (ret == -EINVAL) ++ usage(argv[0]); ++ return ret; ++ } ++ ++ ret = ovpn_run_cmd(&ovpn); ++ if (ret) ++ fprintf(stderr, "Cannot execute command: %s (%d)\n", ++ strerror(-ret), ret); ++ ++ return ret; ++} +diff --git a/tools/testing/selftests/net/ovpn/tcp_peers.txt b/tools/testing/selftests/net/ovpn/tcp_peers.txt +new file mode 100644 +index 000000000000..d753eebe8716 +--- /dev/null ++++ b/tools/testing/selftests/net/ovpn/tcp_peers.txt +@@ -0,0 +1,5 @@ ++1 5.5.5.2 ++2 5.5.5.3 ++3 5.5.5.4 ++4 5.5.5.5 ++5 5.5.5.6 +diff --git a/tools/testing/selftests/net/ovpn/test-chachapoly.sh b/tools/testing/selftests/net/ovpn/test-chachapoly.sh +new file mode 100755 +index 000000000000..79788f10d33b +--- /dev/null ++++ b/tools/testing/selftests/net/ovpn/test-chachapoly.sh +@@ -0,0 +1,9 @@ ++#!/bin/bash ++# SPDX-License-Identifier: GPL-2.0 ++# Copyright (C) 2024 OpenVPN, Inc. ++# ++# Author: Antonio Quartulli ++ ++ALG="chachapoly" ++ ++source test.sh +diff --git a/tools/testing/selftests/net/ovpn/test-float.sh b/tools/testing/selftests/net/ovpn/test-float.sh +new file mode 100755 +index 000000000000..93e1b729861d +--- /dev/null ++++ b/tools/testing/selftests/net/ovpn/test-float.sh +@@ -0,0 +1,9 @@ ++#!/bin/bash ++# SPDX-License-Identifier: GPL-2.0 ++# Copyright (C) 2024 OpenVPN, Inc. ++# ++# Author: Antonio Quartulli ++ ++FLOAT="1" ++ ++source test.sh +diff --git a/tools/testing/selftests/net/ovpn/test-tcp.sh b/tools/testing/selftests/net/ovpn/test-tcp.sh +new file mode 100755 +index 000000000000..7542f595cc56 +--- /dev/null ++++ b/tools/testing/selftests/net/ovpn/test-tcp.sh +@@ -0,0 +1,9 @@ ++#!/bin/bash ++# SPDX-License-Identifier: GPL-2.0 ++# Copyright (C) 2024 OpenVPN, Inc. ++# ++# Author: Antonio Quartulli ++ ++PROTO="TCP" ++ ++source test.sh +diff --git a/tools/testing/selftests/net/ovpn/test.sh b/tools/testing/selftests/net/ovpn/test.sh +new file mode 100755 +index 000000000000..07f3a82df8f3 +--- /dev/null ++++ b/tools/testing/selftests/net/ovpn/test.sh +@@ -0,0 +1,183 @@ ++#!/bin/bash ++# SPDX-License-Identifier: GPL-2.0 ++# Copyright (C) 2020-2024 OpenVPN, Inc. ++# ++# Author: Antonio Quartulli ++ ++#set -x ++set -e ++ ++UDP_PEERS_FILE=${UDP_PEERS_FILE:-udp_peers.txt} ++TCP_PEERS_FILE=${TCP_PEERS_FILE:-tcp_peers.txt} ++OVPN_CLI=${OVPN_CLI:-./ovpn-cli} ++ALG=${ALG:-aes} ++PROTO=${PROTO:-UDP} ++FLOAT=${FLOAT:-0} ++ ++create_ns() { ++ ip netns add peer${1} ++} ++ ++setup_ns() { ++ MODE="P2P" ++ ++ if [ ${1} -eq 0 ]; then ++ MODE="MP" ++ for p in $(seq 1 ${NUM_PEERS}); do ++ ip link add veth${p} netns peer0 type veth peer name veth${p} netns peer${p} ++ ++ ip -n peer0 addr add 10.10.${p}.1/24 dev veth${p} ++ ip -n peer0 link set veth${p} up ++ ++ ip -n peer${p} addr add 10.10.${p}.2/24 dev veth${p} ++ ip -n peer${p} link set veth${p} up ++ done ++ fi ++ ++ ip netns exec peer${1} ${OVPN_CLI} new_iface tun${1} $MODE ++ ip -n peer${1} addr add ${2} dev tun${1} ++ ip -n peer${1} link set tun${1} up ++} ++ ++add_peer() { ++ if [ "${PROTO}" == "UDP" ]; then ++ if [ ${1} -eq 0 ]; then ++ ip netns exec peer0 ${OVPN_CLI} new_multi_peer tun0 1 ${UDP_PEERS_FILE} ++ ++ for p in $(seq 1 ${NUM_PEERS}); do ++ ip netns exec peer0 ${OVPN_CLI} new_key tun0 ${p} 1 0 ${ALG} 0 \ ++ data64.key ++ done ++ else ++ ip netns exec peer${1} ${OVPN_CLI} new_peer tun${1} ${1} 1 10.10.${1}.1 1 ++ ip netns exec peer${1} ${OVPN_CLI} new_key tun${1} ${1} 1 0 ${ALG} 1 \ ++ data64.key ++ fi ++ else ++ if [ ${1} -eq 0 ]; then ++ (ip netns exec peer0 ${OVPN_CLI} listen tun0 1 ${TCP_PEERS_FILE} && { ++ for p in $(seq 1 ${NUM_PEERS}); do ++ ip netns exec peer0 ${OVPN_CLI} new_key tun0 ${p} 1 0 \ ++ ${ALG} 0 data64.key ++ done ++ }) & ++ sleep 5 ++ else ++ ip netns exec peer${1} ${OVPN_CLI} connect tun${1} ${1} 10.10.${1}.1 1 \ ++ data64.key ++ fi ++ fi ++} ++ ++cleanup() { ++ for p in $(seq 1 10); do ++ ip -n peer0 link del veth${p} 2>/dev/null || true ++ done ++ for p in $(seq 0 10); do ++ ip netns exec peer${p} ${OVPN_CLI} del_iface tun${p} 2>/dev/null || true ++ ip netns del peer${p} 2>/dev/null || true ++ done ++} ++ ++if [ "${PROTO}" == "UDP" ]; then ++ NUM_PEERS=${NUM_PEERS:-$(wc -l ${UDP_PEERS_FILE} | awk '{print $1}')} ++else ++ NUM_PEERS=${NUM_PEERS:-$(wc -l ${TCP_PEERS_FILE} | awk '{print $1}')} ++fi ++ ++cleanup ++ ++modprobe -q ovpn || true ++ ++for p in $(seq 0 ${NUM_PEERS}); do ++ create_ns ${p} ++done ++ ++for p in $(seq 0 ${NUM_PEERS}); do ++ setup_ns ${p} 5.5.5.$((${p} + 1))/24 ++done ++ ++for p in $(seq 0 ${NUM_PEERS}); do ++ add_peer ${p} ++done ++ ++for p in $(seq 1 ${NUM_PEERS}); do ++ ip netns exec peer0 ${OVPN_CLI} set_peer tun0 ${p} 60 120 ++ ip netns exec peer${p} ${OVPN_CLI} set_peer tun${p} ${p} 60 120 ++done ++ ++for p in $(seq 1 ${NUM_PEERS}); do ++ ip netns exec peer0 ping -qfc 1000 -w 5 5.5.5.$((${p} + 1)) ++done ++ ++if [ "$FLOAT" == "1" ]; then ++ # make clients float.. ++ for p in $(seq 1 ${NUM_PEERS}); do ++ ip -n peer${p} addr del 10.10.${p}.2/24 dev veth${p} ++ ip -n peer${p} addr add 10.10.${p}.3/24 dev veth${p} ++ done ++ for p in $(seq 1 ${NUM_PEERS}); do ++ ip netns exec peer${p} ping -qfc 1000 -w 5 5.5.5.1 ++ done ++fi ++ ++ip netns exec peer0 iperf3 -1 -s & ++sleep 1 ++ip netns exec peer1 iperf3 -Z -t 3 -c 5.5.5.1 ++ ++echo "Adding secondary key and then swap:" ++for p in $(seq 1 ${NUM_PEERS}); do ++ ip netns exec peer0 ${OVPN_CLI} new_key tun0 ${p} 2 1 ${ALG} 0 data64.key ++ ip netns exec peer${p} ${OVPN_CLI} new_key tun${p} ${p} 2 1 ${ALG} 1 data64.key ++ ip netns exec peer${p} ${OVPN_CLI} swap_keys tun${p} ${p} ++done ++ ++sleep 1 ++echo "Querying all peers:" ++ip netns exec peer0 ${OVPN_CLI} get_peer tun0 ++ip netns exec peer1 ${OVPN_CLI} get_peer tun1 ++ ++echo "Querying peer 1:" ++ip netns exec peer0 ${OVPN_CLI} get_peer tun0 1 ++ ++echo "Querying non-existent peer 10:" ++ip netns exec peer0 ${OVPN_CLI} get_peer tun0 10 || true ++ ++echo "Deleting peer 1:" ++ip netns exec peer0 ${OVPN_CLI} del_peer tun0 1 ++ip netns exec peer1 ${OVPN_CLI} del_peer tun1 1 ++ ++echo "Querying keys:" ++for p in $(seq 2 ${NUM_PEERS}); do ++ ip netns exec peer${p} ${OVPN_CLI} get_key tun${p} ${p} 1 ++ ip netns exec peer${p} ${OVPN_CLI} get_key tun${p} ${p} 2 ++done ++ ++echo "Deleting keys:" ++for p in $(seq 2 ${NUM_PEERS}); do ++ ip netns exec peer${p} ${OVPN_CLI} del_key tun${p} ${p} 1 ++ ip netns exec peer${p} ${OVPN_CLI} del_key tun${p} ${p} 2 ++done ++ ++echo "Setting timeout to 10s MP:" ++# bring ifaces down to prevent traffic being sent ++for p in $(seq 0 ${NUM_PEERS}); do ++ ip -n peer${p} link set tun${p} down ++done ++# set short timeout ++for p in $(seq 2 ${NUM_PEERS}); do ++ ip netns exec peer0 ${OVPN_CLI} set_peer tun0 ${p} 10 10 || true ++ ip netns exec peer${p} ${OVPN_CLI} set_peer tun${p} ${p} 0 0 ++done ++# wait for peers to timeout ++sleep 15 ++ ++echo "Setting timeout to 10s P2P:" ++for p in $(seq 2 ${NUM_PEERS}); do ++ ip netns exec peer${p} ${OVPN_CLI} set_peer tun${p} ${p} 10 10 ++done ++sleep 15 ++ ++cleanup ++ ++modprobe -r ovpn || true +diff --git a/tools/testing/selftests/net/ovpn/udp_peers.txt b/tools/testing/selftests/net/ovpn/udp_peers.txt +new file mode 100644 +index 000000000000..32f14bd9347a +--- /dev/null ++++ b/tools/testing/selftests/net/ovpn/udp_peers.txt +@@ -0,0 +1,5 @@ ++1 10.10.1.2 1 5.5.5.2 ++2 10.10.2.2 1 5.5.5.3 ++3 10.10.3.2 1 5.5.5.4 ++4 10.10.4.2 1 5.5.5.5 ++5 10.10.5.2 1 5.5.5.6 +-- +2.47.0 + +From f3a2622d161f49a41f8a5cd4861c1d533f338c3b Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 4 Nov 2024 15:05:02 +0100 +Subject: [PATCH 11/13] perf-per-core + +Signed-off-by: Peter Jung --- Documentation/arch/x86/topology.rst | 4 + - arch/x86/events/rapl.c | 418 ++++++++++++++++++-------- + arch/x86/events/rapl.c | 412 ++++++++++++++++++-------- arch/x86/include/asm/processor.h | 1 + arch/x86/include/asm/topology.h | 1 + arch/x86/kernel/cpu/debugfs.c | 1 + arch/x86/kernel/cpu/topology_common.c | 1 + - 6 files changed, 305 insertions(+), 121 deletions(-) + 6 files changed, 292 insertions(+), 128 deletions(-) diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst index 7352ab89a55a..c12837e61bda 100644 @@ -17340,7 +26785,7 @@ index 7352ab89a55a..c12837e61bda 100644 System topology examples diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c -index b985ca79cf97..8206038a01ac 100644 +index a481a939862e..b91d194ba51b 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -39,6 +39,10 @@ @@ -17395,9 +26840,9 @@ index b985ca79cf97..8206038a01ac 100644 /* * event code: LSB 8 bits, passed in attr->config * any other bit is reserved -@@ -103,6 +118,10 @@ static struct perf_pmu_events_attr event_attr_##v = { \ - .event_str = str, \ - }; +@@ -116,6 +131,10 @@ static struct perf_pmu_events_attr event_attr_##v = { \ + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \ + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) +#define rapl_pmu_is_pkg_scope() \ + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \ @@ -17406,7 +26851,7 @@ index b985ca79cf97..8206038a01ac 100644 struct rapl_pmu { raw_spinlock_t lock; int n_active; -@@ -115,8 +134,9 @@ struct rapl_pmu { +@@ -128,8 +147,9 @@ struct rapl_pmu { struct rapl_pmus { struct pmu pmu; @@ -17417,7 +26862,7 @@ index b985ca79cf97..8206038a01ac 100644 }; enum rapl_unit_quirk { -@@ -126,29 +146,45 @@ enum rapl_unit_quirk { +@@ -139,19 +159,22 @@ enum rapl_unit_quirk { }; struct rapl_model { @@ -17444,25 +26889,10 @@ index b985ca79cf97..8206038a01ac 100644 static u64 rapl_timer_ms; -static struct perf_msr *rapl_msrs; +static struct rapl_model *rapl_model; -+ -+static inline unsigned int get_rapl_pmu_idx(int cpu) -+{ -+ return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) : -+ topology_logical_die_id(cpu); -+} -+ -+static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu) -+{ -+ return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) : -+ topology_die_cpumask(cpu); -+} - static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) - { -- unsigned int rapl_pmu_idx = topology_logical_die_id(cpu); -+ unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu); - - /* + /* + * Helper functions to get the correct topology macros according to the +@@ -177,7 +200,8 @@ static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) * The unsigned check also catches the '-1' return value for non * existent mappings in the topology map. */ @@ -17472,7 +26902,7 @@ index b985ca79cf97..8206038a01ac 100644 } static inline u64 rapl_read_counter(struct perf_event *event) -@@ -160,7 +196,7 @@ static inline u64 rapl_read_counter(struct perf_event *event) +@@ -189,7 +213,7 @@ static inline u64 rapl_read_counter(struct perf_event *event) static inline u64 rapl_scale(u64 v, int cfg) { @@ -17481,7 +26911,7 @@ index b985ca79cf97..8206038a01ac 100644 pr_warn("Invalid domain %d, failed to scale data\n", cfg); return v; } -@@ -212,34 +248,34 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu) +@@ -241,34 +265,34 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu) static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) { @@ -17525,7 +26955,7 @@ index b985ca79cf97..8206038a01ac 100644 struct perf_event *event) { if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) -@@ -247,39 +283,39 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu, +@@ -276,39 +300,39 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu, event->hw.state = 0; @@ -17579,7 +27009,7 @@ index b985ca79cf97..8206038a01ac 100644 list_del(&event->active_entry); -@@ -297,23 +333,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode) +@@ -326,23 +350,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode) hwc->state |= PERF_HES_UPTODATE; } @@ -17608,7 +27038,7 @@ index b985ca79cf97..8206038a01ac 100644 return 0; } -@@ -327,10 +363,14 @@ static int rapl_pmu_event_init(struct perf_event *event) +@@ -356,10 +380,14 @@ static int rapl_pmu_event_init(struct perf_event *event) { u64 cfg = event->attr.config & RAPL_EVENT_MASK; int bit, ret = 0; @@ -17625,7 +27055,7 @@ index b985ca79cf97..8206038a01ac 100644 return -ENOENT; /* check only supported bits are set */ -@@ -340,16 +380,18 @@ static int rapl_pmu_event_init(struct perf_event *event) +@@ -369,16 +397,18 @@ static int rapl_pmu_event_init(struct perf_event *event) if (event->cpu < 0) return -EINVAL; @@ -17648,7 +27078,7 @@ index b985ca79cf97..8206038a01ac 100644 return -EINVAL; /* unsupported modes and filters */ -@@ -357,12 +399,18 @@ static int rapl_pmu_event_init(struct perf_event *event) +@@ -386,12 +416,18 @@ static int rapl_pmu_event_init(struct perf_event *event) return -EINVAL; /* must be done before validate_group */ @@ -17672,7 +27102,7 @@ index b985ca79cf97..8206038a01ac 100644 event->hw.config = cfg; event->hw.idx = bit; -@@ -377,7 +425,7 @@ static void rapl_pmu_event_read(struct perf_event *event) +@@ -406,7 +442,7 @@ static void rapl_pmu_event_read(struct perf_event *event) static ssize_t rapl_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) { @@ -17681,7 +27111,7 @@ index b985ca79cf97..8206038a01ac 100644 } static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); -@@ -391,17 +439,38 @@ static struct attribute_group rapl_pmu_attr_group = { +@@ -420,17 +456,38 @@ static struct attribute_group rapl_pmu_attr_group = { .attrs = rapl_pmu_attrs, }; @@ -17720,7 +27150,7 @@ index b985ca79cf97..8206038a01ac 100644 /* * we compute in 0.23 nJ increments regardless of MSR -@@ -411,6 +480,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890 +@@ -440,6 +497,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890 RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10"); @@ -17728,7 +27158,7 @@ index b985ca79cf97..8206038a01ac 100644 /* * There are no default events, but we need to create -@@ -444,6 +514,13 @@ static const struct attribute_group *rapl_attr_groups[] = { +@@ -473,6 +531,13 @@ static const struct attribute_group *rapl_attr_groups[] = { NULL, }; @@ -17742,7 +27172,7 @@ index b985ca79cf97..8206038a01ac 100644 static struct attribute *rapl_events_cores[] = { EVENT_PTR(rapl_cores), EVENT_PTR(rapl_cores_unit), -@@ -504,6 +581,18 @@ static struct attribute_group rapl_events_psys_group = { +@@ -533,6 +598,18 @@ static struct attribute_group rapl_events_psys_group = { .attrs = rapl_events_psys, }; @@ -17761,7 +27191,7 @@ index b985ca79cf97..8206038a01ac 100644 static bool test_msr(int idx, void *data) { return test_bit(idx, (unsigned long *) data); -@@ -529,11 +618,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = { +@@ -558,11 +635,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = { }; /* @@ -17776,7 +27206,7 @@ index b985ca79cf97..8206038a01ac 100644 [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 }, [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 }, -@@ -541,72 +630,104 @@ static struct perf_msr amd_rapl_msrs[] = { +@@ -570,77 +647,104 @@ static struct perf_msr amd_rapl_msrs[] = { [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 }, }; @@ -17801,7 +27231,7 @@ index b985ca79cf97..8206038a01ac 100644 - pmu->cpu = -1; + rapl_pmu->cpu = -1; /* Find a new cpu to collect rapl events */ -- target = cpumask_any_but(topology_die_cpumask(cpu), cpu); +- target = cpumask_any_but(get_rapl_pmu_cpumask(cpu), cpu); + target = cpumask_any_but(event_cpumask, cpu); /* Migrate rapl events to the new target */ @@ -17818,7 +27248,13 @@ index b985ca79cf97..8206038a01ac 100644 -static int rapl_cpu_online(unsigned int cpu) +static int rapl_cpu_offline(unsigned int cpu) -+{ + { +- s32 rapl_pmu_idx = get_rapl_pmu_idx(cpu); +- if (rapl_pmu_idx < 0) { +- pr_err("topology_logical_(package/die)_id() returned a negative value"); +- return -EINVAL; +- } +- struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); + int ret = __rapl_cpu_offline(rapl_pmus_pkg, get_rapl_pmu_idx(cpu), + get_rapl_pmu_cpumask(cpu), cpu); + @@ -17831,8 +27267,7 @@ index b985ca79cf97..8206038a01ac 100644 + +static int __rapl_cpu_online(struct rapl_pmus *rapl_pmus, unsigned int rapl_pmu_idx, + const struct cpumask *event_cpumask, unsigned int cpu) - { -- struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); ++{ + struct rapl_pmu *rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx]; int target; @@ -17855,7 +27290,7 @@ index b985ca79cf97..8206038a01ac 100644 + rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms); + rapl_hrtimer_init(rapl_pmu); -- rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu; +- rapl_pmus->pmus[rapl_pmu_idx] = pmu; + rapl_pmus->rapl_pmu[rapl_pmu_idx] = rapl_pmu; } @@ -17863,7 +27298,7 @@ index b985ca79cf97..8206038a01ac 100644 * Check if there is an online cpu in the package which collects rapl * events already. */ -- target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu)); +- target = cpumask_any_and(&rapl_cpu_mask, get_rapl_pmu_cpumask(cpu)); + target = cpumask_any_and(&rapl_pmus->cpumask, event_cpumask); if (target < nr_cpu_ids) return 0; @@ -17907,7 +27342,7 @@ index b985ca79cf97..8206038a01ac 100644 /* * DRAM domain on HSW server and KNL has fixed energy unit which can be * different than the unit from power unit MSR. See -@@ -645,22 +766,29 @@ static void __init rapl_advertise(void) +@@ -679,22 +783,29 @@ static void __init rapl_advertise(void) int i; pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n", @@ -17943,20 +27378,23 @@ index b985ca79cf97..8206038a01ac 100644 kfree(rapl_pmus); } -@@ -673,11 +801,17 @@ static const struct attribute_group *rapl_attr_update[] = { +@@ -707,14 +818,17 @@ static const struct attribute_group *rapl_attr_update[] = { NULL, }; -static int __init init_rapl_pmus(void) +-{ +- int nr_rapl_pmu = topology_max_packages(); +static const struct attribute_group *rapl_per_core_attr_update[] = { + &rapl_events_per_core_group, +}; -+ + +- if (!rapl_pmu_is_pkg_scope()) +- nr_rapl_pmu *= topology_max_dies_per_package(); +static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int nr_rapl_pmu, + const struct attribute_group **rapl_attr_groups, + const struct attribute_group **rapl_attr_update) - { -- int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package(); ++{ + struct rapl_pmus *rapl_pmus; - rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL); @@ -17964,7 +27402,7 @@ index b985ca79cf97..8206038a01ac 100644 if (!rapl_pmus) return -ENOMEM; -@@ -693,75 +827,80 @@ static int __init init_rapl_pmus(void) +@@ -730,75 +844,80 @@ static int __init init_rapl_pmus(void) rapl_pmus->pmu.read = rapl_pmu_event_read; rapl_pmus->pmu.module = THIS_MODULE; rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; @@ -18061,7 +27499,7 @@ index b985ca79cf97..8206038a01ac 100644 }; static const struct x86_cpu_id rapl_model_match[] __initconst = { -@@ -817,28 +956,47 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); +@@ -854,28 +973,47 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); static int __init rapl_pmu_init(void) { const struct x86_cpu_id *id; @@ -18117,7 +27555,7 @@ index b985ca79cf97..8206038a01ac 100644 /* * Install callbacks. Core will call them for each online cpu. */ -@@ -848,10 +1006,24 @@ static int __init rapl_pmu_init(void) +@@ -885,10 +1023,24 @@ static int __init rapl_pmu_init(void) if (ret) goto out; @@ -18143,7 +27581,7 @@ index b985ca79cf97..8206038a01ac 100644 rapl_advertise(); return 0; -@@ -859,7 +1031,7 @@ static int __init rapl_pmu_init(void) +@@ -896,7 +1048,7 @@ static int __init rapl_pmu_init(void) cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); out: pr_warn("Initialization failed (%d), disabled\n", ret); @@ -18152,7 +27590,7 @@ index b985ca79cf97..8206038a01ac 100644 return ret; } module_init(rapl_pmu_init); -@@ -867,7 +1039,11 @@ module_init(rapl_pmu_init); +@@ -904,7 +1056,11 @@ module_init(rapl_pmu_init); static void __exit intel_rapl_exit(void) { cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE); @@ -18167,7 +27605,7 @@ index b985ca79cf97..8206038a01ac 100644 } module_exit(intel_rapl_exit); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index e17f4d733e44..7e53b701bc27 100644 +index c0975815980c..cfd8a5591421 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -98,6 +98,7 @@ struct cpuinfo_topology { @@ -18215,14 +27653,14 @@ index 8277c64f88db..b5a5e1411469 100644 /* Package relative core ID */ -- -2.47.0.rc0 +2.47.0 -From 38ca6249d4a3205988323759c2e0986d93e737aa Mon Sep 17 00:00:00 2001 -From: Eric Naim -Date: Tue, 22 Oct 2024 22:53:29 +0800 -Subject: [PATCH 11/13] t2 +From ec1991f4667217d8f154c08c365cedb721bcf4eb Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 4 Nov 2024 14:52:03 +0100 +Subject: [PATCH 12/13] t2 -Signed-off-by: Eric Naim +Signed-off-by: Peter Jung --- .../ABI/testing/sysfs-driver-hid-appletb-kbd | 13 + Documentation/core-api/printk-formats.rst | 32 + @@ -18241,9 +27679,7 @@ Signed-off-by: Eric Naim drivers/hid/Kconfig | 22 + drivers/hid/Makefile | 2 + drivers/hid/hid-appletb-bl.c | 207 +++ - drivers/hid/hid-appletb-kbd.c | 501 ++++++++ - drivers/hid/hid-core.c | 25 + - drivers/hid/hid-google-hammer.c | 27 +- + drivers/hid/hid-appletb-kbd.c | 432 +++++++ drivers/hid/hid-multitouch.c | 60 +- drivers/hid/hid-quirks.c | 8 +- drivers/hwmon/applesmc.c | 1138 ++++++++++++----- @@ -18279,11 +27715,10 @@ Signed-off-by: Eric Naim drivers/staging/apple-bce/vhci/vhci.c | 759 +++++++++++ drivers/staging/apple-bce/vhci/vhci.h | 52 + include/drm/drm_format_helper.h | 3 + - include/linux/hid.h | 2 + lib/test_printf.c | 20 +- lib/vsprintf.c | 36 +- scripts/checkpatch.pl | 2 +- - 59 files changed, 8368 insertions(+), 361 deletions(-) + 56 files changed, 8270 insertions(+), 336 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-driver-hid-appletb-kbd create mode 100644 drivers/gpu/drm/tiny/appletbdrm.c create mode 100644 drivers/hid/hid-appletb-bl.c @@ -18335,10 +27770,10 @@ index 000000000000..2a19584d091e + 3 None + == ================= diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst -index 4451ef501936..c726a846f752 100644 +index 14e093da3ccd..ccd7bd29a6d6 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst -@@ -632,6 +632,38 @@ Examples:: +@@ -630,6 +630,38 @@ Examples:: %p4cc Y10 little-endian (0x20303159) %p4cc NV12 big-endian (0xb231564e) @@ -18378,10 +27813,10 @@ index 4451ef501936..c726a846f752 100644 ---- diff --git a/MAINTAINERS b/MAINTAINERS -index 0bcfbc58a9ab..affc58245cc1 100644 +index 271b59a9c585..50764f9b3bb9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS -@@ -6903,6 +6903,12 @@ S: Supported +@@ -7011,6 +7011,12 @@ S: Supported T: git https://gitlab.freedesktop.org/drm/misc/kernel.git F: drivers/gpu/drm/sun4i/sun8i* @@ -18395,10 +27830,10 @@ index 0bcfbc58a9ab..affc58245cc1 100644 S: Orphan T: git https://gitlab.freedesktop.org/drm/misc/kernel.git diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c -index ad5c05ee92f3..09c82a3e83f2 100644 +index f6a6fc6a4f5c..e71b6dfad958 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c -@@ -2237,6 +2237,9 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, +@@ -2260,6 +2260,9 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, int ret, retry = 0, i; bool supports_atomic = false; @@ -18481,10 +27916,10 @@ index b1be458ed4dd..28c0e76a1e88 100644 drm_fb_xrgb8888_to_argb8888(dst, dst_pitch, src, fb, clip, state); return 0; diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c -index 5b6aabce4c32..fafc673d508e 100644 +index b1c294236cc8..21e23ba5391e 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c -@@ -4640,6 +4640,7 @@ intel_ddi_init_hdmi_connector(struct intel_digital_port *dig_port) +@@ -4641,6 +4641,7 @@ intel_ddi_init_hdmi_connector(struct intel_digital_port *dig_port) static bool intel_ddi_a_force_4_lanes(struct intel_digital_port *dig_port) { @@ -18492,7 +27927,7 @@ index 5b6aabce4c32..fafc673d508e 100644 struct drm_i915_private *dev_priv = to_i915(dig_port->base.base.dev); if (dig_port->base.port != PORT_A) -@@ -4648,6 +4649,9 @@ static bool intel_ddi_a_force_4_lanes(struct intel_digital_port *dig_port) +@@ -4649,6 +4650,9 @@ static bool intel_ddi_a_force_4_lanes(struct intel_digital_port *dig_port) if (dig_port->saved_port_bits & DDI_A_4_LANES) return false; @@ -18521,7 +27956,7 @@ index 49a1ac4f5491..c8c10a6104c4 100644 fb->base.width, fb->base.height, sizes->fb_width, sizes->fb_height); diff --git a/drivers/gpu/drm/i915/display/intel_quirks.c b/drivers/gpu/drm/i915/display/intel_quirks.c -index dfd8b4960e6d..7232f9acd0a0 100644 +index 29b56d53a340..7226ec8fdd9c 100644 --- a/drivers/gpu/drm/i915/display/intel_quirks.c +++ b/drivers/gpu/drm/i915/display/intel_quirks.c @@ -64,6 +64,18 @@ static void quirk_increase_ddi_disabled_time(struct intel_display *display) @@ -18741,7 +28176,7 @@ index 76dde89a044b..9a1b412e764a 100644 obj-$(CONFIG_DRM_CIRRUS_QEMU) += cirrus.o diff --git a/drivers/gpu/drm/tiny/appletbdrm.c b/drivers/gpu/drm/tiny/appletbdrm.c new file mode 100644 -index 000000000000..b9440ce0064e +index 000000000000..7a74c8ad37cd --- /dev/null +++ b/drivers/gpu/drm/tiny/appletbdrm.c @@ -0,0 +1,624 @@ @@ -18754,7 +28189,7 @@ index 000000000000..b9440ce0064e + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + -+#include ++#include + +#include +#include @@ -19370,7 +28805,7 @@ index 000000000000..b9440ce0064e +MODULE_DESCRIPTION("Apple Touch Bar DRM Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/gpu/vga/vga_switcheroo.c b/drivers/gpu/vga/vga_switcheroo.c -index 365e6ddbe90f..cf357cd3389d 100644 +index 18f2c92beff8..3de1bca45ed2 100644 --- a/drivers/gpu/vga/vga_switcheroo.c +++ b/drivers/gpu/vga/vga_switcheroo.c @@ -438,12 +438,7 @@ find_active_client(struct list_head *head) @@ -19388,7 +28823,7 @@ index 365e6ddbe90f..cf357cd3389d 100644 } diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig -index 08446c89eff6..35ef5d4ef068 100644 +index f8a56d631242..6c8e9e004907 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -148,6 +148,27 @@ config HID_APPLEIR @@ -19419,7 +28854,7 @@ index 08446c89eff6..35ef5d4ef068 100644 config HID_ASUS tristate "Asus" depends on USB_HID -@@ -723,6 +744,7 @@ config HID_MULTITOUCH +@@ -729,6 +750,7 @@ config HID_MULTITOUCH Say Y here if you have one of the following devices: - 3M PCT touch screens - ActionStar dual touch panels @@ -19428,7 +28863,7 @@ index 08446c89eff6..35ef5d4ef068 100644 - Cando dual touch panels - Chunghwa panels diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile -index e40f1ddebbb7..d903c9a2629d 100644 +index 496dab54c73a..13d32f55e5d4 100644 --- a/drivers/hid/Makefile +++ b/drivers/hid/Makefile @@ -29,6 +29,8 @@ obj-$(CONFIG_HID_ALPS) += hid-alps.o @@ -19655,10 +29090,10 @@ index 000000000000..819157686e59 +MODULE_LICENSE("GPL"); diff --git a/drivers/hid/hid-appletb-kbd.c b/drivers/hid/hid-appletb-kbd.c new file mode 100644 -index 000000000000..c26b7a19a5e4 +index 000000000000..442c4d8848df --- /dev/null +++ b/drivers/hid/hid-appletb-kbd.c -@@ -0,0 +1,501 @@ +@@ -0,0 +1,432 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Apple Touch Bar Keyboard Mode Driver @@ -19677,8 +29112,6 @@ index 000000000000..c26b7a19a5e4 +#include +#include +#include -+#include -+#include +#include + +#include "hid-ids.h" @@ -19690,7 +29123,6 @@ index 000000000000..c26b7a19a5e4 +#define APPLETB_KBD_MODE_MAX APPLETB_KBD_MODE_OFF + +#define APPLETB_DEVID_KEYBOARD 1 -+#define APPLETB_DEVID_TRACKPAD 2 + +#define HID_USAGE_MODE 0x00ff0004 + @@ -19705,29 +29137,14 @@ index 000000000000..c26b7a19a5e4 +module_param_named(fntoggle, appletb_tb_fn_toggle, bool, 0644); +MODULE_PARM_DESC(fntoggle, "Switch between Fn and media controls on pressing Fn key"); + -+static bool appletb_tb_autodim = true; -+module_param_named(autodim, appletb_tb_autodim, bool, 0644); -+MODULE_PARM_DESC(autodim, "Automatically dim touchbar if left idle"); -+ -+static int appletb_tb_dim_timeout = 60; -+module_param_named(dim_timeout, appletb_tb_dim_timeout, int, 0644); -+MODULE_PARM_DESC(dim_timeout, "Dim timeout in sec"); -+ -+static int appletb_tb_idle_timeout = 15; -+module_param_named(idle_timeout, appletb_tb_idle_timeout, int, 0644); -+MODULE_PARM_DESC(idle_timeout, "Idle timeout in sec"); -+ +struct appletb_kbd { + struct hid_field *mode_field; -+ struct input_handler inp_handler; -+ struct input_handle kbd_handle; -+ struct input_handle tpd_handle; -+ struct backlight_device *backlight_dev; -+ struct timer_list inactivity_timer; -+ bool has_dimmed; -+ bool has_turned_off; ++ + u8 saved_mode; + u8 current_mode; ++ struct input_handler inp_handler; ++ struct input_handle kbd_handle; ++ +}; + +static const struct key_entry appletb_kbd_keymap[] = { @@ -19825,34 +29242,6 @@ index 000000000000..c26b7a19a5e4 + } +} + -+static void appletb_inactivity_timer(struct timer_list *t) -+{ -+ struct appletb_kbd *kbd = from_timer(kbd, t, inactivity_timer); -+ -+ if (kbd->backlight_dev && appletb_tb_autodim) { -+ if (!kbd->has_dimmed) { -+ backlight_device_set_brightness(kbd->backlight_dev, 1); -+ kbd->has_dimmed = true; -+ mod_timer(&kbd->inactivity_timer, jiffies + msecs_to_jiffies(appletb_tb_idle_timeout * 1000)); -+ } else if (!kbd->has_turned_off) { -+ backlight_device_set_brightness(kbd->backlight_dev, 0); -+ kbd->has_turned_off = true; -+ } -+ } -+} -+ -+static void reset_inactivity_timer(struct appletb_kbd *kbd) -+{ -+ if ((kbd->has_dimmed || kbd->has_turned_off) && kbd->backlight_dev) { -+ backlight_device_set_brightness(kbd->backlight_dev, 2); -+ if (appletb_tb_autodim) { -+ kbd->has_dimmed = false; -+ kbd->has_turned_off = false; -+ mod_timer(&kbd->inactivity_timer, jiffies + msecs_to_jiffies(appletb_tb_dim_timeout * 1000)); -+ } -+ } -+} -+ +static int appletb_kbd_hid_event(struct hid_device *hdev, struct hid_field *field, + struct hid_usage *usage, __s32 value) +{ @@ -19877,8 +29266,6 @@ index 000000000000..c26b7a19a5e4 + if (slot < 0) + return 0; + -+ reset_inactivity_timer(kbd); -+ + translation = sparse_keymap_entry_from_scancode(input, usage->code); + + if (translation && kbd->current_mode == APPLETB_KBD_MODE_SPCL) { @@ -19895,8 +29282,6 @@ index 000000000000..c26b7a19a5e4 +{ + struct appletb_kbd *kbd = handle->private; + -+ reset_inactivity_timer(kbd); -+ + if (type == EV_KEY && code == KEY_FN && appletb_tb_fn_toggle) { + if (value == 1) { + kbd->saved_mode = kbd->current_mode; @@ -19922,9 +29307,6 @@ index 000000000000..c26b7a19a5e4 + if (id->driver_info == APPLETB_DEVID_KEYBOARD) { + handle = &kbd->kbd_handle; + handle->name = "tbkbd"; -+ } else if (id->driver_info == APPLETB_DEVID_TRACKPAD) { -+ handle = &kbd->tpd_handle; -+ handle->name = "tbtpd"; + } else { + return -ENOENT; + } @@ -19998,15 +29380,6 @@ index 000000000000..c26b7a19a5e4 + .keybit = { [BIT_WORD(KEY_FN)] = BIT_MASK(KEY_FN) }, + .driver_info = APPLETB_DEVID_KEYBOARD, + }, -+ { -+ .flags = INPUT_DEVICE_ID_MATCH_BUS | -+ INPUT_DEVICE_ID_MATCH_VENDOR | -+ INPUT_DEVICE_ID_MATCH_KEYBIT, -+ .bustype = BUS_USB, -+ .vendor = USB_VENDOR_ID_APPLE, -+ .keybit = { [BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH) }, -+ .driver_info = APPLETB_DEVID_TRACKPAD, -+ }, + { } +}; + @@ -20063,12 +29436,6 @@ index 000000000000..c26b7a19a5e4 + goto stop_hw; + } + -+ timer_setup(&kbd->inactivity_timer, appletb_inactivity_timer, 0); -+ mod_timer(&kbd->inactivity_timer, jiffies + msecs_to_jiffies(appletb_tb_dim_timeout * 1000)); -+ kbd->backlight_dev = backlight_device_get_by_name("appletb_backlight"); -+ if (!kbd->backlight_dev) -+ dev_err_probe(dev, ret, "Failed to get backlight device\n"); -+ + kbd->inp_handler.event = appletb_kbd_inp_event; + kbd->inp_handler.connect = appletb_kbd_inp_connect; + kbd->inp_handler.disconnect = appletb_kbd_inp_disconnect; @@ -20107,7 +29474,6 @@ index 000000000000..c26b7a19a5e4 + appletb_kbd_set_mode(kbd, APPLETB_KBD_MODE_OFF); + + input_unregister_handler(&kbd->inp_handler); -+ del_timer_sync(&kbd->inactivity_timer); + + hid_hw_close(hdev); + hid_hw_stop(hdev); @@ -20160,89 +29526,8 @@ index 000000000000..c26b7a19a5e4 +MODULE_AUTHOR("Kerem Karabay "); +MODULE_DESCRIPTION("MacBookPro Touch Bar Keyboard Mode Driver"); +MODULE_LICENSE("GPL"); -diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c -index 988d0acbdf04..caeba5487b69 100644 ---- a/drivers/hid/hid-core.c -+++ b/drivers/hid/hid-core.c -@@ -1912,6 +1912,31 @@ int hid_set_field(struct hid_field *field, unsigned offset, __s32 value) - } - EXPORT_SYMBOL_GPL(hid_set_field); - -+struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_type, -+ unsigned int application, unsigned int usage) -+{ -+ struct list_head *report_list = &hdev->report_enum[report_type].report_list; -+ struct hid_report *report; -+ int i, j; -+ -+ list_for_each_entry(report, report_list, list) { -+ if (report->application != application) -+ continue; -+ -+ for (i = 0; i < report->maxfield; i++) { -+ struct hid_field *field = report->field[i]; -+ -+ for (j = 0; j < field->maxusage; j++) { -+ if (field->usage[j].hid == usage) -+ return field; -+ } -+ } -+ } -+ -+ return NULL; -+} -+EXPORT_SYMBOL_GPL(hid_find_field); -+ - static struct hid_report *hid_get_report(struct hid_report_enum *report_enum, - const u8 *data) - { -diff --git a/drivers/hid/hid-google-hammer.c b/drivers/hid/hid-google-hammer.c -index 6e4ebc349e45..4e79fafeeafa 100644 ---- a/drivers/hid/hid-google-hammer.c -+++ b/drivers/hid/hid-google-hammer.c -@@ -418,38 +418,15 @@ static int hammer_event(struct hid_device *hid, struct hid_field *field, - return 0; - } - --static bool hammer_has_usage(struct hid_device *hdev, unsigned int report_type, -- unsigned application, unsigned usage) --{ -- struct hid_report_enum *re = &hdev->report_enum[report_type]; -- struct hid_report *report; -- int i, j; -- -- list_for_each_entry(report, &re->report_list, list) { -- if (report->application != application) -- continue; -- -- for (i = 0; i < report->maxfield; i++) { -- struct hid_field *field = report->field[i]; -- -- for (j = 0; j < field->maxusage; j++) -- if (field->usage[j].hid == usage) -- return true; -- } -- } -- -- return false; --} -- - static bool hammer_has_folded_event(struct hid_device *hdev) - { -- return hammer_has_usage(hdev, HID_INPUT_REPORT, -+ return !!hid_find_field(hdev, HID_INPUT_REPORT, - HID_GD_KEYBOARD, HID_USAGE_KBD_FOLDED); - } - - static bool hammer_has_backlight_control(struct hid_device *hdev) - { -- return hammer_has_usage(hdev, HID_OUTPUT_REPORT, -+ return !!hid_find_field(hdev, HID_OUTPUT_REPORT, - HID_GD_KEYBOARD, HID_AD_BRIGHTNESS); - } - diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c -index 847462650549..6c4cb3883955 100644 +index e936019d21fe..0d5382e965de 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -72,6 +72,7 @@ MODULE_LICENSE("GPL"); @@ -20266,10 +29551,10 @@ index 847462650549..6c4cb3883955 100644 #define MT_CLS_RAZER_BLADE_STEALTH 0x0112 #define MT_CLS_SMART_TECH 0x0113 +#define MT_CLS_APPLE_TOUCHBAR 0x0114 + #define MT_CLS_SIS 0x0457 #define MT_DEFAULT_MAXCONTACT 10 - #define MT_MAX_MAXCONTACT 250 -@@ -396,6 +399,13 @@ static const struct mt_class mt_classes[] = { +@@ -397,6 +400,13 @@ static const struct mt_class mt_classes[] = { MT_QUIRK_CONTACT_CNT_ACCURATE | MT_QUIRK_SEPARATE_APP_REPORT, }, @@ -20280,10 +29565,10 @@ index 847462650549..6c4cb3883955 100644 + .is_direct = true, + .maxcontacts = 11, + }, - { } - }; - -@@ -489,9 +499,6 @@ static void mt_feature_mapping(struct hid_device *hdev, + { .name = MT_CLS_SIS, + .quirks = MT_QUIRK_NOT_SEEN_MEANS_UP | + MT_QUIRK_ALWAYS_VALID | +@@ -495,9 +505,6 @@ static void mt_feature_mapping(struct hid_device *hdev, if (!td->maxcontacts && field->logical_maximum <= MT_MAX_MAXCONTACT) td->maxcontacts = field->logical_maximum; @@ -20293,7 +29578,7 @@ index 847462650549..6c4cb3883955 100644 break; case HID_DG_BUTTONTYPE: -@@ -565,13 +572,13 @@ static struct mt_application *mt_allocate_application(struct mt_device *td, +@@ -571,13 +578,13 @@ static struct mt_application *mt_allocate_application(struct mt_device *td, mt_application->application = application; INIT_LIST_HEAD(&mt_application->mt_usages); @@ -20309,7 +29594,7 @@ index 847462650549..6c4cb3883955 100644 mt_application->mt_flags |= INPUT_MT_POINTER; td->inputmode_value = MT_INPUTMODE_TOUCHPAD; } -@@ -635,7 +642,9 @@ static struct mt_report_data *mt_allocate_report_data(struct mt_device *td, +@@ -641,7 +648,9 @@ static struct mt_report_data *mt_allocate_report_data(struct mt_device *td, if (field->logical == HID_DG_FINGER || td->hdev->group != HID_GROUP_MULTITOUCH_WIN_8) { for (n = 0; n < field->report_count; n++) { @@ -20320,7 +29605,7 @@ index 847462650549..6c4cb3883955 100644 rdata->is_mt_collection = true; break; } -@@ -807,6 +816,15 @@ static int mt_touch_input_mapping(struct hid_device *hdev, struct hid_input *hi, +@@ -813,6 +822,15 @@ static int mt_touch_input_mapping(struct hid_device *hdev, struct hid_input *hi, MT_STORE_FIELD(confidence_state); return 1; @@ -20336,7 +29621,7 @@ index 847462650549..6c4cb3883955 100644 case HID_DG_TIPSWITCH: if (field->application != HID_GD_SYSTEM_MULTIAXIS) input_set_capability(hi->input, -@@ -814,6 +832,7 @@ static int mt_touch_input_mapping(struct hid_device *hdev, struct hid_input *hi, +@@ -820,6 +838,7 @@ static int mt_touch_input_mapping(struct hid_device *hdev, struct hid_input *hi, MT_STORE_FIELD(tip_state); return 1; case HID_DG_CONTACTID: @@ -20344,7 +29629,7 @@ index 847462650549..6c4cb3883955 100644 MT_STORE_FIELD(contactid); app->touches_by_report++; return 1; -@@ -869,10 +888,6 @@ static int mt_touch_input_mapping(struct hid_device *hdev, struct hid_input *hi, +@@ -875,10 +894,6 @@ static int mt_touch_input_mapping(struct hid_device *hdev, struct hid_input *hi, case HID_DG_CONTACTMAX: /* contact max are global to the report */ return -1; @@ -20355,7 +29640,7 @@ index 847462650549..6c4cb3883955 100644 } /* let hid-input decide for the others */ return 0; -@@ -1300,6 +1315,10 @@ static int mt_touch_input_configured(struct hid_device *hdev, +@@ -1306,6 +1321,10 @@ static int mt_touch_input_configured(struct hid_device *hdev, struct input_dev *input = hi->input; int ret; @@ -20366,7 +29651,7 @@ index 847462650549..6c4cb3883955 100644 if (!td->maxcontacts) td->maxcontacts = MT_DEFAULT_MAXCONTACT; -@@ -1307,6 +1326,9 @@ static int mt_touch_input_configured(struct hid_device *hdev, +@@ -1313,6 +1332,9 @@ static int mt_touch_input_configured(struct hid_device *hdev, if (td->serial_maybe) mt_post_parse_default_settings(td, app); @@ -20376,7 +29661,7 @@ index 847462650549..6c4cb3883955 100644 if (cls->is_indirect) app->mt_flags |= INPUT_MT_POINTER; -@@ -1758,6 +1780,15 @@ static int mt_probe(struct hid_device *hdev, const struct hid_device_id *id) +@@ -1764,6 +1786,15 @@ static int mt_probe(struct hid_device *hdev, const struct hid_device_id *id) } } @@ -20392,7 +29677,7 @@ index 847462650549..6c4cb3883955 100644 td = devm_kzalloc(&hdev->dev, sizeof(struct mt_device), GFP_KERNEL); if (!td) { dev_err(&hdev->dev, "cannot allocate multitouch data\n"); -@@ -1805,10 +1836,6 @@ static int mt_probe(struct hid_device *hdev, const struct hid_device_id *id) +@@ -1811,10 +1842,6 @@ static int mt_probe(struct hid_device *hdev, const struct hid_device_id *id) timer_setup(&td->release_timer, mt_expired_timeout, 0); @@ -20403,7 +29688,7 @@ index 847462650549..6c4cb3883955 100644 if (mtclass->quirks & MT_QUIRK_FIX_CONST_CONTACT_ID) mt_fix_const_fields(hdev, HID_DG_CONTACTID); -@@ -2277,6 +2304,11 @@ static const struct hid_device_id mt_devices[] = { +@@ -2299,6 +2326,11 @@ static const struct hid_device_id mt_devices[] = { MT_USB_DEVICE(USB_VENDOR_ID_XIROKU, USB_DEVICE_ID_XIROKU_CSR2) }, @@ -22428,7 +31713,7 @@ index fc6d6a9053ce..698f44794453 100644 MODULE_LICENSE("GPL v2"); MODULE_DEVICE_TABLE(dmi, applesmc_whitelist); diff --git a/drivers/input/mouse/bcm5974.c b/drivers/input/mouse/bcm5974.c -index 10a03a566905..8c3ccd98ba93 100644 +index dfdfb59cc8b5..e0da70576167 100644 --- a/drivers/input/mouse/bcm5974.c +++ b/drivers/input/mouse/bcm5974.c @@ -83,6 +83,24 @@ @@ -22646,10 +31931,10 @@ index 1417e230edbd..e69785af8e1d 100644 * Retina MacBook Pros cannot switch the panel's AUX separately * and need eDP pre-calibration. They are distinguishable from diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig -index db4a392841b1..580df4ce4f9f 100644 +index 3fb68d60dfc1..7337f658fe96 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig -@@ -66,4 +66,6 @@ source "drivers/staging/fieldbus/Kconfig" +@@ -64,4 +64,6 @@ source "drivers/staging/fieldbus/Kconfig" source "drivers/staging/vme_user/Kconfig" @@ -22657,10 +31942,10 @@ index db4a392841b1..580df4ce4f9f 100644 + endif # STAGING diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile -index 5390879b5d1b..528be2d3b546 100644 +index c977aa13fad4..241ea7562045 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile -@@ -22,3 +22,4 @@ obj-$(CONFIG_GREYBUS) += greybus/ +@@ -21,3 +21,4 @@ obj-$(CONFIG_GREYBUS) += greybus/ obj-$(CONFIG_BCM2835_VCHIQ) += vc04_services/ obj-$(CONFIG_XIL_AXIS_FIFO) += axis-fifo/ obj-$(CONFIG_FIELDBUS_DEV) += fieldbus/ @@ -28497,24 +37782,11 @@ index 428d81afe215..aa1604d92c1a 100644 void drm_fb_xrgb8888_to_argb8888(struct iosys_map *dst, const unsigned int *dst_pitch, const struct iosys_map *src, const struct drm_framebuffer *fb, const struct drm_rect *clip, struct drm_format_conv_state *state); -diff --git a/include/linux/hid.h b/include/linux/hid.h -index 1533c9dcd3a6..2deff79f39a1 100644 ---- a/include/linux/hid.h -+++ b/include/linux/hid.h -@@ -940,6 +940,8 @@ extern void hidinput_report_event(struct hid_device *hid, struct hid_report *rep - extern int hidinput_connect(struct hid_device *hid, unsigned int force); - extern void hidinput_disconnect(struct hid_device *); - -+struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_type, -+ unsigned int application, unsigned int usage); - int hid_set_field(struct hid_field *, unsigned, __s32); - int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, - int interrupt); diff --git a/lib/test_printf.c b/lib/test_printf.c -index 965cb6f28527..db99014b8c13 100644 +index 8448b6d02bd9..f63591b3ee69 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c -@@ -745,18 +745,26 @@ static void __init fwnode_pointer(void) +@@ -719,18 +719,26 @@ static void __init fwnode_pointer(void) static void __init fourcc_pointer(void) { struct { @@ -28548,7 +37820,7 @@ index 965cb6f28527..db99014b8c13 100644 static void __init diff --git a/lib/vsprintf.c b/lib/vsprintf.c -index 2d71b1115916..5274e3c881de 100644 +index c5e2ec9303c5..874e3af8104c 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1760,27 +1760,50 @@ char *fourcc_string(char *buf, char *end, const u32 *fourcc, @@ -28608,7 +37880,7 @@ index 2d71b1115916..5274e3c881de 100644 *p++ = ' '; *p++ = '('; -@@ -2355,6 +2378,7 @@ char *rust_fmt_argument(char *buf, char *end, void *ptr); +@@ -2334,6 +2357,7 @@ char *rust_fmt_argument(char *buf, char *end, void *ptr); * read the documentation (path below) first. * - 'NF' For a netdev_features_t * - '4cc' V4L2 or DRM FourCC code, with endianness and raw numerical value. @@ -28630,828 +37902,14 @@ index 4427572b2477..b60c99d61882 100755 last; } -- -2.47.0.rc0 +2.47.0 -From 9c5f8134d6095a520bca8870f2477115b595ca07 Mon Sep 17 00:00:00 2001 -From: Eric Naim -Date: Tue, 22 Oct 2024 22:53:44 +0800 -Subject: [PATCH 12/13] thp-shrinker - -Signed-off-by: Peter Jung ---- - Documentation/admin-guide/mm/transhuge.rst | 16 ++ - include/linux/huge_mm.h | 4 +- - include/linux/khugepaged.h | 1 + - include/linux/page-flags.h | 13 +- - include/linux/rmap.h | 7 +- - include/linux/vm_event_item.h | 1 + - mm/huge_memory.c | 143 ++++++++++++++++-- - mm/khugepaged.c | 3 +- - mm/migrate.c | 75 +++++++-- - mm/migrate_device.c | 4 +- - mm/rmap.c | 5 +- - mm/vmscan.c | 3 +- - mm/vmstat.c | 1 + - .../selftests/mm/split_huge_page_test.c | 71 +++++++++ - tools/testing/selftests/mm/vm_util.c | 22 +++ - tools/testing/selftests/mm/vm_util.h | 1 + - 16 files changed, 334 insertions(+), 36 deletions(-) - -diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst -index 058485daf186..02ae7bc9efbd 100644 ---- a/Documentation/admin-guide/mm/transhuge.rst -+++ b/Documentation/admin-guide/mm/transhuge.rst -@@ -202,6 +202,16 @@ PMD-mappable transparent hugepage:: - - cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size - -+All THPs at fault and collapse time will be added to _deferred_list, -+and will therefore be split under memory presure if they are considered -+"underused". A THP is underused if the number of zero-filled pages in -+the THP is above max_ptes_none (see below). It is possible to disable -+this behaviour by writing 0 to shrink_underused, and enable it by writing -+1 to it:: -+ -+ echo 0 > /sys/kernel/mm/transparent_hugepage/shrink_underused -+ echo 1 > /sys/kernel/mm/transparent_hugepage/shrink_underused -+ - khugepaged will be automatically started when PMD-sized THP is enabled - (either of the per-size anon control or the top-level control are set - to "always" or "madvise"), and it'll be automatically shutdown when -@@ -447,6 +457,12 @@ thp_deferred_split_page - splitting it would free up some memory. Pages on split queue are - going to be split under memory pressure. - -+thp_underused_split_page -+ is incremented when a huge page on the split queue was split -+ because it was underused. A THP is underused if the number of -+ zero pages in the THP is above a certain threshold -+ (/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none). -+ - thp_split_pmd - is incremented every time a PMD split into table of PTEs. - This can happen, for instance, when application calls mprotect() or -diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h -index e25d9ebfdf89..00af84aa88ea 100644 ---- a/include/linux/huge_mm.h -+++ b/include/linux/huge_mm.h -@@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page) - { - return split_huge_page_to_list_to_order(page, NULL, 0); - } --void deferred_split_folio(struct folio *folio); -+void deferred_split_folio(struct folio *folio, bool partially_mapped); - - void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address, bool freeze, struct folio *folio); -@@ -484,7 +484,7 @@ static inline int split_huge_page(struct page *page) - { - return 0; - } --static inline void deferred_split_folio(struct folio *folio) {} -+static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} - #define split_huge_pmd(__vma, __pmd, __address) \ - do { } while (0) - -diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h -index f68865e19b0b..30baae91b225 100644 ---- a/include/linux/khugepaged.h -+++ b/include/linux/khugepaged.h -@@ -4,6 +4,7 @@ - - #include /* MMF_VM_HUGEPAGE */ - -+extern unsigned int khugepaged_max_ptes_none __read_mostly; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE - extern struct attribute_group khugepaged_attr_group; - -diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h -index 5769fe6e4950..5e7bc8522e91 100644 ---- a/include/linux/page-flags.h -+++ b/include/linux/page-flags.h -@@ -185,6 +185,7 @@ enum pageflags { - /* At least one page in this folio has the hwpoison flag set */ - PG_has_hwpoisoned = PG_error, - PG_large_rmappable = PG_workingset, /* anon or file-backed */ -+ PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */ - }; - - #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) -@@ -865,8 +866,18 @@ static inline void ClearPageCompound(struct page *page) - ClearPageHead(page); - } - FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) -+FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE) -+/* -+ * PG_partially_mapped is protected by deferred_split split_queue_lock, -+ * so its safe to use non-atomic set/clear. -+ */ -+__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE) -+__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE) - #else - FOLIO_FLAG_FALSE(large_rmappable) -+FOLIO_TEST_FLAG_FALSE(partially_mapped) -+__FOLIO_SET_FLAG_NOOP(partially_mapped) -+__FOLIO_CLEAR_FLAG_NOOP(partially_mapped) - #endif - - #define PG_head_mask ((1UL << PG_head)) -@@ -1175,7 +1186,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) - */ - #define PAGE_FLAGS_SECOND \ - (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ -- 1UL << PG_large_rmappable) -+ 1UL << PG_large_rmappable | 1UL << PG_partially_mapped) - - #define PAGE_FLAGS_PRIVATE \ - (1UL << PG_private | 1UL << PG_private_2) -diff --git a/include/linux/rmap.h b/include/linux/rmap.h -index 0978c64f49d8..07854d1f9ad6 100644 ---- a/include/linux/rmap.h -+++ b/include/linux/rmap.h -@@ -745,7 +745,12 @@ int folio_mkclean(struct folio *); - int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, - struct vm_area_struct *vma); - --void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); -+enum rmp_flags { -+ RMP_LOCKED = 1 << 0, -+ RMP_USE_SHARED_ZEROPAGE = 1 << 1, -+}; -+ -+void remove_migration_ptes(struct folio *src, struct folio *dst, int flags); - - /* - * rmap_walk_control: To control rmap traversing for specific needs -diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h -index 747943bc8cc2..d35e588e0ece 100644 ---- a/include/linux/vm_event_item.h -+++ b/include/linux/vm_event_item.h -@@ -104,6 +104,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, - THP_SPLIT_PAGE, - THP_SPLIT_PAGE_FAILED, - THP_DEFERRED_SPLIT_PAGE, -+ THP_UNDERUSED_SPLIT_PAGE, - THP_SPLIT_PMD, - THP_SCAN_EXCEED_NONE_PTE, - THP_SCAN_EXCEED_SWAP_PTE, -diff --git a/mm/huge_memory.c b/mm/huge_memory.c -index 4d2839fcf688..eb2e5c305547 100644 ---- a/mm/huge_memory.c -+++ b/mm/huge_memory.c -@@ -77,6 +77,7 @@ static unsigned long deferred_split_count(struct shrinker *shrink, - struct shrink_control *sc); - static unsigned long deferred_split_scan(struct shrinker *shrink, - struct shrink_control *sc); -+static bool split_underused_thp = true; - - static atomic_t huge_zero_refcount; - struct folio *huge_zero_folio __read_mostly; -@@ -449,6 +450,27 @@ static ssize_t hpage_pmd_size_show(struct kobject *kobj, - static struct kobj_attribute hpage_pmd_size_attr = - __ATTR_RO(hpage_pmd_size); - -+static ssize_t split_underused_thp_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sysfs_emit(buf, "%d\n", split_underused_thp); -+} -+ -+static ssize_t split_underused_thp_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int err = kstrtobool(buf, &split_underused_thp); -+ -+ if (err < 0) -+ return err; -+ -+ return count; -+} -+ -+static struct kobj_attribute split_underused_thp_attr = __ATTR( -+ shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store); -+ - static struct attribute *hugepage_attr[] = { - &enabled_attr.attr, - &defrag_attr.attr, -@@ -457,6 +479,7 @@ static struct attribute *hugepage_attr[] = { - #ifdef CONFIG_SHMEM - &shmem_enabled_attr.attr, - #endif -+ &split_underused_thp_attr.attr, - NULL, - }; - -@@ -1013,6 +1036,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, - update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - mm_inc_nr_ptes(vma->vm_mm); -+ deferred_split_folio(folio, false); - spin_unlock(vmf->ptl); - count_vm_event(THP_FAULT_ALLOC); - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); -@@ -2784,7 +2808,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, - return false; - } - --static void remap_page(struct folio *folio, unsigned long nr) -+static void remap_page(struct folio *folio, unsigned long nr, int flags) - { - int i = 0; - -@@ -2792,7 +2816,7 @@ static void remap_page(struct folio *folio, unsigned long nr) - if (!folio_test_anon(folio)) - return; - for (;;) { -- remove_migration_ptes(folio, folio, true); -+ remove_migration_ptes(folio, folio, RMP_LOCKED | flags); - i += folio_nr_pages(folio); - if (i >= nr) - break; -@@ -3000,7 +3024,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, - - if (nr_dropped) - shmem_uncharge(folio->mapping->host, nr_dropped); -- remap_page(folio, nr); -+ remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0); - - /* - * set page to its compound_head when split to non order-0 pages, so -@@ -3235,6 +3259,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, - if (folio_order(folio) > 1 && - !list_empty(&folio->_deferred_list)) { - ds_queue->split_queue_len--; -+ if (folio_test_partially_mapped(folio)) { -+ __folio_clear_partially_mapped(folio); -+ } - /* - * Reinitialize page_deferred_list after removing the - * page from the split_queue, otherwise a subsequent -@@ -3269,7 +3296,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, - if (mapping) - xas_unlock(&xas); - local_irq_enable(); -- remap_page(folio, folio_nr_pages(folio)); -+ remap_page(folio, folio_nr_pages(folio), 0); - ret = -EAGAIN; - } - -@@ -3297,12 +3324,16 @@ void __folio_undo_large_rmappable(struct folio *folio) - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); - if (!list_empty(&folio->_deferred_list)) { - ds_queue->split_queue_len--; -+ if (folio_test_partially_mapped(folio)) { -+ __folio_clear_partially_mapped(folio); -+ } - list_del_init(&folio->_deferred_list); - } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); - } - --void deferred_split_folio(struct folio *folio) -+/* partially_mapped=false won't clear PG_partially_mapped folio flag */ -+void deferred_split_folio(struct folio *folio, bool partially_mapped) - { - struct deferred_split *ds_queue = get_deferred_split_queue(folio); - #ifdef CONFIG_MEMCG -@@ -3317,6 +3348,9 @@ void deferred_split_folio(struct folio *folio) - if (folio_order(folio) <= 1) - return; - -+ if (!partially_mapped && !split_underused_thp) -+ return; -+ - /* - * The try_to_unmap() in page reclaim path might reach here too, - * this may cause a race condition to corrupt deferred split queue. -@@ -3330,14 +3364,20 @@ void deferred_split_folio(struct folio *folio) - if (folio_test_swapcache(folio)) - return; - -- if (!list_empty(&folio->_deferred_list)) -- return; -- - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); -+ if (partially_mapped) { -+ if (!folio_test_partially_mapped(folio)) { -+ __folio_set_partially_mapped(folio); -+ if (folio_test_pmd_mappable(folio)) -+ count_vm_event(THP_DEFERRED_SPLIT_PAGE); -+ count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); -+ -+ } -+ } else { -+ /* partially mapped folios cannot become non-partially mapped */ -+ VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio); -+ } - if (list_empty(&folio->_deferred_list)) { -- if (folio_test_pmd_mappable(folio)) -- count_vm_event(THP_DEFERRED_SPLIT_PAGE); -- count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); - list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); - ds_queue->split_queue_len++; - #ifdef CONFIG_MEMCG -@@ -3362,6 +3402,39 @@ static unsigned long deferred_split_count(struct shrinker *shrink, - return READ_ONCE(ds_queue->split_queue_len); - } - -+static bool thp_underused(struct folio *folio) -+{ -+ int num_zero_pages = 0, num_filled_pages = 0; -+ void *kaddr; -+ int i; -+ -+ if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1) -+ return false; -+ -+ for (i = 0; i < folio_nr_pages(folio); i++) { -+ kaddr = kmap_local_folio(folio, i * PAGE_SIZE); -+ if (!memchr_inv(kaddr, 0, PAGE_SIZE)) { -+ num_zero_pages++; -+ if (num_zero_pages > khugepaged_max_ptes_none) { -+ kunmap_local(kaddr); -+ return true; -+ } -+ } else { -+ /* -+ * Another path for early exit once the number -+ * of non-zero filled pages exceeds threshold. -+ */ -+ num_filled_pages++; -+ if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) { -+ kunmap_local(kaddr); -+ return false; -+ } -+ } -+ kunmap_local(kaddr); -+ } -+ return false; -+} -+ - static unsigned long deferred_split_scan(struct shrinker *shrink, - struct shrink_control *sc) - { -@@ -3369,8 +3442,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, - struct deferred_split *ds_queue = &pgdata->deferred_split_queue; - unsigned long flags; - LIST_HEAD(list); -- struct folio *folio, *next; -- int split = 0; -+ struct folio *folio, *next, *prev = NULL; -+ int split = 0, removed = 0; - - #ifdef CONFIG_MEMCG - if (sc->memcg) -@@ -3385,6 +3458,9 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, - list_move(&folio->_deferred_list, &list); - } else { - /* We lost race with folio_put() */ -+ if (folio_test_partially_mapped(folio)) { -+ __folio_clear_partially_mapped(folio); -+ } - list_del_init(&folio->_deferred_list); - ds_queue->split_queue_len--; - } -@@ -3394,20 +3470,55 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); - - list_for_each_entry_safe(folio, next, &list, _deferred_list) { -+ bool did_split = false; -+ bool underused = false; -+ -+ if (!folio_test_partially_mapped(folio)) { -+ underused = thp_underused(folio); -+ if (!underused) -+ goto next; -+ } - if (!folio_trylock(folio)) - goto next; -- /* split_huge_page() removes page from list on success */ -- if (!split_folio(folio)) -+ if (!split_folio(folio)) { -+ did_split = true; -+ if (underused) -+ count_vm_event(THP_UNDERUSED_SPLIT_PAGE); - split++; -+ } - folio_unlock(folio); - next: -- folio_put(folio); -+ /* -+ * split_folio() removes folio from list on success. -+ * Only add back to the queue if folio is partially mapped. -+ * If thp_underused returns false, or if split_folio fails -+ * in the case it was underused, then consider it used and -+ * don't add it back to split_queue. -+ */ -+ if (!did_split && !folio_test_partially_mapped(folio)) { -+ list_del_init(&folio->_deferred_list); -+ removed++; -+ } else { -+ /* -+ * That unlocked list_del_init() above would be unsafe, -+ * unless its folio is separated from any earlier folios -+ * left on the list (which may be concurrently unqueued) -+ * by one safe folio with refcount still raised. -+ */ -+ swap(folio, prev); -+ } -+ if (folio) -+ folio_put(folio); - } - - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); - list_splice_tail(&list, &ds_queue->split_queue); -+ ds_queue->split_queue_len -= removed; - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); - -+ if (prev) -+ folio_put(prev); -+ - /* - * Stop shrinker if we didn't split any page, but the queue is empty. - * This can happen if pages were freed under us. -diff --git a/mm/khugepaged.c b/mm/khugepaged.c -index 4cba91ecf74b..ee490f1e7de2 100644 ---- a/mm/khugepaged.c -+++ b/mm/khugepaged.c -@@ -85,7 +85,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); - * - * Note that these are only respected if collapse was initiated by khugepaged. - */ --static unsigned int khugepaged_max_ptes_none __read_mostly; -+unsigned int khugepaged_max_ptes_none __read_mostly; - static unsigned int khugepaged_max_ptes_swap __read_mostly; - static unsigned int khugepaged_max_ptes_shared __read_mostly; - -@@ -1235,6 +1235,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, - pgtable_trans_huge_deposit(mm, pmd, pgtable); - set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache_pmd(vma, address, pmd); -+ deferred_split_folio(folio, false); - spin_unlock(pmd_ptl); - - folio = NULL; -diff --git a/mm/migrate.c b/mm/migrate.c -index 368ab3878fa6..d3a66f1a621b 100644 ---- a/mm/migrate.c -+++ b/mm/migrate.c -@@ -177,13 +177,57 @@ void putback_movable_pages(struct list_head *l) - } - } - -+static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw, -+ struct folio *folio, -+ unsigned long idx) -+{ -+ struct page *page = folio_page(folio, idx); -+ bool contains_data; -+ pte_t newpte; -+ void *addr; -+ -+ VM_BUG_ON_PAGE(PageCompound(page), page); -+ VM_BUG_ON_PAGE(!PageAnon(page), page); -+ VM_BUG_ON_PAGE(!PageLocked(page), page); -+ VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page); -+ -+ if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) || -+ mm_forbids_zeropage(pvmw->vma->vm_mm)) -+ return false; -+ -+ /* -+ * The pmd entry mapping the old thp was flushed and the pte mapping -+ * this subpage has been non present. If the subpage is only zero-filled -+ * then map it to the shared zeropage. -+ */ -+ addr = kmap_local_page(page); -+ contains_data = memchr_inv(addr, 0, PAGE_SIZE); -+ kunmap_local(addr); -+ -+ if (contains_data) -+ return false; -+ -+ newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address), -+ pvmw->vma->vm_page_prot)); -+ set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte); -+ -+ dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio)); -+ return true; -+} -+ -+struct rmap_walk_arg { -+ struct folio *folio; -+ bool map_unused_to_zeropage; -+}; -+ - /* - * Restore a potential migration pte to a working pte entry - */ - static bool remove_migration_pte(struct folio *folio, -- struct vm_area_struct *vma, unsigned long addr, void *old) -+ struct vm_area_struct *vma, unsigned long addr, void *arg) - { -- DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); -+ struct rmap_walk_arg *rmap_walk_arg = arg; -+ DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION); - - while (page_vma_mapped_walk(&pvmw)) { - rmap_t rmap_flags = RMAP_NONE; -@@ -207,6 +251,9 @@ static bool remove_migration_pte(struct folio *folio, - continue; - } - #endif -+ if (rmap_walk_arg->map_unused_to_zeropage && -+ try_to_map_unused_to_zeropage(&pvmw, folio, idx)) -+ continue; - - folio_get(folio); - pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); -@@ -285,14 +332,21 @@ static bool remove_migration_pte(struct folio *folio, - * Get rid of all migration entries and replace them by - * references to the indicated page. - */ --void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) -+void remove_migration_ptes(struct folio *src, struct folio *dst, int flags) - { -+ struct rmap_walk_arg rmap_walk_arg = { -+ .folio = src, -+ .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE, -+ }; -+ - struct rmap_walk_control rwc = { - .rmap_one = remove_migration_pte, -- .arg = src, -+ .arg = &rmap_walk_arg, - }; - -- if (locked) -+ VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src); -+ -+ if (flags & RMP_LOCKED) - rmap_walk_locked(dst, &rwc); - else - rmap_walk(dst, &rwc); -@@ -904,7 +958,7 @@ static int writeout(struct address_space *mapping, struct folio *folio) - * At this point we know that the migration attempt cannot - * be successful. - */ -- remove_migration_ptes(folio, folio, false); -+ remove_migration_ptes(folio, folio, 0); - - rc = mapping->a_ops->writepage(&folio->page, &wbc); - -@@ -1068,7 +1122,7 @@ static void migrate_folio_undo_src(struct folio *src, - struct list_head *ret) - { - if (page_was_mapped) -- remove_migration_ptes(src, src, false); -+ remove_migration_ptes(src, src, 0); - /* Drop an anon_vma reference if we took one */ - if (anon_vma) - put_anon_vma(anon_vma); -@@ -1306,7 +1360,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, - lru_add_drain(); - - if (old_page_state & PAGE_WAS_MAPPED) -- remove_migration_ptes(src, dst, false); -+ remove_migration_ptes(src, dst, 0); - - out_unlock_both: - folio_unlock(dst); -@@ -1444,7 +1498,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, - - if (page_was_mapped) - remove_migration_ptes(src, -- rc == MIGRATEPAGE_SUCCESS ? dst : src, false); -+ rc == MIGRATEPAGE_SUCCESS ? dst : src, 0); - - unlock_put_anon: - folio_unlock(dst); -@@ -1682,7 +1736,8 @@ static int migrate_pages_batch(struct list_head *from, - * use _deferred_list. - */ - if (nr_pages > 2 && -- !list_empty(&folio->_deferred_list)) { -+ !list_empty(&folio->_deferred_list) && -+ folio_test_partially_mapped(folio)) { - if (!try_split_folio(folio, split_folios, mode)) { - nr_failed++; - stats->nr_thp_failed += is_thp; -diff --git a/mm/migrate_device.c b/mm/migrate_device.c -index 6d66dc1c6ffa..8f875636b35b 100644 ---- a/mm/migrate_device.c -+++ b/mm/migrate_device.c -@@ -424,7 +424,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, - continue; - - folio = page_folio(page); -- remove_migration_ptes(folio, folio, false); -+ remove_migration_ptes(folio, folio, 0); - - src_pfns[i] = 0; - folio_unlock(folio); -@@ -837,7 +837,7 @@ void migrate_device_finalize(unsigned long *src_pfns, - - src = page_folio(page); - dst = page_folio(newpage); -- remove_migration_ptes(src, dst, false); -+ remove_migration_ptes(src, dst, 0); - folio_unlock(src); - - if (is_zone_device_page(page)) -diff --git a/mm/rmap.c b/mm/rmap.c -index 2490e727e2dc..77b5185058b4 100644 ---- a/mm/rmap.c -+++ b/mm/rmap.c -@@ -1566,8 +1566,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, - * Check partially_mapped first to ensure it is a large folio. - */ - if (folio_test_anon(folio) && partially_mapped && -- list_empty(&folio->_deferred_list)) -- deferred_split_folio(folio); -+ !folio_test_partially_mapped(folio)) -+ deferred_split_folio(folio, true); -+ - } - __folio_mod_stat(folio, -nr, -nr_pmdmapped); - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 35b67785907b..ca76f7df2d54 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -1232,7 +1232,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, - * Split partially mapped folios right away. - * We can free the unmapped pages without IO. - */ -- if (data_race(!list_empty(&folio->_deferred_list)) && -+ if (data_race(!list_empty(&folio->_deferred_list) && -+ folio_test_partially_mapped(folio)) && - split_folio_to_list(folio, folio_list)) - goto activate_locked; - } -diff --git a/mm/vmstat.c b/mm/vmstat.c -index e875f2a4915f..6c48f75eefa9 100644 ---- a/mm/vmstat.c -+++ b/mm/vmstat.c -@@ -1384,6 +1384,7 @@ const char * const vmstat_text[] = { - "thp_split_page", - "thp_split_page_failed", - "thp_deferred_split_page", -+ "thp_underused_split_page", - "thp_split_pmd", - "thp_scan_exceed_none_pte", - "thp_scan_exceed_swap_pte", -diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c -index e5e8dafc9d94..eb6d1b9fc362 100644 ---- a/tools/testing/selftests/mm/split_huge_page_test.c -+++ b/tools/testing/selftests/mm/split_huge_page_test.c -@@ -84,6 +84,76 @@ static void write_debugfs(const char *fmt, ...) - write_file(SPLIT_DEBUGFS, input, ret + 1); - } - -+static char *allocate_zero_filled_hugepage(size_t len) -+{ -+ char *result; -+ size_t i; -+ -+ result = memalign(pmd_pagesize, len); -+ if (!result) { -+ printf("Fail to allocate memory\n"); -+ exit(EXIT_FAILURE); -+ } -+ -+ madvise(result, len, MADV_HUGEPAGE); -+ -+ for (i = 0; i < len; i++) -+ result[i] = (char)0; -+ -+ return result; -+} -+ -+static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hpages, size_t len) -+{ -+ unsigned long rss_anon_before, rss_anon_after; -+ size_t i; -+ -+ if (!check_huge_anon(one_page, 4, pmd_pagesize)) { -+ printf("No THP is allocated\n"); -+ exit(EXIT_FAILURE); -+ } -+ -+ rss_anon_before = rss_anon(); -+ if (!rss_anon_before) { -+ printf("No RssAnon is allocated before split\n"); -+ exit(EXIT_FAILURE); -+ } -+ -+ /* split all THPs */ -+ write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, -+ (uint64_t)one_page + len, 0); -+ -+ for (i = 0; i < len; i++) -+ if (one_page[i] != (char)0) { -+ printf("%ld byte corrupted\n", i); -+ exit(EXIT_FAILURE); -+ } -+ -+ if (!check_huge_anon(one_page, 0, pmd_pagesize)) { -+ printf("Still AnonHugePages not split\n"); -+ exit(EXIT_FAILURE); -+ } -+ -+ rss_anon_after = rss_anon(); -+ if (rss_anon_after >= rss_anon_before) { -+ printf("Incorrect RssAnon value. Before: %ld After: %ld\n", -+ rss_anon_before, rss_anon_after); -+ exit(EXIT_FAILURE); -+ } -+} -+ -+void split_pmd_zero_pages(void) -+{ -+ char *one_page; -+ int nr_hpages = 4; -+ size_t len = nr_hpages * pmd_pagesize; -+ -+ one_page = allocate_zero_filled_hugepage(len); -+ verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len); -+ printf("Split zero filled huge pages successful\n"); -+ free(one_page); -+} -+ - void split_pmd_thp(void) - { - char *one_page; -@@ -431,6 +501,7 @@ int main(int argc, char **argv) - - fd_size = 2 * pmd_pagesize; - -+ split_pmd_zero_pages(); - split_pmd_thp(); - split_pte_mapped_thp(); - split_file_backed_thp(); -diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c -index 5a62530da3b5..d8d0cf04bb57 100644 ---- a/tools/testing/selftests/mm/vm_util.c -+++ b/tools/testing/selftests/mm/vm_util.c -@@ -12,6 +12,7 @@ - - #define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" - #define SMAP_FILE_PATH "/proc/self/smaps" -+#define STATUS_FILE_PATH "/proc/self/status" - #define MAX_LINE_LENGTH 500 - - unsigned int __page_size; -@@ -171,6 +172,27 @@ uint64_t read_pmd_pagesize(void) - return strtoul(buf, NULL, 10); - } - -+unsigned long rss_anon(void) -+{ -+ unsigned long rss_anon = 0; -+ FILE *fp; -+ char buffer[MAX_LINE_LENGTH]; -+ -+ fp = fopen(STATUS_FILE_PATH, "r"); -+ if (!fp) -+ ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, STATUS_FILE_PATH); -+ -+ if (!check_for_pattern(fp, "RssAnon:", buffer, sizeof(buffer))) -+ goto err_out; -+ -+ if (sscanf(buffer, "RssAnon:%10lu kB", &rss_anon) != 1) -+ ksft_exit_fail_msg("Reading status error\n"); -+ -+err_out: -+ fclose(fp); -+ return rss_anon; -+} -+ - bool __check_huge(void *addr, char *pattern, int nr_hpages, - uint64_t hpage_size) - { -diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h -index 9007c420d52c..2eaed8209925 100644 ---- a/tools/testing/selftests/mm/vm_util.h -+++ b/tools/testing/selftests/mm/vm_util.h -@@ -39,6 +39,7 @@ unsigned long pagemap_get_pfn(int fd, char *start); - void clear_softdirty(void); - bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); - uint64_t read_pmd_pagesize(void); -+unsigned long rss_anon(void); - bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); - bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); - bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); --- -2.47.0.rc0 - -From 6401d4b492055092a6ef1946c026e64ba06cc0ec Mon Sep 17 00:00:00 2001 -From: Eric Naim -Date: Tue, 22 Oct 2024 22:53:58 +0800 +From 42c5adef2d2bb73955bbbbee8b713a87fdd025f8 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 4 Nov 2024 14:52:12 +0100 Subject: [PATCH 13/13] zstd -Signed-off-by: Eric Naim +Signed-off-by: Peter Jung --- include/linux/zstd.h | 2 +- include/linux/zstd_errors.h | 23 +- @@ -29516,7 +37974,7 @@ Signed-off-by: Eric Naim create mode 100644 lib/zstd/common/bits.h diff --git a/include/linux/zstd.h b/include/linux/zstd.h -index 113408eef6ec..f109d49f43f8 100644 +index b2c7cf310c8f..ac59ae9a18d7 100644 --- a/include/linux/zstd.h +++ b/include/linux/zstd.h @@ -1,6 +1,6 @@ @@ -32933,7 +41391,7 @@ index 5042ff870308..57462466e188 100644 +#endif /* HUF_H_298734234 */ diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h -index 1d9cc03924ca..2e91e7780c1f 100644 +index c22a2e69bf46..d9bd752fe17b 100644 --- a/lib/zstd/common/mem.h +++ b/lib/zstd/common/mem.h @@ -1,6 +1,6 @@ @@ -34413,7 +42871,7 @@ index 74ef0db47621..0b229f5d2ae2 100644 } - diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c -index f620cafca633..0d139727cd39 100644 +index 16bb995bc6c4..885167f7e47b 100644 --- a/lib/zstd/compress/zstd_compress.c +++ b/lib/zstd/compress/zstd_compress.c @@ -1,5 +1,6 @@ @@ -37465,16 +45923,16 @@ index f620cafca633..0d139727cd39 100644 FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); assert(dictID <= (size_t)(U32)-1); cdict->dictID = (U32)dictID; -@@ -4811,7 +5450,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( - cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch, - customMem); +@@ -4813,7 +5452,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( + if (!cdict) + return NULL; - if (ZSTD_isError( ZSTD_initCDict_internal(cdict, + if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict, dict, dictSize, dictLoadMethod, dictContentType, cctxParams) )) { -@@ -4906,6 +5545,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict( +@@ -4908,6 +5547,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict( params.cParams = cParams; params.useRowMatchFinder = useRowMatchFinder; cdict->useRowMatchFinder = useRowMatchFinder; @@ -37482,7 +45940,7 @@ index f620cafca633..0d139727cd39 100644 if (ZSTD_isError( ZSTD_initCDict_internal(cdict, dict, dictSize, -@@ -4985,12 +5625,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced( +@@ -4987,12 +5627,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced( /* ZSTD_compressBegin_usingCDict() : * cdict must be != NULL */ @@ -37501,7 +45959,7 @@ index f620cafca633..0d139727cd39 100644 /*! ZSTD_compress_usingCDict_internal(): * Implementation of various ZSTD_compress_usingCDict* functions. */ -@@ -5000,7 +5645,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx, +@@ -5002,7 +5647,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) { FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ @@ -37510,7 +45968,7 @@ index f620cafca633..0d139727cd39 100644 } /*! ZSTD_compress_usingCDict_advanced(): -@@ -5197,30 +5842,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) +@@ -5199,30 +5844,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx) { @@ -37564,7 +46022,7 @@ index f620cafca633..0d139727cd39 100644 if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { assert(zcs->inBuff != NULL); assert(zcs->inBuffSize > 0); -@@ -5229,8 +5885,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, +@@ -5231,8 +5887,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, assert(zcs->outBuff != NULL); assert(zcs->outBuffSize > 0); } @@ -37576,7 +46034,7 @@ index f620cafca633..0d139727cd39 100644 assert((U32)flushMode <= (U32)ZSTD_e_end); while (someMoreWork) { -@@ -5245,7 +5903,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, +@@ -5247,7 +5905,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) /* OR we are allowed to return dstSizeTooSmall */ && (zcs->inBuffPos == 0) ) { /* shortcut to compression pass directly into output buffer */ @@ -37585,7 +46043,7 @@ index f620cafca633..0d139727cd39 100644 op, oend-op, ip, iend-ip); DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize); FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed"); -@@ -5262,8 +5920,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, +@@ -5264,8 +5922,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, zcs->inBuff + zcs->inBuffPos, toLoad, ip, iend-ip); zcs->inBuffPos += loaded; @@ -37595,7 +46053,7 @@ index f620cafca633..0d139727cd39 100644 if ( (flushMode == ZSTD_e_continue) && (zcs->inBuffPos < zcs->inBuffTarget) ) { /* not enough input to fill full block : stop here */ -@@ -5274,6 +5931,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, +@@ -5276,6 +5933,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, /* empty */ someMoreWork = 0; break; } @@ -37616,7 +46074,7 @@ index f620cafca633..0d139727cd39 100644 } /* compress current block (note : this stage cannot be stopped in the middle) */ DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode); -@@ -5281,9 +5952,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, +@@ -5283,9 +5954,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, void* cDst; size_t cSize; size_t oSize = oend-op; @@ -37628,7 +46086,7 @@ index f620cafca633..0d139727cd39 100644 if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) cDst = op; /* compress into output buffer, to skip flush stage */ else -@@ -5291,9 +5961,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, +@@ -5293,9 +5963,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, if (inputBuffered) { unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend); cSize = lastBlock ? @@ -37640,7 +46098,7 @@ index f620cafca633..0d139727cd39 100644 zcs->inBuff + zcs->inToCompress, iSize); FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); zcs->frameEnded = lastBlock; -@@ -5306,19 +5976,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, +@@ -5308,19 +5978,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, if (!lastBlock) assert(zcs->inBuffTarget <= zcs->inBuffSize); zcs->inToCompress = zcs->inBuffPos; @@ -37666,7 +46124,7 @@ index f620cafca633..0d139727cd39 100644 } if (cDst == op) { /* no need to flush */ op += cSize; -@@ -5388,8 +6055,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf +@@ -5390,8 +6057,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf /* After a compression call set the expected input/output buffer. * This is validated at the start of the next compression call. */ @@ -37678,7 +46136,7 @@ index f620cafca633..0d139727cd39 100644 if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { cctx->expectedInBuffer = *input; } -@@ -5408,22 +6077,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx, +@@ -5410,22 +6079,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx, { if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { ZSTD_inBuffer const expect = cctx->expectedInBuffer; @@ -37707,7 +46165,7 @@ index f620cafca633..0d139727cd39 100644 ZSTD_CCtx_params params = cctx->requestedParams; ZSTD_prefixDict const prefixDict = cctx->prefixDict; FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ -@@ -5437,9 +6106,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, +@@ -5439,9 +6108,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, params.compressionLevel = cctx->cdict->compressionLevel; } DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); @@ -37720,7 +46178,7 @@ index f620cafca633..0d139727cd39 100644 ? prefixDict.dictSize : (cctx->cdict ? cctx->cdict->dictContentSize : 0); ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); -@@ -5451,6 +6120,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, +@@ -5453,6 +6122,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, ¶ms.cParams); params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams); params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams); @@ -37730,7 +46188,7 @@ index f620cafca633..0d139727cd39 100644 { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1; assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); -@@ -5477,6 +6149,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, +@@ -5479,6 +6151,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, return 0; } @@ -37739,7 +46197,7 @@ index f620cafca633..0d139727cd39 100644 size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input, -@@ -5491,8 +6165,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, +@@ -5493,8 +6167,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, /* transparent initialization stage */ if (cctx->streamStage == zcss_init) { @@ -37769,7 +46227,7 @@ index f620cafca633..0d139727cd39 100644 } /* end of transparent initialization stage */ -@@ -5510,13 +6203,20 @@ size_t ZSTD_compressStream2_simpleArgs ( +@@ -5512,13 +6205,20 @@ size_t ZSTD_compressStream2_simpleArgs ( const void* src, size_t srcSize, size_t* srcPos, ZSTD_EndDirective endOp) { @@ -37796,7 +46254,7 @@ index f620cafca633..0d139727cd39 100644 } size_t ZSTD_compress2(ZSTD_CCtx* cctx, -@@ -5539,6 +6239,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, +@@ -5541,6 +6241,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, /* Reset to the original values. */ cctx->requestedParams.inBufferMode = originalInBufferMode; cctx->requestedParams.outBufferMode = originalOutBufferMode; @@ -37804,7 +46262,7 @@ index f620cafca633..0d139727cd39 100644 FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed"); if (result != 0) { /* compression not completed, due to lack of output space */ assert(oPos == dstCapacity); -@@ -5549,64 +6250,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, +@@ -5551,64 +6252,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, } } @@ -37889,7 +46347,7 @@ index f620cafca633..0d139727cd39 100644 if (cctx->cdict) { dictSize = (U32)cctx->cdict->dictContentSize; } else if (cctx->prefixDict.dict) { -@@ -5615,25 +6313,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, +@@ -5617,25 +6315,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, dictSize = 0; } ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); @@ -37954,7 +46412,7 @@ index f620cafca633..0d139727cd39 100644 ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); if (inSeqs[idx].litLength) { -@@ -5642,26 +6370,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, +@@ -5644,26 +6372,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ip += inSeqs[idx].litLength; seqPos->posInSrc += inSeqs[idx].litLength; } @@ -37984,7 +46442,7 @@ index f620cafca633..0d139727cd39 100644 { U32 idx = seqPos->idx; U32 startPosInSequence = seqPos->posInSequence; -@@ -5673,6 +6390,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* +@@ -5675,6 +6392,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* U32 bytesAdjustment = 0; U32 finalMatchSplit = 0; @@ -37994,7 +46452,7 @@ index f620cafca633..0d139727cd39 100644 if (cctx->cdict) { dictSize = cctx->cdict->dictContentSize; } else if (cctx->prefixDict.dict) { -@@ -5680,7 +6400,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* +@@ -5682,7 +6402,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* } else { dictSize = 0; } @@ -38003,7 +46461,7 @@ index f620cafca633..0d139727cd39 100644 DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) { -@@ -5688,7 +6408,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* +@@ -5690,7 +6410,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* U32 litLength = currSeq.litLength; U32 matchLength = currSeq.matchLength; U32 const rawOffset = currSeq.offset; @@ -38012,7 +46470,7 @@ index f620cafca633..0d139727cd39 100644 /* Modify the sequence depending on where endPosInSequence lies */ if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) { -@@ -5702,7 +6422,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* +@@ -5704,7 +6424,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* /* Move to the next sequence */ endPosInSequence -= currSeq.litLength + currSeq.matchLength; startPosInSequence = 0; @@ -38020,7 +46478,7 @@ index f620cafca633..0d139727cd39 100644 } else { /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence does not reach the end of the match. So, we have to split the sequence */ -@@ -5742,21 +6461,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* +@@ -5744,21 +6463,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* } /* Check if this offset can be represented with a repcode */ { U32 const ll0 = (litLength == 0); @@ -38051,7 +46509,7 @@ index f620cafca633..0d139727cd39 100644 } DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength); -@@ -5779,7 +6500,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* +@@ -5781,7 +6502,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, @@ -38060,7 +46518,7 @@ index f620cafca633..0d139727cd39 100644 static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) { ZSTD_sequenceCopier sequenceCopier = NULL; -@@ -5793,6 +6514,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) +@@ -5795,6 +6516,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) return sequenceCopier; } @@ -38118,7 +46576,7 @@ index f620cafca633..0d139727cd39 100644 /* Compress, block-by-block, all of the sequences given. * * Returns the cumulative size of all compressed blocks (including their headers), -@@ -5805,9 +6577,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, +@@ -5807,9 +6579,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, const void* src, size_t srcSize) { size_t cSize = 0; @@ -38128,7 +46586,7 @@ index f620cafca633..0d139727cd39 100644 size_t remaining = srcSize; ZSTD_sequencePosition seqPos = {0, 0, 0}; -@@ -5827,22 +6596,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, +@@ -5829,22 +6598,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, } while (remaining) { @@ -38164,7 +46622,7 @@ index f620cafca633..0d139727cd39 100644 cSize += cBlockSize; ip += blockSize; op += cBlockSize; -@@ -5851,6 +6627,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, +@@ -5853,6 +6629,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, continue; } @@ -38172,7 +46630,7 @@ index f620cafca633..0d139727cd39 100644 compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore, &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, &cctx->appliedParams, -@@ -5859,11 +6636,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, +@@ -5861,11 +6638,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, cctx->bmi2); FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); @@ -38186,7 +46644,7 @@ index f620cafca633..0d139727cd39 100644 /* We don't want to emit our first block as a RLE even if it qualifies because * doing so will cause the decoder (cli only) to throw a "should consume all input error." * This is only an issue for zstd <= v1.4.3 -@@ -5874,12 +6651,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, +@@ -5876,12 +6653,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, if (compressedSeqsSize == 0) { /* ZSTD_noCompressBlock writes the block header as well */ cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); @@ -38203,7 +46661,7 @@ index f620cafca633..0d139727cd39 100644 } else { U32 cBlockHeader; /* Error checking and repcodes update */ -@@ -5891,11 +6668,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, +@@ -5893,11 +6670,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); MEM_writeLE24(op, cBlockHeader); cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; @@ -38216,7 +46674,7 @@ index f620cafca633..0d139727cd39 100644 if (lastBlock) { break; -@@ -5906,12 +6682,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, +@@ -5908,12 +6684,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, dstCapacity -= cBlockSize; cctx->isFirstBlock = 0; } @@ -38233,7 +46691,7 @@ index f620cafca633..0d139727cd39 100644 const ZSTD_Sequence* inSeqs, size_t inSeqsSize, const void* src, size_t srcSize) { -@@ -5921,7 +6700,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci +@@ -5923,7 +6702,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci size_t frameHeaderSize = 0; /* Transparent initialization stage, same as compressStream2() */ @@ -38242,7 +46700,7 @@ index f620cafca633..0d139727cd39 100644 assert(cctx != NULL); FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed"); /* Begin writing output, starting with frame header */ -@@ -5949,26 +6728,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci +@@ -5951,26 +6730,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci cSize += 4; } @@ -38281,7 +46739,7 @@ index f620cafca633..0d139727cd39 100644 if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */ /* single thread mode : attempt to calculate remaining to flush more precisely */ { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE; -@@ -6090,7 +6877,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, +@@ -6092,7 +6879,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, cp.targetLength = (unsigned)(-clampedCompressionLevel); } /* refine parameters based on srcSize & dictSize */ @@ -38290,7 +46748,7 @@ index f620cafca633..0d139727cd39 100644 } } -@@ -6125,3 +6912,29 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH +@@ -6127,3 +6914,29 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); } @@ -48063,7 +56521,7 @@ index 22686e367e6f..466828e35752 100644 MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("Zstd Common"); diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c -index 04e1b5c01d9b..8ecf43226af2 100644 +index bd8784449b31..ceaf352d03e2 100644 --- a/lib/zstd/zstd_compress_module.c +++ b/lib/zstd/zstd_compress_module.c @@ -1,6 +1,6 @@ @@ -48075,7 +56533,7 @@ index 04e1b5c01d9b..8ecf43226af2 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c -index f4ed952ed485..7d31518e9d5a 100644 +index 469fc3059be0..0ae819f0c927 100644 --- a/lib/zstd/zstd_decompress_module.c +++ b/lib/zstd/zstd_decompress_module.c @@ -1,6 +1,6 @@ @@ -48086,7 +56544,7 @@ index f4ed952ed485..7d31518e9d5a 100644 * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -77,7 +77,7 @@ EXPORT_SYMBOL(zstd_init_dstream); +@@ -113,7 +113,7 @@ EXPORT_SYMBOL(zstd_init_dstream); size_t zstd_reset_dstream(zstd_dstream *dstream) { @@ -48096,4 +56554,5 @@ index f4ed952ed485..7d31518e9d5a 100644 EXPORT_SYMBOL(zstd_reset_dstream); -- -2.47.0.rc0 +2.47.0 +