From 46657044ecf2aeb2f18a862e2a143a812299dc46 Mon Sep 17 00:00:00 2001 From: ferreo Date: Fri, 8 Nov 2024 21:20:42 +0000 Subject: [PATCH] 6.11.7 --- .github/release-nest-v3 | 2 +- VERSION | 2 +- patches/0001-cachyos-base-all.patch | 1273 ++++++++++++++++++++++----- patches/0003-bore-cachy-ext.patch | 165 ++-- 4 files changed, 1090 insertions(+), 352 deletions(-) diff --git a/.github/release-nest-v3 b/.github/release-nest-v3 index 56a6051..d8263ee 100644 --- a/.github/release-nest-v3 +++ b/.github/release-nest-v3 @@ -1 +1 @@ -1 \ No newline at end of file +2 \ No newline at end of file diff --git a/VERSION b/VERSION index 42033bf..25fbb41 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -6.11.6 +6.11.7 diff --git a/patches/0001-cachyos-base-all.patch b/patches/0001-cachyos-base-all.patch index 7485c5a..4b519e9 100644 --- a/patches/0001-cachyos-base-all.patch +++ b/patches/0001-cachyos-base-all.patch @@ -1,7 +1,7 @@ -From e3a84aa467e6f1e7f6c082e31c5840e97ae7a295 Mon Sep 17 00:00:00 2001 +From 388846a7197e09827139dd2e32554cf564cee8d7 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 1 Nov 2024 09:17:26 +0100 -Subject: [PATCH 01/12] address-masking +Date: Fri, 8 Nov 2024 16:53:48 +0100 +Subject: [PATCH 01/13] address-masking Signed-off-by: Peter Jung --- @@ -57,10 +57,10 @@ index 39c7cf82b0c2..43844510d5d0 100644 -- 2.47.0 -From 6fa826b951d38ade0a5b6a1f44e6f2d93f4f5941 Mon Sep 17 00:00:00 2001 +From 540411a33c5579d41cde12dba682c174197a958c Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 1 Nov 2024 09:12:40 +0100 -Subject: [PATCH 02/12] amd-cache-optimizer +Date: Fri, 8 Nov 2024 16:54:07 +0100 +Subject: [PATCH 02/13] amd-cache-optimizer Signed-off-by: Peter Jung --- @@ -350,10 +350,10 @@ index 000000000000..679613d02b9a -- 2.47.0 -From 22e62dc7c2fe659336c934f0f57556f65c60c085 Mon Sep 17 00:00:00 2001 +From cc249b74bd76342d380a67f5ba39401d42bfd99b Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 1 Nov 2024 09:13:11 +0100 -Subject: [PATCH 03/12] amd-pstate +Date: Fri, 8 Nov 2024 16:54:24 +0100 +Subject: [PATCH 03/13] amd-pstate Signed-off-by: Peter Jung --- @@ -361,7 +361,7 @@ Signed-off-by: Peter Jung arch/x86/include/asm/cpufeatures.h | 3 +- arch/x86/include/asm/intel-family.h | 6 + arch/x86/include/asm/processor.h | 21 +- - arch/x86/include/asm/topology.h | 9 + + arch/x86/include/asm/topology.h | 11 + arch/x86/kernel/acpi/cppc.c | 195 ++++++++++++++- arch/x86/kernel/cpu/amd.c | 16 -- arch/x86/kernel/cpu/debugfs.c | 1 + @@ -369,11 +369,13 @@ Signed-off-by: Peter Jung arch/x86/kernel/cpu/topology_amd.c | 3 + arch/x86/kernel/cpu/topology_common.c | 34 +++ arch/x86/kernel/smpboot.c | 5 +- + drivers/acpi/cppc_acpi.c | 6 - + drivers/acpi/processor_driver.c | 1 + drivers/cpufreq/acpi-cpufreq.c | 12 +- drivers/cpufreq/amd-pstate.c | 261 ++++++-------------- include/acpi/cppc_acpi.h | 41 ++- tools/arch/x86/include/asm/cpufeatures.h | 2 +- - 16 files changed, 394 insertions(+), 233 deletions(-) + 18 files changed, 397 insertions(+), 239 deletions(-) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index d0324d44f548..210a808b74ec 100644 @@ -484,7 +486,7 @@ index a75a07f4931f..e17f4d733e44 100644 static inline void amd_check_microcode(void) { } #endif diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h -index abe3a8f22cbd..94d9832a5bc8 100644 +index abe3a8f22cbd..ea0b3fa8914e 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -114,6 +114,12 @@ enum x86_topology_domains { @@ -510,6 +512,15 @@ index abe3a8f22cbd..94d9832a5bc8 100644 static inline unsigned int topology_max_packages(void) { return __max_logical_packages; +@@ -295,6 +304,8 @@ extern void arch_scale_freq_tick(void); + #ifdef CONFIG_ACPI_CPPC_LIB + void init_freq_invariance_cppc(void); + #define arch_init_invariance_cppc init_freq_invariance_cppc ++#else ++static inline void arch_init_invariance_cppc(void) { } + #endif + + #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index ff8f25faca3d..59edf64ad9ed 100644 --- a/arch/x86/kernel/acpi/cppc.c @@ -896,6 +907,42 @@ index 390e4fe7433e..9ee84f58f3b4 100644 return 0; } +diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c +index 544f53ae9cc0..e74f4dda1a76 100644 +--- a/drivers/acpi/cppc_acpi.c ++++ b/drivers/acpi/cppc_acpi.c +@@ -671,10 +671,6 @@ static int pcc_data_alloc(int pcc_ss_id) + * ) + */ + +-#ifndef arch_init_invariance_cppc +-static inline void arch_init_invariance_cppc(void) { } +-#endif +- + /** + * acpi_cppc_processor_probe - Search for per CPU _CPC objects. + * @pr: Ptr to acpi_processor containing this CPU's logical ID. +@@ -905,8 +901,6 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) + goto out_free; + } + +- arch_init_invariance_cppc(); +- + kfree(output.pointer); + return 0; + +diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c +index cb52dd000b95..59620e7bc664 100644 +--- a/drivers/acpi/processor_driver.c ++++ b/drivers/acpi/processor_driver.c +@@ -270,6 +270,7 @@ static int __init acpi_processor_driver_init(void) + NULL, acpi_soft_cpu_dead); + + acpi_processor_throttling_init(); ++ arch_init_invariance_cppc(); + return 0; + err: + driver_unregister(&acpi_processor_driver); diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index a8ca625a98b8..0f04feb6cafa 100644 --- a/drivers/cpufreq/acpi-cpufreq.c @@ -1455,7 +1502,7 @@ index 929b9097a6c1..0532e913705c 100644 /* enable amd pstate feature */ diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h -index e1720d930666..76e44e102780 100644 +index a451ca4c207b..62d368bcd9ec 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -161,34 +161,37 @@ extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf); @@ -1565,10 +1612,10 @@ index dd4682857c12..23698d0f4bb4 100644 -- 2.47.0 -From 234fb5cdb099e93cc6c8882b5165ddb78beeef29 Mon Sep 17 00:00:00 2001 +From c22fef0041e87b7577f076e2a524b295ae0d3195 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 1 Nov 2024 09:13:22 +0100 -Subject: [PATCH 04/12] bbr3 +Date: Fri, 8 Nov 2024 16:55:11 +0100 +Subject: [PATCH 04/13] bbr3 Signed-off-by: Peter Jung --- @@ -4951,10 +4998,10 @@ index 4d40615dc8fc..f27941201ef2 100644 -- 2.47.0 -From c3d18557f4f71abdc434c928fd22d75e74ee781b Mon Sep 17 00:00:00 2001 +From 13131fcbfd0af9c911cf7cf7623652a6123aeee0 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 1 Nov 2024 09:14:43 +0100 -Subject: [PATCH 05/12] cachy +Date: Fri, 8 Nov 2024 16:56:22 +0100 +Subject: [PATCH 05/13] cachy Signed-off-by: Peter Jung --- @@ -5050,7 +5097,7 @@ index be010fec7654..900113802ffc 100644 Safety option to keep boot IRQs enabled. This should never be necessary. diff --git a/Makefile b/Makefile -index 318a5d60088e..1c1773ba7867 100644 +index 692bbdf40fb5..c89daa69f457 100644 --- a/Makefile +++ b/Makefile @@ -803,11 +803,19 @@ KBUILD_CFLAGS += -fno-delete-null-pointer-checks @@ -6092,10 +6139,10 @@ index d5d6ab484e5a..dccba7bcdf97 100644 } diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c -index 87672ca714de..21442469791c 100644 +index 80e60ea2d11e..51dea35848f6 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c -@@ -2762,7 +2762,10 @@ int smu_get_power_limit(void *handle, +@@ -2775,7 +2775,10 @@ int smu_get_power_limit(void *handle, *limit = smu->max_power_limit; break; case SMU_PPT_LIMIT_MIN: @@ -6107,7 +6154,7 @@ index 87672ca714de..21442469791c 100644 break; default: return -EINVAL; -@@ -2786,7 +2789,14 @@ static int smu_set_power_limit(void *handle, uint32_t limit) +@@ -2799,7 +2802,14 @@ static int smu_set_power_limit(void *handle, uint32_t limit) if (smu->ppt_funcs->set_power_limit) return smu->ppt_funcs->set_power_limit(smu, limit_type, limit); @@ -10744,12 +10791,12 @@ index 38ef6d06888e..0f78364efd4f 100644 config SCHED_HRTICK diff --git a/kernel/fork.c b/kernel/fork.c -index 6b97fb2ac4af..003de4829c15 100644 +index dc08a2374733..49ae2412d5b5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c -@@ -105,6 +105,10 @@ - #include +@@ -106,6 +106,10 @@ #include + #include +#ifdef CONFIG_USER_NS +#include @@ -10758,7 +10805,7 @@ index 6b97fb2ac4af..003de4829c15 100644 #include #include #include -@@ -2135,6 +2139,10 @@ __latent_entropy struct task_struct *copy_process( +@@ -2136,6 +2140,10 @@ __latent_entropy struct task_struct *copy_process( if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -10769,7 +10816,7 @@ index 6b97fb2ac4af..003de4829c15 100644 /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. -@@ -3283,6 +3291,12 @@ int ksys_unshare(unsigned long unshare_flags) +@@ -3285,6 +3293,12 @@ int ksys_unshare(unsigned long unshare_flags) if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; @@ -10805,7 +10852,7 @@ index 33cac79e3994..3277df47ab3c 100644 return state; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 1d2cbdb162a6..91b242e47db7 100644 +index 425348b8d9eb..180643d6197d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -73,10 +73,19 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; @@ -11002,7 +11049,7 @@ index 4430ac68e4c4..3bd08b60a9b3 100644 EXPORT_SYMBOL_GPL(dirty_writeback_interval); diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index 91ace8ca97e2..f8b4dae35fc3 100644 +index ec459522c293..e0f083a5b7d3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -271,7 +271,11 @@ const char * const migratetype_names[MIGRATE_TYPES] = { @@ -11055,10 +11102,10 @@ index bd5183dfd879..3a410f53a07c 100644 /* diff --git a/mm/vmscan.c b/mm/vmscan.c -index 128f307da6ee..35b67785907b 100644 +index f5bcd08527ae..83c66d0267c0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c -@@ -199,7 +199,11 @@ struct scan_control { +@@ -200,7 +200,11 @@ struct scan_control { /* * From 0 .. MAX_SWAPPINESS. Higher means more swappy. */ @@ -11070,7 +11117,7 @@ index 128f307da6ee..35b67785907b 100644 #ifdef CONFIG_MEMCG -@@ -3968,7 +3972,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc +@@ -3974,7 +3978,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc } /* to protect the working set of the last N jiffies */ @@ -11216,22 +11263,20 @@ index 663ce300dd06..f83493838cf9 100644 -- 2.47.0 -From d9c17d5d901674319171a09d1f01a6df1e57f195 Mon Sep 17 00:00:00 2001 +From 9cb83850dc8662524d629fc34e92b3bfcc50183a Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 1 Nov 2024 09:14:54 +0100 -Subject: [PATCH 06/12] fixes +Date: Fri, 8 Nov 2024 17:43:05 +0100 +Subject: [PATCH 06/13] fixes Signed-off-by: Peter Jung --- - arch/Kconfig | 4 +- - drivers/bluetooth/btusb.c | 4 ++ - drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 +++ - drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 15 ++++++- - drivers/gpu/drm/drm_edid.c | 47 ++++++++++++++++++++-- - drivers/net/wireless/realtek/rtw89/pci.c | 48 +++++++++++++++++++---- - mm/mmap.c | 3 +- - mm/shrinker.c | 8 ++-- - 8 files changed, 117 insertions(+), 17 deletions(-) + arch/Kconfig | 4 +- + arch/x86/kernel/cpu/amd.c | 11 +++++ + drivers/bluetooth/btusb.c | 4 ++ + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 ++ + .../drm/amd/display/dc/bios/bios_parser2.c | 4 +- + drivers/gpu/drm/drm_edid.c | 47 +++++++++++++++++-- + 6 files changed, 69 insertions(+), 6 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 975dd22a2dbd..de69b8f5b5be 100644 @@ -11255,6 +11300,28 @@ index 975dd22a2dbd..de69b8f5b5be 100644 depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS help This value can be used to select the number of bits to use to +diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c +index fab5caec0b72..823f44f7bc94 100644 +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -924,6 +924,17 @@ static void init_amd_zen4(struct cpuinfo_x86 *c) + { + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); ++ ++ /* ++ * These Zen4 SoCs advertise support for virtualized VMLOAD/VMSAVE ++ * in some BIOS versions but they can lead to random host reboots. ++ */ ++ switch (c->x86_model) { ++ case 0x18 ... 0x1f: ++ case 0x60 ... 0x7f: ++ clear_cpu_cap(c, X86_FEATURE_V_VMSAVE_VMLOAD); ++ break; ++ } + } + + static void init_amd_zen5(struct cpuinfo_x86 *c) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 2408e50743ca..73c54e92afa9 100644 --- a/drivers/bluetooth/btusb.c @@ -11286,46 +11353,21 @@ index 9c3b7b027485..ad5c05ee92f3 100644 /* let modprobe override vga console setting */ return pci_register_driver(&amdgpu_kms_pci_driver); -diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c -index 21442469791c..51dea35848f6 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c -@@ -1234,6 +1234,14 @@ static void smu_init_xgmi_plpd_mode(struct smu_context *smu) - } - } +diff --git a/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c b/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c +index 0d8498ab9b23..be8fbb04ad98 100644 +--- a/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c ++++ b/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c +@@ -3127,7 +3127,9 @@ static enum bp_result bios_parser_get_vram_info( + struct atom_data_revision revision; -+static bool smu_is_workload_profile_available(struct smu_context *smu, -+ u32 profile) -+{ -+ if (profile >= PP_SMC_POWER_PROFILE_COUNT) -+ return false; -+ return smu->workload_map && smu->workload_map[profile].valid_mapping; -+} -+ - static int smu_sw_init(void *handle) - { - struct amdgpu_device *adev = (struct amdgpu_device *)handle; -@@ -1257,7 +1265,6 @@ static int smu_sw_init(void *handle) - atomic_set(&smu->smu_power.power_gate.vpe_gated, 1); - atomic_set(&smu->smu_power.power_gate.umsch_mm_gated, 1); + // vram info moved to umc_info for DCN4x +- if (info && DATA_TABLES(umc_info)) { ++ if (dcb->ctx->dce_version >= DCN_VERSION_4_01 && ++ dcb->ctx->dce_version < DCN_VERSION_MAX && ++ info && DATA_TABLES(umc_info)) { + header = GET_IMAGE(struct atom_common_table_header, + DATA_TABLES(umc_info)); -- smu->workload_mask = 1 << smu->workload_prority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT]; - smu->workload_prority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT] = 0; - smu->workload_prority[PP_SMC_POWER_PROFILE_FULLSCREEN3D] = 1; - smu->workload_prority[PP_SMC_POWER_PROFILE_POWERSAVING] = 2; -@@ -1266,6 +1273,12 @@ static int smu_sw_init(void *handle) - smu->workload_prority[PP_SMC_POWER_PROFILE_COMPUTE] = 5; - smu->workload_prority[PP_SMC_POWER_PROFILE_CUSTOM] = 6; - -+ if (smu->is_apu || -+ !smu_is_workload_profile_available(smu, PP_SMC_POWER_PROFILE_FULLSCREEN3D)) -+ smu->workload_mask = 1 << smu->workload_prority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT]; -+ else -+ smu->workload_mask = 1 << smu->workload_prority[PP_SMC_POWER_PROFILE_FULLSCREEN3D]; -+ - smu->workload_setting[0] = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; - smu->workload_setting[1] = PP_SMC_POWER_PROFILE_FULLSCREEN3D; - smu->workload_setting[2] = PP_SMC_POWER_PROFILE_POWERSAVING; diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index f68a41eeb1fa..6cd386d0fccb 100644 --- a/drivers/gpu/drm/drm_edid.c @@ -11419,140 +11461,13 @@ index f68a41eeb1fa..6cd386d0fccb 100644 if (!newmode) continue; -diff --git a/drivers/net/wireless/realtek/rtw89/pci.c b/drivers/net/wireless/realtek/rtw89/pci.c -index 02afeb3acce4..5aef7fa37878 100644 ---- a/drivers/net/wireless/realtek/rtw89/pci.c -+++ b/drivers/net/wireless/realtek/rtw89/pci.c -@@ -3026,24 +3026,54 @@ static void rtw89_pci_declaim_device(struct rtw89_dev *rtwdev, - pci_disable_device(pdev); - } - --static void rtw89_pci_cfg_dac(struct rtw89_dev *rtwdev) -+static bool rtw89_pci_chip_is_manual_dac(struct rtw89_dev *rtwdev) - { -- struct rtw89_pci *rtwpci = (struct rtw89_pci *)rtwdev->priv; - const struct rtw89_chip_info *chip = rtwdev->chip; - -- if (!rtwpci->enable_dac) -- return; -- - switch (chip->chip_id) { - case RTL8852A: - case RTL8852B: - case RTL8851B: - case RTL8852BT: -- break; -+ return true; - default: -- return; -+ return false; -+ } -+} -+ -+static bool rtw89_pci_is_dac_compatible_bridge(struct rtw89_dev *rtwdev) -+{ -+ struct rtw89_pci *rtwpci = (struct rtw89_pci *)rtwdev->priv; -+ struct pci_dev *bridge = pci_upstream_bridge(rtwpci->pdev); -+ -+ if (!rtw89_pci_chip_is_manual_dac(rtwdev)) -+ return true; -+ -+ if (!bridge) -+ return false; -+ -+ switch (bridge->vendor) { -+ case PCI_VENDOR_ID_INTEL: -+ return true; -+ case PCI_VENDOR_ID_ASMEDIA: -+ if (bridge->device == 0x2806) -+ return true; -+ break; - } - -+ return false; -+} -+ -+static void rtw89_pci_cfg_dac(struct rtw89_dev *rtwdev) -+{ -+ struct rtw89_pci *rtwpci = (struct rtw89_pci *)rtwdev->priv; -+ -+ if (!rtwpci->enable_dac) -+ return; -+ -+ if (!rtw89_pci_chip_is_manual_dac(rtwdev)) -+ return; -+ - rtw89_pci_config_byte_set(rtwdev, RTW89_PCIE_L1_CTRL, RTW89_PCIE_BIT_EN_64BITS); - } - -@@ -3061,6 +3091,9 @@ static int rtw89_pci_setup_mapping(struct rtw89_dev *rtwdev, - goto err; - } - -+ if (!rtw89_pci_is_dac_compatible_bridge(rtwdev)) -+ goto no_dac; -+ - ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(36)); - if (!ret) { - rtwpci->enable_dac = true; -@@ -3073,6 +3106,7 @@ static int rtw89_pci_setup_mapping(struct rtw89_dev *rtwdev, - goto err_release_regions; - } - } -+no_dac: - - resource_len = pci_resource_len(pdev, bar_id); - rtwpci->mmap = pci_iomap(pdev, bar_id, resource_len); -diff --git a/mm/mmap.c b/mm/mmap.c -index 18fddcce03b8..8a04f29aa423 100644 ---- a/mm/mmap.c -+++ b/mm/mmap.c -@@ -1952,7 +1952,8 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, - - if (get_area) { - addr = get_area(file, addr, len, pgoff, flags); -- } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { -+ } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) -+ && IS_ALIGNED(len, PMD_SIZE)) { - /* Ensures that larger anonymous mappings are THP aligned. */ - addr = thp_get_unmapped_area_vmflags(file, addr, len, - pgoff, flags, vm_flags); -diff --git a/mm/shrinker.c b/mm/shrinker.c -index dc5d2a6fcfc4..4a93fd433689 100644 ---- a/mm/shrinker.c -+++ b/mm/shrinker.c -@@ -76,19 +76,21 @@ void free_shrinker_info(struct mem_cgroup *memcg) - - int alloc_shrinker_info(struct mem_cgroup *memcg) - { -- struct shrinker_info *info; - int nid, ret = 0; - int array_size = 0; - - mutex_lock(&shrinker_mutex); - array_size = shrinker_unit_size(shrinker_nr_max); - for_each_node(nid) { -- info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid); -+ struct shrinker_info *info = kvzalloc_node(sizeof(*info) + array_size, -+ GFP_KERNEL, nid); - if (!info) - goto err; - info->map_nr_max = shrinker_nr_max; -- if (shrinker_unit_alloc(info, NULL, nid)) -+ if (shrinker_unit_alloc(info, NULL, nid)) { -+ kvfree(info); - goto err; -+ } - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); - } - mutex_unlock(&shrinker_mutex); -- 2.47.0 -From 19c7eed02bcf056464e7a65fef6df3ee828843b1 Mon Sep 17 00:00:00 2001 +From f16d50e4fc59e2c7f3ef308730ba77afdcac2878 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 1 Nov 2024 09:15:05 +0100 -Subject: [PATCH 07/12] intel-pstate +Date: Fri, 8 Nov 2024 17:36:26 +0100 +Subject: [PATCH 07/13] intel-pstate Signed-off-by: Peter Jung --- @@ -11562,7 +11477,7 @@ Signed-off-by: Peter Jung 3 files changed, 328 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h -index 94d9832a5bc8..9f9376db64e3 100644 +index ea0b3fa8914e..7f6c39b3be7b 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -291,9 +291,22 @@ static inline long arch_scale_freq_capacity(int cpu) @@ -12033,10 +11948,10 @@ index 348a330678bd..c11be253bfa3 100644 -- 2.47.0 -From e5f623fa904e2cd611cf7e5c90dcdb9555909b6e Mon Sep 17 00:00:00 2001 +From c0242f5a4a52500a19004b54029c624bc2f36433 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 1 Nov 2024 09:15:23 +0100 -Subject: [PATCH 08/12] ksm +Date: Fri, 8 Nov 2024 17:37:00 +0100 +Subject: [PATCH 08/13] ksm Signed-off-by: Peter Jung --- @@ -12466,10 +12381,10 @@ index 01071182763e..7394bad8178e 100644 -- 2.47.0 -From c3e54867da3e3b223071b86ea497f21c551774eb Mon Sep 17 00:00:00 2001 +From 3b74aac80ad5d4d1bc6b73c6050634865d58f0cd Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 1 Nov 2024 09:15:35 +0100 -Subject: [PATCH 09/12] ntsync +Date: Fri, 8 Nov 2024 17:37:17 +0100 +Subject: [PATCH 09/13] ntsync Signed-off-by: Peter Jung --- @@ -15555,10 +15470,900 @@ index 000000000000..5fa2c9a0768c -- 2.47.0 -From 669ed9174a34e7240a8364a3fae9d1d23c55087a Mon Sep 17 00:00:00 2001 +From 9ddf5856d1be7d4bd93ca2cca7a9c21337969c6d Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 1 Nov 2024 09:16:21 +0100 -Subject: [PATCH 10/12] t2 +Date: Fri, 8 Nov 2024 17:37:42 +0100 +Subject: [PATCH 10/13] perf-per-core + +Signed-off-by: Peter Jung +--- + Documentation/arch/x86/topology.rst | 4 + + arch/x86/events/rapl.c | 408 ++++++++++++++++++-------- + arch/x86/include/asm/processor.h | 1 + + arch/x86/include/asm/topology.h | 1 + + arch/x86/kernel/cpu/debugfs.c | 1 + + arch/x86/kernel/cpu/topology_common.c | 1 + + 6 files changed, 288 insertions(+), 128 deletions(-) + +diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst +index 7352ab89a55a..c12837e61bda 100644 +--- a/Documentation/arch/x86/topology.rst ++++ b/Documentation/arch/x86/topology.rst +@@ -135,6 +135,10 @@ Thread-related topology information in the kernel: + The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo + "core_id." + ++ - topology_logical_core_id(); ++ ++ The logical core ID to which a thread belongs. ++ + + + System topology examples +diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c +index a481a939862e..6b405bf46781 100644 +--- a/arch/x86/events/rapl.c ++++ b/arch/x86/events/rapl.c +@@ -39,6 +39,10 @@ + * event: rapl_energy_psys + * perf code: 0x5 + * ++ * per_core counter: consumption of a single physical core ++ * event: rapl_energy_per_core (power_per_core PMU) ++ * perf code: 0x1 ++ * + * We manage those counters as free running (read-only). They may be + * use simultaneously by other tools, such as turbostat. + * +@@ -70,18 +74,25 @@ MODULE_LICENSE("GPL"); + /* + * RAPL energy status counters + */ +-enum perf_rapl_events { ++enum perf_rapl_pkg_events { + PERF_RAPL_PP0 = 0, /* all cores */ + PERF_RAPL_PKG, /* entire package */ + PERF_RAPL_RAM, /* DRAM */ + PERF_RAPL_PP1, /* gpu */ + PERF_RAPL_PSYS, /* psys */ + +- PERF_RAPL_MAX, +- NR_RAPL_DOMAINS = PERF_RAPL_MAX, ++ PERF_RAPL_PKG_EVENTS_MAX, ++ NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX, ++}; ++ ++enum perf_rapl_core_events { ++ PERF_RAPL_PER_CORE = 0, /* per-core */ ++ ++ PERF_RAPL_CORE_EVENTS_MAX, ++ NR_RAPL_CORE_DOMAINS = PERF_RAPL_CORE_EVENTS_MAX, + }; + +-static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { ++static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = { + "pp0-core", + "package", + "dram", +@@ -89,6 +100,10 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { + "psys", + }; + ++static const char *const rapl_core_domain_names[NR_RAPL_CORE_DOMAINS] __initconst = { ++ "per-core", ++}; ++ + /* + * event code: LSB 8 bits, passed in attr->config + * any other bit is reserved +@@ -128,8 +143,9 @@ struct rapl_pmu { + + struct rapl_pmus { + struct pmu pmu; ++ cpumask_t cpumask; + unsigned int nr_rapl_pmu; +- struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu); ++ struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu); + }; + + enum rapl_unit_quirk { +@@ -139,19 +155,22 @@ enum rapl_unit_quirk { + }; + + struct rapl_model { +- struct perf_msr *rapl_msrs; +- unsigned long events; ++ struct perf_msr *rapl_pkg_msrs; ++ struct perf_msr *rapl_core_msrs; ++ unsigned long pkg_events; ++ unsigned long core_events; + unsigned int msr_power_unit; + enum rapl_unit_quirk unit_quirk; + }; + + /* 1/2^hw_unit Joule */ +-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; +-static struct rapl_pmus *rapl_pmus; +-static cpumask_t rapl_cpu_mask; +-static unsigned int rapl_cntr_mask; ++static int rapl_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly; ++static struct rapl_pmus *rapl_pmus_pkg; ++static struct rapl_pmus *rapl_pmus_core; ++static unsigned int rapl_pkg_cntr_mask; ++static unsigned int rapl_core_cntr_mask; + static u64 rapl_timer_ms; +-static struct perf_msr *rapl_msrs; ++static struct rapl_model *rapl_model; + + /* + * Helper functions to get the correct topology macros according to the +@@ -177,7 +196,8 @@ static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) + * The unsigned check also catches the '-1' return value for non + * existent mappings in the topology map. + */ +- return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL; ++ return rapl_pmu_idx < rapl_pmus_pkg->nr_rapl_pmu ? ++ rapl_pmus_pkg->rapl_pmu[rapl_pmu_idx] : NULL; + } + + static inline u64 rapl_read_counter(struct perf_event *event) +@@ -189,7 +209,7 @@ static inline u64 rapl_read_counter(struct perf_event *event) + + static inline u64 rapl_scale(u64 v, int cfg) + { +- if (cfg > NR_RAPL_DOMAINS) { ++ if (cfg > NR_RAPL_PKG_DOMAINS) { + pr_warn("Invalid domain %d, failed to scale data\n", cfg); + return v; + } +@@ -241,34 +261,34 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu) + + static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) + { +- struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); ++ struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); + struct perf_event *event; + unsigned long flags; + +- if (!pmu->n_active) ++ if (!rapl_pmu->n_active) + return HRTIMER_NORESTART; + +- raw_spin_lock_irqsave(&pmu->lock, flags); ++ raw_spin_lock_irqsave(&rapl_pmu->lock, flags); + +- list_for_each_entry(event, &pmu->active_list, active_entry) ++ list_for_each_entry(event, &rapl_pmu->active_list, active_entry) + rapl_event_update(event); + +- raw_spin_unlock_irqrestore(&pmu->lock, flags); ++ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); + +- hrtimer_forward_now(hrtimer, pmu->timer_interval); ++ hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval); + + return HRTIMER_RESTART; + } + +-static void rapl_hrtimer_init(struct rapl_pmu *pmu) ++static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu) + { +- struct hrtimer *hr = &pmu->hrtimer; ++ struct hrtimer *hr = &rapl_pmu->hrtimer; + + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hr->function = rapl_hrtimer_handle; + } + +-static void __rapl_pmu_event_start(struct rapl_pmu *pmu, ++static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu, + struct perf_event *event) + { + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) +@@ -276,39 +296,39 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu, + + event->hw.state = 0; + +- list_add_tail(&event->active_entry, &pmu->active_list); ++ list_add_tail(&event->active_entry, &rapl_pmu->active_list); + + local64_set(&event->hw.prev_count, rapl_read_counter(event)); + +- pmu->n_active++; +- if (pmu->n_active == 1) +- rapl_start_hrtimer(pmu); ++ rapl_pmu->n_active++; ++ if (rapl_pmu->n_active == 1) ++ rapl_start_hrtimer(rapl_pmu); + } + + static void rapl_pmu_event_start(struct perf_event *event, int mode) + { +- struct rapl_pmu *pmu = event->pmu_private; ++ struct rapl_pmu *rapl_pmu = event->pmu_private; + unsigned long flags; + +- raw_spin_lock_irqsave(&pmu->lock, flags); +- __rapl_pmu_event_start(pmu, event); +- raw_spin_unlock_irqrestore(&pmu->lock, flags); ++ raw_spin_lock_irqsave(&rapl_pmu->lock, flags); ++ __rapl_pmu_event_start(rapl_pmu, event); ++ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); + } + + static void rapl_pmu_event_stop(struct perf_event *event, int mode) + { +- struct rapl_pmu *pmu = event->pmu_private; ++ struct rapl_pmu *rapl_pmu = event->pmu_private; + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + +- raw_spin_lock_irqsave(&pmu->lock, flags); ++ raw_spin_lock_irqsave(&rapl_pmu->lock, flags); + + /* mark event as deactivated and stopped */ + if (!(hwc->state & PERF_HES_STOPPED)) { +- WARN_ON_ONCE(pmu->n_active <= 0); +- pmu->n_active--; +- if (pmu->n_active == 0) +- hrtimer_cancel(&pmu->hrtimer); ++ WARN_ON_ONCE(rapl_pmu->n_active <= 0); ++ rapl_pmu->n_active--; ++ if (rapl_pmu->n_active == 0) ++ hrtimer_cancel(&rapl_pmu->hrtimer); + + list_del(&event->active_entry); + +@@ -326,23 +346,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode) + hwc->state |= PERF_HES_UPTODATE; + } + +- raw_spin_unlock_irqrestore(&pmu->lock, flags); ++ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); + } + + static int rapl_pmu_event_add(struct perf_event *event, int mode) + { +- struct rapl_pmu *pmu = event->pmu_private; ++ struct rapl_pmu *rapl_pmu = event->pmu_private; + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + +- raw_spin_lock_irqsave(&pmu->lock, flags); ++ raw_spin_lock_irqsave(&rapl_pmu->lock, flags); + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) +- __rapl_pmu_event_start(pmu, event); ++ __rapl_pmu_event_start(rapl_pmu, event); + +- raw_spin_unlock_irqrestore(&pmu->lock, flags); ++ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); + + return 0; + } +@@ -356,10 +376,14 @@ static int rapl_pmu_event_init(struct perf_event *event) + { + u64 cfg = event->attr.config & RAPL_EVENT_MASK; + int bit, ret = 0; +- struct rapl_pmu *pmu; ++ struct rapl_pmu *rapl_pmu; ++ struct rapl_pmus *curr_rapl_pmus; + + /* only look at RAPL events */ +- if (event->attr.type != rapl_pmus->pmu.type) ++ if (event->attr.type == rapl_pmus_pkg->pmu.type || ++ (rapl_pmus_core && event->attr.type == rapl_pmus_core->pmu.type)) ++ curr_rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu); ++ else + return -ENOENT; + + /* check only supported bits are set */ +@@ -369,16 +393,18 @@ static int rapl_pmu_event_init(struct perf_event *event) + if (event->cpu < 0) + return -EINVAL; + +- event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; ++ if (curr_rapl_pmus == rapl_pmus_pkg) ++ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; + +- if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) ++ if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) + return -EINVAL; + +- cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1); ++ cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1); + bit = cfg - 1; + + /* check event supported */ +- if (!(rapl_cntr_mask & (1 << bit))) ++ if (!(rapl_pkg_cntr_mask & (1 << bit)) && ++ !(rapl_core_cntr_mask & (1 << bit))) + return -EINVAL; + + /* unsupported modes and filters */ +@@ -386,12 +412,18 @@ static int rapl_pmu_event_init(struct perf_event *event) + return -EINVAL; + + /* must be done before validate_group */ +- pmu = cpu_to_rapl_pmu(event->cpu); +- if (!pmu) ++ if (curr_rapl_pmus == rapl_pmus_core) { ++ rapl_pmu = curr_rapl_pmus->rapl_pmu[topology_logical_core_id(event->cpu)]; ++ event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr; ++ } else { ++ rapl_pmu = curr_rapl_pmus->rapl_pmu[get_rapl_pmu_idx(event->cpu)]; ++ event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr; ++ } ++ ++ if (!rapl_pmu) + return -EINVAL; +- event->cpu = pmu->cpu; +- event->pmu_private = pmu; +- event->hw.event_base = rapl_msrs[bit].msr; ++ event->cpu = rapl_pmu->cpu; ++ event->pmu_private = rapl_pmu; + event->hw.config = cfg; + event->hw.idx = bit; + +@@ -406,7 +438,7 @@ static void rapl_pmu_event_read(struct perf_event *event) + static ssize_t rapl_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, char *buf) + { +- return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask); ++ return cpumap_print_to_pagebuf(true, buf, &rapl_pmus_pkg->cpumask); + } + + static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); +@@ -420,17 +452,38 @@ static struct attribute_group rapl_pmu_attr_group = { + .attrs = rapl_pmu_attrs, + }; + ++static ssize_t rapl_get_attr_per_core_cpumask(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return cpumap_print_to_pagebuf(true, buf, &rapl_pmus_core->cpumask); ++} ++ ++static struct device_attribute dev_attr_per_core_cpumask = __ATTR(cpumask, 0444, ++ rapl_get_attr_per_core_cpumask, ++ NULL); ++ ++static struct attribute *rapl_pmu_per_core_attrs[] = { ++ &dev_attr_per_core_cpumask.attr, ++ NULL, ++}; ++ ++static struct attribute_group rapl_pmu_per_core_attr_group = { ++ .attrs = rapl_pmu_per_core_attrs, ++}; ++ + RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); + RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); + RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); + RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); + RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05"); ++RAPL_EVENT_ATTR_STR(energy-per-core, rapl_per_core, "event=0x01"); + + RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); + RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); + RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); + RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); + RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules"); ++RAPL_EVENT_ATTR_STR(energy-per-core.unit, rapl_per_core_unit, "Joules"); + + /* + * we compute in 0.23 nJ increments regardless of MSR +@@ -440,6 +493,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890 + RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); + RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); + RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10"); ++RAPL_EVENT_ATTR_STR(energy-per-core.scale, rapl_per_core_scale, "2.3283064365386962890625e-10"); + + /* + * There are no default events, but we need to create +@@ -473,6 +527,13 @@ static const struct attribute_group *rapl_attr_groups[] = { + NULL, + }; + ++static const struct attribute_group *rapl_per_core_attr_groups[] = { ++ &rapl_pmu_per_core_attr_group, ++ &rapl_pmu_format_group, ++ &rapl_pmu_events_group, ++ NULL, ++}; ++ + static struct attribute *rapl_events_cores[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_cores_unit), +@@ -533,6 +594,18 @@ static struct attribute_group rapl_events_psys_group = { + .attrs = rapl_events_psys, + }; + ++static struct attribute *rapl_events_per_core[] = { ++ EVENT_PTR(rapl_per_core), ++ EVENT_PTR(rapl_per_core_unit), ++ EVENT_PTR(rapl_per_core_scale), ++ NULL, ++}; ++ ++static struct attribute_group rapl_events_per_core_group = { ++ .name = "events", ++ .attrs = rapl_events_per_core, ++}; ++ + static bool test_msr(int idx, void *data) + { + return test_bit(idx, (unsigned long *) data); +@@ -558,11 +631,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = { + }; + + /* +- * Force to PERF_RAPL_MAX size due to: +- * - perf_msr_probe(PERF_RAPL_MAX) ++ * Force to PERF_RAPL_PKG_EVENTS_MAX size due to: ++ * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX) + * - want to use same event codes across both architectures + */ +-static struct perf_msr amd_rapl_msrs[] = { ++static struct perf_msr amd_rapl_pkg_msrs[] = { + [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 }, + [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 }, +@@ -570,77 +643,104 @@ static struct perf_msr amd_rapl_msrs[] = { + [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 }, + }; + +-static int rapl_cpu_offline(unsigned int cpu) ++static struct perf_msr amd_rapl_core_msrs[] = { ++ [PERF_RAPL_PER_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_per_core_group, ++ test_msr, false, RAPL_MSR_MASK }, ++}; ++ ++static int __rapl_cpu_offline(struct rapl_pmus *rapl_pmus, unsigned int rapl_pmu_idx, ++ const struct cpumask *event_cpumask, unsigned int cpu) + { +- struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); ++ struct rapl_pmu *rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx]; + int target; + + /* Check if exiting cpu is used for collecting rapl events */ +- if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask)) ++ if (!cpumask_test_and_clear_cpu(cpu, &rapl_pmus->cpumask)) + return 0; + +- pmu->cpu = -1; ++ rapl_pmu->cpu = -1; + /* Find a new cpu to collect rapl events */ +- target = cpumask_any_but(get_rapl_pmu_cpumask(cpu), cpu); ++ target = cpumask_any_but(event_cpumask, cpu); + + /* Migrate rapl events to the new target */ + if (target < nr_cpu_ids) { +- cpumask_set_cpu(target, &rapl_cpu_mask); +- pmu->cpu = target; +- perf_pmu_migrate_context(pmu->pmu, cpu, target); ++ cpumask_set_cpu(target, &rapl_pmus->cpumask); ++ rapl_pmu->cpu = target; ++ perf_pmu_migrate_context(rapl_pmu->pmu, cpu, target); + } + return 0; + } + +-static int rapl_cpu_online(unsigned int cpu) ++static int rapl_cpu_offline(unsigned int cpu) + { +- s32 rapl_pmu_idx = get_rapl_pmu_idx(cpu); +- if (rapl_pmu_idx < 0) { +- pr_err("topology_logical_(package/die)_id() returned a negative value"); +- return -EINVAL; +- } +- struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); ++ int ret = __rapl_cpu_offline(rapl_pmus_pkg, get_rapl_pmu_idx(cpu), ++ get_rapl_pmu_cpumask(cpu), cpu); ++ ++ if (ret == 0 && rapl_model->core_events) ++ ret = __rapl_cpu_offline(rapl_pmus_core, topology_logical_core_id(cpu), ++ topology_sibling_cpumask(cpu), cpu); ++ ++ return ret; ++} ++ ++static int __rapl_cpu_online(struct rapl_pmus *rapl_pmus, unsigned int rapl_pmu_idx, ++ const struct cpumask *event_cpumask, unsigned int cpu) ++{ ++ struct rapl_pmu *rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx]; + int target; + +- if (!pmu) { +- pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); +- if (!pmu) ++ if (!rapl_pmu) { ++ rapl_pmu = kzalloc_node(sizeof(*rapl_pmu), GFP_KERNEL, cpu_to_node(cpu)); ++ if (!rapl_pmu) + return -ENOMEM; + +- raw_spin_lock_init(&pmu->lock); +- INIT_LIST_HEAD(&pmu->active_list); +- pmu->pmu = &rapl_pmus->pmu; +- pmu->timer_interval = ms_to_ktime(rapl_timer_ms); +- rapl_hrtimer_init(pmu); ++ raw_spin_lock_init(&rapl_pmu->lock); ++ INIT_LIST_HEAD(&rapl_pmu->active_list); ++ rapl_pmu->pmu = &rapl_pmus->pmu; ++ rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms); ++ rapl_hrtimer_init(rapl_pmu); + +- rapl_pmus->pmus[rapl_pmu_idx] = pmu; ++ rapl_pmus->rapl_pmu[rapl_pmu_idx] = rapl_pmu; + } + + /* + * Check if there is an online cpu in the package which collects rapl + * events already. + */ +- target = cpumask_any_and(&rapl_cpu_mask, get_rapl_pmu_cpumask(cpu)); ++ target = cpumask_any_and(&rapl_pmus->cpumask, event_cpumask); + if (target < nr_cpu_ids) + return 0; + +- cpumask_set_cpu(cpu, &rapl_cpu_mask); +- pmu->cpu = cpu; ++ cpumask_set_cpu(cpu, &rapl_pmus->cpumask); ++ rapl_pmu->cpu = cpu; + return 0; + } + +-static int rapl_check_hw_unit(struct rapl_model *rm) ++static int rapl_cpu_online(unsigned int cpu) ++{ ++ int ret = __rapl_cpu_online(rapl_pmus_pkg, get_rapl_pmu_idx(cpu), ++ get_rapl_pmu_cpumask(cpu), cpu); ++ ++ if (ret == 0 && rapl_model->core_events) ++ ret = __rapl_cpu_online(rapl_pmus_core, topology_logical_core_id(cpu), ++ topology_sibling_cpumask(cpu), cpu); ++ ++ return ret; ++} ++ ++ ++static int rapl_check_hw_unit(void) + { + u64 msr_rapl_power_unit_bits; + int i; + + /* protect rdmsrl() to handle virtualization */ +- if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits)) ++ if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits)) + return -1; +- for (i = 0; i < NR_RAPL_DOMAINS; i++) ++ for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) + rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + +- switch (rm->unit_quirk) { ++ switch (rapl_model->unit_quirk) { + /* + * DRAM domain on HSW server and KNL has fixed energy unit which can be + * different than the unit from power unit MSR. See +@@ -679,22 +779,29 @@ static void __init rapl_advertise(void) + int i; + + pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n", +- hweight32(rapl_cntr_mask), rapl_timer_ms); ++ hweight32(rapl_pkg_cntr_mask) + hweight32(rapl_core_cntr_mask), rapl_timer_ms); ++ ++ for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) { ++ if (rapl_pkg_cntr_mask & (1 << i)) { ++ pr_info("hw unit of domain %s 2^-%d Joules\n", ++ rapl_pkg_domain_names[i], rapl_hw_unit[i]); ++ } ++ } + +- for (i = 0; i < NR_RAPL_DOMAINS; i++) { +- if (rapl_cntr_mask & (1 << i)) { ++ for (i = 0; i < NR_RAPL_CORE_DOMAINS; i++) { ++ if (rapl_core_cntr_mask & (1 << i)) { + pr_info("hw unit of domain %s 2^-%d Joules\n", +- rapl_domain_names[i], rapl_hw_unit[i]); ++ rapl_core_domain_names[i], rapl_hw_unit[i]); + } + } + } + +-static void cleanup_rapl_pmus(void) ++static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus) + { + int i; + + for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++) +- kfree(rapl_pmus->pmus[i]); ++ kfree(rapl_pmus->rapl_pmu[i]); + kfree(rapl_pmus); + } + +@@ -707,14 +814,17 @@ static const struct attribute_group *rapl_attr_update[] = { + NULL, + }; + +-static int __init init_rapl_pmus(void) +-{ +- int nr_rapl_pmu = topology_max_packages(); ++static const struct attribute_group *rapl_per_core_attr_update[] = { ++ &rapl_events_per_core_group, ++}; + +- if (!rapl_pmu_is_pkg_scope()) +- nr_rapl_pmu *= topology_max_dies_per_package(); ++static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int nr_rapl_pmu, ++ const struct attribute_group **rapl_attr_groups, ++ const struct attribute_group **rapl_attr_update) ++{ ++ struct rapl_pmus *rapl_pmus; + +- rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL); ++ rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL); + if (!rapl_pmus) + return -ENOMEM; + +@@ -730,75 +840,80 @@ static int __init init_rapl_pmus(void) + rapl_pmus->pmu.read = rapl_pmu_event_read; + rapl_pmus->pmu.module = THIS_MODULE; + rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; ++ ++ *rapl_pmus_ptr = rapl_pmus; ++ + return 0; + } + + static struct rapl_model model_snb = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_PP1), + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_snbep = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_hsw = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PP1), + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_hsx = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_knl = { +- .events = BIT(PERF_RAPL_PKG) | ++ .pkg_events = BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_skl = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PP1) | + BIT(PERF_RAPL_PSYS), + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_spr = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PSYS), + .unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR, + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_spr_msrs, ++ .rapl_pkg_msrs = intel_rapl_spr_msrs, + }; + + static struct rapl_model model_amd_hygon = { +- .events = BIT(PERF_RAPL_PKG), ++ .pkg_events = BIT(PERF_RAPL_PKG), ++ .core_events = BIT(PERF_RAPL_PER_CORE), + .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, +- .rapl_msrs = amd_rapl_msrs, ++ .rapl_pkg_msrs = amd_rapl_pkg_msrs, ++ .rapl_core_msrs = amd_rapl_core_msrs, + }; + + static const struct x86_cpu_id rapl_model_match[] __initconst = { +@@ -854,28 +969,47 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); + static int __init rapl_pmu_init(void) + { + const struct x86_cpu_id *id; +- struct rapl_model *rm; + int ret; ++ int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package(); ++ int nr_cores = topology_max_packages() * topology_num_cores_per_package(); ++ ++ if (rapl_pmu_is_pkg_scope()) ++ nr_rapl_pmu = topology_max_packages(); + + id = x86_match_cpu(rapl_model_match); + if (!id) + return -ENODEV; + +- rm = (struct rapl_model *) id->driver_data; ++ rapl_model = (struct rapl_model *) id->driver_data; + +- rapl_msrs = rm->rapl_msrs; ++ rapl_pkg_cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, PERF_RAPL_PKG_EVENTS_MAX, ++ false, (void *) &rapl_model->pkg_events); + +- rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, +- false, (void *) &rm->events); +- +- ret = rapl_check_hw_unit(rm); ++ ret = rapl_check_hw_unit(); + if (ret) + return ret; + +- ret = init_rapl_pmus(); ++ ret = init_rapl_pmus(&rapl_pmus_pkg, nr_rapl_pmu, rapl_attr_groups, rapl_attr_update); + if (ret) + return ret; + ++ if (rapl_model->core_events) { ++ rapl_core_cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs, ++ PERF_RAPL_CORE_EVENTS_MAX, false, ++ (void *) &rapl_model->core_events); ++ ++ ret = init_rapl_pmus(&rapl_pmus_core, nr_cores, ++ rapl_per_core_attr_groups, rapl_per_core_attr_update); ++ if (ret) { ++ /* ++ * If initialization of per_core PMU fails, reset per_core ++ * flag, and continue with power PMU initialization. ++ */ ++ pr_warn("Per-core PMU initialization failed (%d)\n", ret); ++ rapl_model->core_events = 0UL; ++ } ++ } ++ + /* + * Install callbacks. Core will call them for each online cpu. + */ +@@ -885,10 +1019,24 @@ static int __init rapl_pmu_init(void) + if (ret) + goto out; + +- ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); ++ ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1); + if (ret) + goto out1; + ++ if (rapl_model->core_events) { ++ ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_per_core", -1); ++ if (ret) { ++ /* ++ * If registration of per_core PMU fails, cleanup per_core PMU ++ * variables, reset the per_core flag and keep the ++ * power PMU untouched. ++ */ ++ pr_warn("Per-core PMU registration failed (%d)\n", ret); ++ cleanup_rapl_pmus(rapl_pmus_core); ++ rapl_model->core_events = 0UL; ++ } ++ } ++ + rapl_advertise(); + return 0; + +@@ -896,7 +1044,7 @@ static int __init rapl_pmu_init(void) + cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); + out: + pr_warn("Initialization failed (%d), disabled\n", ret); +- cleanup_rapl_pmus(); ++ cleanup_rapl_pmus(rapl_pmus_pkg); + return ret; + } + module_init(rapl_pmu_init); +@@ -904,7 +1052,11 @@ module_init(rapl_pmu_init); + static void __exit intel_rapl_exit(void) + { + cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE); +- perf_pmu_unregister(&rapl_pmus->pmu); +- cleanup_rapl_pmus(); ++ perf_pmu_unregister(&rapl_pmus_pkg->pmu); ++ cleanup_rapl_pmus(rapl_pmus_pkg); ++ if (rapl_model->core_events) { ++ perf_pmu_unregister(&rapl_pmus_core->pmu); ++ cleanup_rapl_pmus(rapl_pmus_core); ++ } + } + module_exit(intel_rapl_exit); +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index e17f4d733e44..7e53b701bc27 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -98,6 +98,7 @@ struct cpuinfo_topology { + // Logical ID mappings + u32 logical_pkg_id; + u32 logical_die_id; ++ u32 logical_core_id; + + // AMD Node ID and Nodes per Package info + u32 amd_node_id; +diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h +index 7f6c39b3be7b..f21f7979dd12 100644 +--- a/arch/x86/include/asm/topology.h ++++ b/arch/x86/include/asm/topology.h +@@ -143,6 +143,7 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu); + #define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id) + #define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id) + #define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id) ++#define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id) + #define topology_die_id(cpu) (cpu_data(cpu).topo.die_id) + #define topology_core_id(cpu) (cpu_data(cpu).topo.core_id) + #define topology_ppin(cpu) (cpu_data(cpu).ppin) +diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c +index 10719aba6276..cacfd3f6abef 100644 +--- a/arch/x86/kernel/cpu/debugfs.c ++++ b/arch/x86/kernel/cpu/debugfs.c +@@ -25,6 +25,7 @@ static int cpu_debug_show(struct seq_file *m, void *p) + seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c)); + seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); + seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); ++ seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id); + seq_printf(m, "llc_id: %u\n", c->topo.llc_id); + seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); + seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id); +diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c +index 8277c64f88db..b5a5e1411469 100644 +--- a/arch/x86/kernel/cpu/topology_common.c ++++ b/arch/x86/kernel/cpu/topology_common.c +@@ -185,6 +185,7 @@ static void topo_set_ids(struct topo_scan *tscan, bool early) + if (!early) { + c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); + c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); ++ c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN); + } + + /* Package relative core ID */ +-- +2.47.0 + +From b87a889f2caa3af28aace25417833dd8e24b7254 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 8 Nov 2024 17:38:09 +0100 +Subject: [PATCH 11/13] t2 Signed-off-by: Peter Jung --- @@ -25970,10 +26775,10 @@ index 4427572b2477..b60c99d61882 100755 -- 2.47.0 -From 7ec11666090bbffb316c27f68c9fbdb613c1a37d Mon Sep 17 00:00:00 2001 +From f59885cb7f117db011ba4e019cdbaef99f6ea4ab Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 1 Nov 2024 09:16:34 +0100 -Subject: [PATCH 11/12] thp-shrinker +Date: Fri, 8 Nov 2024 17:38:31 +0100 +Subject: [PATCH 12/13] thp-shrinker Signed-off-by: Peter Jung --- @@ -26432,7 +27237,7 @@ index 4cba91ecf74b..ee490f1e7de2 100644 folio = NULL; diff --git a/mm/migrate.c b/mm/migrate.c -index 368ab3878fa6..d3a66f1a621b 100644 +index 75b858bd6aa5..a43bad6bd4e0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -177,13 +177,57 @@ void putback_movable_pages(struct list_head *l) @@ -26599,10 +27404,10 @@ index 6d66dc1c6ffa..8f875636b35b 100644 if (is_zone_device_page(page)) diff --git a/mm/rmap.c b/mm/rmap.c -index 2490e727e2dc..77b5185058b4 100644 +index 3d89847f01da..bcf689c3e297 100644 --- a/mm/rmap.c +++ b/mm/rmap.c -@@ -1566,8 +1566,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, +@@ -1578,8 +1578,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, * Check partially_mapped first to ensure it is a large folio. */ if (folio_test_anon(folio) && partially_mapped && @@ -26615,10 +27420,10 @@ index 2490e727e2dc..77b5185058b4 100644 __folio_mod_stat(folio, -nr, -nr_pmdmapped); diff --git a/mm/vmscan.c b/mm/vmscan.c -index 35b67785907b..ca76f7df2d54 100644 +index 83c66d0267c0..6f4a3ab7217d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c -@@ -1232,7 +1232,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, +@@ -1238,7 +1238,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, * Split partially mapped folios right away. * We can free the unmapped pages without IO. */ @@ -26784,10 +27589,10 @@ index 9007c420d52c..2eaed8209925 100644 -- 2.47.0 -From 683fbaa60b17e0a380f04f8f3f1c5cd0d4095241 Mon Sep 17 00:00:00 2001 +From c1b6dbd10a05045020ba333ab3045dd710d7f74f Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 1 Nov 2024 09:16:43 +0100 -Subject: [PATCH 12/12] zstd +Date: Fri, 8 Nov 2024 17:38:45 +0100 +Subject: [PATCH 13/13] zstd Signed-off-by: Peter Jung --- diff --git a/patches/0003-bore-cachy-ext.patch b/patches/0003-bore-cachy-ext.patch index 0720cda..a17edc9 100644 --- a/patches/0003-bore-cachy-ext.patch +++ b/patches/0003-bore-cachy-ext.patch @@ -1,6 +1,6 @@ -From 318c40e6ac298c062db3e34a9e94e75b81d3a653 Mon Sep 17 00:00:00 2001 +From 6ecc114fa063ac6afffe38eafb3f7fdbd3695a33 Mon Sep 17 00:00:00 2001 From: Eric Naim -Date: Mon, 28 Oct 2024 10:11:08 +0800 +Date: Thu, 7 Nov 2024 22:56:57 +0800 Subject: [PATCH] bore-cachy-ext Signed-off-by: Eric Naim @@ -8,16 +8,16 @@ Signed-off-by: Eric Naim include/linux/sched.h | 20 +- include/linux/sched/bore.h | 37 ++++ init/Kconfig | 17 ++ - kernel/Kconfig.hz | 43 +++++ + kernel/Kconfig.hz | 17 ++ kernel/fork.c | 5 + kernel/sched/Makefile | 1 + kernel/sched/bore.c | 381 +++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 7 + - kernel/sched/debug.c | 67 ++++++- - kernel/sched/fair.c | 114 +++++++++-- + kernel/sched/debug.c | 61 +++++- + kernel/sched/fair.c | 98 ++++++++-- kernel/sched/features.h | 4 + - kernel/sched/sched.h | 16 ++ - 12 files changed, 694 insertions(+), 18 deletions(-) + kernel/sched/sched.h | 9 + + 12 files changed, 639 insertions(+), 18 deletions(-) create mode 100644 include/linux/sched/bore.h create mode 100644 kernel/sched/bore.c @@ -136,10 +136,10 @@ index e1a88d48d652..3aea8e43c360 100644 bool "Automatic process group scheduling" select CGROUPS diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 0f78364efd4f..4cf2d88916bd 100644 +index 0f78364efd4f..83a6b919ab29 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz -@@ -79,5 +79,48 @@ config HZ +@@ -79,5 +79,22 @@ config HZ default 750 if HZ_750 default 1000 if HZ_1000 @@ -159,32 +159,6 @@ index 0f78364efd4f..4cf2d88916bd 100644 + Setting this value too high can cause the system to boot with + an unnecessarily large base slice, resulting in high scheduling + latency and poor system responsiveness. -+ -+config MIGRATION_COST_BASE_NS -+ int "Default value for migration_cost_base_ns" -+ default 300000 -+ help -+ The BORE Scheduler automatically calculates the optimal -+ migration_cost_ns using the following equation: -+ -+ migration_cost_ns = -+ migration_cost_base_ns + ilog2(ncpus) * migration_cost_step_ns -+ -+ This option sets the default migration_cost_base_ns -+ to be used in the automatic calculation. -+ -+config MIGRATION_COST_STEP_NS -+ int "Default value for migration_cost_step_ns" -+ default 50000 -+ help -+ The BORE Scheduler automatically calculates the optimal -+ migration_cost_ns using the following equation: -+ -+ migration_cost_ns = -+ migration_cost_base_ns + ilog2(ncpus) * migration_cost_step_ns -+ -+ This option sets the default migration_cost_step_ns -+ to be used in the automatic calculation. + config SCHED_HRTICK def_bool HIGH_RES_TIMERS @@ -608,7 +582,7 @@ index 000000000000..cd7e8a8d6075 +#endif // CONFIG_SYSCTL +#endif // CONFIG_SCHED_BORE diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 8ae04bd4a5a4..896db098c4c5 100644 +index 8c045d05e4fe..9ff3a76daf21 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -97,6 +97,8 @@ @@ -625,7 +599,7 @@ index 8ae04bd4a5a4..896db098c4c5 100644 #endif +#ifdef CONFIG_SCHED_BORE -+ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 5.7.1 by Masahito Suzuki"); ++ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 5.7.3 by Masahito Suzuki"); + init_task_bore(&init_task); +#endif // CONFIG_SCHED_BORE + @@ -633,10 +607,10 @@ index 8ae04bd4a5a4..896db098c4c5 100644 #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index c057ef46c5f8..b71ce5182500 100644 +index c057ef46c5f8..6201135dd6bb 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c -@@ -167,7 +167,55 @@ static const struct file_operations sched_feat_fops = { +@@ -167,7 +167,53 @@ static const struct file_operations sched_feat_fops = { }; #ifdef CONFIG_SMP @@ -684,15 +658,13 @@ index c057ef46c5f8..b71ce5182500 100644 +}; + +DEFINE_SYSCTL_SCHED_FUNC(min_base_slice, min_base_slice) -+DEFINE_SYSCTL_SCHED_FUNC(migration_cost_base, migration_cost) -+DEFINE_SYSCTL_SCHED_FUNC(migration_cost_step, migration_cost) +#undef DEFINE_SYSCTL_SCHED_FUNC +#else // !CONFIG_SCHED_BORE static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { -@@ -213,7 +261,7 @@ static const struct file_operations sched_scaling_fops = { +@@ -213,7 +259,7 @@ static const struct file_operations sched_scaling_fops = { .llseek = seq_lseek, .release = single_release, }; @@ -701,7 +673,7 @@ index c057ef46c5f8..b71ce5182500 100644 #endif /* SMP */ #ifdef CONFIG_PREEMPT_DYNAMIC -@@ -347,14 +395,25 @@ static __init int sched_init_debug(void) +@@ -347,13 +393,20 @@ static __init int sched_init_debug(void) debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); #endif @@ -716,18 +688,13 @@ index c057ef46c5f8..b71ce5182500 100644 debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); #ifdef CONFIG_SMP -+#ifdef CONFIG_SCHED_BORE -+ debugfs_create_file("migration_cost_base_ns", 0644, debugfs_sched, NULL, &sched_migration_cost_base_fops); -+ debugfs_create_file("migration_cost_step_ns", 0644, debugfs_sched, NULL, &sched_migration_cost_step_fops); -+ debugfs_create_u32("migration_cost_ns", 0444, debugfs_sched, &sysctl_sched_migration_cost); -+#else // !CONFIG_SCHED_BORE ++#if !defined(CONFIG_SCHED_BORE) debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); - debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); +#endif // CONFIG_SCHED_BORE + debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); - mutex_lock(&sched_domains_mutex); -@@ -596,6 +655,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) +@@ -596,6 +649,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); @@ -737,7 +704,7 @@ index c057ef46c5f8..b71ce5182500 100644 #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif -@@ -1069,6 +1131,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, +@@ -1069,6 +1125,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.load.weight); #ifdef CONFIG_SMP @@ -748,7 +715,7 @@ index c057ef46c5f8..b71ce5182500 100644 P(se.avg.runnable_sum); P(se.avg.util_sum); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index a36e37a674e8..bdd7366db711 100644 +index a36e37a674e8..1c8704b2c230 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -55,6 +55,8 @@ @@ -760,7 +727,7 @@ index a36e37a674e8..bdd7366db711 100644 /* * The initial- and re-scaling of tunables is configurable * -@@ -64,28 +66,38 @@ +@@ -64,28 +66,32 @@ * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus * @@ -787,8 +754,8 @@ index a36e37a674e8..bdd7366db711 100644 -static unsigned int normalized_sysctl_sched_base_slice = 350000ULL; -#else +#ifdef CONFIG_SCHED_BORE -+const static uint nsecs_per_tick = 1000000000ULL / HZ; -+const_debug uint sysctl_sched_min_base_slice = CONFIG_MIN_BASE_SLICE_NS; ++static const unsigned int nsecs_per_tick = 1000000000ULL / HZ; ++unsigned int sysctl_sched_min_base_slice = CONFIG_MIN_BASE_SLICE_NS; +__read_mostly uint sysctl_sched_base_slice = nsecs_per_tick; +#else // !CONFIG_SCHED_BORE unsigned int sysctl_sched_base_slice = 750000ULL; @@ -799,18 +766,12 @@ index a36e37a674e8..bdd7366db711 100644 -#ifdef CONFIG_CACHY -const_debug unsigned int sysctl_sched_migration_cost = 300000UL; -#else -+#ifdef CONFIG_SCHED_BORE -+const_debug uint sysctl_sched_migration_cost_base = CONFIG_MIGRATION_COST_BASE_NS; -+const_debug uint sysctl_sched_migration_cost_step = CONFIG_MIGRATION_COST_STEP_NS; -+__read_mostly uint sysctl_sched_migration_cost = CONFIG_MIGRATION_COST_BASE_NS; -+#else // !CONFIG_SCHED_BORE const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -#endif -+#endif // CONFIG_SCHED_BORE static int __init setup_sched_thermal_decay_shift(char *str) { -@@ -130,12 +142,8 @@ int __weak arch_asym_cpu_priority(int cpu) +@@ -130,12 +136,8 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ @@ -823,48 +784,29 @@ index a36e37a674e8..bdd7366db711 100644 #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ -@@ -201,6 +209,18 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) +@@ -201,6 +203,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) * * This idea comes from the SD scheduler of Con Kolivas: */ +#ifdef CONFIG_SCHED_BORE -+static void auto_calculate_base_slice(void) { ++static void update_sysctl(void) { + sysctl_sched_base_slice = nsecs_per_tick * + max(1UL, DIV_ROUND_UP(sysctl_sched_min_base_slice, nsecs_per_tick)); +} -+static void auto_calculate_migration_cost(void) { -+ sysctl_sched_migration_cost = sysctl_sched_migration_cost_base + -+ ilog2(num_online_cpus()) * sysctl_sched_migration_cost_step; -+} -+void sched_update_min_base_slice(void) { auto_calculate_base_slice(); } -+void sched_update_migration_cost(void) { auto_calculate_migration_cost(); } ++void sched_update_min_base_slice(void) { update_sysctl(); } +#else // !CONFIG_SCHED_BORE static unsigned int get_update_sysctl_factor(void) { unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); -@@ -221,15 +241,21 @@ static unsigned int get_update_sysctl_factor(void) - - return factor; - } -+#endif // CONFIG_SCHED_BORE - - static void update_sysctl(void) - { -+#ifdef CONFIG_SCHED_BORE -+ auto_calculate_base_slice(); -+ auto_calculate_migration_cost(); -+#else // !CONFIG_SCHED_BORE - unsigned int factor = get_update_sysctl_factor(); - - #define SET_SYSCTL(name) \ - (sysctl_##name = (factor) * normalized_sysctl_##name) +@@ -231,6 +240,7 @@ static void update_sysctl(void) SET_SYSCTL(sched_base_slice); #undef SET_SYSCTL -+#endif // CONFIG_SCHED_BORE } ++#endif // CONFIG_SCHED_BORE void __init sched_init_granularity(void) -@@ -708,6 +734,9 @@ static s64 entity_lag(u64 avruntime, struct sched_entity *se) + { +@@ -708,6 +718,9 @@ static s64 entity_lag(u64 avruntime, struct sched_entity *se) vlag = avruntime - se->vruntime; limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); @@ -874,7 +816,7 @@ index a36e37a674e8..bdd7366db711 100644 return clamp(vlag, -limit, limit); } -@@ -909,6 +938,10 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) +@@ -909,6 +922,10 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) * until it gets a new slice. See the HACK in set_next_entity(). */ if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) @@ -885,7 +827,7 @@ index a36e37a674e8..bdd7366db711 100644 return curr; /* Pick the leftmost entity if it's eligible */ -@@ -967,6 +1000,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +@@ -967,6 +984,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ #ifdef CONFIG_SMP @@ -893,7 +835,7 @@ index a36e37a674e8..bdd7366db711 100644 int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); -@@ -978,6 +1012,7 @@ int sched_update_scaling(void) +@@ -978,6 +996,7 @@ int sched_update_scaling(void) return 0; } @@ -901,7 +843,7 @@ index a36e37a674e8..bdd7366db711 100644 #endif #endif -@@ -1178,6 +1213,10 @@ static void update_curr(struct cfs_rq *cfs_rq) +@@ -1178,6 +1197,10 @@ static void update_curr(struct cfs_rq *cfs_rq) if (unlikely(delta_exec <= 0)) return; @@ -912,7 +854,7 @@ index a36e37a674e8..bdd7366db711 100644 curr->vruntime += calc_delta_fair(delta_exec, curr); update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); -@@ -3804,7 +3843,7 @@ static void reweight_eevdf(struct sched_entity *se, u64 avruntime, +@@ -3804,7 +3827,7 @@ static void reweight_eevdf(struct sched_entity *se, u64 avruntime, se->deadline = avruntime + vslice; } @@ -921,7 +863,7 @@ index a36e37a674e8..bdd7366db711 100644 unsigned long weight) { bool curr = cfs_rq->curr == se; -@@ -5212,6 +5251,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -5212,6 +5235,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * EEVDF: placement strategy #1 / #2 */ @@ -931,7 +873,7 @@ index a36e37a674e8..bdd7366db711 100644 if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { struct sched_entity *curr = cfs_rq->curr; unsigned long load; -@@ -5282,6 +5324,16 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -5282,6 +5308,16 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->vruntime = vruntime - lag; @@ -948,7 +890,7 @@ index a36e37a674e8..bdd7366db711 100644 /* * When joining the competition; the existing tasks will be, * on average, halfway through their slice, as such start tasks -@@ -5391,6 +5443,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); +@@ -5391,6 +5427,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -956,7 +898,7 @@ index a36e37a674e8..bdd7366db711 100644 int action = UPDATE_TG; if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) -@@ -5418,6 +5471,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -5418,6 +5455,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) clear_buddies(cfs_rq, se); update_entity_lag(cfs_rq, se); @@ -968,7 +910,7 @@ index a36e37a674e8..bdd7366db711 100644 if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; -@@ -6869,6 +6927,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +@@ -6869,6 +6911,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) bool was_sched_idle = sched_idle_rq(rq); util_est_dequeue(&rq->cfs, p); @@ -983,7 +925,7 @@ index a36e37a674e8..bdd7366db711 100644 for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); -@@ -8651,16 +8717,25 @@ static void yield_task_fair(struct rq *rq) +@@ -8651,16 +8701,25 @@ static void yield_task_fair(struct rq *rq) /* * Are we the only task in the tree? */ @@ -1009,7 +951,7 @@ index a36e37a674e8..bdd7366db711 100644 /* * Tell update_rq_clock() that we've just updated, * so we don't do microscopic update in schedule() -@@ -12725,6 +12800,9 @@ static void task_fork_fair(struct task_struct *p) +@@ -12725,6 +12784,9 @@ static void task_fork_fair(struct task_struct *p) curr = cfs_rq->curr; if (curr) update_curr(cfs_rq); @@ -1019,7 +961,7 @@ index a36e37a674e8..bdd7366db711 100644 place_entity(cfs_rq, se, ENQUEUE_INITIAL); rq_unlock(rq, &rf); } -@@ -12837,6 +12915,10 @@ static void attach_task_cfs_rq(struct task_struct *p) +@@ -12837,6 +12899,10 @@ static void attach_task_cfs_rq(struct task_struct *p) static void switched_from_fair(struct rq *rq, struct task_struct *p) { @@ -1046,37 +988,28 @@ index 143f55df890b..e97b7b68bdd3 100644 /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 48d893de632b..8c3fa2ffa177 100644 +index 48d893de632b..1030725bacd7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h -@@ -2065,7 +2065,12 @@ static inline void update_sched_domain_debugfs(void) { } +@@ -2065,7 +2065,11 @@ static inline void update_sched_domain_debugfs(void) { } static inline void dirty_sched_domain_sysctl(int cpu) { } #endif +#ifdef CONFIG_SCHED_BORE +extern void sched_update_min_base_slice(void); -+extern void sched_update_migration_cost(void); +#else // !CONFIG_SCHED_BORE extern int sched_update_scaling(void); +#endif // CONFIG_SCHED_BORE static inline const struct cpumask *task_user_cpus(struct task_struct *p) { -@@ -2735,9 +2740,20 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); - #endif - +@@ -2737,7 +2741,12 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); extern const_debug unsigned int sysctl_sched_nr_migrate; -+#ifdef CONFIG_SCHED_BORE -+extern const_debug unsigned int sysctl_sched_migration_cost_base; -+extern const_debug unsigned int sysctl_sched_migration_cost_step; -+extern __read_mostly unsigned int sysctl_sched_migration_cost; -+#else // !CONFIG_SCHED_BORE extern const_debug unsigned int sysctl_sched_migration_cost; -+#endif // CONFIG_SCHED_BORE +#ifdef CONFIG_SCHED_BORE -+extern const_debug unsigned int sysctl_sched_min_base_slice; -+extern __read_mostly unsigned int sysctl_sched_base_slice; ++extern unsigned int sysctl_sched_min_base_slice; ++extern __read_mostly uint sysctl_sched_base_slice; +#else // !CONFIG_SCHED_BORE extern unsigned int sysctl_sched_base_slice; +#endif // CONFIG_SCHED_BORE