From 2fd49cc4758d3ce835e4e4f574113d9b7958bc48 Mon Sep 17 00:00:00 2001 From: ferrreo Date: Sat, 16 Mar 2024 10:46:26 +0000 Subject: [PATCH] 6.8.1 stable --- .github/workflows/release.yml | 2 - VERSION | 2 +- config | 116 +- patches/cachyos/0001-bore-cachy.patch | 180 +- patches/cachyos/0001-cachyos-base-all.patch | 336 ++- patches/cachyos/0003-nvidia.patch | 230 ++ patches/cachyos/0004-intel.patch | 2203 +++++++++++++++++++ patches/series | 2 + scripts/source.sh | 5 +- 9 files changed, 2873 insertions(+), 203 deletions(-) create mode 100644 patches/cachyos/0003-nvidia.patch create mode 100644 patches/cachyos/0004-intel.patch diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4c32f4c..52c984d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,8 +14,6 @@ jobs: steps: - uses: actions/checkout@v3 - with: - ref: 6.8RC - name: Import GPG key id: import_gpg diff --git a/VERSION b/VERSION index 1269a95..5f6c086 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -6.8-rc6 +6.8.1 diff --git a/config b/config index 8bd93d6..53b954e 100644 --- a/config +++ b/config @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 6.8.0-rc6 Kernel Configuration +# Linux/x86 6.8.1 Kernel Configuration # CONFIG_CC_VERSION_TEXT="gcc (GCC) 13.2.1 20230801" CONFIG_CC_IS_GCC=y @@ -16,10 +16,11 @@ CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT=y +CONFIG_GCC_ASM_GOTO_OUTPUT_WORKAROUND=y CONFIG_TOOLS_SUPPORT_RELR=y CONFIG_CC_HAS_ASM_INLINE=y CONFIG_CC_HAS_NO_PROFILE_FN_ATTR=y -CONFIG_PAHOLE_VERSION=125 +CONFIG_PAHOLE_VERSION=126 CONFIG_IRQ_WORK=y CONFIG_BUILDTIME_TABLE_SORT=y CONFIG_THREAD_INFO_IN_TASK=y @@ -137,6 +138,7 @@ CONFIG_PREEMPT_COUNT=y CONFIG_PREEMPTION=y CONFIG_PREEMPT_DYNAMIC=y CONFIG_SCHED_CORE=y +# CONFIG_SCHED_CLASS_EXT is not set # # CPU/Task time and stats accounting @@ -199,17 +201,16 @@ CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y # CONFIG_UCLAMP_TASK=y CONFIG_UCLAMP_BUCKETS_COUNT=5 -# CONFIG_SCHED_ALT is not set # end of Scheduler features CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y CONFIG_CC_HAS_INT128=y CONFIG_CC_IMPLICIT_FALLTHROUGH="-Wimplicit-fallthrough=5" -CONFIG_GCC11_NO_ARRAY_BOUNDS=y +CONFIG_GCC10_NO_ARRAY_BOUNDS=y +CONFIG_CC_NO_ARRAY_BOUNDS=y CONFIG_GCC_NO_STRINGOP_OVERFLOW=y CONFIG_CC_NO_STRINGOP_OVERFLOW=y -CONFIG_CC_NO_ARRAY_BOUNDS=y CONFIG_ARCH_SUPPORTS_INT128=y CONFIG_NUMA_BALANCING=y CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y @@ -224,6 +225,7 @@ CONFIG_CGROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y # CONFIG_RT_GROUP_SCHED is not set +# CONFIG_EXT_GROUP_SCHED is not set CONFIG_SCHED_MM_CID=y CONFIG_UCLAMP_TASK_GROUP=y CONFIG_CGROUP_PIDS=y @@ -614,6 +616,7 @@ CONFIG_CPU_IBRS_ENTRY=y CONFIG_CPU_SRSO=y CONFIG_SLS=y # CONFIG_GDS_FORCE_MITIGATION is not set +CONFIG_MITIGATION_RFDS=y CONFIG_ARCH_HAS_ADD_PAGES=y # @@ -1141,7 +1144,7 @@ CONFIG_SWAP=y CONFIG_ZSWAP=y CONFIG_ZSWAP_DEFAULT_ON=y # CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON is not set -# CONFIG_ZSWAP_SHRINKER_DEFAULT_ON is not set +CONFIG_ZSWAP_SHRINKER_DEFAULT_ON=y # CONFIG_ZSWAP_COMPRESSOR_DEFAULT_DEFLATE is not set # CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set # CONFIG_ZSWAP_COMPRESSOR_DEFAULT_842 is not set @@ -1296,8 +1299,8 @@ CONFIG_TLS_DEVICE=y # CONFIG_TLS_TOE is not set CONFIG_XFRM=y CONFIG_XFRM_OFFLOAD=y -CONFIG_XFRM_ALGO=y -CONFIG_XFRM_USER=y +CONFIG_XFRM_ALGO=m +CONFIG_XFRM_USER=m # CONFIG_XFRM_USER_COMPAT is not set CONFIG_XFRM_INTERFACE=m CONFIG_XFRM_SUB_POLICY=y @@ -2218,7 +2221,7 @@ CONFIG_LWTUNNEL_BPF=y CONFIG_DST_CACHE=y CONFIG_GRO_CELLS=y CONFIG_SOCK_VALIDATE_XMIT=y -CONFIG_NET_SELFTESTS=y +CONFIG_NET_SELFTESTS=m CONFIG_NET_SOCK_MSG=y CONFIG_NET_DEVLINK=y CONFIG_PAGE_POOL=y @@ -2291,7 +2294,7 @@ CONFIG_PCI_HYPERV_INTERFACE=m # CONFIG_PCIE_DW=y CONFIG_PCIE_DW_HOST=y -CONFIG_PCI_MESON=y +CONFIG_PCI_MESON=m CONFIG_PCIE_DW_PLAT=y CONFIG_PCIE_DW_PLAT_HOST=y # end of DesignWare-based PCIe controllers @@ -3624,10 +3627,10 @@ CONFIG_SKFP=m # CONFIG_HIPPI is not set CONFIG_NET_SB1000=m CONFIG_PHYLINK=m -CONFIG_PHYLIB=y +CONFIG_PHYLIB=m CONFIG_SWPHY=y CONFIG_LED_TRIGGER_PHY=y -CONFIG_FIXED_PHY=y +CONFIG_FIXED_PHY=m CONFIG_SFP=m # @@ -3761,11 +3764,11 @@ CONFIG_MCTP_SERIAL=m CONFIG_MCTP_TRANSPORT_I2C=m # end of MCTP Device Drivers -CONFIG_MDIO_DEVICE=y -CONFIG_MDIO_BUS=y -CONFIG_FWNODE_MDIO=y -CONFIG_ACPI_MDIO=y -CONFIG_MDIO_DEVRES=y +CONFIG_MDIO_DEVICE=m +CONFIG_MDIO_BUS=m +CONFIG_FWNODE_MDIO=m +CONFIG_ACPI_MDIO=m +CONFIG_MDIO_DEVRES=m CONFIG_MDIO_BITBANG=m CONFIG_MDIO_BCM_UNIMAC=m CONFIG_MDIO_CAVIUM=m @@ -4197,7 +4200,7 @@ CONFIG_IEEE802154_HWSIM=m # # Wireless WAN # -CONFIG_WWAN=y +CONFIG_WWAN=m CONFIG_WWAN_DEBUGFS=y CONFIG_WWAN_HWSIM=m CONFIG_MHI_WWAN_CTRL=m @@ -4626,9 +4629,9 @@ CONFIG_SERIAL_8250_DWLIB=y CONFIG_SERIAL_8250_DFL=m CONFIG_SERIAL_8250_DW=m CONFIG_SERIAL_8250_RT288X=y -CONFIG_SERIAL_8250_LPSS=y -CONFIG_SERIAL_8250_MID=y -CONFIG_SERIAL_8250_PERICOM=y +CONFIG_SERIAL_8250_LPSS=m +CONFIG_SERIAL_8250_MID=m +CONFIG_SERIAL_8250_PERICOM=m # # Non-8250 serial port support @@ -4905,7 +4908,7 @@ CONFIG_SPI_SLAVE_SYSTEM_CONTROL=m CONFIG_SPI_DYNAMIC=y # CONFIG_SPMI is not set # CONFIG_HSI is not set -CONFIG_PPS=y +CONFIG_PPS=m # CONFIG_PPS_DEBUG is not set # @@ -4923,8 +4926,8 @@ CONFIG_PPS_CLIENT_GPIO=m # # PTP clock support # -CONFIG_PTP_1588_CLOCK=y -CONFIG_PTP_1588_CLOCK_OPTIONAL=y +CONFIG_PTP_1588_CLOCK=m +CONFIG_PTP_1588_CLOCK_OPTIONAL=m CONFIG_DP83640_PHY=m CONFIG_PTP_1588_CLOCK_INES=m CONFIG_PTP_1588_CLOCK_KVM=m @@ -4961,25 +4964,25 @@ CONFIG_PINCTRL_CS47L92=y # CONFIG_PINCTRL_BAYTRAIL=y CONFIG_PINCTRL_CHERRYVIEW=y -CONFIG_PINCTRL_LYNXPOINT=y +CONFIG_PINCTRL_LYNXPOINT=m CONFIG_PINCTRL_INTEL=y -CONFIG_PINCTRL_INTEL_PLATFORM=y -CONFIG_PINCTRL_ALDERLAKE=y -CONFIG_PINCTRL_BROXTON=y -CONFIG_PINCTRL_CANNONLAKE=y -CONFIG_PINCTRL_CEDARFORK=y -CONFIG_PINCTRL_DENVERTON=y -CONFIG_PINCTRL_ELKHARTLAKE=y -CONFIG_PINCTRL_EMMITSBURG=y -CONFIG_PINCTRL_GEMINILAKE=y -CONFIG_PINCTRL_ICELAKE=y -CONFIG_PINCTRL_JASPERLAKE=y -CONFIG_PINCTRL_LAKEFIELD=y -CONFIG_PINCTRL_LEWISBURG=y -CONFIG_PINCTRL_METEORLAKE=y -CONFIG_PINCTRL_METEORPOINT=y -CONFIG_PINCTRL_SUNRISEPOINT=y -CONFIG_PINCTRL_TIGERLAKE=y +CONFIG_PINCTRL_INTEL_PLATFORM=m +CONFIG_PINCTRL_ALDERLAKE=m +CONFIG_PINCTRL_BROXTON=m +CONFIG_PINCTRL_CANNONLAKE=m +CONFIG_PINCTRL_CEDARFORK=m +CONFIG_PINCTRL_DENVERTON=m +CONFIG_PINCTRL_ELKHARTLAKE=m +CONFIG_PINCTRL_EMMITSBURG=m +CONFIG_PINCTRL_GEMINILAKE=m +CONFIG_PINCTRL_ICELAKE=m +CONFIG_PINCTRL_JASPERLAKE=m +CONFIG_PINCTRL_LAKEFIELD=m +CONFIG_PINCTRL_LEWISBURG=m +CONFIG_PINCTRL_METEORLAKE=m +CONFIG_PINCTRL_METEORPOINT=m +CONFIG_PINCTRL_SUNRISEPOINT=m +CONFIG_PINCTRL_TIGERLAKE=m # end of Intel pinctrl drivers # @@ -5199,7 +5202,7 @@ CONFIG_CHARGER_TWL4030=m CONFIG_CHARGER_LP8727=m CONFIG_CHARGER_LP8788=m CONFIG_CHARGER_GPIO=m -CONFIG_CHARGER_MANAGER=y +CONFIG_CHARGER_MANAGER=m CONFIG_CHARGER_LT3651=m CONFIG_CHARGER_LTC4162L=m CONFIG_CHARGER_MAX14577=m @@ -5754,7 +5757,7 @@ CONFIG_MFD_SYSCON=y CONFIG_MFD_LP3943=m CONFIG_MFD_LP8788=y CONFIG_MFD_TI_LMU=m -CONFIG_MFD_PALMAS=y +CONFIG_MFD_PALMAS=m CONFIG_TPS6105X=m CONFIG_TPS65010=m CONFIG_TPS6507X=m @@ -7873,7 +7876,7 @@ CONFIG_HID=y CONFIG_HID_BATTERY_STRENGTH=y CONFIG_HIDRAW=y CONFIG_UHID=m -CONFIG_HID_GENERIC=y +CONFIG_HID_GENERIC=m # # Special HID drivers @@ -8638,6 +8641,7 @@ CONFIG_LEDS_TRIGGER_NETDEV=m CONFIG_LEDS_TRIGGER_PATTERN=m CONFIG_LEDS_TRIGGER_AUDIO=m CONFIG_LEDS_TRIGGER_TTY=m +CONFIG_LEDS_TRIGGER_BLKDEV=m # # Simple LED drivers @@ -8713,7 +8717,7 @@ CONFIG_EDAC_SUPPORT=y CONFIG_EDAC=y CONFIG_EDAC_LEGACY_SYSFS=y # CONFIG_EDAC_DEBUG is not set -CONFIG_EDAC_DECODE_MCE=m +CONFIG_EDAC_DECODE_MCE=y CONFIG_EDAC_GHES=y CONFIG_EDAC_AMD64=m CONFIG_EDAC_E752X=m @@ -8898,7 +8902,7 @@ CONFIG_DW_DMAC=m CONFIG_DW_DMAC_PCI=y CONFIG_DW_EDMA=m CONFIG_DW_EDMA_PCIE=m -CONFIG_HSU_DMA=y +CONFIG_HSU_DMA=m CONFIG_SF_PDMA=m CONFIG_INTEL_LDMA=y @@ -8944,7 +8948,7 @@ CONFIG_VFIO_CONTAINER=y CONFIG_VFIO_IOMMU_TYPE1=m # CONFIG_VFIO_NOIOMMU is not set CONFIG_VFIO_VIRQFD=y -# CONFIG_VFIO_DEBUGFS is not set +CONFIG_VFIO_DEBUGFS=y # # VFIO support for PCI devices @@ -9042,7 +9046,7 @@ CONFIG_SWIOTLB_XEN=y CONFIG_XEN_PCI_STUB=y CONFIG_XEN_PCIDEV_BACKEND=m CONFIG_XEN_PVCALLS_FRONTEND=m -CONFIG_XEN_PVCALLS_BACKEND=y +CONFIG_XEN_PVCALLS_BACKEND=m CONFIG_XEN_SCSI_BACKEND=m CONFIG_XEN_PRIVCMD=m CONFIG_XEN_PRIVCMD_EVENTFD=y @@ -9931,7 +9935,7 @@ CONFIG_AM2315=m CONFIG_DHT11=m CONFIG_HDC100X=m CONFIG_HDC2010=m -CONFIG_HDC3010=m +CONFIG_HDC3020=m CONFIG_HID_SENSOR_HUMIDITY=m CONFIG_HTS221=m CONFIG_HTS221_I2C=m @@ -10229,7 +10233,7 @@ CONFIG_PWM=y CONFIG_PWM_SYSFS=y # CONFIG_PWM_DEBUG is not set CONFIG_PWM_CLK=m -CONFIG_PWM_CRC=y +CONFIG_PWM_CRC=m CONFIG_PWM_CROS_EC=m CONFIG_PWM_DWC_CORE=m CONFIG_PWM_DWC=m @@ -10310,7 +10314,7 @@ CONFIG_ANDROID_BINDER_DEVICES="binder,hwbinder,vndbinder" # CONFIG_ANDROID_BINDER_IPC_SELFTEST is not set # end of Android -CONFIG_LIBNVDIMM=y +CONFIG_LIBNVDIMM=m CONFIG_BLK_DEV_PMEM=m CONFIG_ND_CLAIM=y CONFIG_ND_BTT=m @@ -10318,6 +10322,8 @@ CONFIG_BTT=y CONFIG_ND_PFN=m CONFIG_NVDIMM_PFN=y CONFIG_NVDIMM_DAX=y +CONFIG_NVDIMM_KEYS=y +# CONFIG_NVDIMM_SECURITY_TEST is not set CONFIG_DAX=y CONFIG_DEV_DAX=m CONFIG_DEV_DAX_PMEM=m @@ -10944,7 +10950,7 @@ CONFIG_CRYPTO=y # CONFIG_CRYPTO_ALGAPI=y CONFIG_CRYPTO_ALGAPI2=y -CONFIG_CRYPTO_AEAD=y +CONFIG_CRYPTO_AEAD=m CONFIG_CRYPTO_AEAD2=y CONFIG_CRYPTO_SIG2=y CONFIG_CRYPTO_SKCIPHER=y @@ -11600,7 +11606,7 @@ CONFIG_STACKTRACE=y # # Debug kernel data structures # -CONFIG_DEBUG_LIST=y +# CONFIG_DEBUG_LIST is not set # CONFIG_DEBUG_PLIST is not set # CONFIG_DEBUG_SG is not set # CONFIG_DEBUG_NOTIFIERS is not set @@ -11624,7 +11630,7 @@ CONFIG_RCU_EXP_CPU_STALL_TIMEOUT=0 # CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set # CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set -CONFIG_LATENCYTOP=y +# CONFIG_LATENCYTOP is not set # CONFIG_DEBUG_CGROUP_REF is not set CONFIG_USER_STACKTRACE_SUPPORT=y CONFIG_NOP_TRACER=y diff --git a/patches/cachyos/0001-bore-cachy.patch b/patches/cachyos/0001-bore-cachy.patch index 5de21f9..8aeeab5 100644 --- a/patches/cachyos/0001-bore-cachy.patch +++ b/patches/cachyos/0001-bore-cachy.patch @@ -1,24 +1,24 @@ -From 97dcd5da7813021da6111c09488a1ebe75f1d935 Mon Sep 17 00:00:00 2001 +From 1ab81cfa061f454316364a32761ce45a7479e616 Mon Sep 17 00:00:00 2001 From: Piotr Gorski -Date: Mon, 26 Feb 2024 09:09:36 +0100 +Date: Thu, 7 Mar 2024 22:28:47 +0100 Subject: [PATCH] bore-cachy Signed-off-by: Piotr Gorski --- - include/linux/sched.h | 11 ++ + include/linux/sched.h | 12 ++ init/Kconfig | 19 +++ - kernel/sched/core.c | 146 +++++++++++++++++++++ - kernel/sched/debug.c | 57 +++++++- - kernel/sched/fair.c | 281 ++++++++++++++++++++++++++++++++++++---- + kernel/sched/core.c | 148 +++++++++++++++++++ + kernel/sched/debug.c | 61 +++++++- + kernel/sched/fair.c | 319 ++++++++++++++++++++++++++++++++++++---- kernel/sched/features.h | 4 + kernel/sched/sched.h | 7 + - 7 files changed, 501 insertions(+), 24 deletions(-) + 7 files changed, 542 insertions(+), 28 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h -index ffe8f618a..314c2c981 100644 +index ffe8f618a..7ac6163f9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h -@@ -547,6 +547,17 @@ struct sched_entity { +@@ -547,6 +547,18 @@ struct sched_entity { u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; @@ -29,6 +29,7 @@ index ffe8f618a..314c2c981 100644 + u8 burst_penalty; + u8 burst_score; + u32 burst_load; ++ bool on_cfs_rq; + u8 child_burst; + u32 child_burst_cnt; + u64 child_burst_last_cached; @@ -67,10 +68,10 @@ index 47671886d..c99132cf6 100644 bool "Automatic process group scheduling" select CGROUPS diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 9116bcc90..64b663a7b 100644 +index 9116bcc90..43e4311db 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c -@@ -4507,6 +4507,141 @@ int wake_up_state(struct task_struct *p, unsigned int state) +@@ -4507,6 +4507,143 @@ int wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } @@ -85,6 +86,7 @@ index 9116bcc90..64b663a7b 100644 + init_task.se.curr_burst_penalty = 0; + init_task.se.burst_penalty = 0; + init_task.se.burst_score = 0; ++ init_task.se.on_cfs_rq = false; + init_task.se.child_burst_last_cached = 0; + init_task.se.burst_load = 0; +} @@ -93,6 +95,7 @@ index 9116bcc90..64b663a7b 100644 + p->se.burst_time = 0; + p->se.curr_burst_penalty = 0; + p->se.burst_score = 0; ++ p->se.on_cfs_rq = false; + p->se.child_burst_last_cached = 0; + p->se.burst_load = 0; +} @@ -212,7 +215,7 @@ index 9116bcc90..64b663a7b 100644 /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. -@@ -4523,6 +4658,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +@@ -4523,6 +4660,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; @@ -222,7 +225,7 @@ index 9116bcc90..64b663a7b 100644 p->se.vlag = 0; p->se.slice = sysctl_sched_base_slice; INIT_LIST_HEAD(&p->se.group_node); -@@ -4839,6 +4977,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) +@@ -4839,6 +4979,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) void sched_post_fork(struct task_struct *p) { @@ -232,20 +235,20 @@ index 9116bcc90..64b663a7b 100644 uclamp_post_fork(p); } -@@ -9910,6 +10051,11 @@ void __init sched_init(void) +@@ -9910,6 +10053,11 @@ void __init sched_init(void) BUG_ON(&dl_sched_class != &stop_sched_class + 1); #endif +#ifdef CONFIG_SCHED_BORE + sched_init_bore(); -+ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 4.2.4 by Masahito Suzuki"); ++ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 4.5.2 by Masahito Suzuki"); +#endif // CONFIG_SCHED_BORE + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 8d5d98a58..3f37534f8 100644 +index 8d5d98a58..a565363fd 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -167,7 +167,52 @@ static const struct file_operations sched_feat_fops = { @@ -341,8 +344,19 @@ index 8d5d98a58..3f37534f8 100644 #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif +@@ -1068,6 +1123,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + + P(se.load.weight); + #ifdef CONFIG_SMP ++#ifdef CONFIG_SCHED_BORE ++ P(se.burst_load); ++ P(se.burst_score); ++#endif // CONFIG_SCHED_BORE + P(se.avg.load_sum); + P(se.avg.runnable_sum); + P(se.avg.util_sum); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index fc0a9de42..f85eab965 100644 +index fc0a9de42..3ee4e7e70 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -19,6 +19,9 @@ @@ -355,7 +369,7 @@ index fc0a9de42..f85eab965 100644 */ #include #include -@@ -64,28 +67,126 @@ +@@ -64,28 +67,128 @@ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus * @@ -405,6 +419,8 @@ index fc0a9de42..f85eab965 100644 +u8 __read_mostly sched_burst_penalty_offset = 22; +uint __read_mostly sched_burst_penalty_scale = 1280; +uint __read_mostly sched_burst_cache_lifetime = 60000000; ++u8 __read_mostly sched_vlag_deviation_limit = 11; ++static int __maybe_unused thirty_two = 32; +static int __maybe_unused sixty_four = 64; +static int __maybe_unused maxval_12_bits = 4095; + @@ -450,7 +466,7 @@ index fc0a9de42..f85eab965 100644 + if (sched_burst_score_rounding) penalty += 0x2U; + se->burst_score = penalty >> 2; + -+ if ((se->burst_score != prev_score) && se->burst_load) { ++ if ((se->burst_score != prev_score) && se->on_cfs_rq) { + avg_vruntime_sub(cfs_rq, se); + avg_vruntime_add(cfs_rq, se); + } @@ -493,7 +509,7 @@ index fc0a9de42..f85eab965 100644 int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) -@@ -136,12 +237,8 @@ int __weak arch_asym_cpu_priority(int cpu) +@@ -136,12 +239,8 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ @@ -506,7 +522,7 @@ index fc0a9de42..f85eab965 100644 #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ -@@ -150,6 +247,78 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; +@@ -150,6 +249,87 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = { @@ -581,11 +597,20 @@ index fc0a9de42..f85eab965 100644 + .mode = 0644, + .proc_handler = proc_douintvec, + }, ++ { ++ .procname = "sched_vlag_deviation_limit", ++ .data = &sched_vlag_deviation_limit, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &thirty_two, ++ }, +#endif // CONFIG_SCHED_BORE #ifdef CONFIG_CFS_BANDWIDTH { .procname = "sched_cfs_bandwidth_slice_us", -@@ -208,6 +377,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) +@@ -208,6 +388,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) * * This idea comes from the SD scheduler of Con Kolivas: */ @@ -599,7 +624,7 @@ index fc0a9de42..f85eab965 100644 static unsigned int get_update_sysctl_factor(void) { unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); -@@ -238,6 +414,7 @@ static void update_sysctl(void) +@@ -238,6 +425,7 @@ static void update_sysctl(void) SET_SYSCTL(sched_base_slice); #undef SET_SYSCTL } @@ -607,7 +632,7 @@ index fc0a9de42..f85eab965 100644 void __init sched_init_granularity(void) { -@@ -311,6 +488,9 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) +@@ -311,6 +499,9 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) if (unlikely(se->load.weight != NICE_0_LOAD)) delta = __calc_delta(delta, NICE_0_LOAD, &se->load); @@ -617,7 +642,7 @@ index fc0a9de42..f85eab965 100644 return delta; } -@@ -637,10 +817,26 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) +@@ -637,10 +828,26 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * * As measured, the max (key * weight) value was ~44 bits for a kernel build. */ @@ -626,11 +651,11 @@ index fc0a9de42..f85eab965 100644 +#else // CONFIG_SCHED_BORE +static unsigned long entity_weight(struct sched_entity *se) { + unsigned long weight = se->load.weight; -+ if (likely(weight && sched_bore)) weight = unscale_slice(weight, se); ++ if (likely(sched_bore)) weight = unscale_slice(weight, se); +#ifdef CONFIG_64BIT -+ weight >>= SCHED_FIXEDPOINT_SHIFT - 5; ++ weight >>= SCHED_FIXEDPOINT_SHIFT - 3; +#endif // CONFIG_64BIT -+ return max(1UL, weight); ++ return weight; +} +#endif // CONFIG_SCHED_BORE + @@ -645,7 +670,7 @@ index fc0a9de42..f85eab965 100644 s64 key = entity_key(cfs_rq, se); cfs_rq->avg_vruntime += key * weight; -@@ -650,7 +846,12 @@ avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +@@ -650,7 +857,12 @@ avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) static void avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -659,7 +684,15 @@ index fc0a9de42..f85eab965 100644 s64 key = entity_key(cfs_rq, se); cfs_rq->avg_vruntime -= key * weight; -@@ -677,7 +878,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) +@@ -670,14 +882,14 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) + * Specifically: avg_runtime() + 0 must result in entity_eligible() := true + * For this to be so, the result of this function must have a left bias. + */ +-u64 avg_vruntime(struct cfs_rq *cfs_rq) ++static u64 avg_key(struct cfs_rq *cfs_rq) + { + struct sched_entity *curr = cfs_rq->curr; + s64 avg = cfs_rq->avg_vruntime; long load = cfs_rq->avg_load; if (curr && curr->on_rq) { @@ -668,7 +701,7 @@ index fc0a9de42..f85eab965 100644 avg += entity_key(cfs_rq, curr) * weight; load += weight; -@@ -687,7 +888,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) +@@ -687,12 +899,15 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) /* sign flips effective floor / ceil */ if (avg < 0) avg -= (load - 1); @@ -676,8 +709,17 @@ index fc0a9de42..f85eab965 100644 + avg = div64_s64(avg, load); } - return cfs_rq->min_vruntime + avg; -@@ -717,6 +918,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +- return cfs_rq->min_vruntime + avg; ++ return avg; + } + ++u64 avg_vruntime(struct cfs_rq *cfs_rq) { ++ return cfs_rq->min_vruntime + avg_key(cfs_rq); ++} + /* + * lag_i = S - s_i = w_i * (V - v_i) + * +@@ -717,6 +932,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) lag = avg_vruntime(cfs_rq) - se->vruntime; limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); @@ -687,7 +729,7 @@ index fc0a9de42..f85eab965 100644 se->vlag = clamp(lag, -limit, limit); } -@@ -744,7 +948,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) +@@ -744,7 +962,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) long load = cfs_rq->avg_load; if (curr && curr->on_rq) { @@ -696,7 +738,24 @@ index fc0a9de42..f85eab965 100644 avg += entity_key(cfs_rq, curr) * weight; load += weight; -@@ -968,6 +1172,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +@@ -840,10 +1058,16 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + se->min_vruntime = se->vruntime; + rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, + __entity_less, &min_vruntime_cb); ++#ifdef CONFIG_SCHED_BORE ++ se->on_cfs_rq = true; ++#endif // CONFIG_SCHED_BORE + } + + static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + { ++#ifdef CONFIG_SCHED_BORE ++ se->on_cfs_rq = false; ++#endif // CONFIG_SCHED_BORE + rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, + &min_vruntime_cb); + avg_vruntime_sub(cfs_rq, se); +@@ -968,6 +1192,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ #ifdef CONFIG_SMP @@ -704,7 +763,7 @@ index fc0a9de42..f85eab965 100644 int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); -@@ -979,6 +1184,7 @@ int sched_update_scaling(void) +@@ -979,6 +1204,7 @@ int sched_update_scaling(void) return 0; } @@ -712,7 +771,7 @@ index fc0a9de42..f85eab965 100644 #endif #endif -@@ -1178,7 +1384,13 @@ static void update_curr(struct cfs_rq *cfs_rq) +@@ -1178,7 +1404,13 @@ static void update_curr(struct cfs_rq *cfs_rq) if (unlikely(delta_exec <= 0)) return; @@ -726,17 +785,28 @@ index fc0a9de42..f85eab965 100644 update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); -@@ -3787,6 +3999,9 @@ static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se, - */ - vslice = (s64)(se->deadline - avruntime); - vslice = div_s64(vslice * old_weight, weight); -+#ifdef CONFIG_SCHED_BORE -+ if (unlikely(!sched_bore) || (s64)(avruntime + vslice - se->deadline) < 0) -+#endif // CONFIG_SCHED_BORE - se->deadline = avruntime + vslice; - } +@@ -5170,8 +5402,8 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} + static void + place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + { +- u64 vslice, vruntime = avg_vruntime(cfs_rq); +- s64 lag = 0; ++ s64 lag = 0, key = avg_key(cfs_rq); ++ u64 vslice, vruntime = cfs_rq->min_vruntime + key; -@@ -5244,12 +5459,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + se->slice = sysctl_sched_base_slice; + vslice = calc_delta_fair(se->slice, se); +@@ -5184,6 +5416,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + * + * EEVDF: placement strategy #1 / #2 + */ ++#ifdef CONFIG_SCHED_BORE ++ if (unlikely(!sched_bore) || se->vlag) ++#endif // CONFIG_SCHED_BORE + if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { + struct sched_entity *curr = cfs_rq->curr; + unsigned long load; +@@ -5244,12 +5479,22 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ load = cfs_rq->avg_load; if (curr && curr->on_rq) @@ -745,14 +815,24 @@ index fc0a9de42..f85eab965 100644 - lag *= load + scale_load_down(se->load.weight); + lag *= load + entity_weight(se); ++#if !defined(CONFIG_SCHED_BORE) if (WARN_ON_ONCE(!load)) ++#else // CONFIG_SCHED_BORE ++ if (unlikely(!load)) ++#endif // CONFIG_SCHED_BORE load = 1; - lag = div_s64(lag, load); + lag = div64_s64(lag, load); ++#ifdef CONFIG_SCHED_BORE ++ if (likely(sched_bore)) { ++ s64 limit = vslice << sched_vlag_deviation_limit; ++ lag = clamp(lag, -limit, limit); ++ } ++#endif // CONFIG_SCHED_BORE } se->vruntime = vruntime - lag; -@@ -6816,6 +7031,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +@@ -6816,6 +7061,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) bool was_sched_idle = sched_idle_rq(rq); util_est_dequeue(&rq->cfs, p); @@ -767,7 +847,7 @@ index fc0a9de42..f85eab965 100644 for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); -@@ -8565,16 +8788,25 @@ static void yield_task_fair(struct rq *rq) +@@ -8565,16 +8818,25 @@ static void yield_task_fair(struct rq *rq) /* * Are we the only task in the tree? */ @@ -793,7 +873,7 @@ index fc0a9de42..f85eab965 100644 /* * Tell update_rq_clock() that we've just updated, * so we don't do microscopic update in schedule() -@@ -12664,6 +12896,9 @@ static void task_fork_fair(struct task_struct *p) +@@ -12664,6 +12926,9 @@ static void task_fork_fair(struct task_struct *p) curr = cfs_rq->curr; if (curr) update_curr(cfs_rq); diff --git a/patches/cachyos/0001-cachyos-base-all.patch b/patches/cachyos/0001-cachyos-base-all.patch index e18294a..148a8a7 100644 --- a/patches/cachyos/0001-cachyos-base-all.patch +++ b/patches/cachyos/0001-cachyos-base-all.patch @@ -1,6 +1,6 @@ -From 83b6cdeff5fe00d3225b6593453ed3782289b0fb Mon Sep 17 00:00:00 2001 +From 8f03bb4df21c5746b9f1c3e399faa3c932737e4f Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 26 Feb 2024 15:46:47 +0100 +Date: Fri, 15 Mar 2024 20:08:47 +0100 Subject: [PATCH 1/7] amd-pstate Signed-off-by: Peter Jung @@ -8,18 +8,20 @@ Signed-off-by: Peter Jung .../admin-guide/kernel-parameters.txt | 5 + Documentation/admin-guide/pm/amd-pstate.rst | 70 ++- arch/x86/Kconfig | 5 +- + arch/x86/include/asm/msr-index.h | 2 + arch/x86/kernel/acpi/cppc.c | 2 +- drivers/acpi/cppc_acpi.c | 17 +- drivers/acpi/processor_driver.c | 6 + + drivers/cpufreq/acpi-cpufreq.c | 2 - drivers/cpufreq/amd-pstate-ut.c | 2 +- - drivers/cpufreq/amd-pstate.c | 440 +++++++++++++++--- + drivers/cpufreq/amd-pstate.c | 501 +++++++++++++++--- include/acpi/cppc_acpi.h | 5 + - include/linux/amd-pstate.h | 31 +- + include/linux/amd-pstate.h | 32 +- include/linux/cpufreq.h | 1 + - 11 files changed, 523 insertions(+), 61 deletions(-) + 13 files changed, 562 insertions(+), 88 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 31b3a25680d0..522530432548 100644 +index 73062d47a462..a493d93e0d2c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -374,6 +374,11 @@ @@ -130,7 +132,7 @@ index 9eb26014d34b..82fbd01da658 100644 =============================================== diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 5edec175b9bf..29d110285438 100644 +index 637e337c332e..de39c296ea3f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1054,8 +1054,9 @@ config SCHED_MC @@ -145,6 +147,19 @@ index 5edec175b9bf..29d110285438 100644 select CPU_FREQ default y help +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index d1b5edaf6c34..bfe139eb75b6 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -744,6 +744,8 @@ + #define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT) + #define MSR_K7_FID_VID_CTL 0xc0010041 + #define MSR_K7_FID_VID_STATUS 0xc0010042 ++#define MSR_K7_HWCR_CPB_DIS_BIT 25 ++#define MSR_K7_HWCR_CPB_DIS BIT_ULL(MSR_K7_HWCR_CPB_DIS_BIT) + + /* K6 MSRs */ + #define MSR_K6_WHCR 0xc0000082 diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index 8d8752b44f11..ff8f25faca3d 100644 --- a/arch/x86/kernel/acpi/cppc.c @@ -218,8 +233,21 @@ index 4bd16b3f0781..67db60eda370 100644 default: acpi_handle_debug(handle, "Unsupported event [0x%x]\n", event); break; +diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c +index 37f1cdf46d29..2fc82831bddd 100644 +--- a/drivers/cpufreq/acpi-cpufreq.c ++++ b/drivers/cpufreq/acpi-cpufreq.c +@@ -50,8 +50,6 @@ enum { + #define AMD_MSR_RANGE (0x7) + #define HYGON_MSR_RANGE (0x7) + +-#define MSR_K7_HWCR_CPB_DIS (1ULL << 25) +- + struct acpi_cpufreq_data { + unsigned int resume; + unsigned int cpu_feature; diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c -index f04ae67dda37..c5e2ca02f5ea 100644 +index f04ae67dda37..b3601b0e6dd3 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -226,7 +226,7 @@ static void amd_pstate_ut_check_freq(u32 index) @@ -227,12 +255,12 @@ index f04ae67dda37..c5e2ca02f5ea 100644 } - if (cpudata->boost_supported) { -+ if (amd_pstate_global_params.cpb_supported) { ++ if (amd_pstate_global_params.cpb_boost) { if ((policy->max == cpudata->max_freq) || (policy->max == cpudata->nominal_freq)) amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c -index 1791d37fbc53..91572dbe0cd1 100644 +index 1791d37fbc53..651055df1710 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -37,6 +37,7 @@ @@ -296,63 +324,94 @@ index 1791d37fbc53..91572dbe0cd1 100644 static inline int get_mode_idx_from_str(const char *str, size_t size) { int i; -@@ -296,14 +336,12 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) +@@ -291,16 +331,20 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) + { + u64 cap1; + u32 highest_perf; ++ struct cppc_perf_caps cppc_perf; ++ int ret; + +- int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, ++ ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &cap1); if (ret) return ret; -- + - /* - * TODO: Introduce AMD specific power feature. - * - * CPPC entry doesn't indicate the highest performance in some ASICs. -+ -+ /* Some CPUs have different highest_perf from others, it is safer ++ ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); ++ if (ret) ++ return ret; ++ ++ /* Some CPUs have different highest_perf from others, it is safer + * to read it than to assume some erroneous value, leading to performance issues. */ highest_perf = amd_get_highest_perf(); -- if (highest_perf > AMD_CPPC_HIGHEST_PERF(cap1)) -+ if(highest_perf > AMD_CPPC_HIGHEST_PERF(cap1)) - highest_perf = AMD_CPPC_HIGHEST_PERF(cap1); - - WRITE_ONCE(cpudata->highest_perf, highest_perf); -@@ -311,6 +349,7 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) + if (highest_perf > AMD_CPPC_HIGHEST_PERF(cap1)) +@@ -311,7 +355,11 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1)); WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1)); WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1)); + WRITE_ONCE(cpudata->prefcore_ranking, AMD_CPPC_HIGHEST_PERF(cap1)); WRITE_ONCE(cpudata->min_limit_perf, AMD_CPPC_LOWEST_PERF(cap1)); ++ WRITE_ONCE(cpudata->lowest_freq, cppc_perf.lowest_freq); ++ WRITE_ONCE(cpudata->nominal_freq, cppc_perf.nominal_freq); ++ return 0; } -@@ -324,8 +363,11 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) + +@@ -319,11 +367,15 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) + { + struct cppc_perf_caps cppc_perf; + u32 highest_perf; ++ int ret; + +- int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); ++ ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); if (ret) return ret; -+ /* Some CPUs have different highest_perf from others, it is safer ++ /* Some CPUs have different highest_perf from others, it is safer + * to read it than to assume some erroneous value, leading to performance issues. + */ highest_perf = amd_get_highest_perf(); -- if (highest_perf > cppc_perf.highest_perf) -+ if(highest_perf > cppc_perf.highest_perf) + if (highest_perf > cppc_perf.highest_perf) highest_perf = cppc_perf.highest_perf; - - WRITE_ONCE(cpudata->highest_perf, highest_perf); -@@ -334,6 +376,7 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) +@@ -334,7 +386,10 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) WRITE_ONCE(cpudata->lowest_nonlinear_perf, cppc_perf.lowest_nonlinear_perf); WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf); + WRITE_ONCE(cpudata->prefcore_ranking, cppc_perf.highest_perf); WRITE_ONCE(cpudata->min_limit_perf, cppc_perf.lowest_perf); ++ WRITE_ONCE(cpudata->lowest_freq, cppc_perf.lowest_freq); ++ WRITE_ONCE(cpudata->nominal_freq, cppc_perf.nominal_freq); if (cppc_state == AMD_PSTATE_ACTIVE) -@@ -431,6 +474,7 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, + return 0; +@@ -430,7 +485,10 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) + static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, u32 des_perf, u32 max_perf, bool fast_switch, int gov_flags) { ++ unsigned long max_freq; ++ struct cpufreq_policy *policy = cpufreq_cpu_get(cpudata->cpu); u64 prev = READ_ONCE(cpudata->cppc_req_cached); + u32 nominal_perf = READ_ONCE(cpudata->nominal_perf); u64 value = prev; min_perf = clamp_t(unsigned long, min_perf, cpudata->min_limit_perf, -@@ -450,6 +494,10 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, +@@ -439,6 +497,9 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, + cpudata->max_limit_perf); + des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf); + ++ max_freq = READ_ONCE(cpudata->max_limit_freq); ++ policy->cur = div_u64(des_perf * max_freq, max_perf); ++ + if ((cppc_state == AMD_PSTATE_GUIDED) && (gov_flags & CPUFREQ_GOV_DYNAMIC_SWITCHING)) { + min_perf = des_perf; + des_perf = 0; +@@ -450,6 +511,10 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, value &= ~AMD_CPPC_DES_PERF(~0L); value |= AMD_CPPC_DES_PERF(des_perf); @@ -363,7 +422,46 @@ index 1791d37fbc53..91572dbe0cd1 100644 value &= ~AMD_CPPC_MAX_PERF(~0L); value |= AMD_CPPC_MAX_PERF(max_perf); -@@ -570,7 +618,7 @@ static void amd_pstate_adjust_perf(unsigned int cpu, +@@ -477,12 +542,19 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy) + + static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) + { +- u32 max_limit_perf, min_limit_perf; ++ u32 max_limit_perf, min_limit_perf, lowest_perf; + struct amd_cpudata *cpudata = policy->driver_data; + + max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq); + min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq); + ++ lowest_perf = READ_ONCE(cpudata->lowest_perf); ++ if (min_limit_perf < lowest_perf) ++ min_limit_perf = lowest_perf; ++ ++ if (max_limit_perf < min_limit_perf) ++ max_limit_perf = min_limit_perf; ++ + WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf); + WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf); + WRITE_ONCE(cpudata->max_limit_freq, policy->max); +@@ -553,10 +625,9 @@ static void amd_pstate_adjust_perf(unsigned int cpu, + unsigned long capacity) + { + unsigned long max_perf, min_perf, des_perf, +- cap_perf, lowest_nonlinear_perf, max_freq; ++ cap_perf, lowest_nonlinear_perf; + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct amd_cpudata *cpudata = policy->driver_data; +- unsigned int target_freq; + + if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) + amd_pstate_update_min_max_limit(policy); +@@ -564,13 +635,12 @@ static void amd_pstate_adjust_perf(unsigned int cpu, + + cap_perf = READ_ONCE(cpudata->highest_perf); + lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); +- max_freq = READ_ONCE(cpudata->max_freq); + + des_perf = cap_perf; if (target_perf < capacity) des_perf = DIV_ROUND_UP(cap_perf * target_perf, capacity); @@ -372,31 +470,45 @@ index 1791d37fbc53..91572dbe0cd1 100644 if (_min_perf < capacity) min_perf = DIV_ROUND_UP(cap_perf * _min_perf, capacity); -@@ -593,13 +641,19 @@ static void amd_pstate_adjust_perf(unsigned int cpu, +@@ -582,8 +652,6 @@ static void amd_pstate_adjust_perf(unsigned int cpu, + max_perf = min_perf; + + des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf); +- target_freq = div_u64(des_perf * max_freq, max_perf); +- policy->cur = target_freq; + + amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true, + policy->governor->flags); +@@ -592,30 +660,30 @@ static void amd_pstate_adjust_perf(unsigned int cpu, + static int amd_get_min_freq(struct amd_cpudata *cpudata) { - struct cppc_perf_caps cppc_perf; +- struct cppc_perf_caps cppc_perf; + u32 lowest_freq; - int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); - if (ret) - return ret; - +- int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); +- if (ret) +- return ret; + if (quirks && quirks->lowest_freq) + lowest_freq = quirks->lowest_freq; + else -+ lowest_freq = cppc_perf.lowest_freq; -+ ++ lowest_freq = READ_ONCE(cpudata->lowest_freq); + /* Switch to khz */ - return cppc_perf.lowest_freq * 1000; + return lowest_freq * 1000; } static int amd_get_max_freq(struct amd_cpudata *cpudata) -@@ -612,10 +666,14 @@ static int amd_get_max_freq(struct amd_cpudata *cpudata) - if (ret) - return ret; + { +- struct cppc_perf_caps cppc_perf; + u32 max_perf, max_freq, nominal_freq, nominal_perf; + u64 boost_ratio; +- int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); +- if (ret) +- return ret; +- - nominal_freq = cppc_perf.nominal_freq; + nominal_freq = READ_ONCE(cpudata->nominal_freq); nominal_perf = READ_ONCE(cpudata->nominal_perf); @@ -409,37 +521,47 @@ index 1791d37fbc53..91572dbe0cd1 100644 boost_ratio = div_u64(max_perf << SCHED_CAPACITY_SHIFT, nominal_perf); -@@ -628,13 +686,18 @@ static int amd_get_max_freq(struct amd_cpudata *cpudata) +@@ -627,31 +695,25 @@ static int amd_get_max_freq(struct amd_cpudata *cpudata) + static int amd_get_nominal_freq(struct amd_cpudata *cpudata) { - struct cppc_perf_caps cppc_perf; +- struct cppc_perf_caps cppc_perf; + u32 nominal_freq; - int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); - if (ret) - return ret; - -- /* Switch to khz */ -- return cppc_perf.nominal_freq * 1000; +- int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); +- if (ret) +- return ret; + if (quirks && quirks->nominal_freq) + nominal_freq = quirks->nominal_freq; + else -+ nominal_freq = cppc_perf.nominal_freq; -+ ++ nominal_freq = READ_ONCE(cpudata->nominal_freq); + +- /* Switch to khz */ +- return cppc_perf.nominal_freq * 1000; + return nominal_freq; } static int amd_get_lowest_nonlinear_freq(struct amd_cpudata *cpudata) -@@ -648,7 +711,7 @@ static int amd_get_lowest_nonlinear_freq(struct amd_cpudata *cpudata) - if (ret) - return ret; + { +- struct cppc_perf_caps cppc_perf; + u32 lowest_nonlinear_freq, lowest_nonlinear_perf, + nominal_freq, nominal_perf; + u64 lowest_nonlinear_ratio; +- int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); +- if (ret) +- return ret; +- - nominal_freq = cppc_perf.nominal_freq; + nominal_freq = READ_ONCE(cpudata->nominal_freq); nominal_perf = READ_ONCE(cpudata->nominal_perf); +- +- lowest_nonlinear_perf = cppc_perf.lowest_nonlinear_perf; ++ lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); - lowest_nonlinear_perf = cppc_perf.lowest_nonlinear_perf; -@@ -662,48 +725,164 @@ static int amd_get_lowest_nonlinear_freq(struct amd_cpudata *cpudata) + lowest_nonlinear_ratio = div_u64(lowest_nonlinear_perf << SCHED_CAPACITY_SHIFT, + nominal_perf); +@@ -662,48 +724,164 @@ static int amd_get_lowest_nonlinear_freq(struct amd_cpudata *cpudata) return lowest_nonlinear_freq * 1000; } @@ -463,7 +585,7 @@ index 1791d37fbc53..91572dbe0cd1 100644 - policy->cpuinfo.max_freq = cpudata->max_freq; - else - policy->cpuinfo.max_freq = cpudata->nominal_freq; -+ amd_pstate_global_params.cpb_supported = !((boost_val >> 25) & 0x1); ++ amd_pstate_global_params.cpb_supported = !(boost_val & MSR_K7_HWCR_CPB_DIS); + amd_pstate_global_params.cpb_boost = amd_pstate_global_params.cpb_supported; - policy->max = policy->cpuinfo.max_freq; @@ -507,8 +629,7 @@ index 1791d37fbc53..91572dbe0cd1 100644 - nominal_perf = READ_ONCE(cpudata->nominal_perf); + if (boot_cpu_has(X86_FEATURE_CPPC)) { + u64 cap1; - -- if (highest_perf <= nominal_perf) ++ + ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1); + if (ret) + return ret; @@ -531,7 +652,8 @@ index 1791d37fbc53..91572dbe0cd1 100644 +{ + int ret, prio; + u32 highest_perf; -+ + +- if (highest_perf <= nominal_perf) + ret = amd_pstate_get_highest_perf(cpudata->cpu, &highest_perf); + if (ret) + return; @@ -628,7 +750,7 @@ index 1791d37fbc53..91572dbe0cd1 100644 } static int amd_pstate_cpu_init(struct cpufreq_policy *policy) -@@ -727,24 +906,30 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) +@@ -727,24 +905,30 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) cpudata->cpu = policy->cpu; @@ -665,7 +787,7 @@ index 1791d37fbc53..91572dbe0cd1 100644 policy->min = min_freq; policy->max = max_freq; -@@ -777,12 +962,10 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) +@@ -777,12 +961,10 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) cpudata->min_freq = min_freq; cpudata->max_limit_freq = max_freq; cpudata->min_limit_freq = min_freq; @@ -678,7 +800,7 @@ index 1791d37fbc53..91572dbe0cd1 100644 if (!current_pstate_driver->adjust_perf) current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; -@@ -877,6 +1060,28 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, +@@ -877,6 +1059,28 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, return sysfs_emit(buf, "%u\n", perf); } @@ -707,7 +829,7 @@ index 1791d37fbc53..91572dbe0cd1 100644 static ssize_t show_energy_performance_available_preferences( struct cpufreq_policy *policy, char *buf) { -@@ -1074,18 +1279,125 @@ static ssize_t status_store(struct device *a, struct device_attribute *b, +@@ -1074,18 +1278,125 @@ static ssize_t status_store(struct device *a, struct device_attribute *b, return ret < 0 ? ret : count; } @@ -751,7 +873,7 @@ index 1791d37fbc53..91572dbe0cd1 100644 + if (on) + policy->cpuinfo.max_freq = cpudata->max_freq; + else -+ policy->cpuinfo.max_freq = cpudata->nominal_freq; ++ policy->cpuinfo.max_freq = cpudata->nominal_freq * 1000; + + policy->max = policy->cpuinfo.max_freq; + @@ -790,7 +912,7 @@ index 1791d37fbc53..91572dbe0cd1 100644 + + amd_pstate_global_params.cpb_boost = !!new_state; + -+ for_each_possible_cpu(cpu) { ++ for_each_online_cpu(cpu) { + + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct amd_cpudata *cpudata = policy->driver_data; @@ -833,7 +955,7 @@ index 1791d37fbc53..91572dbe0cd1 100644 NULL, }; -@@ -1093,6 +1405,8 @@ static struct freq_attr *amd_pstate_epp_attr[] = { +@@ -1093,6 +1404,8 @@ static struct freq_attr *amd_pstate_epp_attr[] = { &amd_pstate_max_freq, &amd_pstate_lowest_nonlinear_freq, &amd_pstate_highest_perf, @@ -842,7 +964,7 @@ index 1791d37fbc53..91572dbe0cd1 100644 &energy_performance_preference, &energy_performance_available_preferences, NULL, -@@ -1100,6 +1414,8 @@ static struct freq_attr *amd_pstate_epp_attr[] = { +@@ -1100,6 +1413,8 @@ static struct freq_attr *amd_pstate_epp_attr[] = { static struct attribute *pstate_global_attributes[] = { &dev_attr_status.attr, @@ -851,7 +973,7 @@ index 1791d37fbc53..91572dbe0cd1 100644 NULL }; -@@ -1151,17 +1467,23 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) +@@ -1151,17 +1466,23 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) cpudata->cpu = policy->cpu; cpudata->epp_policy = 0; @@ -879,7 +1001,7 @@ index 1791d37fbc53..91572dbe0cd1 100644 ret = -EINVAL; goto free_cpudata1; } -@@ -1174,7 +1496,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) +@@ -1174,7 +1495,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) /* Initial processor data capability frequencies */ cpudata->max_freq = max_freq; cpudata->min_freq = min_freq; @@ -887,7 +1009,7 @@ index 1791d37fbc53..91572dbe0cd1 100644 cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq; policy->driver_data = cpudata; -@@ -1205,7 +1526,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) +@@ -1205,7 +1525,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) return ret; WRITE_ONCE(cpudata->cppc_cap1_cached, value); } @@ -895,7 +1017,33 @@ index 1791d37fbc53..91572dbe0cd1 100644 return 0; -@@ -1431,7 +1751,7 @@ static struct cpufreq_driver amd_pstate_driver = { +@@ -1232,6 +1551,12 @@ static void amd_pstate_epp_update_limit(struct cpufreq_policy *policy) + max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq); + min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq); + ++ if (min_limit_perf < min_perf) ++ min_limit_perf = min_perf; ++ ++ if (max_limit_perf < min_limit_perf) ++ max_limit_perf = min_limit_perf; ++ + WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf); + WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf); + +@@ -1294,6 +1619,12 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) + + amd_pstate_epp_update_limit(policy); + ++ /* ++ * policy->cur is never updated with the amd_pstate_epp driver, but it ++ * is used as a stale frequency value. So, keep it within limits. ++ */ ++ policy->cur = policy->min; ++ + return 0; + } + +@@ -1431,7 +1762,7 @@ static struct cpufreq_driver amd_pstate_driver = { .exit = amd_pstate_cpu_exit, .suspend = amd_pstate_cpu_suspend, .resume = amd_pstate_cpu_resume, @@ -904,7 +1052,7 @@ index 1791d37fbc53..91572dbe0cd1 100644 .name = "amd-pstate", .attr = amd_pstate_attr, }; -@@ -1446,6 +1766,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = { +@@ -1446,6 +1777,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = { .online = amd_pstate_epp_cpu_online, .suspend = amd_pstate_epp_suspend, .resume = amd_pstate_epp_resume, @@ -912,7 +1060,7 @@ index 1791d37fbc53..91572dbe0cd1 100644 .name = "amd-pstate-epp", .attr = amd_pstate_epp_attr, }; -@@ -1486,6 +1807,11 @@ static int __init amd_pstate_init(void) +@@ -1486,6 +1818,11 @@ static int __init amd_pstate_init(void) if (cpufreq_get_current_driver()) return -EEXIST; @@ -924,7 +1072,7 @@ index 1791d37fbc53..91572dbe0cd1 100644 switch (cppc_state) { case AMD_PSTATE_UNDEFINED: /* Disable on the following configs by default: -@@ -1567,7 +1893,17 @@ static int __init amd_pstate_param(char *str) +@@ -1567,7 +1904,17 @@ static int __init amd_pstate_param(char *str) return amd_pstate_set_driver(mode_idx); } @@ -966,7 +1114,7 @@ index 3a0995f8bce8..930b6afba6f4 100644 { return -ENOTSUPP; diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h -index 6ad02ad9c7b4..f6e2c9825700 100644 +index 6ad02ad9c7b4..e89cf1249715 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -39,11 +39,16 @@ struct amd_aperf_mperf { @@ -1005,7 +1153,13 @@ index 6ad02ad9c7b4..f6e2c9825700 100644 u32 min_limit_perf; u32 max_limit_perf; u32 min_limit_freq; -@@ -84,7 +92,7 @@ struct amd_cpudata { +@@ -79,12 +87,13 @@ struct amd_cpudata { + u32 min_freq; + u32 nominal_freq; + u32 lowest_nonlinear_freq; ++ u32 lowest_freq; + + struct amd_aperf_mperf cur; struct amd_aperf_mperf prev; u64 freq; @@ -1014,7 +1168,7 @@ index 6ad02ad9c7b4..f6e2c9825700 100644 /* EPP feature related attributes*/ s16 epp_policy; -@@ -114,4 +122,23 @@ static const char * const amd_pstate_mode_string[] = { +@@ -114,4 +123,23 @@ static const char * const amd_pstate_mode_string[] = { [AMD_PSTATE_GUIDED] = "guided", NULL, }; @@ -1053,7 +1207,7 @@ index afda5f24d3dd..9bebeec24abb 100644 -- 2.44.0 -From 73dd80071220cff0908e4e8561b5c0a815e8520d Mon Sep 17 00:00:00 2001 +From 93aefd5f98b793e9447e64dcbaa69221102e304a Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 26 Feb 2024 15:46:58 +0100 Subject: [PATCH 2/7] bbr3 @@ -4439,7 +4593,7 @@ index d1ad20ce1c8c..ef74f33c7905 100644 -- 2.44.0 -From 173737dc7aacb08dc475afa58212800f7a34b240 Mon Sep 17 00:00:00 2001 +From fb681aa9768aa30b3b17152a221868238394dd64 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 26 Feb 2024 15:47:11 +0100 Subject: [PATCH 3/7] block @@ -4928,7 +5082,7 @@ index f958e79277b8..1b0de4fc3958 100644 -- 2.44.0 -From 4b43d78e522b63355e09a4fb91365a1e11891a01 Mon Sep 17 00:00:00 2001 +From 4f371ea8a1f8a47e624592a91f9e961080aec2eb Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 26 Feb 2024 15:47:21 +0100 Subject: [PATCH 4/7] cachy @@ -4984,10 +5138,10 @@ Signed-off-by: Peter Jung create mode 100644 drivers/platform/x86/steamdeck.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 522530432548..65b1952a783b 100644 +index a493d93e0d2c..8d6a2ce37f8f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -4375,6 +4375,15 @@ +@@ -4396,6 +4396,15 @@ nomsi [MSI] If the PCI_MSI kernel config parameter is enabled, this kernel boot option can be used to disable the use of MSI interrupts system-wide. @@ -5004,7 +5158,7 @@ index 522530432548..65b1952a783b 100644 Safety option to keep boot IRQs enabled. This should never be necessary. diff --git a/Makefile b/Makefile -index 6cdb5717bfe0..dc9adf866df1 100644 +index 95b320ada47c..0b7d42037c3e 100644 --- a/Makefile +++ b/Makefile @@ -808,9 +808,164 @@ endif # need-config @@ -8151,7 +8305,7 @@ index 6030a8235617..60b7fe5fa74a 100644 { return &init_user_ns; diff --git a/init/Kconfig b/init/Kconfig -index 8426d59cc634..47671886d579 100644 +index bee58f7468c3..9ea39297f149 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -132,6 +132,10 @@ config THREAD_INFO_IN_TASK @@ -8394,7 +8548,7 @@ index ffc3a2ba3a8c..0e440573033c 100644 # diff --git a/mm/compaction.c b/mm/compaction.c -index 4add68d40e8d..b692129f63f4 100644 +index b961db601df4..91d627e8a93d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1830,7 +1830,11 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE @@ -8454,7 +8608,7 @@ index 3f255534986a..01b3e5cb8da1 100644 EXPORT_SYMBOL_GPL(dirty_writeback_interval); diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index 150d4f23b010..d5ec35e0b3a2 100644 +index a663202045dc..7c48b114331b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -287,7 +287,11 @@ const char * const migratetype_names[MIGRATE_TYPES] = { @@ -8557,7 +8711,7 @@ index bd5183dfd879..3a410f53a07c 100644 /* diff --git a/mm/vmscan.c b/mm/vmscan.c -index 4f9c854ce6cc..fd1d9b4194e3 100644 +index 4255619a1a31..5a3fbaf34158 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -185,7 +185,11 @@ struct scan_control { @@ -8587,7 +8741,7 @@ index 4f9c854ce6cc..fd1d9b4194e3 100644 -- 2.44.0 -From 9b3faef691a9c9a202c27e5285fcd67d8a95564c Mon Sep 17 00:00:00 2001 +From 516559b0e31629dafbe60212d041e63af1b12c1c Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 26 Feb 2024 15:47:43 +0100 Subject: [PATCH 5/7] fixes @@ -8622,7 +8776,7 @@ index a5af0edd3eb8..0731bc203aa9 100644 -- 2.44.0 -From 6adc19960a6a214361b1099a732af82e9edb6b62 Mon Sep 17 00:00:00 2001 +From e01d8909a6a6d90eb2ff29871d79f4e9359638ca Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 26 Feb 2024 15:48:00 +0100 Subject: [PATCH 6/7] ksm @@ -9062,7 +9216,7 @@ index faad00cce269..c7c9eb656468 100644 -- 2.44.0 -From b66054cf4e9ef095844e6d3a673214a8088f500c Mon Sep 17 00:00:00 2001 +From 0634ad09765970da5be85d61cb4b8b4b38adb3c0 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 1 Feb 2024 16:54:48 +0100 Subject: [PATCH 7/7] zstd diff --git a/patches/cachyos/0003-nvidia.patch b/patches/cachyos/0003-nvidia.patch new file mode 100644 index 0000000..ce7fb7f --- /dev/null +++ b/patches/cachyos/0003-nvidia.patch @@ -0,0 +1,230 @@ +From d2db737a5be989688a7a5d805b7f299d0203d228 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 29 Jan 2024 15:09:44 +0100 +Subject: [PATCH] NVIDIA: Fixup GPL issue + +Signed-off-by: Peter Jung +--- + kernel/rcu/tree_plugin.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h +index 41021080ad25..72474d8ec180 100644 +--- a/kernel/rcu/tree_plugin.h ++++ b/kernel/rcu/tree_plugin.h +@@ -406,7 +406,7 @@ void __rcu_read_lock(void) + WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true); + barrier(); /* critical section after entry code. */ + } +-EXPORT_SYMBOL_GPL(__rcu_read_lock); ++EXPORT_SYMBOL(__rcu_read_lock); + + /* + * Preemptible RCU implementation for rcu_read_unlock(). +@@ -431,7 +431,7 @@ void __rcu_read_unlock(void) + WARN_ON_ONCE(rrln < 0 || rrln > RCU_NEST_PMAX); + } + } +-EXPORT_SYMBOL_GPL(__rcu_read_unlock); ++EXPORT_SYMBOL(__rcu_read_unlock); + + /* + * Advance a ->blkd_tasks-list pointer to the next entry, instead +-- +2.43.0 + +--- a/kernel/nvidia-drm/nvidia-drm-drv.c ++++ b/kernel/nvidia-drm/nvidia-drm-drv.c +@@ -480,6 +480,22 @@ static int nv_drm_load(struct drm_device *dev, unsigned long flags) + return -ENODEV; + } + ++#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) ++ /* ++ * If fbdev is enabled, take modeset ownership now before other DRM clients ++ * can take master (and thus NVKMS ownership). ++ */ ++ if (nv_drm_fbdev_module_param) { ++ if (!nvKms->grabOwnership(pDevice)) { ++ nvKms->freeDevice(pDevice); ++ NV_DRM_DEV_LOG_ERR(nv_dev, "Failed to grab NVKMS modeset ownership"); ++ return -EBUSY; ++ } ++ ++ nv_dev->hasFramebufferConsole = NV_TRUE; ++ } ++#endif ++ + mutex_lock(&nv_dev->lock); + + /* Set NvKmsKapiDevice */ +@@ -590,6 +606,15 @@ static void __nv_drm_unload(struct drm_device *dev) + return; + } + ++ /* Release modeset ownership if fbdev is enabled */ ++ ++#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) ++ if (nv_dev->hasFramebufferConsole) { ++ drm_atomic_helper_shutdown(dev); ++ nvKms->releaseOwnership(nv_dev->pDevice); ++ } ++#endif ++ + cancel_delayed_work_sync(&nv_dev->hotplug_event_work); + mutex_lock(&nv_dev->lock); + +@@ -1768,14 +1793,7 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info) + } + + #if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) +- if (nv_drm_fbdev_module_param && +- drm_core_check_feature(dev, DRIVER_MODESET)) { +- +- if (!nvKms->grabOwnership(nv_dev->pDevice)) { +- NV_DRM_DEV_LOG_ERR(nv_dev, "Failed to grab NVKMS modeset ownership"); +- goto failed_grab_ownership; +- } +- ++ if (nv_dev->hasFramebufferConsole) { + if (bus_is_pci) { + struct pci_dev *pdev = to_pci_dev(device); + +@@ -1786,8 +1804,6 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info) + #endif + } + drm_fbdev_generic_setup(dev, 32); +- +- nv_dev->hasFramebufferConsole = NV_TRUE; + } + #endif /* defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) */ + +@@ -1798,12 +1814,6 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info) + + return; /* Success */ + +-#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) +-failed_grab_ownership: +- +- drm_dev_unregister(dev); +-#endif +- + failed_drm_register: + + nv_drm_dev_free(dev); +@@ -1870,12 +1880,6 @@ void nv_drm_remove_devices(void) + struct nv_drm_device *next = dev_list->next; + struct drm_device *dev = dev_list->dev; + +-#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) +- if (dev_list->hasFramebufferConsole) { +- drm_atomic_helper_shutdown(dev); +- nvKms->releaseOwnership(dev_list->pDevice); +- } +-#endif + drm_dev_unregister(dev); + nv_drm_dev_free(dev); + +From d82eb6c87ee2e05b6bbd35f703a41e68b3adc3a7 Mon Sep 17 00:00:00 2001 +From: Aaron Plattner +Date: Tue, 26 Dec 2023 11:58:46 -0800 +Subject: [PATCH] nvidia-drm: Use a workqueue to defer calling + drm_kms_helper_hotplug_event + +--- + kernel/nvidia-drm/nvidia-drm-drv.c | 24 ++++++++++++++++++++++++ + kernel/nvidia-drm/nvidia-drm-encoder.c | 4 ++-- + kernel/nvidia-drm/nvidia-drm-priv.h | 1 + + 3 files changed, 27 insertions(+), 2 deletions(-) + +diff --git kernel/nvidia-drm/nvidia-drm-drv.c kernel/nvidia-drm/nvidia-drm-drv.c +index e0ddb6c..9f7424d 100644 +--- kernel/nvidia-drm/nvidia-drm-drv.c ++++ kernel/nvidia-drm/nvidia-drm-drv.c +@@ -74,6 +74,7 @@ + #endif + + #include ++#include + + /* + * Commit fcd70cd36b9b ("drm: Split out drm_probe_helper.h") +@@ -405,6 +406,27 @@ static int nv_drm_create_properties(struct nv_drm_device *nv_dev) + return 0; + } + ++#if defined(NV_DRM_ATOMIC_MODESET_AVAILABLE) ++/* ++ * We can't just call drm_kms_helper_hotplug_event directly because ++ * fbdev_generic may attempt to set a mode from inside the hotplug event ++ * handler. Because kapi event handling runs on nvkms_kthread_q, this blocks ++ * other event processing including the flip completion notifier expected by ++ * nv_drm_atomic_commit. ++ * ++ * Defer hotplug event handling to a work item so that nvkms_kthread_q can ++ * continue processing events while a DRM modeset is in progress. ++ */ ++static void nv_drm_handle_hotplug_event(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct nv_drm_device *nv_dev = ++ container_of(dwork, struct nv_drm_device, hotplug_event_work); ++ ++ drm_kms_helper_hotplug_event(nv_dev->dev); ++} ++#endif ++ + static int nv_drm_load(struct drm_device *dev, unsigned long flags) + { + #if defined(NV_DRM_ATOMIC_MODESET_AVAILABLE) +@@ -540,6 +562,7 @@ static int nv_drm_load(struct drm_device *dev, unsigned long flags) + + /* Enable event handling */ + ++ INIT_DELAYED_WORK(&nv_dev->hotplug_event_work, nv_drm_handle_hotplug_event); + atomic_set(&nv_dev->enable_event_handling, true); + + init_waitqueue_head(&nv_dev->flip_event_wq); +@@ -567,6 +590,7 @@ static void __nv_drm_unload(struct drm_device *dev) + return; + } + ++ cancel_delayed_work_sync(&nv_dev->hotplug_event_work); + mutex_lock(&nv_dev->lock); + + WARN_ON(nv_dev->subOwnershipGranted); +diff --git kernel/nvidia-drm/nvidia-drm-encoder.c kernel/nvidia-drm/nvidia-drm-encoder.c +index b5ef5a2..7c0c119 100644 +--- kernel/nvidia-drm/nvidia-drm-encoder.c ++++ kernel/nvidia-drm/nvidia-drm-encoder.c +@@ -300,7 +300,7 @@ void nv_drm_handle_display_change(struct nv_drm_device *nv_dev, + + nv_drm_connector_mark_connection_status_dirty(nv_encoder->nv_connector); + +- drm_kms_helper_hotplug_event(dev); ++ schedule_delayed_work(&nv_dev->hotplug_event_work, 0); + } + + void nv_drm_handle_dynamic_display_connected(struct nv_drm_device *nv_dev, +@@ -347,6 +347,6 @@ void nv_drm_handle_dynamic_display_connected(struct nv_drm_device *nv_dev, + drm_reinit_primary_mode_group(dev); + #endif + +- drm_kms_helper_hotplug_event(dev); ++ schedule_delayed_work(&nv_dev->hotplug_event_work, 0); + } + #endif +diff --git kernel/nvidia-drm/nvidia-drm-priv.h kernel/nvidia-drm/nvidia-drm-priv.h +index 253155f..c9ce727 100644 +--- kernel/nvidia-drm/nvidia-drm-priv.h ++++ kernel/nvidia-drm/nvidia-drm-priv.h +@@ -126,6 +126,7 @@ struct nv_drm_device { + NvU64 modifiers[6 /* block linear */ + 1 /* linear */ + 1 /* terminator */]; + #endif + ++ struct delayed_work hotplug_event_work; + atomic_t enable_event_handling; + + /** +-- +2.43.0 \ No newline at end of file diff --git a/patches/cachyos/0004-intel.patch b/patches/cachyos/0004-intel.patch new file mode 100644 index 0000000..87da0d3 --- /dev/null +++ b/patches/cachyos/0004-intel.patch @@ -0,0 +1,2203 @@ +From a06ef5a36a19553f48d73428311b241839d53b9c Mon Sep 17 00:00:00 2001 +From: Laio Oriel Seman +Date: Fri, 8 Mar 2024 11:30:24 -0300 +Subject: [PATCH 1/2] ITD + +--- + MAINTAINERS | 1 + + arch/x86/include/asm/cpufeatures.h | 2 + + arch/x86/include/asm/disabled-features.h | 8 +- + arch/x86/include/asm/hfi.h | 85 +++++ + arch/x86/include/asm/hreset.h | 30 ++ + arch/x86/include/asm/msr-index.h | 12 + + arch/x86/include/asm/topology.h | 15 + + arch/x86/kernel/Makefile | 2 + + arch/x86/kernel/cpu/common.c | 33 +- + arch/x86/kernel/cpu/cpuid-deps.c | 1 + + arch/x86/kernel/process_32.c | 3 + + arch/x86/kernel/process_64.c | 3 + + arch/x86/kernel/sched_ipcc.c | 93 +++++ + drivers/thermal/intel/Kconfig | 1 + + drivers/thermal/intel/intel_hfi.c | 411 ++++++++++++++++++----- + drivers/thermal/thermal_netlink.c | 62 +++- + drivers/thermal/thermal_netlink.h | 26 ++ + include/linux/sched.h | 24 +- + include/linux/sched/topology.h | 6 + + init/Kconfig | 12 + + kernel/sched/core.c | 10 +- + kernel/sched/fair.c | 318 +++++++++++++++++- + kernel/sched/sched.h | 66 ++++ + kernel/sched/topology.c | 9 + + kernel/time/timer.c | 2 +- + 25 files changed, 1127 insertions(+), 108 deletions(-) + create mode 100644 arch/x86/include/asm/hfi.h + create mode 100644 arch/x86/include/asm/hreset.h + create mode 100644 arch/x86/kernel/sched_ipcc.c + +diff --git a/MAINTAINERS b/MAINTAINERS +index 88b28f85587..9bb09b30526 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -21791,6 +21791,7 @@ L: linux-pm@vger.kernel.org + S: Supported + Q: https://patchwork.kernel.org/project/linux-pm/list/ + T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git thermal ++F: arch/x86/include/asm/hfi.h + F: Documentation/ABI/testing/sysfs-class-thermal + F: Documentation/admin-guide/thermal/ + F: Documentation/devicetree/bindings/thermal/ +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 2b62cdd8dd1..31b1cea6847 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -326,6 +326,7 @@ + #define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ + #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ + #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ ++#define X86_FEATURE_HRESET (12*32+22) /* Hardware history reset instruction */ + #define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ + #define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */ + +@@ -360,6 +361,7 @@ + #define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ + #define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ + #define X86_FEATURE_HFI (14*32+19) /* Hardware Feedback Interface */ ++#define X86_FEATURE_ITD (14*32+23) /* Intel Thread Director */ + + /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ + #define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ +diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h +index 702d93fdd10..f4aa34cfd20 100644 +--- a/arch/x86/include/asm/disabled-features.h ++++ b/arch/x86/include/asm/disabled-features.h +@@ -117,6 +117,12 @@ + #define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31)) + #endif + ++#ifdef CONFIG_IPC_CLASSES ++# define DISABLE_ITD 0 ++#else ++# define DISABLE_ITD (1 << (X86_FEATURE_ITD & 31)) ++#endif ++ + /* + * Make sure to add features to the correct mask + */ +@@ -135,7 +141,7 @@ + DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) + #define DISABLED_MASK12 (DISABLE_LAM) + #define DISABLED_MASK13 0 +-#define DISABLED_MASK14 0 ++#define DISABLED_MASK14 (DISABLE_ITD) + #define DISABLED_MASK15 0 + #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \ + DISABLE_ENQCMD) +diff --git a/arch/x86/include/asm/hfi.h b/arch/x86/include/asm/hfi.h +new file mode 100644 +index 00000000000..b7fda3e0e8c +--- /dev/null ++++ b/arch/x86/include/asm/hfi.h +@@ -0,0 +1,85 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _ASM_X86_HFI_H ++#define _ASM_X86_HFI_H ++ ++/* CPUID detection and enumeration definitions for HFI */ ++ ++union hfi_capabilities { ++ struct { ++ u8 performance:1; ++ u8 energy_efficiency:1; ++ u8 __reserved:6; ++ } split; ++ u8 bits; ++}; ++ ++union cpuid6_edx { ++ struct { ++ union hfi_capabilities capabilities; ++ u32 table_pages:4; ++ u32 __reserved:4; ++ s32 index:16; ++ } split; ++ u32 full; ++}; ++ ++union cpuid6_ecx { ++ struct { ++ u32 dont_care0:8; ++ u32 nr_classes:8; ++ u32 dont_care1:16; ++ } split; ++ u32 full; ++}; ++ ++/** ++ * struct hfi_hdr - Header of the HFI table ++ * @perf_updated: Hardware updated performance capabilities ++ * @ee_updated: Hardware updated energy efficiency capabilities ++ * ++ * Properties of the data in an HFI table. There exists one header per each ++ * HFI class. ++ */ ++struct hfi_hdr { ++ u8 perf_updated; ++ u8 ee_updated; ++} __packed; ++ ++/** ++ * struct hfi_table - Representation of an HFI table ++ * @base_addr: Base address of the local copy of the HFI table ++ * @timestamp: Timestamp of the last update of the local table. ++ * Located at the base of the local table. ++ * @hdr: Base address of the header of the local table ++ * @data: Base address of the data of the local table ++ */ ++struct hfi_table { ++ union { ++ void *base_addr; ++ u64 *timestamp; ++ }; ++ void *hdr; ++ void *data; ++}; ++ ++/** ++ * struct hfi_features - Supported HFI features ++ * @nr_classes: Number of classes supported ++ * @nr_table_pages: Size of the HFI table in 4KB pages ++ * @cpu_stride: Stride size to locate the capability data of a logical ++ * processor within the table (i.e., row stride) ++ * @class_stride: Stride size to locate a class within the capability ++ * data of a logical processor or the HFI table header ++ * @hdr_size: Size of the table header ++ * ++ * Parameters and supported features that are common to all HFI instances ++ */ ++struct hfi_features { ++ unsigned int nr_classes; ++ size_t nr_table_pages; ++ unsigned int cpu_stride; ++ unsigned int class_stride; ++ unsigned int hdr_size; ++}; ++ ++#endif /* _ASM_X86_HFI_H */ +diff --git a/arch/x86/include/asm/hreset.h b/arch/x86/include/asm/hreset.h +new file mode 100644 +index 00000000000..d68ca2fb864 +--- /dev/null ++++ b/arch/x86/include/asm/hreset.h +@@ -0,0 +1,30 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _ASM_X86_HRESET_H ++ ++/** ++ * HRESET - History reset. Available since binutils v2.36. ++ * ++ * Request the processor to reset the history of task classification on the ++ * current logical processor. The history components to be ++ * reset are specified in %eax. Only bits specified in CPUID(0x20).EBX ++ * and enabled in the IA32_HRESET_ENABLE MSR can be selected. ++ * ++ * The assembly code looks like: ++ * ++ * hreset %eax ++ * ++ * The corresponding machine code looks like: ++ * ++ * F3 0F 3A F0 ModRM Imm ++ * ++ * The value of ModRM is 0xc0 to specify %eax register addressing. ++ * The ignored immediate operand is set to 0. ++ * ++ * The instruction is documented in the Intel SDM. ++ */ ++ ++#define __ASM_HRESET ".byte 0xf3, 0xf, 0x3a, 0xf0, 0xc0, 0x0" ++ ++void reset_hardware_history(void); ++ ++#endif /* _ASM_X86_HRESET_H */ +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index f1bd7b91b3c..f334c19b028 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -1143,7 +1143,19 @@ + + /* Hardware Feedback Interface */ + #define MSR_IA32_HW_FEEDBACK_PTR 0x17d0 ++#define HW_FEEDBACK_PTR_VALID BIT_ULL(0) ++#define HW_FEEDBACK_PTR_RESERVED_MASK GENMASK_ULL(11, 1) ++ + #define MSR_IA32_HW_FEEDBACK_CONFIG 0x17d1 ++#define MSR_IA32_HW_FEEDBACK_THREAD_CONFIG 0x17d4 ++#define MSR_IA32_HW_FEEDBACK_CHAR 0x17d2 ++ ++/* Hardware History Reset */ ++#define MSR_IA32_HW_HRESET_ENABLE 0x17da ++ ++#define HW_FEEDBACK_CONFIG_HFI_ENABLE BIT_ULL(0) ++#define HW_FEEDBACK_CONFIG_ITD_ENABLE BIT_ULL(1) ++#define HW_FEEDBACK_THREAD_CONFIG_ENABLE BIT_ULL(0) + + /* x2APIC locked status */ + #define MSR_IA32_XAPIC_DISABLE_STATUS 0xBD +diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h +index 5f87f6b9b09..29fc06efcb6 100644 +--- a/arch/x86/include/asm/topology.h ++++ b/arch/x86/include/asm/topology.h +@@ -235,4 +235,19 @@ void init_freq_invariance_cppc(void); + #define arch_init_invariance_cppc init_freq_invariance_cppc + #endif + ++#ifdef CONFIG_INTEL_HFI_THERMAL ++int intel_hfi_read_classid(u8 *classid); ++unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu); ++#else ++static inline int intel_hfi_read_classid(u8 *classid) { return -ENODEV; } ++static inline unsigned long ++intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu) { return -ENODEV; } ++#endif ++ ++#ifdef CONFIG_IPC_CLASSES ++void intel_update_ipcc(struct task_struct *curr); ++#define arch_update_ipcc intel_update_ipcc ++#define arch_get_ipcc_score intel_hfi_get_ipcc_score ++#endif ++ + #endif /* _ASM_X86_TOPOLOGY_H */ +diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile +index 0000325ab98..9bc7319175d 100644 +--- a/arch/x86/kernel/Makefile ++++ b/arch/x86/kernel/Makefile +@@ -150,6 +150,8 @@ obj-$(CONFIG_X86_CET) += cet.o + + obj-$(CONFIG_X86_USER_SHADOW_STACK) += shstk.o + ++obj-$(CONFIG_IPC_CLASSES) += sched_ipcc.o ++ + ### + # 64 bit specific files + ifeq ($(CONFIG_X86_64),y) +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index fbc4e60d027..99ebd403fe4 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -57,6 +57,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -381,6 +382,35 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c) + cr4_clear_bits(X86_CR4_UMIP); + } + ++static u32 hardware_history_features __ro_after_init; ++ ++ ++void reset_hardware_history(void) ++{ ++ asm_inline volatile (ALTERNATIVE("", __ASM_HRESET, X86_FEATURE_HRESET) ++ : : "a" (hardware_history_features) : "memory"); ++} ++ ++EXPORT_SYMBOL(reset_hardware_history); ++ ++static __always_inline void setup_hreset(struct cpuinfo_x86 *c) ++{ ++ if (!cpu_feature_enabled(X86_FEATURE_HRESET)) ++ return; ++ ++ /* ++ * Use on all CPUs the hardware history features that the boot ++ * CPU supports. ++ */ ++ if (c == &boot_cpu_data) ++ hardware_history_features = cpuid_ebx(0x20); ++ ++ if (!hardware_history_features) ++ return; ++ ++ wrmsrl(MSR_IA32_HW_HRESET_ENABLE, hardware_history_features); ++} ++ + /* These bits should not change their value after CPU init is finished. */ + static const unsigned long cr4_pinned_mask = + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | +@@ -1872,10 +1902,11 @@ static void identify_cpu(struct cpuinfo_x86 *c) + /* Disable the PN if appropriate */ + squash_the_stupid_serial_number(c); + +- /* Set up SMEP/SMAP/UMIP */ ++ /* Set up SMEP/SMAP/UMIP/HRESET */ + setup_smep(c); + setup_smap(c); + setup_umip(c); ++ setup_hreset(c); + + /* Enable FSGSBASE instructions if available. */ + if (cpu_has(c, X86_FEATURE_FSGSBASE)) { +diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c +index e462c1d3800..db62700cdac 100644 +--- a/arch/x86/kernel/cpu/cpuid-deps.c ++++ b/arch/x86/kernel/cpu/cpuid-deps.c +@@ -81,6 +81,7 @@ static const struct cpuid_dep cpuid_deps[] = { + { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, + { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, + { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, ++ { X86_FEATURE_ITD, X86_FEATURE_HFI }, + { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, + {} + }; +diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c +index 708c87b88cc..7353bb119e7 100644 +--- a/arch/x86/kernel/process_32.c ++++ b/arch/x86/kernel/process_32.c +@@ -52,6 +52,7 @@ + #include + #include + #include ++#include + #include + + #include "process.h" +@@ -214,6 +215,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + /* Load the Intel cache allocation PQR MSR. */ + resctrl_sched_in(next_p); + ++ reset_hardware_history(); ++ + return prev_p; + } + +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index 33b268747bb..202a6735c09 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -54,6 +54,7 @@ + #include + #include + #include ++#include + #include + #include + #ifdef CONFIG_IA32_EMULATION +@@ -661,6 +662,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + /* Load the Intel cache allocation PQR MSR. */ + resctrl_sched_in(next_p); + ++ reset_hardware_history(); ++ + return prev_p; + } + +diff --git a/arch/x86/kernel/sched_ipcc.c b/arch/x86/kernel/sched_ipcc.c +new file mode 100644 +index 00000000000..dd73fc8be49 +--- /dev/null ++++ b/arch/x86/kernel/sched_ipcc.c +@@ -0,0 +1,93 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++ * Intel support for scheduler IPC classes ++ * ++ * Copyright (c) 2023, Intel Corporation. ++ * ++ * Author: Ricardo Neri ++ * ++ * On hybrid processors, the architecture differences between types of CPUs ++ * lead to different number of retired instructions per cycle (IPC). IPCs may ++ * differ further by classes of instructions. ++ * ++ * The scheduler assigns an IPC class to every task with arch_update_ipcc() ++ * from data that hardware provides. Implement this interface for x86. ++ * ++ * See kernel/sched/sched.h for details. ++ */ ++ ++#include ++ ++#include ++#include ++ ++#define CLASS_DEBOUNCER_SKIPS 4 ++ ++/** ++ * debounce_and_update_class() - Process and update a task's classification ++ * ++ * @p: The task of which the classification will be updated ++ * @new_ipcc: The new IPC classification ++ * ++ * Update the classification of @p with the new value that hardware provides. ++ * Only update the classification of @p if it has been the same during ++ * CLASS_DEBOUNCER_SKIPS consecutive ticks. ++ */ ++static void debounce_and_update_class(struct task_struct *p, u8 new_ipcc) ++{ ++ u16 debounce_skip; ++ ++ /* The class of @p changed. Only restart the debounce counter. */ ++ if (p->ipcc_tmp != new_ipcc) { ++ p->ipcc_cntr = 1; ++ goto out; ++ } ++ ++ /* ++ * The class of @p did not change. Update it if it has been the same ++ * for CLASS_DEBOUNCER_SKIPS user ticks. ++ */ ++ debounce_skip = p->ipcc_cntr + 1; ++ if (debounce_skip < CLASS_DEBOUNCER_SKIPS) ++ p->ipcc_cntr++; ++ else ++ p->ipcc = new_ipcc; ++ ++out: ++ p->ipcc_tmp = new_ipcc; ++} ++ ++static bool classification_is_accurate(u8 hfi_class, bool smt_siblings_idle) ++{ ++ switch (boot_cpu_data.x86_model) { ++ case INTEL_FAM6_ALDERLAKE: ++ case INTEL_FAM6_ALDERLAKE_L: ++ case INTEL_FAM6_RAPTORLAKE: ++ case INTEL_FAM6_RAPTORLAKE_P: ++ case INTEL_FAM6_RAPTORLAKE_S: ++ if (hfi_class == 3 || hfi_class == 2 || smt_siblings_idle) ++ return true; ++ ++ return false; ++ ++ default: ++ return false; ++ } ++} ++ ++void intel_update_ipcc(struct task_struct *curr) ++{ ++ u8 hfi_class; ++ bool idle; ++ ++ if (intel_hfi_read_classid(&hfi_class)) ++ return; ++ ++ /* ++ * 0 is a valid classification for Intel Thread Director. A scheduler ++ * IPCC class of 0 means that the task is unclassified. Adjust. ++ */ ++ idle = sched_smt_siblings_idle(task_cpu(curr)); ++ if (classification_is_accurate(hfi_class, idle)) ++ debounce_and_update_class(curr, hfi_class + 1); ++} +diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig +index b43953b5539..03da183ff99 100644 +--- a/drivers/thermal/intel/Kconfig ++++ b/drivers/thermal/intel/Kconfig +@@ -109,6 +109,7 @@ config INTEL_HFI_THERMAL + depends on CPU_SUP_INTEL + depends on X86_THERMAL_VECTOR + select THERMAL_NETLINK ++ select IPC_CLASSES + help + Select this option to enable the Hardware Feedback Interface. If + selected, hardware provides guidance to the operating system on +diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c +index 3b04c6ec4fc..b791906914b 100644 +--- a/drivers/thermal/intel/intel_hfi.c ++++ b/drivers/thermal/intel/intel_hfi.c +@@ -30,9 +30,12 @@ + #include + #include + #include ++#include + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -41,6 +44,7 @@ + #include + #include + ++#include + #include + + #include "intel_hfi.h" +@@ -48,32 +52,20 @@ + + #include "../thermal_netlink.h" + +-/* Hardware Feedback Interface MSR configuration bits */ +-#define HW_FEEDBACK_PTR_VALID_BIT BIT(0) +-#define HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT BIT(0) + + /* CPUID detection and enumeration definitions for HFI */ + + #define CPUID_HFI_LEAF 6 + +-union hfi_capabilities { ++union hfi_thread_feedback_char_msr { + struct { +- u8 performance:1; +- u8 energy_efficiency:1; +- u8 __reserved:6; ++ u64 classid : 8; ++ u64 __reserved : 55; ++ u64 valid : 1; + } split; +- u8 bits; ++ u64 full; + }; + +-union cpuid6_edx { +- struct { +- union hfi_capabilities capabilities; +- u32 table_pages:4; +- u32 __reserved:4; +- s32 index:16; +- } split; +- u32 full; +-}; + + /** + * struct hfi_cpu_data - HFI capabilities per CPU +@@ -81,32 +73,17 @@ union cpuid6_edx { + * @ee_cap: Energy efficiency capability + * + * Capabilities of a logical processor in the HFI table. These capabilities are +- * unitless. ++ * unitless and specific to each HFI class. + */ + struct hfi_cpu_data { + u8 perf_cap; + u8 ee_cap; + } __packed; + +-/** +- * struct hfi_hdr - Header of the HFI table +- * @perf_updated: Hardware updated performance capabilities +- * @ee_updated: Hardware updated energy efficiency capabilities +- * +- * Properties of the data in an HFI table. +- */ +-struct hfi_hdr { +- u8 perf_updated; +- u8 ee_updated; +-} __packed; + + /** + * struct hfi_instance - Representation of an HFI instance (i.e., a table) +- * @local_table: Base of the local copy of the HFI table +- * @timestamp: Timestamp of the last update of the local table. +- * Located at the base of the local table. +- * @hdr: Base address of the header of the local table +- * @data: Base address of the data of the local table ++ * @local_table: Local copy of HFI table for this instance + * @cpus: CPUs represented in this HFI table instance + * @hw_table: Pointer to the HFI table of this instance + * @update_work: Delayed work to process HFI updates +@@ -116,12 +93,7 @@ struct hfi_hdr { + * A set of parameters to parse and navigate a specific HFI table. + */ + struct hfi_instance { +- union { +- void *local_table; +- u64 *timestamp; +- }; +- void *hdr; +- void *data; ++ struct hfi_table local_table; + cpumask_var_t cpus; + void *hw_table; + struct delayed_work update_work; +@@ -129,20 +101,6 @@ struct hfi_instance { + raw_spinlock_t event_lock; + }; + +-/** +- * struct hfi_features - Supported HFI features +- * @nr_table_pages: Size of the HFI table in 4KB pages +- * @cpu_stride: Stride size to locate the capability data of a logical +- * processor within the table (i.e., row stride) +- * @hdr_size: Size of the table header +- * +- * Parameters and supported features that are common to all HFI instances +- */ +-struct hfi_features { +- size_t nr_table_pages; +- unsigned int cpu_stride; +- unsigned int hdr_size; +-}; + + /** + * struct hfi_cpu_info - Per-CPU attributes to consume HFI data +@@ -159,6 +117,7 @@ struct hfi_cpu_info { + static DEFINE_PER_CPU(struct hfi_cpu_info, hfi_cpu_info) = { .index = -1 }; + + static int max_hfi_instances; ++static int hfi_clients_nr; + static struct hfi_instance *hfi_instances; + + static struct hfi_features hfi_features; +@@ -168,6 +127,139 @@ static struct workqueue_struct *hfi_updates_wq; + #define HFI_UPDATE_INTERVAL HZ + #define HFI_MAX_THERM_NOTIFY_COUNT 16 + ++/* ++ * A task may be unclassified if it has been recently created, spend most of ++ * its lifetime sleeping, or hardware has not provided a classification. ++ * ++ * Most tasks will be classified as scheduler's IPC class 1 (HFI class 0) ++ * eventually. Meanwhile, the scheduler will place classes of tasks with higher ++ * IPC scores on higher-performance CPUs. ++ * ++ * IPC class 1 is a reasonable choice. It matches the performance capability ++ * of the legacy, classless, HFI table. ++ */ ++#define HFI_UNCLASSIFIED_DEFAULT 1 ++ ++/* A cache of the HFI perf capabilities for lockless access. */ ++static int __percpu *hfi_ipcc_scores; ++/* Sequence counter for hfi_ipcc_scores */ ++static seqcount_t hfi_ipcc_seqcount = SEQCNT_ZERO(hfi_ipcc_seqcount); ++ ++static int alloc_hfi_ipcc_scores(void) ++{ ++ if (!cpu_feature_enabled(X86_FEATURE_ITD)) ++ return 0; ++ ++ hfi_ipcc_scores = __alloc_percpu(sizeof(*hfi_ipcc_scores) * ++ hfi_features.nr_classes, ++ sizeof(*hfi_ipcc_scores)); ++ ++ return hfi_ipcc_scores ? 0 : -ENOMEM; ++} ++ ++unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu) ++{ ++ int *scores, score; ++ unsigned long seq; ++ ++ scores = per_cpu_ptr(hfi_ipcc_scores, cpu); ++ if (!scores) ++ return -ENODEV; ++ ++ if (cpu < 0 || cpu >= nr_cpu_ids) ++ return -EINVAL; ++ ++ if (ipcc == IPC_CLASS_UNCLASSIFIED) ++ ipcc = HFI_UNCLASSIFIED_DEFAULT; ++ ++ /* ++ * Scheduler IPC classes start at 1. HFI classes start at 0. ++ * See note intel_hfi_update_ipcc(). ++ */ ++ if (ipcc >= hfi_features.nr_classes + 1) ++ return -EINVAL; ++ ++ /* ++ * The seqcount implies load-acquire semantics to order loads with ++ * lockless stores of the write side in set_hfi_ipcc_score(). It ++ * also implies a compiler barrier. ++ */ ++ do { ++ seq = read_seqcount_begin(&hfi_ipcc_seqcount); ++ /* @ipcc is never 0. */ ++ score = scores[ipcc - 1]; ++ } while (read_seqcount_retry(&hfi_ipcc_seqcount, seq)); ++ ++ return score; ++} ++ ++static void set_hfi_ipcc_scores(struct hfi_instance *hfi_instance) ++{ ++ int cpu; ++ ++ if (!cpu_feature_enabled(X86_FEATURE_ITD)) ++ return; ++ ++ /* ++ * Serialize with writes to the HFI table. It also protects the write ++ * loop against seqcount readers running in interrupt context. ++ */ ++ raw_spin_lock_irq(&hfi_instance->table_lock); ++ /* ++ * The seqcount implies store-release semantics to order stores with ++ * lockless loads from the seqcount read side in ++ * intel_hfi_get_ipcc_score(). It also implies a compiler barrier. ++ */ ++ write_seqcount_begin(&hfi_ipcc_seqcount); ++ for_each_cpu(cpu, hfi_instance->cpus) { ++ int c, *scores; ++ s16 index; ++ ++ index = per_cpu(hfi_cpu_info, cpu).index; ++ scores = per_cpu_ptr(hfi_ipcc_scores, cpu); ++ ++ for (c = 0; c < hfi_features.nr_classes; c++) { ++ struct hfi_cpu_data *caps; ++ ++ caps = hfi_instance->local_table.data + ++ index * hfi_features.cpu_stride + ++ c * hfi_features.class_stride; ++ scores[c] = caps->perf_cap; ++ } ++ } ++ ++ write_seqcount_end(&hfi_ipcc_seqcount); ++ raw_spin_unlock_irq(&hfi_instance->table_lock); ++} ++ ++/** ++ * intel_hfi_read_classid() - Read the currrent classid ++ * @classid: Variable to which the classid will be written. ++ * ++ * Read the classification that Intel Thread Director has produced when this ++ * function is called. Thread classification must be enabled before calling ++ * this function. ++ * ++ * Return: 0 if the produced classification is valid. Error otherwise. ++ */ ++int intel_hfi_read_classid(u8 *classid) ++{ ++ union hfi_thread_feedback_char_msr msr; ++ ++ /* We should not be here if ITD is not supported. */ ++ if (!cpu_feature_enabled(X86_FEATURE_ITD)) { ++ pr_warn_once("task classification requested but not supported!"); ++ return -ENODEV; ++ } ++ ++ rdmsrl(MSR_IA32_HW_FEEDBACK_CHAR, msr.full); ++ if (!msr.split.valid) ++ return -EINVAL; ++ ++ *classid = msr.split.classid; ++ return 0; ++} ++ + static void get_hfi_caps(struct hfi_instance *hfi_instance, + struct thermal_genl_cpu_caps *cpu_caps) + { +@@ -179,7 +271,7 @@ static void get_hfi_caps(struct hfi_instance *hfi_instance, + s16 index; + + index = per_cpu(hfi_cpu_info, cpu).index; +- caps = hfi_instance->data + index * hfi_features.cpu_stride; ++ caps = hfi_instance->local_table.data + index * hfi_features.cpu_stride; + cpu_caps[i].cpu = cpu; + + /* +@@ -235,6 +327,8 @@ static void update_capabilities(struct hfi_instance *hfi_instance) + thermal_genl_cpu_capability_event(cpu_count, &cpu_caps[i]); + + kfree(cpu_caps); ++ ++ set_hfi_ipcc_scores(hfi_instance); + out: + mutex_unlock(&hfi_instance_lock); + } +@@ -296,7 +390,7 @@ void intel_hfi_process_event(__u64 pkg_therm_status_msr_val) + * where a lagging CPU entered the locked region. + */ + new_timestamp = *(u64 *)hfi_instance->hw_table; +- if (*hfi_instance->timestamp == new_timestamp) { ++ if (*hfi_instance->local_table.timestamp == new_timestamp) { + thermal_clear_package_intr_status(PACKAGE_LEVEL, PACKAGE_THERM_STATUS_HFI_UPDATED); + raw_spin_unlock(&hfi_instance->event_lock); + return; +@@ -308,7 +402,7 @@ void intel_hfi_process_event(__u64 pkg_therm_status_msr_val) + * Copy the updated table into our local copy. This includes the new + * timestamp. + */ +- memcpy(hfi_instance->local_table, hfi_instance->hw_table, ++ memcpy(hfi_instance->local_table.base_addr, hfi_instance->hw_table, + hfi_features.nr_table_pages << PAGE_SHIFT); + + /* +@@ -337,17 +431,18 @@ static void init_hfi_cpu_index(struct hfi_cpu_info *info) + } + + /* +- * The format of the HFI table depends on the number of capabilities that the +- * hardware supports. Keep a data structure to navigate the table. ++ * The format of the HFI table depends on the number of capabilities and classes ++ * that the hardware supports. Keep a data structure to navigate the table. + */ + static void init_hfi_instance(struct hfi_instance *hfi_instance) + { + /* The HFI header is below the time-stamp. */ +- hfi_instance->hdr = hfi_instance->local_table + +- sizeof(*hfi_instance->timestamp); ++ hfi_instance->local_table.hdr = hfi_instance->local_table.base_addr + ++ sizeof(*hfi_instance->local_table.timestamp); + + /* The HFI data starts below the header. */ +- hfi_instance->data = hfi_instance->hdr + hfi_features.hdr_size; ++ hfi_instance->local_table.data = hfi_instance->local_table.hdr + ++ hfi_features.hdr_size; + } + + /* Caller must hold hfi_instance_lock. */ +@@ -356,8 +451,13 @@ static void hfi_enable(void) + u64 msr_val; + + rdmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val); +- msr_val |= HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT; ++ msr_val |= HW_FEEDBACK_CONFIG_HFI_ENABLE; ++ ++ if (cpu_feature_enabled(X86_FEATURE_ITD)) ++ msr_val |= HW_FEEDBACK_CONFIG_ITD_ENABLE; ++ + wrmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val); ++ + } + + static void hfi_set_hw_table(struct hfi_instance *hfi_instance) +@@ -366,7 +466,7 @@ static void hfi_set_hw_table(struct hfi_instance *hfi_instance) + u64 msr_val; + + hw_table_pa = virt_to_phys(hfi_instance->hw_table); +- msr_val = hw_table_pa | HW_FEEDBACK_PTR_VALID_BIT; ++ msr_val = hw_table_pa | HW_FEEDBACK_PTR_VALID; + wrmsrl(MSR_IA32_HW_FEEDBACK_PTR, msr_val); + } + +@@ -377,7 +477,11 @@ static void hfi_disable(void) + int i; + + rdmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val); +- msr_val &= ~HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT; ++ msr_val &= ~HW_FEEDBACK_CONFIG_HFI_ENABLE; ++ ++ if (cpu_feature_enabled(X86_FEATURE_ITD)) ++ msr_val &= ~HW_FEEDBACK_CONFIG_ITD_ENABLE; ++ + wrmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val); + + /* +@@ -396,6 +500,30 @@ static void hfi_disable(void) + } + } + ++static void hfi_enable_itd_classification(void) ++{ ++ u64 msr_val; ++ ++ if (!cpu_feature_enabled(X86_FEATURE_ITD)) ++ return; ++ ++ rdmsrl(MSR_IA32_HW_FEEDBACK_THREAD_CONFIG, msr_val); ++ msr_val |= HW_FEEDBACK_THREAD_CONFIG_ENABLE; ++ wrmsrl(MSR_IA32_HW_FEEDBACK_THREAD_CONFIG, msr_val); ++} ++ ++static void hfi_disable_itd_classification(void) ++{ ++ u64 msr_val; ++ ++ if (!cpu_feature_enabled(X86_FEATURE_ITD)) ++ return; ++ ++ rdmsrl(MSR_IA32_HW_FEEDBACK_THREAD_CONFIG, msr_val); ++ msr_val &= ~HW_FEEDBACK_THREAD_CONFIG_ENABLE; ++ wrmsrl(MSR_IA32_HW_FEEDBACK_THREAD_CONFIG, msr_val); ++} ++ + /** + * intel_hfi_online() - Enable HFI on @cpu + * @cpu: CPU in which the HFI will be enabled +@@ -436,6 +564,8 @@ void intel_hfi_online(unsigned int cpu) + + init_hfi_cpu_index(info); + ++ hfi_enable_itd_classification(); ++ + /* + * Now check if the HFI instance of the package/die of @cpu has been + * initialized (by checking its header). In such case, all we have to +@@ -443,7 +573,7 @@ void intel_hfi_online(unsigned int cpu) + * if needed. + */ + mutex_lock(&hfi_instance_lock); +- if (hfi_instance->hdr) ++ if (hfi_instance->local_table.hdr) + goto enable; + + /* +@@ -463,9 +593,9 @@ void intel_hfi_online(unsigned int cpu) + * Allocate memory to keep a local copy of the table that + * hardware generates. + */ +- hfi_instance->local_table = kzalloc(hfi_features.nr_table_pages << PAGE_SHIFT, +- GFP_KERNEL); +- if (!hfi_instance->local_table) ++ hfi_instance->local_table.base_addr = kzalloc(hfi_features.nr_table_pages << PAGE_SHIFT, ++ GFP_KERNEL); ++ if (!hfi_instance->local_table.base_addr) + goto free_hw_table; + + init_hfi_instance(hfi_instance); +@@ -477,11 +607,23 @@ void intel_hfi_online(unsigned int cpu) + enable: + cpumask_set_cpu(cpu, hfi_instance->cpus); + +- /* Enable this HFI instance if this is its first online CPU. */ +- if (cpumask_weight(hfi_instance->cpus) == 1) { ++ /* ++ * Enable this HFI instance if this is its first online CPU and ++ * there are user-space clients of thermal events. ++ */ ++ if (cpumask_weight(hfi_instance->cpus) == 1 && hfi_clients_nr > 0) { + hfi_set_hw_table(hfi_instance); + hfi_enable(); + } ++ /* ++ * We have all we need to support IPC classes. Task classification is ++ * now working. ++ * ++ * All class scores are zero until after the first HFI update. That is ++ * OK. The scheduler queries these scores at every load balance. ++ */ ++ if (cpu_feature_enabled(X86_FEATURE_ITD)) ++ sched_enable_ipc_classes(); + + unlock: + mutex_unlock(&hfi_instance_lock); +@@ -516,9 +658,11 @@ void intel_hfi_offline(unsigned int cpu) + if (!hfi_instance) + return; + +- if (!hfi_instance->hdr) ++ if (!hfi_instance->local_table.hdr) + return; + ++ hfi_disable_itd_classification(); ++ + mutex_lock(&hfi_instance_lock); + cpumask_clear_cpu(cpu, hfi_instance->cpus); + +@@ -557,44 +701,133 @@ static __init int hfi_parse_features(void) + /* The number of 4KB pages required by the table */ + hfi_features.nr_table_pages = edx.split.table_pages + 1; + ++ /* ++ * Capability fields of an HFI class are grouped together. Classes are ++ * contiguous in memory. Hence, use the number of supported features to ++ * locate a specific class. ++ */ ++ hfi_features.class_stride = nr_capabilities; ++ ++ if (cpu_feature_enabled(X86_FEATURE_ITD)) { ++ union cpuid6_ecx ecx; ++ ++ ecx.full = cpuid_ecx(CPUID_HFI_LEAF); ++ hfi_features.nr_classes = ecx.split.nr_classes; ++ } else { ++ hfi_features.nr_classes = 1; ++ } ++ + /* + * The header contains change indications for each supported feature. + * The size of the table header is rounded up to be a multiple of 8 + * bytes. + */ +- hfi_features.hdr_size = DIV_ROUND_UP(nr_capabilities, 8) * 8; ++ hfi_features.hdr_size = DIV_ROUND_UP(nr_capabilities * ++ hfi_features.nr_classes, 8) * 8; + + /* + * Data of each logical processor is also rounded up to be a multiple + * of 8 bytes. + */ +- hfi_features.cpu_stride = DIV_ROUND_UP(nr_capabilities, 8) * 8; ++ hfi_features.cpu_stride = DIV_ROUND_UP(nr_capabilities * ++ hfi_features.nr_classes, 8) * 8; + + return 0; + } + +-static void hfi_do_enable(void) ++/* ++ * If concurrency is not prevented by other means, the HFI enable/disable ++ * routines must be called under hfi_instance_lock." ++ */ ++static void hfi_enable_instance(void *ptr) ++{ ++ hfi_set_hw_table(ptr); ++ hfi_enable(); ++} ++ ++static void hfi_disable_instance(void *ptr) ++{ ++ hfi_disable(); ++} ++ ++static void hfi_syscore_resume(void) + { + /* This code runs only on the boot CPU. */ + struct hfi_cpu_info *info = &per_cpu(hfi_cpu_info, 0); + struct hfi_instance *hfi_instance = info->hfi_instance; + + /* No locking needed. There is no concurrency with CPU online. */ +- hfi_set_hw_table(hfi_instance); +- hfi_enable(); ++ if (hfi_clients_nr > 0) { ++ hfi_set_hw_table(hfi_instance); ++ hfi_enable_instance(hfi_instance); ++ hfi_enable_itd_classification(); ++ } + } + +-static int hfi_do_disable(void) ++static int hfi_syscore_suspend(void) + { + /* No locking needed. There is no concurrency with CPU offline. */ ++ ++ hfi_disable_itd_classification(); ++ + hfi_disable(); + + return 0; + } + + static struct syscore_ops hfi_pm_ops = { +- .resume = hfi_do_enable, +- .suspend = hfi_do_disable, ++ .resume = hfi_syscore_resume, ++ .suspend = hfi_syscore_suspend, ++}; ++ ++static int hfi_thermal_notify(struct notifier_block *nb, unsigned long state, ++ void *_notify) ++{ ++ struct thermal_genl_notify *notify = _notify; ++ struct hfi_instance *hfi_instance; ++ smp_call_func_t func = NULL; ++ unsigned int cpu; ++ int i; ++ ++ if (notify->mcgrp != THERMAL_GENL_EVENT_GROUP) ++ return NOTIFY_DONE; ++ ++ if (state != THERMAL_NOTIFY_BIND && state != THERMAL_NOTIFY_UNBIND) ++ return NOTIFY_DONE; ++ ++ mutex_lock(&hfi_instance_lock); ++ ++ switch (state) { ++ case THERMAL_NOTIFY_BIND: ++ if (++hfi_clients_nr == 1) ++ func = hfi_enable_instance; ++ break; ++ case THERMAL_NOTIFY_UNBIND: ++ if (--hfi_clients_nr == 0) ++ func = hfi_disable_instance; ++ break; ++ } ++ ++ if (!func) ++ goto out; ++ ++ for (i = 0; i < max_hfi_instances; i++) { ++ hfi_instance = &hfi_instances[i]; ++ if (cpumask_empty(hfi_instance->cpus)) ++ continue; ++ ++ cpu = cpumask_any(hfi_instance->cpus); ++ smp_call_function_single(cpu, func, hfi_instance, true); ++ } ++ ++out: ++ mutex_unlock(&hfi_instance_lock); ++ ++ return NOTIFY_OK; ++} ++ ++static struct notifier_block hfi_thermal_nb = { ++ .notifier_call = hfi_thermal_notify, + }; + + void __init intel_hfi_init(void) +@@ -628,10 +861,28 @@ void __init intel_hfi_init(void) + if (!hfi_updates_wq) + goto err_nomem; + ++ /* ++ * Both thermal core and Intel HFI can not be build as modules. ++ * As kernel build-in drivers they are initialized before user-space ++ * starts, hence we can not miss BIND/UNBIND events when applications ++ * add/remove thermal multicast group to/from a netlink socket. ++ */ ++ if (thermal_genl_register_notifier(&hfi_thermal_nb)) ++ goto err_nl_notif; ++ + register_syscore_ops(&hfi_pm_ops); + ++ if (alloc_hfi_ipcc_scores()) ++ goto err_ipcc; ++ + return; + ++err_nl_notif: ++ destroy_workqueue(hfi_updates_wq); ++ ++err_ipcc: ++ destroy_workqueue(hfi_updates_wq); ++ + err_nomem: + for (j = 0; j < i; ++j) { + hfi_instance = &hfi_instances[j]; +diff --git a/drivers/thermal/thermal_netlink.c b/drivers/thermal/thermal_netlink.c +index 76a231a2965..bef14ce69ec 100644 +--- a/drivers/thermal/thermal_netlink.c ++++ b/drivers/thermal/thermal_netlink.c +@@ -7,17 +7,13 @@ + * Generic netlink for thermal management framework + */ + #include ++#include + #include + #include + #include + + #include "thermal_core.h" + +-enum thermal_genl_multicast_groups { +- THERMAL_GENL_SAMPLING_GROUP = 0, +- THERMAL_GENL_EVENT_GROUP = 1, +-}; +- + static const struct genl_multicast_group thermal_genl_mcgrps[] = { + [THERMAL_GENL_SAMPLING_GROUP] = { .name = THERMAL_GENL_SAMPLING_GROUP_NAME, }, + [THERMAL_GENL_EVENT_GROUP] = { .name = THERMAL_GENL_EVENT_GROUP_NAME, }, +@@ -74,11 +70,12 @@ struct param { + + typedef int (*cb_t)(struct param *); + +-static struct genl_family thermal_gnl_family; ++static struct genl_family thermal_genl_family; ++static BLOCKING_NOTIFIER_HEAD(thermal_genl_chain); + + static int thermal_group_has_listeners(enum thermal_genl_multicast_groups group) + { +- return genl_has_listeners(&thermal_gnl_family, &init_net, group); ++ return genl_has_listeners(&thermal_genl_family, &init_net, group); + } + + /************************** Sampling encoding *******************************/ +@@ -95,7 +92,7 @@ int thermal_genl_sampling_temp(int id, int temp) + if (!skb) + return -ENOMEM; + +- hdr = genlmsg_put(skb, 0, 0, &thermal_gnl_family, 0, ++ hdr = genlmsg_put(skb, 0, 0, &thermal_genl_family, 0, + THERMAL_GENL_SAMPLING_TEMP); + if (!hdr) + goto out_free; +@@ -108,7 +105,7 @@ int thermal_genl_sampling_temp(int id, int temp) + + genlmsg_end(skb, hdr); + +- genlmsg_multicast(&thermal_gnl_family, skb, 0, THERMAL_GENL_SAMPLING_GROUP, GFP_KERNEL); ++ genlmsg_multicast(&thermal_genl_family, skb, 0, THERMAL_GENL_SAMPLING_GROUP, GFP_KERNEL); + + return 0; + out_cancel: +@@ -282,7 +279,7 @@ static int thermal_genl_send_event(enum thermal_genl_event event, + return -ENOMEM; + p->msg = msg; + +- hdr = genlmsg_put(msg, 0, 0, &thermal_gnl_family, 0, event); ++ hdr = genlmsg_put(msg, 0, 0, &thermal_genl_family, 0, event); + if (!hdr) + goto out_free_msg; + +@@ -292,7 +289,7 @@ static int thermal_genl_send_event(enum thermal_genl_event event, + + genlmsg_end(msg, hdr); + +- genlmsg_multicast(&thermal_gnl_family, msg, 0, THERMAL_GENL_EVENT_GROUP, GFP_KERNEL); ++ genlmsg_multicast(&thermal_genl_family, msg, 0, THERMAL_GENL_EVENT_GROUP, GFP_KERNEL); + + return 0; + +@@ -593,7 +590,7 @@ static int thermal_genl_cmd_dumpit(struct sk_buff *skb, + int ret; + void *hdr; + +- hdr = genlmsg_put(skb, 0, 0, &thermal_gnl_family, 0, cmd); ++ hdr = genlmsg_put(skb, 0, 0, &thermal_genl_family, 0, cmd); + if (!hdr) + return -EMSGSIZE; + +@@ -625,7 +622,7 @@ static int thermal_genl_cmd_doit(struct sk_buff *skb, + return -ENOMEM; + p.msg = msg; + +- hdr = genlmsg_put_reply(msg, info, &thermal_gnl_family, 0, cmd); ++ hdr = genlmsg_put_reply(msg, info, &thermal_genl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + +@@ -645,6 +642,27 @@ static int thermal_genl_cmd_doit(struct sk_buff *skb, + return ret; + } + ++static int thermal_genl_bind(int mcgrp) ++{ ++ struct thermal_genl_notify n = { .mcgrp = mcgrp }; ++ ++ if (WARN_ON_ONCE(mcgrp > THERMAL_GENL_MAX_GROUP)) ++ return -EINVAL; ++ ++ blocking_notifier_call_chain(&thermal_genl_chain, THERMAL_NOTIFY_BIND, &n); ++ return 0; ++} ++ ++static void thermal_genl_unbind(int mcgrp) ++{ ++ struct thermal_genl_notify n = { .mcgrp = mcgrp }; ++ ++ if (WARN_ON_ONCE(mcgrp > THERMAL_GENL_MAX_GROUP)) ++ return; ++ ++ blocking_notifier_call_chain(&thermal_genl_chain, THERMAL_NOTIFY_UNBIND, &n); ++} ++ + static const struct genl_small_ops thermal_genl_ops[] = { + { + .cmd = THERMAL_GENL_CMD_TZ_GET_ID, +@@ -673,12 +691,14 @@ static const struct genl_small_ops thermal_genl_ops[] = { + }, + }; + +-static struct genl_family thermal_gnl_family __ro_after_init = { ++static struct genl_family thermal_genl_family __ro_after_init = { + .hdrsize = 0, + .name = THERMAL_GENL_FAMILY_NAME, + .version = THERMAL_GENL_VERSION, + .maxattr = THERMAL_GENL_ATTR_MAX, + .policy = thermal_genl_policy, ++ .bind = thermal_genl_bind, ++ .unbind = thermal_genl_unbind, + .small_ops = thermal_genl_ops, + .n_small_ops = ARRAY_SIZE(thermal_genl_ops), + .resv_start_op = THERMAL_GENL_CMD_CDEV_GET + 1, +@@ -686,12 +706,22 @@ static struct genl_family thermal_gnl_family __ro_after_init = { + .n_mcgrps = ARRAY_SIZE(thermal_genl_mcgrps), + }; + ++int thermal_genl_register_notifier(struct notifier_block *nb) ++{ ++ return blocking_notifier_chain_register(&thermal_genl_chain, nb); ++} ++ ++int thermal_genl_unregister_notifier(struct notifier_block *nb) ++{ ++ return blocking_notifier_chain_unregister(&thermal_genl_chain, nb); ++} ++ + int __init thermal_netlink_init(void) + { +- return genl_register_family(&thermal_gnl_family); ++ return genl_register_family(&thermal_genl_family); + } + + void __init thermal_netlink_exit(void) + { +- genl_unregister_family(&thermal_gnl_family); ++ genl_unregister_family(&thermal_genl_family); + } +diff --git a/drivers/thermal/thermal_netlink.h b/drivers/thermal/thermal_netlink.h +index 93a927e144d..e01221e8816 100644 +--- a/drivers/thermal/thermal_netlink.h ++++ b/drivers/thermal/thermal_netlink.h +@@ -10,6 +10,19 @@ struct thermal_genl_cpu_caps { + int efficiency; + }; + ++enum thermal_genl_multicast_groups { ++ THERMAL_GENL_SAMPLING_GROUP = 0, ++ THERMAL_GENL_EVENT_GROUP = 1, ++ THERMAL_GENL_MAX_GROUP = THERMAL_GENL_EVENT_GROUP, ++}; ++ ++#define THERMAL_NOTIFY_BIND 0 ++#define THERMAL_NOTIFY_UNBIND 1 ++ ++struct thermal_genl_notify { ++ int mcgrp; ++}; ++ + struct thermal_zone_device; + struct thermal_trip; + struct thermal_cooling_device; +@@ -18,6 +31,9 @@ struct thermal_cooling_device; + #ifdef CONFIG_THERMAL_NETLINK + int __init thermal_netlink_init(void); + void __init thermal_netlink_exit(void); ++int thermal_genl_register_notifier(struct notifier_block *nb); ++int thermal_genl_unregister_notifier(struct notifier_block *nb); ++ + int thermal_notify_tz_create(const struct thermal_zone_device *tz); + int thermal_notify_tz_delete(const struct thermal_zone_device *tz); + int thermal_notify_tz_enable(const struct thermal_zone_device *tz); +@@ -48,6 +64,16 @@ static inline int thermal_notify_tz_create(const struct thermal_zone_device *tz) + return 0; + } + ++static inline int thermal_genl_register_notifier(struct notifier_block *nb) ++{ ++ return 0; ++} ++ ++static inline int thermal_genl_unregister_notifier(struct notifier_block *nb) ++{ ++ return 0; ++} ++ + static inline int thermal_notify_tz_delete(const struct thermal_zone_device *tz) + { + return 0; +diff --git a/include/linux/sched.h b/include/linux/sched.h +index ffe8f618ab8..8d458554bae 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -137,6 +137,8 @@ struct user_event_mm; + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ + TASK_PARKED) + ++#define IPC_CLASS_UNCLASSIFIED 0 ++ + #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) + + #define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) +@@ -301,7 +303,7 @@ enum { + TASK_COMM_LEN = 16, + }; + +-extern void scheduler_tick(void); ++extern void scheduler_tick(bool user_tick); + + #define MAX_SCHEDULE_TIMEOUT LONG_MAX + +@@ -1547,6 +1549,24 @@ struct task_struct { + struct user_event_mm *user_event_mm; + #endif + ++#ifdef CONFIG_IPC_CLASSES ++ /* ++ * A hardware-defined classification of task that reflects but is ++ * not identical to the number of instructions per cycle. ++ */ ++ unsigned int ipcc : 9; ++ /* ++ * A candidate classification that arch-specific implementations ++ * qualify for correctness. ++ */ ++ unsigned int ipcc_tmp : 9; ++ /* ++ * Counter to filter out transient candidate classifications ++ * of a task. ++ */ ++ unsigned int ipcc_cntr : 14; ++#endif ++ + /* + * New fields for task_struct should be added above here, so that + * they are included in the randomized portion of task_struct. +@@ -2183,4 +2203,6 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } + + extern void sched_set_stop_task(int cpu, struct task_struct *stop); + ++extern bool sched_smt_siblings_idle(int cpu); ++ + #endif +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index a6e04b4a21d..f32fce3fc8e 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -292,4 +292,10 @@ static inline int task_node(const struct task_struct *p) + return cpu_to_node(task_cpu(p)); + } + ++#ifdef CONFIG_IPC_CLASSES ++extern void sched_enable_ipc_classes(void); ++#else ++static inline void sched_enable_ipc_classes(void) { } ++#endif ++ + #endif /* _LINUX_SCHED_TOPOLOGY_H */ +diff --git a/init/Kconfig b/init/Kconfig +index bee58f7468c..3447c10cbdd 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -849,6 +849,18 @@ config UCLAMP_BUCKETS_COUNT + + If in doubt, use the default value. + ++config IPC_CLASSES ++ bool "IPC classes of tasks" ++ depends on SMP ++ help ++ If selected, each task is assigned a classification value that ++ reflects the type of instructions that the task executes. This ++ classification reflects but is not equal to the number of ++ instructions retired per cycle. ++ ++ The scheduler uses the classification value to improve the placement ++ of tasks. ++ + endmenu + + # +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 9116bcc9034..5e07149813c 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4515,6 +4515,11 @@ int wake_up_state(struct task_struct *p, unsigned int state) + */ + static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + { ++#ifdef CONFIG_IPC_CLASSES ++ p->ipcc = IPC_CLASS_UNCLASSIFIED; ++ p->ipcc_tmp = IPC_CLASS_UNCLASSIFIED; ++ p->ipcc_cntr = 0; ++#endif + p->on_rq = 0; + + p->se.on_rq = 0; +@@ -5653,7 +5658,7 @@ static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +-void scheduler_tick(void) ++void scheduler_tick(bool user_tick) + { + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); +@@ -5665,6 +5670,9 @@ void scheduler_tick(void) + if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + arch_scale_freq_tick(); + ++ if (sched_ipcc_enabled() && user_tick) ++ arch_update_ipcc(curr); ++ + sched_clock_tick(); + + rq_lock(rq, &rf); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 533547e3c90..38e0acfefb0 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1305,7 +1305,14 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) + * Scheduling class queueing methods: + */ + +-static inline bool is_core_idle(int cpu) ++/** ++ * sched_smt_siblings_idle - Check whether SMT siblings of a CPU are idle ++ * @cpu: The CPU to check ++ * ++ * Returns true if all the SMT siblings of @cpu are idle or @cpu does not have ++ * SMT siblings. The idle state of @cpu is not considered. ++ */ ++bool sched_smt_siblings_idle(int cpu) + { + #ifdef CONFIG_SCHED_SMT + int sibling; +@@ -2008,7 +2015,7 @@ static inline int numa_idle_core(int idle_core, int cpu) + * Prefer cores instead of packing HT siblings + * and triggering future load balancing. + */ +- if (is_core_idle(cpu)) ++ if (sched_smt_siblings_idle(cpu)) + idle_core = cpu; + + return idle_core; +@@ -9449,6 +9456,13 @@ struct sg_lb_stats { + unsigned int nr_numa_running; + unsigned int nr_preferred_running; + #endif ++#ifdef CONFIG_IPC_CLASSES ++ unsigned long min_score; /* Min(score(rq->curr->ipcc)) */ ++ unsigned short min_ipcc; /* Class of the task with the minimum IPCC score in the rq */ ++ unsigned long sum_score; /* Sum(score(rq->curr->ipcc)) */ ++ long ipcc_score_after; /* Prospective IPCC score after load balancing */ ++ unsigned long ipcc_score_before; /* IPCC score before load balancing */ ++#endif + }; + + /* +@@ -9727,6 +9741,248 @@ group_type group_classify(unsigned int imbalance_pct, + return group_has_spare; + } + ++#ifdef CONFIG_IPC_CLASSES ++static void init_rq_ipcc_stats(struct sg_lb_stats *sgs) ++{ ++ /* All IPCC stats have been set to zero in update_sg_lb_stats(). */ ++ sgs->min_score = ULONG_MAX; ++} ++ ++static int rq_last_task_ipcc(int dst_cpu, struct rq *rq, unsigned short *ipcc) ++{ ++ struct list_head *tasks = &rq->cfs_tasks; ++ struct task_struct *p; ++ struct rq_flags rf; ++ int ret = -EINVAL; ++ ++ rq_lock_irqsave(rq, &rf); ++ if (list_empty(tasks)) ++ goto out; ++ ++ p = list_last_entry(tasks, struct task_struct, se.group_node); ++ if (p->flags & PF_EXITING || is_idle_task(p) || ++ !cpumask_test_cpu(dst_cpu, p->cpus_ptr)) ++ goto out; ++ ++ ret = 0; ++ *ipcc = p->ipcc; ++out: ++ rq_unlock(rq, &rf); ++ return ret; ++} ++ ++/* Called only if cpu_of(@rq) is not idle and has tasks running. */ ++static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, ++ struct rq *rq) ++{ ++ unsigned short ipcc; ++ unsigned long score; ++ ++ if (!sched_ipcc_enabled()) ++ return; ++ ++ if (rq_last_task_ipcc(dst_cpu, rq, &ipcc)) ++ return; ++ ++ score = arch_get_ipcc_score(ipcc, cpu_of(rq)); ++ ++ /* ++ * Ignore tasks with invalid scores. When finding the busiest group, we ++ * prefer those with higher sum_score. This group will not be selected. ++ */ ++ if (IS_ERR_VALUE(score)) ++ return; ++ ++ sgs->sum_score += score; ++ ++ if (score < sgs->min_score) { ++ sgs->min_score = score; ++ sgs->min_ipcc = ipcc; ++ } ++} ++ ++static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs, ++ struct sched_group *sg, ++ struct lb_env *env) ++{ ++ unsigned long score_on_dst_cpu, before; ++ int busy_cpus; ++ long after; ++ ++ if (!sched_ipcc_enabled()) ++ return; ++ ++ /* ++ * IPCC scores are only useful during idle load balancing. For now, ++ * only asym_packing uses IPCC scores. ++ */ ++ if (!(env->sd->flags & SD_ASYM_PACKING) || ++ env->idle == CPU_NOT_IDLE) ++ return; ++ ++ /* ++ * IPCC scores are used to break ties only between these types of ++ * groups. ++ */ ++ if (sgs->group_type != group_fully_busy && ++ sgs->group_type != group_asym_packing) ++ return; ++ ++ busy_cpus = sgs->group_weight - sgs->idle_cpus; ++ ++ /* No busy CPUs in the group. No tasks to move. */ ++ if (!busy_cpus) ++ return; ++ ++ score_on_dst_cpu = arch_get_ipcc_score(sgs->min_ipcc, env->dst_cpu); ++ ++ /* ++ * Do not use IPC scores. sgs::ipcc_score_{after, before} will be zero ++ * and not used. ++ */ ++ if (IS_ERR_VALUE(score_on_dst_cpu)) ++ return; ++ ++ before = sgs->sum_score; ++ after = before - sgs->min_score; ++ ++ /* SMT siblings share throughput. */ ++ if (busy_cpus > 1 && sg->flags & SD_SHARE_CPUCAPACITY) { ++ before /= busy_cpus; ++ /* One sibling will become idle after load balance. */ ++ after /= busy_cpus - 1; ++ } ++ ++ sgs->ipcc_score_after = after + score_on_dst_cpu; ++ sgs->ipcc_score_before = before; ++} ++ ++/** ++ * sched_asym_ipcc_prefer - Select a sched group based on its IPCC score ++ * @a: Load balancing statistics of a sched group ++ * @b: Load balancing statistics of a second sched group ++ * ++ * Returns: true if @a has a higher IPCC score than @b after load balance. ++ * False otherwise. ++ */ ++static bool sched_asym_ipcc_prefer(struct sg_lb_stats *a, ++ struct sg_lb_stats *b) ++{ ++ if (!sched_ipcc_enabled()) ++ return false; ++ ++ /* @a increases overall throughput after load balance. */ ++ if (a->ipcc_score_after > b->ipcc_score_after) ++ return true; ++ ++ /* ++ * If @a and @b yield the same overall throughput, pick @a if ++ * its current throughput is lower than that of @b. ++ */ ++ if (a->ipcc_score_after == b->ipcc_score_after) ++ return a->ipcc_score_before < b->ipcc_score_before; ++ ++ return false; ++} ++ ++/** ++ * sched_asym_ipcc_pick - Select a sched group based on its IPCC score ++ * @a: A scheduling group ++ * @b: A second scheduling group ++ * @a_stats: Load balancing statistics of @a ++ * @b_stats: Load balancing statistics of @b ++ * ++ * Returns: true if @a has the same priority and @a has tasks with IPC classes ++ * that yield higher overall throughput after load balance. False otherwise. ++ */ ++static bool sched_asym_ipcc_pick(struct sched_group *a, ++ struct sched_group *b, ++ struct sg_lb_stats *a_stats, ++ struct sg_lb_stats *b_stats) ++{ ++ /* ++ * Only use the class-specific preference selection if both sched ++ * groups have the same priority. ++ */ ++ if (arch_asym_cpu_priority(a->asym_prefer_cpu) != ++ arch_asym_cpu_priority(b->asym_prefer_cpu)) ++ return false; ++ ++ return sched_asym_ipcc_prefer(a_stats, b_stats); ++} ++ ++/** ++ * ipcc_score_delta - Get the IPCC score delta wrt the load balance's dst_cpu ++ * @rq: A runqueue ++ * @env: Load balancing environment ++ * ++ * Returns: The IPCC score delta that the last task enqueued in @rq would get ++ * if placed in the destination CPU of @env. LONG_MIN to indicate that the ++ * delta should not be used. ++ */ ++static long ipcc_score_delta(struct rq *rq, struct lb_env *env) ++{ ++ unsigned long score_src, score_dst; ++ unsigned short ipcc; ++ ++ if (!sched_ipcc_enabled()) ++ return LONG_MIN; ++ ++ /* Only asym_packing uses IPCC scores at the moment. */ ++ if (!(env->sd->flags & SD_ASYM_PACKING)) ++ return LONG_MIN; ++ ++ if (rq_last_task_ipcc(env->dst_cpu, rq, &ipcc)) ++ return LONG_MIN; ++ ++ score_dst = arch_get_ipcc_score(ipcc, env->dst_cpu); ++ if (IS_ERR_VALUE(score_dst)) ++ return LONG_MIN; ++ ++ score_src = arch_get_ipcc_score(ipcc, cpu_of(rq)); ++ if (IS_ERR_VALUE(score_src)) ++ return LONG_MIN; ++ ++ return score_dst - score_src; ++} ++ ++#else /* CONFIG_IPC_CLASSES */ ++static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, ++ struct rq *rq) ++{ ++} ++ ++static void init_rq_ipcc_stats(struct sg_lb_stats *sgs) ++{ ++} ++ ++static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs, ++ struct sched_group *sg, ++ struct lb_env *env) ++{ ++} ++ ++static bool sched_asym_ipcc_prefer(struct sg_lb_stats *a, ++ struct sg_lb_stats *b) ++{ ++ return false; ++} ++ ++static bool sched_asym_ipcc_pick(struct sched_group *a, ++ struct sched_group *b, ++ struct sg_lb_stats *a_stats, ++ struct sg_lb_stats *b_stats) ++{ ++ return false; ++} ++ ++static long ipcc_score_delta(struct rq *rq, struct lb_env *env) ++{ ++ return LONG_MIN; ++} ++ ++#endif /* CONFIG_IPC_CLASSES */ ++ + /** + * sched_use_asym_prio - Check whether asym_packing priority must be used + * @sd: The scheduling domain of the load balancing +@@ -9743,7 +9999,7 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) + if (!sched_smt_active()) + return true; + +- return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu); ++ return sd->flags & SD_SHARE_CPUCAPACITY || sched_smt_siblings_idle(cpu); + } + + /** +@@ -9882,6 +10138,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, + int i, nr_running, local_group; + + memset(sgs, 0, sizeof(*sgs)); ++ init_rq_ipcc_stats(sgs); + + local_group = group == sds->local; + +@@ -9931,6 +10188,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, + if (sgs->group_misfit_task_load < load) + sgs->group_misfit_task_load = load; + } ++ ++ update_sg_lb_ipcc_stats(env->dst_cpu, sgs, rq); + } + + sgs->group_capacity = group->sgc->capacity; +@@ -9950,6 +10209,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, + + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + ++ if (!local_group) ++ update_sg_lb_stats_scores(sgs, group, env); ++ + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / +@@ -10021,6 +10283,16 @@ static bool update_sd_pick_busiest(struct lb_env *env, + /* Prefer to move from lowest priority CPU's work */ + if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu)) + return false; ++ ++ /* ++ * Unlike other callers of sched_asym_prefer(), here both @sg ++ * and @sds::busiest have tasks running. When they have equal ++ * priority, their IPC class scores can be used to select a ++ * better busiest. ++ */ ++ if (sched_asym_ipcc_pick(sds->busiest, sg, &sds->busiest_stat, sgs)) ++ return false; ++ + break; + + case group_misfit_task: +@@ -10061,10 +10333,21 @@ static bool update_sd_pick_busiest(struct lb_env *env, + if (sgs->avg_load == busiest->avg_load) { + /* + * SMT sched groups need more help than non-SMT groups. +- * If @sg happens to also be SMT, either choice is good. + */ +- if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) +- return false; ++ if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) { ++ if (!(sg->flags & SD_SHARE_CPUCAPACITY)) ++ return false; ++ ++ /* ++ * Between two SMT groups, use IPCC scores to pick the ++ * one that would improve throughput the most (only ++ * asym_packing uses IPCC scores for now). ++ */ ++ if (sched_ipcc_enabled() && ++ env->sd->flags & SD_ASYM_PACKING && ++ sched_asym_ipcc_prefer(busiest, sgs)) ++ return false; ++ } + } + + break; +@@ -10981,6 +11264,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, + { + struct rq *busiest = NULL, *rq; + unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; ++ long busiest_ipcc_delta = LONG_MIN; + unsigned int busiest_nr = 0; + int i; + +@@ -11097,6 +11381,26 @@ static struct rq *find_busiest_queue(struct lb_env *env, + if (busiest_nr < nr_running) { + busiest_nr = nr_running; + busiest = rq; ++ ++ /* ++ * Remember the IPCC score of the busiest ++ * runqueue. We may need it to break a tie with ++ * other queues with equal nr_running. ++ */ ++ busiest_ipcc_delta = ipcc_score_delta(busiest, env); ++ /* ++ * For ties, select @rq if doing would give its last ++ * queued task a bigger IPC boost when migrated to ++ * dst_cpu. ++ */ ++ } else if (busiest_nr == nr_running) { ++ long delta = ipcc_score_delta(rq, env); ++ ++ if (busiest_ipcc_delta < delta) { ++ busiest_ipcc_delta = delta; ++ busiest_nr = nr_running; ++ busiest = rq; ++ } + } + break; + +@@ -11228,7 +11532,7 @@ static int should_we_balance(struct lb_env *env) + * balancing cores, but remember the first idle SMT CPU for + * later consideration. Find CPU on an idle core first. + */ +- if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) { ++ if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !sched_smt_siblings_idle(cpu)) { + if (idle_smt == -1) + idle_smt = cpu; + /* +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 001fe047bd5..b741fca335b 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2622,6 +2622,72 @@ void arch_scale_freq_tick(void) + } + #endif + ++#ifdef CONFIG_IPC_CLASSES ++DECLARE_STATIC_KEY_FALSE(sched_ipcc); ++ ++static inline bool sched_ipcc_enabled(void) ++{ ++ return static_branch_unlikely(&sched_ipcc); ++} ++ ++#ifndef arch_update_ipcc ++/** ++ * arch_update_ipcc() - Update the IPC class of the current task ++ * @curr: The current task ++ * ++ * Request that the IPC classification of @curr is updated. ++ * ++ * Returns: none ++ */ ++static __always_inline ++void arch_update_ipcc(struct task_struct *curr) ++{ ++} ++#endif ++ ++#ifndef arch_get_ipcc_score ++ ++#define SCHED_IPCC_SCORE_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) ++/** ++ * arch_get_ipcc_score() - Get the IPC score of a class of task ++ * @ipcc: The IPC class ++ * @cpu: A CPU number ++ * ++ * The IPC performance scores reflects (but it is not identical to) the number ++ * of instructions retired per cycle for a given IPC class. It is a linear and ++ * abstract metric. Higher scores reflect better performance. ++ * ++ * The IPC score can be normalized with respect to the class, i, with the ++ * highest IPC score on the CPU, c, with highest performance: ++ * ++ * IPC(i, c) ++ * ------------------------------------ * SCHED_IPCC_SCORE_SCALE ++ * max(IPC(i, c) : (i, c)) ++ * ++ * Scheduling schemes that want to use the IPC score along with other ++ * normalized metrics for scheduling (e.g., CPU capacity) may need to normalize ++ * it. ++ * ++ * Other scheduling schemes (e.g., asym_packing) do not need normalization. ++ * ++ * Returns the performance score of an IPC class, @ipcc, when running on @cpu. ++ * Error when either @ipcc or @cpu are invalid. ++ */ ++static __always_inline ++unsigned long arch_get_ipcc_score(unsigned short ipcc, int cpu) ++{ ++ return SCHED_IPCC_SCORE_SCALE; ++} ++#endif ++#else /* CONFIG_IPC_CLASSES */ ++ ++#define arch_get_ipcc_score(ipcc, cpu) (-EINVAL) ++#define arch_update_ipcc(curr) ++ ++static inline bool sched_ipcc_enabled(void) { return false; } ++ ++#endif /* CONFIG_IPC_CLASSES */ ++ + #ifndef arch_scale_freq_capacity + /** + * arch_scale_freq_capacity - get the frequency scale factor of a given CPU. +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 10d1391e741..da49c3c5162 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -677,6 +677,15 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_cluster_active); + ++#ifdef CONFIG_IPC_CLASSES ++DEFINE_STATIC_KEY_FALSE(sched_ipcc); ++ ++void sched_enable_ipc_classes(void) ++{ ++ static_branch_enable_cpuslocked(&sched_ipcc); ++} ++#endif ++ + static void update_top_cache_domain(int cpu) + { + struct sched_domain_shared *sds = NULL; +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 352b161113c..f739cd5912d 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -2089,7 +2089,7 @@ void update_process_times(int user_tick) + if (in_irq()) + irq_work_tick(); + #endif +- scheduler_tick(); ++ scheduler_tick(user_tick); + if (IS_ENABLED(CONFIG_POSIX_TIMERS)) + run_posix_cpu_timers(); + } +-- +2.44.0 + + +From 6ac91be34077c54e9f7459098aff5b9d183de7f8 Mon Sep 17 00:00:00 2001 +From: Stanislaw Gruszka +Date: Mon, 12 Feb 2024 17:16:13 +0100 +Subject: [PATCH 2/2] genetlink: Add per family bind/unbind callbacks + +Add genetlink family bind()/unbind() callbacks when adding/removing +multicast group to/from netlink client socket via setsockopt() or +bind() syscall. + +They can be used to track if consumers of netlink multicast messages +emerge or disappear. Thus, a client implementing callbacks, can now +send events only when there are active consumers, preventing unnecessary +work when none exist. + +Suggested-by: Jakub Kicinski +Signed-off-by: Stanislaw Gruszka +Reviewed-by: Jiri Pirko +Link: https://lore.kernel.org/r/20240212161615.161935-2-stanislaw.gruszka@linux.intel.com +Signed-off-by: Jakub Kicinski +--- + include/net/genetlink.h | 4 ++++ + net/netlink/genetlink.c | 30 ++++++++++++++++++++++++++++++ + 2 files changed, 34 insertions(+) + +diff --git a/include/net/genetlink.h b/include/net/genetlink.h +index e6146912940..ecadba836ae 100644 +--- a/include/net/genetlink.h ++++ b/include/net/genetlink.h +@@ -41,6 +41,8 @@ struct genl_info; + * do additional, common, filtering and return an error + * @post_doit: called after an operation's doit callback, it may + * undo operations done by pre_doit, for example release locks ++ * @bind: called when family multicast group is added to a netlink socket ++ * @unbind: called when family multicast group is removed from a netlink socket + * @module: pointer to the owning module (set to THIS_MODULE) + * @mcgrps: multicast groups used by this family + * @n_mcgrps: number of multicast groups +@@ -84,6 +86,8 @@ struct genl_family { + void (*post_doit)(const struct genl_split_ops *ops, + struct sk_buff *skb, + struct genl_info *info); ++ int (*bind)(int mcgrp); ++ void (*unbind)(int mcgrp); + const struct genl_ops * ops; + const struct genl_small_ops *small_ops; + const struct genl_split_ops *split_ops; +diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c +index 8c7af02f845..50ec599a5cf 100644 +--- a/net/netlink/genetlink.c ++++ b/net/netlink/genetlink.c +@@ -1836,6 +1836,9 @@ static int genl_bind(struct net *net, int group) + !ns_capable(net->user_ns, CAP_SYS_ADMIN)) + ret = -EPERM; + ++ if (family->bind) ++ family->bind(i); ++ + break; + } + +@@ -1843,12 +1846,39 @@ static int genl_bind(struct net *net, int group) + return ret; + } + ++static void genl_unbind(struct net *net, int group) ++{ ++ const struct genl_family *family; ++ unsigned int id; ++ ++ down_read(&cb_lock); ++ ++ idr_for_each_entry(&genl_fam_idr, family, id) { ++ int i; ++ ++ if (family->n_mcgrps == 0) ++ continue; ++ ++ i = group - family->mcgrp_offset; ++ if (i < 0 || i >= family->n_mcgrps) ++ continue; ++ ++ if (family->unbind) ++ family->unbind(i); ++ ++ break; ++ } ++ ++ up_read(&cb_lock); ++} ++ + static int __net_init genl_pernet_init(struct net *net) + { + struct netlink_kernel_cfg cfg = { + .input = genl_rcv, + .flags = NL_CFG_F_NONROOT_RECV, + .bind = genl_bind, ++ .unbind = genl_unbind, + .release = genl_release, + }; + +-- +2.44.0 + +From 68a15ef01803c252261ebb47d86dfc1f2c68ae1e Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Fri, 6 Oct 2023 15:58:56 -0700 +Subject: [PATCH] sched/fair: Don't force smt balancing when CPU has spare + capacity + +Currently group_smt_balance is picked whenever there are more +than two tasks on a core with two SMT. However, the utilization +of those tasks may be low and do not warrant a task +migration to a CPU of lower priority. + +Adjust sched group clssification and sibling_imbalance() +to reflect this consideration. Use sibling_imbalance() to +compute imbalance in calculate_imbalance() for the group_smt_balance +case. + +Signed-off-by: Tim Chen + +--- + kernel/sched/fair.c | 23 +++++++++++------------ + 1 file changed, 11 insertions(+), 12 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index ef7490c4b8b4..7dd7c2d2367a 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9460,14 +9460,15 @@ group_type group_classify(unsigned int imbalance_pct, + if (sgs->group_asym_packing) + return group_asym_packing; + +- if (sgs->group_smt_balance) +- return group_smt_balance; +- + if (sgs->group_misfit_task_load) + return group_misfit_task; + +- if (!group_has_capacity(imbalance_pct, sgs)) +- return group_fully_busy; ++ if (!group_has_capacity(imbalance_pct, sgs)) { ++ if (sgs->group_smt_balance) ++ return group_smt_balance; ++ else ++ return group_fully_busy; ++ } + + return group_has_spare; + } +@@ -9573,6 +9574,11 @@ static inline long sibling_imbalance(struct lb_env *env, + if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running) + return 0; + ++ /* Do not pull tasks off preferred group with spare capacity */ ++ if (busiest->group_type == group_has_spare && ++ sched_asym_prefer(sds->busiest->asym_prefer_cpu, env->dst_cpu)) ++ return 0; ++ + ncores_busiest = sds->busiest->cores; + ncores_local = sds->local->cores; + +@@ -10411,13 +10417,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s + return; + } + +- if (busiest->group_type == group_smt_balance) { +- /* Reduce number of tasks sharing CPU capacity */ +- env->migration_type = migrate_task; +- env->imbalance = 1; +- return; +- } +- + if (busiest->group_type == group_imbalanced) { + /* + * In the group_imb case we cannot rely on group-wide averages +-- +2.32.0 \ No newline at end of file diff --git a/patches/series b/patches/series index e61b72f..50989b1 100644 --- a/patches/series +++ b/patches/series @@ -1,6 +1,8 @@ cachyos/0001-cachyos-base-all.patch cachyos/0001-bore-cachy.patch cachyos/0002-ntsync.patch +cachyos/0003-nvidia.patch +cachyos/0004-intel.patch nobara/0001-Allow-to-set-custom-USB-pollrate-for-specific-device.patch nobara/0001-Revert-PCI-Add-a-REBAR-size-quirk-for-Sapphire-RX-56.patch nobara/0001-Revert-nvme-pci-drop-redundant-pci_enable_pcie_error.patch diff --git a/scripts/source.sh b/scripts/source.sh index 9f46d4b..481f9af 100755 --- a/scripts/source.sh +++ b/scripts/source.sh @@ -2,10 +2,7 @@ echo "Pika Kernel - Getting source" -#wget -nv https://cdn.kernel.org/pub/linux/kernel/v"$(echo $(cat ./VERSION) | cut -f1 -d".")".x/linux-"$(cat ./VERSION)".tar.gz -#tar -xf ./linux-"$(cat ./VERSION)".tar.gz - -wget -nv https://git.kernel.org/torvalds/t/linux-6.8-rc6.tar.gz +wget -nv https://cdn.kernel.org/pub/linux/kernel/v"$(echo $(cat ./VERSION) | cut -f1 -d".")".x/linux-"$(cat ./VERSION)".tar.gz tar -xf ./linux-"$(cat ./VERSION)".tar.gz cd linux-"$(cat ./VERSION)"