From a06ef5a36a19553f48d73428311b241839d53b9c Mon Sep 17 00:00:00 2001 From: Laio Oriel Seman Date: Fri, 8 Mar 2024 11:30:24 -0300 Subject: [PATCH 1/2] ITD --- MAINTAINERS | 1 + arch/x86/include/asm/cpufeatures.h | 2 + arch/x86/include/asm/disabled-features.h | 8 +- arch/x86/include/asm/hfi.h | 85 +++++ arch/x86/include/asm/hreset.h | 30 ++ arch/x86/include/asm/msr-index.h | 12 + arch/x86/include/asm/topology.h | 15 + arch/x86/kernel/Makefile | 2 + arch/x86/kernel/cpu/common.c | 33 +- arch/x86/kernel/cpu/cpuid-deps.c | 1 + arch/x86/kernel/process_32.c | 3 + arch/x86/kernel/process_64.c | 3 + arch/x86/kernel/sched_ipcc.c | 93 +++++ drivers/thermal/intel/Kconfig | 1 + drivers/thermal/intel/intel_hfi.c | 411 ++++++++++++++++++----- drivers/thermal/thermal_netlink.c | 62 +++- drivers/thermal/thermal_netlink.h | 26 ++ include/linux/sched.h | 24 +- include/linux/sched/topology.h | 6 + init/Kconfig | 12 + kernel/sched/core.c | 10 +- kernel/sched/fair.c | 318 +++++++++++++++++- kernel/sched/sched.h | 66 ++++ kernel/sched/topology.c | 9 + kernel/time/timer.c | 2 +- 25 files changed, 1127 insertions(+), 108 deletions(-) create mode 100644 arch/x86/include/asm/hfi.h create mode 100644 arch/x86/include/asm/hreset.h create mode 100644 arch/x86/kernel/sched_ipcc.c diff --git a/MAINTAINERS b/MAINTAINERS index 88b28f85587..9bb09b30526 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21791,6 +21791,7 @@ L: linux-pm@vger.kernel.org S: Supported Q: https://patchwork.kernel.org/project/linux-pm/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git thermal +F: arch/x86/include/asm/hfi.h F: Documentation/ABI/testing/sysfs-class-thermal F: Documentation/admin-guide/thermal/ F: Documentation/devicetree/bindings/thermal/ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2b62cdd8dd1..31b1cea6847 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -326,6 +326,7 @@ #define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ +#define X86_FEATURE_HRESET (12*32+22) /* Hardware history reset instruction */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ #define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */ @@ -360,6 +361,7 @@ #define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ #define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ #define X86_FEATURE_HFI (14*32+19) /* Hardware Feedback Interface */ +#define X86_FEATURE_ITD (14*32+23) /* Intel Thread Director */ /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ #define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 702d93fdd10..f4aa34cfd20 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -117,6 +117,12 @@ #define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31)) #endif +#ifdef CONFIG_IPC_CLASSES +# define DISABLE_ITD 0 +#else +# define DISABLE_ITD (1 << (X86_FEATURE_ITD & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -135,7 +141,7 @@ DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) #define DISABLED_MASK12 (DISABLE_LAM) #define DISABLED_MASK13 0 -#define DISABLED_MASK14 0 +#define DISABLED_MASK14 (DISABLE_ITD) #define DISABLED_MASK15 0 #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \ DISABLE_ENQCMD) diff --git a/arch/x86/include/asm/hfi.h b/arch/x86/include/asm/hfi.h new file mode 100644 index 00000000000..b7fda3e0e8c --- /dev/null +++ b/arch/x86/include/asm/hfi.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_HFI_H +#define _ASM_X86_HFI_H + +/* CPUID detection and enumeration definitions for HFI */ + +union hfi_capabilities { + struct { + u8 performance:1; + u8 energy_efficiency:1; + u8 __reserved:6; + } split; + u8 bits; +}; + +union cpuid6_edx { + struct { + union hfi_capabilities capabilities; + u32 table_pages:4; + u32 __reserved:4; + s32 index:16; + } split; + u32 full; +}; + +union cpuid6_ecx { + struct { + u32 dont_care0:8; + u32 nr_classes:8; + u32 dont_care1:16; + } split; + u32 full; +}; + +/** + * struct hfi_hdr - Header of the HFI table + * @perf_updated: Hardware updated performance capabilities + * @ee_updated: Hardware updated energy efficiency capabilities + * + * Properties of the data in an HFI table. There exists one header per each + * HFI class. + */ +struct hfi_hdr { + u8 perf_updated; + u8 ee_updated; +} __packed; + +/** + * struct hfi_table - Representation of an HFI table + * @base_addr: Base address of the local copy of the HFI table + * @timestamp: Timestamp of the last update of the local table. + * Located at the base of the local table. + * @hdr: Base address of the header of the local table + * @data: Base address of the data of the local table + */ +struct hfi_table { + union { + void *base_addr; + u64 *timestamp; + }; + void *hdr; + void *data; +}; + +/** + * struct hfi_features - Supported HFI features + * @nr_classes: Number of classes supported + * @nr_table_pages: Size of the HFI table in 4KB pages + * @cpu_stride: Stride size to locate the capability data of a logical + * processor within the table (i.e., row stride) + * @class_stride: Stride size to locate a class within the capability + * data of a logical processor or the HFI table header + * @hdr_size: Size of the table header + * + * Parameters and supported features that are common to all HFI instances + */ +struct hfi_features { + unsigned int nr_classes; + size_t nr_table_pages; + unsigned int cpu_stride; + unsigned int class_stride; + unsigned int hdr_size; +}; + +#endif /* _ASM_X86_HFI_H */ diff --git a/arch/x86/include/asm/hreset.h b/arch/x86/include/asm/hreset.h new file mode 100644 index 00000000000..d68ca2fb864 --- /dev/null +++ b/arch/x86/include/asm/hreset.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_HRESET_H + +/** + * HRESET - History reset. Available since binutils v2.36. + * + * Request the processor to reset the history of task classification on the + * current logical processor. The history components to be + * reset are specified in %eax. Only bits specified in CPUID(0x20).EBX + * and enabled in the IA32_HRESET_ENABLE MSR can be selected. + * + * The assembly code looks like: + * + * hreset %eax + * + * The corresponding machine code looks like: + * + * F3 0F 3A F0 ModRM Imm + * + * The value of ModRM is 0xc0 to specify %eax register addressing. + * The ignored immediate operand is set to 0. + * + * The instruction is documented in the Intel SDM. + */ + +#define __ASM_HRESET ".byte 0xf3, 0xf, 0x3a, 0xf0, 0xc0, 0x0" + +void reset_hardware_history(void); + +#endif /* _ASM_X86_HRESET_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f1bd7b91b3c..f334c19b028 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1143,7 +1143,19 @@ /* Hardware Feedback Interface */ #define MSR_IA32_HW_FEEDBACK_PTR 0x17d0 +#define HW_FEEDBACK_PTR_VALID BIT_ULL(0) +#define HW_FEEDBACK_PTR_RESERVED_MASK GENMASK_ULL(11, 1) + #define MSR_IA32_HW_FEEDBACK_CONFIG 0x17d1 +#define MSR_IA32_HW_FEEDBACK_THREAD_CONFIG 0x17d4 +#define MSR_IA32_HW_FEEDBACK_CHAR 0x17d2 + +/* Hardware History Reset */ +#define MSR_IA32_HW_HRESET_ENABLE 0x17da + +#define HW_FEEDBACK_CONFIG_HFI_ENABLE BIT_ULL(0) +#define HW_FEEDBACK_CONFIG_ITD_ENABLE BIT_ULL(1) +#define HW_FEEDBACK_THREAD_CONFIG_ENABLE BIT_ULL(0) /* x2APIC locked status */ #define MSR_IA32_XAPIC_DISABLE_STATUS 0xBD diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 5f87f6b9b09..29fc06efcb6 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -235,4 +235,19 @@ void init_freq_invariance_cppc(void); #define arch_init_invariance_cppc init_freq_invariance_cppc #endif +#ifdef CONFIG_INTEL_HFI_THERMAL +int intel_hfi_read_classid(u8 *classid); +unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu); +#else +static inline int intel_hfi_read_classid(u8 *classid) { return -ENODEV; } +static inline unsigned long +intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu) { return -ENODEV; } +#endif + +#ifdef CONFIG_IPC_CLASSES +void intel_update_ipcc(struct task_struct *curr); +#define arch_update_ipcc intel_update_ipcc +#define arch_get_ipcc_score intel_hfi_get_ipcc_score +#endif + #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0000325ab98..9bc7319175d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -150,6 +150,8 @@ obj-$(CONFIG_X86_CET) += cet.o obj-$(CONFIG_X86_USER_SHADOW_STACK) += shstk.o +obj-$(CONFIG_IPC_CLASSES) += sched_ipcc.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index fbc4e60d027..99ebd403fe4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -381,6 +382,35 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c) cr4_clear_bits(X86_CR4_UMIP); } +static u32 hardware_history_features __ro_after_init; + + +void reset_hardware_history(void) +{ + asm_inline volatile (ALTERNATIVE("", __ASM_HRESET, X86_FEATURE_HRESET) + : : "a" (hardware_history_features) : "memory"); +} + +EXPORT_SYMBOL(reset_hardware_history); + +static __always_inline void setup_hreset(struct cpuinfo_x86 *c) +{ + if (!cpu_feature_enabled(X86_FEATURE_HRESET)) + return; + + /* + * Use on all CPUs the hardware history features that the boot + * CPU supports. + */ + if (c == &boot_cpu_data) + hardware_history_features = cpuid_ebx(0x20); + + if (!hardware_history_features) + return; + + wrmsrl(MSR_IA32_HW_HRESET_ENABLE, hardware_history_features); +} + /* These bits should not change their value after CPU init is finished. */ static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | @@ -1872,10 +1902,11 @@ static void identify_cpu(struct cpuinfo_x86 *c) /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); - /* Set up SMEP/SMAP/UMIP */ + /* Set up SMEP/SMAP/UMIP/HRESET */ setup_smep(c); setup_smap(c); setup_umip(c); + setup_hreset(c); /* Enable FSGSBASE instructions if available. */ if (cpu_has(c, X86_FEATURE_FSGSBASE)) { diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index e462c1d3800..db62700cdac 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -81,6 +81,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, + { X86_FEATURE_ITD, X86_FEATURE_HFI }, { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, {} }; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 708c87b88cc..7353bb119e7 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include "process.h" @@ -214,6 +215,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(next_p); + reset_hardware_history(); + return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 33b268747bb..202a6735c09 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #ifdef CONFIG_IA32_EMULATION @@ -661,6 +662,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(next_p); + reset_hardware_history(); + return prev_p; } diff --git a/arch/x86/kernel/sched_ipcc.c b/arch/x86/kernel/sched_ipcc.c new file mode 100644 index 00000000000..dd73fc8be49 --- /dev/null +++ b/arch/x86/kernel/sched_ipcc.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Intel support for scheduler IPC classes + * + * Copyright (c) 2023, Intel Corporation. + * + * Author: Ricardo Neri + * + * On hybrid processors, the architecture differences between types of CPUs + * lead to different number of retired instructions per cycle (IPC). IPCs may + * differ further by classes of instructions. + * + * The scheduler assigns an IPC class to every task with arch_update_ipcc() + * from data that hardware provides. Implement this interface for x86. + * + * See kernel/sched/sched.h for details. + */ + +#include + +#include +#include + +#define CLASS_DEBOUNCER_SKIPS 4 + +/** + * debounce_and_update_class() - Process and update a task's classification + * + * @p: The task of which the classification will be updated + * @new_ipcc: The new IPC classification + * + * Update the classification of @p with the new value that hardware provides. + * Only update the classification of @p if it has been the same during + * CLASS_DEBOUNCER_SKIPS consecutive ticks. + */ +static void debounce_and_update_class(struct task_struct *p, u8 new_ipcc) +{ + u16 debounce_skip; + + /* The class of @p changed. Only restart the debounce counter. */ + if (p->ipcc_tmp != new_ipcc) { + p->ipcc_cntr = 1; + goto out; + } + + /* + * The class of @p did not change. Update it if it has been the same + * for CLASS_DEBOUNCER_SKIPS user ticks. + */ + debounce_skip = p->ipcc_cntr + 1; + if (debounce_skip < CLASS_DEBOUNCER_SKIPS) + p->ipcc_cntr++; + else + p->ipcc = new_ipcc; + +out: + p->ipcc_tmp = new_ipcc; +} + +static bool classification_is_accurate(u8 hfi_class, bool smt_siblings_idle) +{ + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_ALDERLAKE: + case INTEL_FAM6_ALDERLAKE_L: + case INTEL_FAM6_RAPTORLAKE: + case INTEL_FAM6_RAPTORLAKE_P: + case INTEL_FAM6_RAPTORLAKE_S: + if (hfi_class == 3 || hfi_class == 2 || smt_siblings_idle) + return true; + + return false; + + default: + return false; + } +} + +void intel_update_ipcc(struct task_struct *curr) +{ + u8 hfi_class; + bool idle; + + if (intel_hfi_read_classid(&hfi_class)) + return; + + /* + * 0 is a valid classification for Intel Thread Director. A scheduler + * IPCC class of 0 means that the task is unclassified. Adjust. + */ + idle = sched_smt_siblings_idle(task_cpu(curr)); + if (classification_is_accurate(hfi_class, idle)) + debounce_and_update_class(curr, hfi_class + 1); +} diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig index b43953b5539..03da183ff99 100644 --- a/drivers/thermal/intel/Kconfig +++ b/drivers/thermal/intel/Kconfig @@ -109,6 +109,7 @@ config INTEL_HFI_THERMAL depends on CPU_SUP_INTEL depends on X86_THERMAL_VECTOR select THERMAL_NETLINK + select IPC_CLASSES help Select this option to enable the Hardware Feedback Interface. If selected, hardware provides guidance to the operating system on diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c index 3b04c6ec4fc..b791906914b 100644 --- a/drivers/thermal/intel/intel_hfi.c +++ b/drivers/thermal/intel/intel_hfi.c @@ -30,9 +30,12 @@ #include #include #include +#include #include #include #include +#include +#include #include #include #include @@ -41,6 +44,7 @@ #include #include +#include #include #include "intel_hfi.h" @@ -48,32 +52,20 @@ #include "../thermal_netlink.h" -/* Hardware Feedback Interface MSR configuration bits */ -#define HW_FEEDBACK_PTR_VALID_BIT BIT(0) -#define HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT BIT(0) /* CPUID detection and enumeration definitions for HFI */ #define CPUID_HFI_LEAF 6 -union hfi_capabilities { +union hfi_thread_feedback_char_msr { struct { - u8 performance:1; - u8 energy_efficiency:1; - u8 __reserved:6; + u64 classid : 8; + u64 __reserved : 55; + u64 valid : 1; } split; - u8 bits; + u64 full; }; -union cpuid6_edx { - struct { - union hfi_capabilities capabilities; - u32 table_pages:4; - u32 __reserved:4; - s32 index:16; - } split; - u32 full; -}; /** * struct hfi_cpu_data - HFI capabilities per CPU @@ -81,32 +73,17 @@ union cpuid6_edx { * @ee_cap: Energy efficiency capability * * Capabilities of a logical processor in the HFI table. These capabilities are - * unitless. + * unitless and specific to each HFI class. */ struct hfi_cpu_data { u8 perf_cap; u8 ee_cap; } __packed; -/** - * struct hfi_hdr - Header of the HFI table - * @perf_updated: Hardware updated performance capabilities - * @ee_updated: Hardware updated energy efficiency capabilities - * - * Properties of the data in an HFI table. - */ -struct hfi_hdr { - u8 perf_updated; - u8 ee_updated; -} __packed; /** * struct hfi_instance - Representation of an HFI instance (i.e., a table) - * @local_table: Base of the local copy of the HFI table - * @timestamp: Timestamp of the last update of the local table. - * Located at the base of the local table. - * @hdr: Base address of the header of the local table - * @data: Base address of the data of the local table + * @local_table: Local copy of HFI table for this instance * @cpus: CPUs represented in this HFI table instance * @hw_table: Pointer to the HFI table of this instance * @update_work: Delayed work to process HFI updates @@ -116,12 +93,7 @@ struct hfi_hdr { * A set of parameters to parse and navigate a specific HFI table. */ struct hfi_instance { - union { - void *local_table; - u64 *timestamp; - }; - void *hdr; - void *data; + struct hfi_table local_table; cpumask_var_t cpus; void *hw_table; struct delayed_work update_work; @@ -129,20 +101,6 @@ struct hfi_instance { raw_spinlock_t event_lock; }; -/** - * struct hfi_features - Supported HFI features - * @nr_table_pages: Size of the HFI table in 4KB pages - * @cpu_stride: Stride size to locate the capability data of a logical - * processor within the table (i.e., row stride) - * @hdr_size: Size of the table header - * - * Parameters and supported features that are common to all HFI instances - */ -struct hfi_features { - size_t nr_table_pages; - unsigned int cpu_stride; - unsigned int hdr_size; -}; /** * struct hfi_cpu_info - Per-CPU attributes to consume HFI data @@ -159,6 +117,7 @@ struct hfi_cpu_info { static DEFINE_PER_CPU(struct hfi_cpu_info, hfi_cpu_info) = { .index = -1 }; static int max_hfi_instances; +static int hfi_clients_nr; static struct hfi_instance *hfi_instances; static struct hfi_features hfi_features; @@ -168,6 +127,139 @@ static struct workqueue_struct *hfi_updates_wq; #define HFI_UPDATE_INTERVAL HZ #define HFI_MAX_THERM_NOTIFY_COUNT 16 +/* + * A task may be unclassified if it has been recently created, spend most of + * its lifetime sleeping, or hardware has not provided a classification. + * + * Most tasks will be classified as scheduler's IPC class 1 (HFI class 0) + * eventually. Meanwhile, the scheduler will place classes of tasks with higher + * IPC scores on higher-performance CPUs. + * + * IPC class 1 is a reasonable choice. It matches the performance capability + * of the legacy, classless, HFI table. + */ +#define HFI_UNCLASSIFIED_DEFAULT 1 + +/* A cache of the HFI perf capabilities for lockless access. */ +static int __percpu *hfi_ipcc_scores; +/* Sequence counter for hfi_ipcc_scores */ +static seqcount_t hfi_ipcc_seqcount = SEQCNT_ZERO(hfi_ipcc_seqcount); + +static int alloc_hfi_ipcc_scores(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_ITD)) + return 0; + + hfi_ipcc_scores = __alloc_percpu(sizeof(*hfi_ipcc_scores) * + hfi_features.nr_classes, + sizeof(*hfi_ipcc_scores)); + + return hfi_ipcc_scores ? 0 : -ENOMEM; +} + +unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu) +{ + int *scores, score; + unsigned long seq; + + scores = per_cpu_ptr(hfi_ipcc_scores, cpu); + if (!scores) + return -ENODEV; + + if (cpu < 0 || cpu >= nr_cpu_ids) + return -EINVAL; + + if (ipcc == IPC_CLASS_UNCLASSIFIED) + ipcc = HFI_UNCLASSIFIED_DEFAULT; + + /* + * Scheduler IPC classes start at 1. HFI classes start at 0. + * See note intel_hfi_update_ipcc(). + */ + if (ipcc >= hfi_features.nr_classes + 1) + return -EINVAL; + + /* + * The seqcount implies load-acquire semantics to order loads with + * lockless stores of the write side in set_hfi_ipcc_score(). It + * also implies a compiler barrier. + */ + do { + seq = read_seqcount_begin(&hfi_ipcc_seqcount); + /* @ipcc is never 0. */ + score = scores[ipcc - 1]; + } while (read_seqcount_retry(&hfi_ipcc_seqcount, seq)); + + return score; +} + +static void set_hfi_ipcc_scores(struct hfi_instance *hfi_instance) +{ + int cpu; + + if (!cpu_feature_enabled(X86_FEATURE_ITD)) + return; + + /* + * Serialize with writes to the HFI table. It also protects the write + * loop against seqcount readers running in interrupt context. + */ + raw_spin_lock_irq(&hfi_instance->table_lock); + /* + * The seqcount implies store-release semantics to order stores with + * lockless loads from the seqcount read side in + * intel_hfi_get_ipcc_score(). It also implies a compiler barrier. + */ + write_seqcount_begin(&hfi_ipcc_seqcount); + for_each_cpu(cpu, hfi_instance->cpus) { + int c, *scores; + s16 index; + + index = per_cpu(hfi_cpu_info, cpu).index; + scores = per_cpu_ptr(hfi_ipcc_scores, cpu); + + for (c = 0; c < hfi_features.nr_classes; c++) { + struct hfi_cpu_data *caps; + + caps = hfi_instance->local_table.data + + index * hfi_features.cpu_stride + + c * hfi_features.class_stride; + scores[c] = caps->perf_cap; + } + } + + write_seqcount_end(&hfi_ipcc_seqcount); + raw_spin_unlock_irq(&hfi_instance->table_lock); +} + +/** + * intel_hfi_read_classid() - Read the currrent classid + * @classid: Variable to which the classid will be written. + * + * Read the classification that Intel Thread Director has produced when this + * function is called. Thread classification must be enabled before calling + * this function. + * + * Return: 0 if the produced classification is valid. Error otherwise. + */ +int intel_hfi_read_classid(u8 *classid) +{ + union hfi_thread_feedback_char_msr msr; + + /* We should not be here if ITD is not supported. */ + if (!cpu_feature_enabled(X86_FEATURE_ITD)) { + pr_warn_once("task classification requested but not supported!"); + return -ENODEV; + } + + rdmsrl(MSR_IA32_HW_FEEDBACK_CHAR, msr.full); + if (!msr.split.valid) + return -EINVAL; + + *classid = msr.split.classid; + return 0; +} + static void get_hfi_caps(struct hfi_instance *hfi_instance, struct thermal_genl_cpu_caps *cpu_caps) { @@ -179,7 +271,7 @@ static void get_hfi_caps(struct hfi_instance *hfi_instance, s16 index; index = per_cpu(hfi_cpu_info, cpu).index; - caps = hfi_instance->data + index * hfi_features.cpu_stride; + caps = hfi_instance->local_table.data + index * hfi_features.cpu_stride; cpu_caps[i].cpu = cpu; /* @@ -235,6 +327,8 @@ static void update_capabilities(struct hfi_instance *hfi_instance) thermal_genl_cpu_capability_event(cpu_count, &cpu_caps[i]); kfree(cpu_caps); + + set_hfi_ipcc_scores(hfi_instance); out: mutex_unlock(&hfi_instance_lock); } @@ -296,7 +390,7 @@ void intel_hfi_process_event(__u64 pkg_therm_status_msr_val) * where a lagging CPU entered the locked region. */ new_timestamp = *(u64 *)hfi_instance->hw_table; - if (*hfi_instance->timestamp == new_timestamp) { + if (*hfi_instance->local_table.timestamp == new_timestamp) { thermal_clear_package_intr_status(PACKAGE_LEVEL, PACKAGE_THERM_STATUS_HFI_UPDATED); raw_spin_unlock(&hfi_instance->event_lock); return; @@ -308,7 +402,7 @@ void intel_hfi_process_event(__u64 pkg_therm_status_msr_val) * Copy the updated table into our local copy. This includes the new * timestamp. */ - memcpy(hfi_instance->local_table, hfi_instance->hw_table, + memcpy(hfi_instance->local_table.base_addr, hfi_instance->hw_table, hfi_features.nr_table_pages << PAGE_SHIFT); /* @@ -337,17 +431,18 @@ static void init_hfi_cpu_index(struct hfi_cpu_info *info) } /* - * The format of the HFI table depends on the number of capabilities that the - * hardware supports. Keep a data structure to navigate the table. + * The format of the HFI table depends on the number of capabilities and classes + * that the hardware supports. Keep a data structure to navigate the table. */ static void init_hfi_instance(struct hfi_instance *hfi_instance) { /* The HFI header is below the time-stamp. */ - hfi_instance->hdr = hfi_instance->local_table + - sizeof(*hfi_instance->timestamp); + hfi_instance->local_table.hdr = hfi_instance->local_table.base_addr + + sizeof(*hfi_instance->local_table.timestamp); /* The HFI data starts below the header. */ - hfi_instance->data = hfi_instance->hdr + hfi_features.hdr_size; + hfi_instance->local_table.data = hfi_instance->local_table.hdr + + hfi_features.hdr_size; } /* Caller must hold hfi_instance_lock. */ @@ -356,8 +451,13 @@ static void hfi_enable(void) u64 msr_val; rdmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val); - msr_val |= HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT; + msr_val |= HW_FEEDBACK_CONFIG_HFI_ENABLE; + + if (cpu_feature_enabled(X86_FEATURE_ITD)) + msr_val |= HW_FEEDBACK_CONFIG_ITD_ENABLE; + wrmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val); + } static void hfi_set_hw_table(struct hfi_instance *hfi_instance) @@ -366,7 +466,7 @@ static void hfi_set_hw_table(struct hfi_instance *hfi_instance) u64 msr_val; hw_table_pa = virt_to_phys(hfi_instance->hw_table); - msr_val = hw_table_pa | HW_FEEDBACK_PTR_VALID_BIT; + msr_val = hw_table_pa | HW_FEEDBACK_PTR_VALID; wrmsrl(MSR_IA32_HW_FEEDBACK_PTR, msr_val); } @@ -377,7 +477,11 @@ static void hfi_disable(void) int i; rdmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val); - msr_val &= ~HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT; + msr_val &= ~HW_FEEDBACK_CONFIG_HFI_ENABLE; + + if (cpu_feature_enabled(X86_FEATURE_ITD)) + msr_val &= ~HW_FEEDBACK_CONFIG_ITD_ENABLE; + wrmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val); /* @@ -396,6 +500,30 @@ static void hfi_disable(void) } } +static void hfi_enable_itd_classification(void) +{ + u64 msr_val; + + if (!cpu_feature_enabled(X86_FEATURE_ITD)) + return; + + rdmsrl(MSR_IA32_HW_FEEDBACK_THREAD_CONFIG, msr_val); + msr_val |= HW_FEEDBACK_THREAD_CONFIG_ENABLE; + wrmsrl(MSR_IA32_HW_FEEDBACK_THREAD_CONFIG, msr_val); +} + +static void hfi_disable_itd_classification(void) +{ + u64 msr_val; + + if (!cpu_feature_enabled(X86_FEATURE_ITD)) + return; + + rdmsrl(MSR_IA32_HW_FEEDBACK_THREAD_CONFIG, msr_val); + msr_val &= ~HW_FEEDBACK_THREAD_CONFIG_ENABLE; + wrmsrl(MSR_IA32_HW_FEEDBACK_THREAD_CONFIG, msr_val); +} + /** * intel_hfi_online() - Enable HFI on @cpu * @cpu: CPU in which the HFI will be enabled @@ -436,6 +564,8 @@ void intel_hfi_online(unsigned int cpu) init_hfi_cpu_index(info); + hfi_enable_itd_classification(); + /* * Now check if the HFI instance of the package/die of @cpu has been * initialized (by checking its header). In such case, all we have to @@ -443,7 +573,7 @@ void intel_hfi_online(unsigned int cpu) * if needed. */ mutex_lock(&hfi_instance_lock); - if (hfi_instance->hdr) + if (hfi_instance->local_table.hdr) goto enable; /* @@ -463,9 +593,9 @@ void intel_hfi_online(unsigned int cpu) * Allocate memory to keep a local copy of the table that * hardware generates. */ - hfi_instance->local_table = kzalloc(hfi_features.nr_table_pages << PAGE_SHIFT, - GFP_KERNEL); - if (!hfi_instance->local_table) + hfi_instance->local_table.base_addr = kzalloc(hfi_features.nr_table_pages << PAGE_SHIFT, + GFP_KERNEL); + if (!hfi_instance->local_table.base_addr) goto free_hw_table; init_hfi_instance(hfi_instance); @@ -477,11 +607,23 @@ void intel_hfi_online(unsigned int cpu) enable: cpumask_set_cpu(cpu, hfi_instance->cpus); - /* Enable this HFI instance if this is its first online CPU. */ - if (cpumask_weight(hfi_instance->cpus) == 1) { + /* + * Enable this HFI instance if this is its first online CPU and + * there are user-space clients of thermal events. + */ + if (cpumask_weight(hfi_instance->cpus) == 1 && hfi_clients_nr > 0) { hfi_set_hw_table(hfi_instance); hfi_enable(); } + /* + * We have all we need to support IPC classes. Task classification is + * now working. + * + * All class scores are zero until after the first HFI update. That is + * OK. The scheduler queries these scores at every load balance. + */ + if (cpu_feature_enabled(X86_FEATURE_ITD)) + sched_enable_ipc_classes(); unlock: mutex_unlock(&hfi_instance_lock); @@ -516,9 +658,11 @@ void intel_hfi_offline(unsigned int cpu) if (!hfi_instance) return; - if (!hfi_instance->hdr) + if (!hfi_instance->local_table.hdr) return; + hfi_disable_itd_classification(); + mutex_lock(&hfi_instance_lock); cpumask_clear_cpu(cpu, hfi_instance->cpus); @@ -557,44 +701,133 @@ static __init int hfi_parse_features(void) /* The number of 4KB pages required by the table */ hfi_features.nr_table_pages = edx.split.table_pages + 1; + /* + * Capability fields of an HFI class are grouped together. Classes are + * contiguous in memory. Hence, use the number of supported features to + * locate a specific class. + */ + hfi_features.class_stride = nr_capabilities; + + if (cpu_feature_enabled(X86_FEATURE_ITD)) { + union cpuid6_ecx ecx; + + ecx.full = cpuid_ecx(CPUID_HFI_LEAF); + hfi_features.nr_classes = ecx.split.nr_classes; + } else { + hfi_features.nr_classes = 1; + } + /* * The header contains change indications for each supported feature. * The size of the table header is rounded up to be a multiple of 8 * bytes. */ - hfi_features.hdr_size = DIV_ROUND_UP(nr_capabilities, 8) * 8; + hfi_features.hdr_size = DIV_ROUND_UP(nr_capabilities * + hfi_features.nr_classes, 8) * 8; /* * Data of each logical processor is also rounded up to be a multiple * of 8 bytes. */ - hfi_features.cpu_stride = DIV_ROUND_UP(nr_capabilities, 8) * 8; + hfi_features.cpu_stride = DIV_ROUND_UP(nr_capabilities * + hfi_features.nr_classes, 8) * 8; return 0; } -static void hfi_do_enable(void) +/* + * If concurrency is not prevented by other means, the HFI enable/disable + * routines must be called under hfi_instance_lock." + */ +static void hfi_enable_instance(void *ptr) +{ + hfi_set_hw_table(ptr); + hfi_enable(); +} + +static void hfi_disable_instance(void *ptr) +{ + hfi_disable(); +} + +static void hfi_syscore_resume(void) { /* This code runs only on the boot CPU. */ struct hfi_cpu_info *info = &per_cpu(hfi_cpu_info, 0); struct hfi_instance *hfi_instance = info->hfi_instance; /* No locking needed. There is no concurrency with CPU online. */ - hfi_set_hw_table(hfi_instance); - hfi_enable(); + if (hfi_clients_nr > 0) { + hfi_set_hw_table(hfi_instance); + hfi_enable_instance(hfi_instance); + hfi_enable_itd_classification(); + } } -static int hfi_do_disable(void) +static int hfi_syscore_suspend(void) { /* No locking needed. There is no concurrency with CPU offline. */ + + hfi_disable_itd_classification(); + hfi_disable(); return 0; } static struct syscore_ops hfi_pm_ops = { - .resume = hfi_do_enable, - .suspend = hfi_do_disable, + .resume = hfi_syscore_resume, + .suspend = hfi_syscore_suspend, +}; + +static int hfi_thermal_notify(struct notifier_block *nb, unsigned long state, + void *_notify) +{ + struct thermal_genl_notify *notify = _notify; + struct hfi_instance *hfi_instance; + smp_call_func_t func = NULL; + unsigned int cpu; + int i; + + if (notify->mcgrp != THERMAL_GENL_EVENT_GROUP) + return NOTIFY_DONE; + + if (state != THERMAL_NOTIFY_BIND && state != THERMAL_NOTIFY_UNBIND) + return NOTIFY_DONE; + + mutex_lock(&hfi_instance_lock); + + switch (state) { + case THERMAL_NOTIFY_BIND: + if (++hfi_clients_nr == 1) + func = hfi_enable_instance; + break; + case THERMAL_NOTIFY_UNBIND: + if (--hfi_clients_nr == 0) + func = hfi_disable_instance; + break; + } + + if (!func) + goto out; + + for (i = 0; i < max_hfi_instances; i++) { + hfi_instance = &hfi_instances[i]; + if (cpumask_empty(hfi_instance->cpus)) + continue; + + cpu = cpumask_any(hfi_instance->cpus); + smp_call_function_single(cpu, func, hfi_instance, true); + } + +out: + mutex_unlock(&hfi_instance_lock); + + return NOTIFY_OK; +} + +static struct notifier_block hfi_thermal_nb = { + .notifier_call = hfi_thermal_notify, }; void __init intel_hfi_init(void) @@ -628,10 +861,28 @@ void __init intel_hfi_init(void) if (!hfi_updates_wq) goto err_nomem; + /* + * Both thermal core and Intel HFI can not be build as modules. + * As kernel build-in drivers they are initialized before user-space + * starts, hence we can not miss BIND/UNBIND events when applications + * add/remove thermal multicast group to/from a netlink socket. + */ + if (thermal_genl_register_notifier(&hfi_thermal_nb)) + goto err_nl_notif; + register_syscore_ops(&hfi_pm_ops); + if (alloc_hfi_ipcc_scores()) + goto err_ipcc; + return; +err_nl_notif: + destroy_workqueue(hfi_updates_wq); + +err_ipcc: + destroy_workqueue(hfi_updates_wq); + err_nomem: for (j = 0; j < i; ++j) { hfi_instance = &hfi_instances[j]; diff --git a/drivers/thermal/thermal_netlink.c b/drivers/thermal/thermal_netlink.c index 76a231a2965..bef14ce69ec 100644 --- a/drivers/thermal/thermal_netlink.c +++ b/drivers/thermal/thermal_netlink.c @@ -7,17 +7,13 @@ * Generic netlink for thermal management framework */ #include +#include #include #include #include #include "thermal_core.h" -enum thermal_genl_multicast_groups { - THERMAL_GENL_SAMPLING_GROUP = 0, - THERMAL_GENL_EVENT_GROUP = 1, -}; - static const struct genl_multicast_group thermal_genl_mcgrps[] = { [THERMAL_GENL_SAMPLING_GROUP] = { .name = THERMAL_GENL_SAMPLING_GROUP_NAME, }, [THERMAL_GENL_EVENT_GROUP] = { .name = THERMAL_GENL_EVENT_GROUP_NAME, }, @@ -74,11 +70,12 @@ struct param { typedef int (*cb_t)(struct param *); -static struct genl_family thermal_gnl_family; +static struct genl_family thermal_genl_family; +static BLOCKING_NOTIFIER_HEAD(thermal_genl_chain); static int thermal_group_has_listeners(enum thermal_genl_multicast_groups group) { - return genl_has_listeners(&thermal_gnl_family, &init_net, group); + return genl_has_listeners(&thermal_genl_family, &init_net, group); } /************************** Sampling encoding *******************************/ @@ -95,7 +92,7 @@ int thermal_genl_sampling_temp(int id, int temp) if (!skb) return -ENOMEM; - hdr = genlmsg_put(skb, 0, 0, &thermal_gnl_family, 0, + hdr = genlmsg_put(skb, 0, 0, &thermal_genl_family, 0, THERMAL_GENL_SAMPLING_TEMP); if (!hdr) goto out_free; @@ -108,7 +105,7 @@ int thermal_genl_sampling_temp(int id, int temp) genlmsg_end(skb, hdr); - genlmsg_multicast(&thermal_gnl_family, skb, 0, THERMAL_GENL_SAMPLING_GROUP, GFP_KERNEL); + genlmsg_multicast(&thermal_genl_family, skb, 0, THERMAL_GENL_SAMPLING_GROUP, GFP_KERNEL); return 0; out_cancel: @@ -282,7 +279,7 @@ static int thermal_genl_send_event(enum thermal_genl_event event, return -ENOMEM; p->msg = msg; - hdr = genlmsg_put(msg, 0, 0, &thermal_gnl_family, 0, event); + hdr = genlmsg_put(msg, 0, 0, &thermal_genl_family, 0, event); if (!hdr) goto out_free_msg; @@ -292,7 +289,7 @@ static int thermal_genl_send_event(enum thermal_genl_event event, genlmsg_end(msg, hdr); - genlmsg_multicast(&thermal_gnl_family, msg, 0, THERMAL_GENL_EVENT_GROUP, GFP_KERNEL); + genlmsg_multicast(&thermal_genl_family, msg, 0, THERMAL_GENL_EVENT_GROUP, GFP_KERNEL); return 0; @@ -593,7 +590,7 @@ static int thermal_genl_cmd_dumpit(struct sk_buff *skb, int ret; void *hdr; - hdr = genlmsg_put(skb, 0, 0, &thermal_gnl_family, 0, cmd); + hdr = genlmsg_put(skb, 0, 0, &thermal_genl_family, 0, cmd); if (!hdr) return -EMSGSIZE; @@ -625,7 +622,7 @@ static int thermal_genl_cmd_doit(struct sk_buff *skb, return -ENOMEM; p.msg = msg; - hdr = genlmsg_put_reply(msg, info, &thermal_gnl_family, 0, cmd); + hdr = genlmsg_put_reply(msg, info, &thermal_genl_family, 0, cmd); if (!hdr) goto out_free_msg; @@ -645,6 +642,27 @@ static int thermal_genl_cmd_doit(struct sk_buff *skb, return ret; } +static int thermal_genl_bind(int mcgrp) +{ + struct thermal_genl_notify n = { .mcgrp = mcgrp }; + + if (WARN_ON_ONCE(mcgrp > THERMAL_GENL_MAX_GROUP)) + return -EINVAL; + + blocking_notifier_call_chain(&thermal_genl_chain, THERMAL_NOTIFY_BIND, &n); + return 0; +} + +static void thermal_genl_unbind(int mcgrp) +{ + struct thermal_genl_notify n = { .mcgrp = mcgrp }; + + if (WARN_ON_ONCE(mcgrp > THERMAL_GENL_MAX_GROUP)) + return; + + blocking_notifier_call_chain(&thermal_genl_chain, THERMAL_NOTIFY_UNBIND, &n); +} + static const struct genl_small_ops thermal_genl_ops[] = { { .cmd = THERMAL_GENL_CMD_TZ_GET_ID, @@ -673,12 +691,14 @@ static const struct genl_small_ops thermal_genl_ops[] = { }, }; -static struct genl_family thermal_gnl_family __ro_after_init = { +static struct genl_family thermal_genl_family __ro_after_init = { .hdrsize = 0, .name = THERMAL_GENL_FAMILY_NAME, .version = THERMAL_GENL_VERSION, .maxattr = THERMAL_GENL_ATTR_MAX, .policy = thermal_genl_policy, + .bind = thermal_genl_bind, + .unbind = thermal_genl_unbind, .small_ops = thermal_genl_ops, .n_small_ops = ARRAY_SIZE(thermal_genl_ops), .resv_start_op = THERMAL_GENL_CMD_CDEV_GET + 1, @@ -686,12 +706,22 @@ static struct genl_family thermal_gnl_family __ro_after_init = { .n_mcgrps = ARRAY_SIZE(thermal_genl_mcgrps), }; +int thermal_genl_register_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&thermal_genl_chain, nb); +} + +int thermal_genl_unregister_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&thermal_genl_chain, nb); +} + int __init thermal_netlink_init(void) { - return genl_register_family(&thermal_gnl_family); + return genl_register_family(&thermal_genl_family); } void __init thermal_netlink_exit(void) { - genl_unregister_family(&thermal_gnl_family); + genl_unregister_family(&thermal_genl_family); } diff --git a/drivers/thermal/thermal_netlink.h b/drivers/thermal/thermal_netlink.h index 93a927e144d..e01221e8816 100644 --- a/drivers/thermal/thermal_netlink.h +++ b/drivers/thermal/thermal_netlink.h @@ -10,6 +10,19 @@ struct thermal_genl_cpu_caps { int efficiency; }; +enum thermal_genl_multicast_groups { + THERMAL_GENL_SAMPLING_GROUP = 0, + THERMAL_GENL_EVENT_GROUP = 1, + THERMAL_GENL_MAX_GROUP = THERMAL_GENL_EVENT_GROUP, +}; + +#define THERMAL_NOTIFY_BIND 0 +#define THERMAL_NOTIFY_UNBIND 1 + +struct thermal_genl_notify { + int mcgrp; +}; + struct thermal_zone_device; struct thermal_trip; struct thermal_cooling_device; @@ -18,6 +31,9 @@ struct thermal_cooling_device; #ifdef CONFIG_THERMAL_NETLINK int __init thermal_netlink_init(void); void __init thermal_netlink_exit(void); +int thermal_genl_register_notifier(struct notifier_block *nb); +int thermal_genl_unregister_notifier(struct notifier_block *nb); + int thermal_notify_tz_create(const struct thermal_zone_device *tz); int thermal_notify_tz_delete(const struct thermal_zone_device *tz); int thermal_notify_tz_enable(const struct thermal_zone_device *tz); @@ -48,6 +64,16 @@ static inline int thermal_notify_tz_create(const struct thermal_zone_device *tz) return 0; } +static inline int thermal_genl_register_notifier(struct notifier_block *nb) +{ + return 0; +} + +static inline int thermal_genl_unregister_notifier(struct notifier_block *nb) +{ + return 0; +} + static inline int thermal_notify_tz_delete(const struct thermal_zone_device *tz) { return 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index ffe8f618ab8..8d458554bae 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -137,6 +137,8 @@ struct user_event_mm; __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) +#define IPC_CLASS_UNCLASSIFIED 0 + #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) #define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) @@ -301,7 +303,7 @@ enum { TASK_COMM_LEN = 16, }; -extern void scheduler_tick(void); +extern void scheduler_tick(bool user_tick); #define MAX_SCHEDULE_TIMEOUT LONG_MAX @@ -1547,6 +1549,24 @@ struct task_struct { struct user_event_mm *user_event_mm; #endif +#ifdef CONFIG_IPC_CLASSES + /* + * A hardware-defined classification of task that reflects but is + * not identical to the number of instructions per cycle. + */ + unsigned int ipcc : 9; + /* + * A candidate classification that arch-specific implementations + * qualify for correctness. + */ + unsigned int ipcc_tmp : 9; + /* + * Counter to filter out transient candidate classifications + * of a task. + */ + unsigned int ipcc_cntr : 14; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. @@ -2183,4 +2203,6 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } extern void sched_set_stop_task(int cpu, struct task_struct *stop); +extern bool sched_smt_siblings_idle(int cpu); + #endif diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index a6e04b4a21d..f32fce3fc8e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -292,4 +292,10 @@ static inline int task_node(const struct task_struct *p) return cpu_to_node(task_cpu(p)); } +#ifdef CONFIG_IPC_CLASSES +extern void sched_enable_ipc_classes(void); +#else +static inline void sched_enable_ipc_classes(void) { } +#endif + #endif /* _LINUX_SCHED_TOPOLOGY_H */ diff --git a/init/Kconfig b/init/Kconfig index bee58f7468c..3447c10cbdd 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -849,6 +849,18 @@ config UCLAMP_BUCKETS_COUNT If in doubt, use the default value. +config IPC_CLASSES + bool "IPC classes of tasks" + depends on SMP + help + If selected, each task is assigned a classification value that + reflects the type of instructions that the task executes. This + classification reflects but is not equal to the number of + instructions retired per cycle. + + The scheduler uses the classification value to improve the placement + of tasks. + endmenu # diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9116bcc9034..5e07149813c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4515,6 +4515,11 @@ int wake_up_state(struct task_struct *p, unsigned int state) */ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) { +#ifdef CONFIG_IPC_CLASSES + p->ipcc = IPC_CLASS_UNCLASSIFIED; + p->ipcc_tmp = IPC_CLASS_UNCLASSIFIED; + p->ipcc_cntr = 0; +#endif p->on_rq = 0; p->se.on_rq = 0; @@ -5653,7 +5658,7 @@ static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. */ -void scheduler_tick(void) +void scheduler_tick(bool user_tick) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); @@ -5665,6 +5670,9 @@ void scheduler_tick(void) if (housekeeping_cpu(cpu, HK_TYPE_TICK)) arch_scale_freq_tick(); + if (sched_ipcc_enabled() && user_tick) + arch_update_ipcc(curr); + sched_clock_tick(); rq_lock(rq, &rf); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 533547e3c90..38e0acfefb0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1305,7 +1305,14 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ -static inline bool is_core_idle(int cpu) +/** + * sched_smt_siblings_idle - Check whether SMT siblings of a CPU are idle + * @cpu: The CPU to check + * + * Returns true if all the SMT siblings of @cpu are idle or @cpu does not have + * SMT siblings. The idle state of @cpu is not considered. + */ +bool sched_smt_siblings_idle(int cpu) { #ifdef CONFIG_SCHED_SMT int sibling; @@ -2008,7 +2015,7 @@ static inline int numa_idle_core(int idle_core, int cpu) * Prefer cores instead of packing HT siblings * and triggering future load balancing. */ - if (is_core_idle(cpu)) + if (sched_smt_siblings_idle(cpu)) idle_core = cpu; return idle_core; @@ -9449,6 +9456,13 @@ struct sg_lb_stats { unsigned int nr_numa_running; unsigned int nr_preferred_running; #endif +#ifdef CONFIG_IPC_CLASSES + unsigned long min_score; /* Min(score(rq->curr->ipcc)) */ + unsigned short min_ipcc; /* Class of the task with the minimum IPCC score in the rq */ + unsigned long sum_score; /* Sum(score(rq->curr->ipcc)) */ + long ipcc_score_after; /* Prospective IPCC score after load balancing */ + unsigned long ipcc_score_before; /* IPCC score before load balancing */ +#endif }; /* @@ -9727,6 +9741,248 @@ group_type group_classify(unsigned int imbalance_pct, return group_has_spare; } +#ifdef CONFIG_IPC_CLASSES +static void init_rq_ipcc_stats(struct sg_lb_stats *sgs) +{ + /* All IPCC stats have been set to zero in update_sg_lb_stats(). */ + sgs->min_score = ULONG_MAX; +} + +static int rq_last_task_ipcc(int dst_cpu, struct rq *rq, unsigned short *ipcc) +{ + struct list_head *tasks = &rq->cfs_tasks; + struct task_struct *p; + struct rq_flags rf; + int ret = -EINVAL; + + rq_lock_irqsave(rq, &rf); + if (list_empty(tasks)) + goto out; + + p = list_last_entry(tasks, struct task_struct, se.group_node); + if (p->flags & PF_EXITING || is_idle_task(p) || + !cpumask_test_cpu(dst_cpu, p->cpus_ptr)) + goto out; + + ret = 0; + *ipcc = p->ipcc; +out: + rq_unlock(rq, &rf); + return ret; +} + +/* Called only if cpu_of(@rq) is not idle and has tasks running. */ +static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, + struct rq *rq) +{ + unsigned short ipcc; + unsigned long score; + + if (!sched_ipcc_enabled()) + return; + + if (rq_last_task_ipcc(dst_cpu, rq, &ipcc)) + return; + + score = arch_get_ipcc_score(ipcc, cpu_of(rq)); + + /* + * Ignore tasks with invalid scores. When finding the busiest group, we + * prefer those with higher sum_score. This group will not be selected. + */ + if (IS_ERR_VALUE(score)) + return; + + sgs->sum_score += score; + + if (score < sgs->min_score) { + sgs->min_score = score; + sgs->min_ipcc = ipcc; + } +} + +static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs, + struct sched_group *sg, + struct lb_env *env) +{ + unsigned long score_on_dst_cpu, before; + int busy_cpus; + long after; + + if (!sched_ipcc_enabled()) + return; + + /* + * IPCC scores are only useful during idle load balancing. For now, + * only asym_packing uses IPCC scores. + */ + if (!(env->sd->flags & SD_ASYM_PACKING) || + env->idle == CPU_NOT_IDLE) + return; + + /* + * IPCC scores are used to break ties only between these types of + * groups. + */ + if (sgs->group_type != group_fully_busy && + sgs->group_type != group_asym_packing) + return; + + busy_cpus = sgs->group_weight - sgs->idle_cpus; + + /* No busy CPUs in the group. No tasks to move. */ + if (!busy_cpus) + return; + + score_on_dst_cpu = arch_get_ipcc_score(sgs->min_ipcc, env->dst_cpu); + + /* + * Do not use IPC scores. sgs::ipcc_score_{after, before} will be zero + * and not used. + */ + if (IS_ERR_VALUE(score_on_dst_cpu)) + return; + + before = sgs->sum_score; + after = before - sgs->min_score; + + /* SMT siblings share throughput. */ + if (busy_cpus > 1 && sg->flags & SD_SHARE_CPUCAPACITY) { + before /= busy_cpus; + /* One sibling will become idle after load balance. */ + after /= busy_cpus - 1; + } + + sgs->ipcc_score_after = after + score_on_dst_cpu; + sgs->ipcc_score_before = before; +} + +/** + * sched_asym_ipcc_prefer - Select a sched group based on its IPCC score + * @a: Load balancing statistics of a sched group + * @b: Load balancing statistics of a second sched group + * + * Returns: true if @a has a higher IPCC score than @b after load balance. + * False otherwise. + */ +static bool sched_asym_ipcc_prefer(struct sg_lb_stats *a, + struct sg_lb_stats *b) +{ + if (!sched_ipcc_enabled()) + return false; + + /* @a increases overall throughput after load balance. */ + if (a->ipcc_score_after > b->ipcc_score_after) + return true; + + /* + * If @a and @b yield the same overall throughput, pick @a if + * its current throughput is lower than that of @b. + */ + if (a->ipcc_score_after == b->ipcc_score_after) + return a->ipcc_score_before < b->ipcc_score_before; + + return false; +} + +/** + * sched_asym_ipcc_pick - Select a sched group based on its IPCC score + * @a: A scheduling group + * @b: A second scheduling group + * @a_stats: Load balancing statistics of @a + * @b_stats: Load balancing statistics of @b + * + * Returns: true if @a has the same priority and @a has tasks with IPC classes + * that yield higher overall throughput after load balance. False otherwise. + */ +static bool sched_asym_ipcc_pick(struct sched_group *a, + struct sched_group *b, + struct sg_lb_stats *a_stats, + struct sg_lb_stats *b_stats) +{ + /* + * Only use the class-specific preference selection if both sched + * groups have the same priority. + */ + if (arch_asym_cpu_priority(a->asym_prefer_cpu) != + arch_asym_cpu_priority(b->asym_prefer_cpu)) + return false; + + return sched_asym_ipcc_prefer(a_stats, b_stats); +} + +/** + * ipcc_score_delta - Get the IPCC score delta wrt the load balance's dst_cpu + * @rq: A runqueue + * @env: Load balancing environment + * + * Returns: The IPCC score delta that the last task enqueued in @rq would get + * if placed in the destination CPU of @env. LONG_MIN to indicate that the + * delta should not be used. + */ +static long ipcc_score_delta(struct rq *rq, struct lb_env *env) +{ + unsigned long score_src, score_dst; + unsigned short ipcc; + + if (!sched_ipcc_enabled()) + return LONG_MIN; + + /* Only asym_packing uses IPCC scores at the moment. */ + if (!(env->sd->flags & SD_ASYM_PACKING)) + return LONG_MIN; + + if (rq_last_task_ipcc(env->dst_cpu, rq, &ipcc)) + return LONG_MIN; + + score_dst = arch_get_ipcc_score(ipcc, env->dst_cpu); + if (IS_ERR_VALUE(score_dst)) + return LONG_MIN; + + score_src = arch_get_ipcc_score(ipcc, cpu_of(rq)); + if (IS_ERR_VALUE(score_src)) + return LONG_MIN; + + return score_dst - score_src; +} + +#else /* CONFIG_IPC_CLASSES */ +static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, + struct rq *rq) +{ +} + +static void init_rq_ipcc_stats(struct sg_lb_stats *sgs) +{ +} + +static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs, + struct sched_group *sg, + struct lb_env *env) +{ +} + +static bool sched_asym_ipcc_prefer(struct sg_lb_stats *a, + struct sg_lb_stats *b) +{ + return false; +} + +static bool sched_asym_ipcc_pick(struct sched_group *a, + struct sched_group *b, + struct sg_lb_stats *a_stats, + struct sg_lb_stats *b_stats) +{ + return false; +} + +static long ipcc_score_delta(struct rq *rq, struct lb_env *env) +{ + return LONG_MIN; +} + +#endif /* CONFIG_IPC_CLASSES */ + /** * sched_use_asym_prio - Check whether asym_packing priority must be used * @sd: The scheduling domain of the load balancing @@ -9743,7 +9999,7 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) if (!sched_smt_active()) return true; - return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu); + return sd->flags & SD_SHARE_CPUCAPACITY || sched_smt_siblings_idle(cpu); } /** @@ -9882,6 +10138,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, int i, nr_running, local_group; memset(sgs, 0, sizeof(*sgs)); + init_rq_ipcc_stats(sgs); local_group = group == sds->local; @@ -9931,6 +10188,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (sgs->group_misfit_task_load < load) sgs->group_misfit_task_load = load; } + + update_sg_lb_ipcc_stats(env->dst_cpu, sgs, rq); } sgs->group_capacity = group->sgc->capacity; @@ -9950,6 +10209,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + if (!local_group) + update_sg_lb_stats_scores(sgs, group, env); + /* Computing avg_load makes sense only when group is overloaded */ if (sgs->group_type == group_overloaded) sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / @@ -10021,6 +10283,16 @@ static bool update_sd_pick_busiest(struct lb_env *env, /* Prefer to move from lowest priority CPU's work */ if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu)) return false; + + /* + * Unlike other callers of sched_asym_prefer(), here both @sg + * and @sds::busiest have tasks running. When they have equal + * priority, their IPC class scores can be used to select a + * better busiest. + */ + if (sched_asym_ipcc_pick(sds->busiest, sg, &sds->busiest_stat, sgs)) + return false; + break; case group_misfit_task: @@ -10061,10 +10333,21 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->avg_load == busiest->avg_load) { /* * SMT sched groups need more help than non-SMT groups. - * If @sg happens to also be SMT, either choice is good. */ - if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) - return false; + if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) { + if (!(sg->flags & SD_SHARE_CPUCAPACITY)) + return false; + + /* + * Between two SMT groups, use IPCC scores to pick the + * one that would improve throughput the most (only + * asym_packing uses IPCC scores for now). + */ + if (sched_ipcc_enabled() && + env->sd->flags & SD_ASYM_PACKING && + sched_asym_ipcc_prefer(busiest, sgs)) + return false; + } } break; @@ -10981,6 +11264,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, { struct rq *busiest = NULL, *rq; unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; + long busiest_ipcc_delta = LONG_MIN; unsigned int busiest_nr = 0; int i; @@ -11097,6 +11381,26 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (busiest_nr < nr_running) { busiest_nr = nr_running; busiest = rq; + + /* + * Remember the IPCC score of the busiest + * runqueue. We may need it to break a tie with + * other queues with equal nr_running. + */ + busiest_ipcc_delta = ipcc_score_delta(busiest, env); + /* + * For ties, select @rq if doing would give its last + * queued task a bigger IPC boost when migrated to + * dst_cpu. + */ + } else if (busiest_nr == nr_running) { + long delta = ipcc_score_delta(rq, env); + + if (busiest_ipcc_delta < delta) { + busiest_ipcc_delta = delta; + busiest_nr = nr_running; + busiest = rq; + } } break; @@ -11228,7 +11532,7 @@ static int should_we_balance(struct lb_env *env) * balancing cores, but remember the first idle SMT CPU for * later consideration. Find CPU on an idle core first. */ - if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) { + if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !sched_smt_siblings_idle(cpu)) { if (idle_smt == -1) idle_smt = cpu; /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 001fe047bd5..b741fca335b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2622,6 +2622,72 @@ void arch_scale_freq_tick(void) } #endif +#ifdef CONFIG_IPC_CLASSES +DECLARE_STATIC_KEY_FALSE(sched_ipcc); + +static inline bool sched_ipcc_enabled(void) +{ + return static_branch_unlikely(&sched_ipcc); +} + +#ifndef arch_update_ipcc +/** + * arch_update_ipcc() - Update the IPC class of the current task + * @curr: The current task + * + * Request that the IPC classification of @curr is updated. + * + * Returns: none + */ +static __always_inline +void arch_update_ipcc(struct task_struct *curr) +{ +} +#endif + +#ifndef arch_get_ipcc_score + +#define SCHED_IPCC_SCORE_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) +/** + * arch_get_ipcc_score() - Get the IPC score of a class of task + * @ipcc: The IPC class + * @cpu: A CPU number + * + * The IPC performance scores reflects (but it is not identical to) the number + * of instructions retired per cycle for a given IPC class. It is a linear and + * abstract metric. Higher scores reflect better performance. + * + * The IPC score can be normalized with respect to the class, i, with the + * highest IPC score on the CPU, c, with highest performance: + * + * IPC(i, c) + * ------------------------------------ * SCHED_IPCC_SCORE_SCALE + * max(IPC(i, c) : (i, c)) + * + * Scheduling schemes that want to use the IPC score along with other + * normalized metrics for scheduling (e.g., CPU capacity) may need to normalize + * it. + * + * Other scheduling schemes (e.g., asym_packing) do not need normalization. + * + * Returns the performance score of an IPC class, @ipcc, when running on @cpu. + * Error when either @ipcc or @cpu are invalid. + */ +static __always_inline +unsigned long arch_get_ipcc_score(unsigned short ipcc, int cpu) +{ + return SCHED_IPCC_SCORE_SCALE; +} +#endif +#else /* CONFIG_IPC_CLASSES */ + +#define arch_get_ipcc_score(ipcc, cpu) (-EINVAL) +#define arch_update_ipcc(curr) + +static inline bool sched_ipcc_enabled(void) { return false; } + +#endif /* CONFIG_IPC_CLASSES */ + #ifndef arch_scale_freq_capacity /** * arch_scale_freq_capacity - get the frequency scale factor of a given CPU. diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 10d1391e741..da49c3c5162 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -677,6 +677,15 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); DEFINE_STATIC_KEY_FALSE(sched_cluster_active); +#ifdef CONFIG_IPC_CLASSES +DEFINE_STATIC_KEY_FALSE(sched_ipcc); + +void sched_enable_ipc_classes(void) +{ + static_branch_enable_cpuslocked(&sched_ipcc); +} +#endif + static void update_top_cache_domain(int cpu) { struct sched_domain_shared *sds = NULL; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 352b161113c..f739cd5912d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2089,7 +2089,7 @@ void update_process_times(int user_tick) if (in_irq()) irq_work_tick(); #endif - scheduler_tick(); + scheduler_tick(user_tick); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); } -- 2.44.0 From 6ac91be34077c54e9f7459098aff5b9d183de7f8 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Mon, 12 Feb 2024 17:16:13 +0100 Subject: [PATCH 2/2] genetlink: Add per family bind/unbind callbacks Add genetlink family bind()/unbind() callbacks when adding/removing multicast group to/from netlink client socket via setsockopt() or bind() syscall. They can be used to track if consumers of netlink multicast messages emerge or disappear. Thus, a client implementing callbacks, can now send events only when there are active consumers, preventing unnecessary work when none exist. Suggested-by: Jakub Kicinski Signed-off-by: Stanislaw Gruszka Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240212161615.161935-2-stanislaw.gruszka@linux.intel.com Signed-off-by: Jakub Kicinski --- include/net/genetlink.h | 4 ++++ net/netlink/genetlink.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/include/net/genetlink.h b/include/net/genetlink.h index e6146912940..ecadba836ae 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -41,6 +41,8 @@ struct genl_info; * do additional, common, filtering and return an error * @post_doit: called after an operation's doit callback, it may * undo operations done by pre_doit, for example release locks + * @bind: called when family multicast group is added to a netlink socket + * @unbind: called when family multicast group is removed from a netlink socket * @module: pointer to the owning module (set to THIS_MODULE) * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups @@ -84,6 +86,8 @@ struct genl_family { void (*post_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); + int (*bind)(int mcgrp); + void (*unbind)(int mcgrp); const struct genl_ops * ops; const struct genl_small_ops *small_ops; const struct genl_split_ops *split_ops; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 8c7af02f845..50ec599a5cf 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1836,6 +1836,9 @@ static int genl_bind(struct net *net, int group) !ns_capable(net->user_ns, CAP_SYS_ADMIN)) ret = -EPERM; + if (family->bind) + family->bind(i); + break; } @@ -1843,12 +1846,39 @@ static int genl_bind(struct net *net, int group) return ret; } +static void genl_unbind(struct net *net, int group) +{ + const struct genl_family *family; + unsigned int id; + + down_read(&cb_lock); + + idr_for_each_entry(&genl_fam_idr, family, id) { + int i; + + if (family->n_mcgrps == 0) + continue; + + i = group - family->mcgrp_offset; + if (i < 0 || i >= family->n_mcgrps) + continue; + + if (family->unbind) + family->unbind(i); + + break; + } + + up_read(&cb_lock); +} + static int __net_init genl_pernet_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = genl_rcv, .flags = NL_CFG_F_NONROOT_RECV, .bind = genl_bind, + .unbind = genl_unbind, .release = genl_release, }; -- 2.44.0 From 68a15ef01803c252261ebb47d86dfc1f2c68ae1e Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 6 Oct 2023 15:58:56 -0700 Subject: [PATCH] sched/fair: Don't force smt balancing when CPU has spare capacity Currently group_smt_balance is picked whenever there are more than two tasks on a core with two SMT. However, the utilization of those tasks may be low and do not warrant a task migration to a CPU of lower priority. Adjust sched group clssification and sibling_imbalance() to reflect this consideration. Use sibling_imbalance() to compute imbalance in calculate_imbalance() for the group_smt_balance case. Signed-off-by: Tim Chen --- kernel/sched/fair.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ef7490c4b8b4..7dd7c2d2367a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9460,14 +9460,15 @@ group_type group_classify(unsigned int imbalance_pct, if (sgs->group_asym_packing) return group_asym_packing; - if (sgs->group_smt_balance) - return group_smt_balance; - if (sgs->group_misfit_task_load) return group_misfit_task; - if (!group_has_capacity(imbalance_pct, sgs)) - return group_fully_busy; + if (!group_has_capacity(imbalance_pct, sgs)) { + if (sgs->group_smt_balance) + return group_smt_balance; + else + return group_fully_busy; + } return group_has_spare; } @@ -9573,6 +9574,11 @@ static inline long sibling_imbalance(struct lb_env *env, if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running) return 0; + /* Do not pull tasks off preferred group with spare capacity */ + if (busiest->group_type == group_has_spare && + sched_asym_prefer(sds->busiest->asym_prefer_cpu, env->dst_cpu)) + return 0; + ncores_busiest = sds->busiest->cores; ncores_local = sds->local->cores; @@ -10411,13 +10417,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return; } - if (busiest->group_type == group_smt_balance) { - /* Reduce number of tasks sharing CPU capacity */ - env->migration_type = migrate_task; - env->imbalance = 1; - return; - } - if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages -- 2.32.0