diff --git a/.github/release-nest-v3 b/.github/release-nest-v3 index e440e5c..bf0d87a 100644 --- a/.github/release-nest-v3 +++ b/.github/release-nest-v3 @@ -1 +1 @@ -3 \ No newline at end of file +4 \ No newline at end of file diff --git a/config b/config index f8d6d5f..ff76b30 100644 --- a/config +++ b/config @@ -1,8 +1,8 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 6.11.0-rc6 Kernel Configuration +# Linux/x86 6.11.0 Kernel Configuration # -CONFIG_CC_VERSION_TEXT="gcc (GCC) 14.2.1 20240805" +CONFIG_CC_VERSION_TEXT="gcc (GCC) 14.2.1 20240910" CONFIG_CC_IS_GCC=y CONFIG_GCC_VERSION=140201 CONFIG_CLANG_VERSION=0 @@ -223,7 +223,7 @@ CONFIG_CGROUPS=y CONFIG_PAGE_COUNTER=y # CONFIG_CGROUP_FAVOR_DYNMODS is not set CONFIG_MEMCG=y -CONFIG_MEMCG_V1=y +# CONFIG_MEMCG_V1 is not set CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y @@ -2744,7 +2744,7 @@ CONFIG_NVME_TCP=m CONFIG_NVME_TCP_TLS=y CONFIG_NVME_HOST_AUTH=y CONFIG_NVME_TARGET=m -# CONFIG_NVME_TARGET_DEBUGFS is not set +CONFIG_NVME_TARGET_DEBUGFS=y CONFIG_NVME_TARGET_PASSTHRU=y CONFIG_NVME_TARGET_LOOP=m CONFIG_NVME_TARGET_RDMA=m @@ -3470,7 +3470,7 @@ CONFIG_FM10K=m CONFIG_IGC=m CONFIG_IGC_LEDS=y CONFIG_IDPF=m -CONFIG_IDPF_SINGLEQ=y +# CONFIG_IDPF_SINGLEQ is not set CONFIG_JME=m CONFIG_NET_VENDOR_ADI=y CONFIG_ADIN1110=m @@ -4562,7 +4562,7 @@ CONFIG_INPUT_AD714X_SPI=m CONFIG_INPUT_ARIZONA_HAPTICS=m CONFIG_INPUT_ATC260X_ONKEY=m CONFIG_INPUT_BMA150=m -CONFIG_INPUT_CS40L50_VIBRA=m +# CONFIG_INPUT_CS40L50_VIBRA is not set CONFIG_INPUT_E3X0_BUTTON=m CONFIG_INPUT_PCSPKR=m CONFIG_INPUT_MAX77693_HAPTIC=m @@ -5192,7 +5192,7 @@ CONFIG_GPIO_SIM=m # # GPIO Debugging utilities # -CONFIG_GPIO_VIRTUSER=m +# CONFIG_GPIO_VIRTUSER is not set # end of GPIO Debugging utilities CONFIG_W1=m @@ -10730,7 +10730,7 @@ CONFIG_OVERLAY_FS_METACOPY=y # CONFIG_NETFS_SUPPORT=m CONFIG_NETFS_STATS=y -# CONFIG_NETFS_DEBUG is not set +CONFIG_NETFS_DEBUG=y CONFIG_FSCACHE=y CONFIG_FSCACHE_STATS=y CONFIG_CACHEFILES=m @@ -11654,7 +11654,7 @@ CONFIG_PAHOLE_HAS_SPLIT_BTF=y CONFIG_PAHOLE_HAS_LANG_EXCLUDE=y CONFIG_DEBUG_INFO_BTF_MODULES=y # CONFIG_MODULE_ALLOW_BTF_MISMATCH is not set -# CONFIG_GDB_SCRIPTS is not set +CONFIG_GDB_SCRIPTS=y CONFIG_FRAME_WARN=2048 CONFIG_STRIP_ASM_SYMS=y # CONFIG_READABLE_ASM is not set diff --git a/patches/0001-cachyos-base-all.patch b/patches/0001-cachyos-base-all.patch index ed38f92..7987396 100644 --- a/patches/0001-cachyos-base-all.patch +++ b/patches/0001-cachyos-base-all.patch @@ -1,18 +1,53 @@ -From 89404bebea127570b279beb4ed0a30ace5403370 Mon Sep 17 00:00:00 2001 +From 67efcf30522cda8a81d47d35a9a89c24f5cdd00a Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 1 Sep 2024 10:38:28 +0200 +Date: Sun, 15 Sep 2024 17:28:12 +0200 Subject: [PATCH 01/11] amd-pstate Signed-off-by: Peter Jung --- - arch/x86/include/asm/processor.h | 3 - - arch/x86/kernel/acpi/cppc.c | 159 ++++++++++++++++++++++++++++++- - arch/x86/kernel/cpu/amd.c | 16 ---- - drivers/cpufreq/acpi-cpufreq.c | 12 ++- - drivers/cpufreq/amd-pstate.c | 143 +++++++-------------------- - include/acpi/cppc_acpi.h | 10 ++ - 6 files changed, 208 insertions(+), 135 deletions(-) + Documentation/admin-guide/pm/amd-pstate.rst | 15 +- + arch/x86/include/asm/processor.h | 3 - + arch/x86/kernel/acpi/cppc.c | 172 ++++++++++++++++++-- + arch/x86/kernel/cpu/amd.c | 16 -- + drivers/acpi/cppc_acpi.c | 10 +- + drivers/cpufreq/acpi-cpufreq.c | 12 +- + drivers/cpufreq/amd-pstate.c | 133 ++++----------- + include/acpi/cppc_acpi.h | 41 +++-- + 8 files changed, 254 insertions(+), 148 deletions(-) +diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst +index d0324d44f548..210a808b74ec 100644 +--- a/Documentation/admin-guide/pm/amd-pstate.rst ++++ b/Documentation/admin-guide/pm/amd-pstate.rst +@@ -251,7 +251,9 @@ performance supported in `AMD CPPC Performance Capability `_). + In some ASICs, the highest CPPC performance is not the one in the ``_CPC`` + table, so we need to expose it to sysfs. If boost is not active, but + still supported, this maximum frequency will be larger than the one in +-``cpuinfo``. ++``cpuinfo``. On systems that support preferred core, the driver will have ++different values for some cores than others and this will reflect the values ++advertised by the platform at bootup. + This attribute is read-only. + + ``amd_pstate_lowest_nonlinear_freq`` +@@ -262,6 +264,17 @@ lowest non-linear performance in `AMD CPPC Performance Capability + `_.) + This attribute is read-only. + ++``amd_pstate_hw_prefcore`` ++ ++Whether the platform supports the preferred core feature and it has been ++enabled. This attribute is read-only. ++ ++``amd_pstate_prefcore_ranking`` ++ ++The performance ranking of the core. This number doesn't have any unit, but ++larger numbers are preferred at the time of reading. This can change at ++runtime based on platform conditions. This attribute is read-only. ++ + ``energy_performance_available_preferences`` + + A list of all the supported EPP preferences that could be used for diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a75a07f4931f..775acbdea1a9 100644 --- a/arch/x86/include/asm/processor.h @@ -35,7 +70,7 @@ index a75a07f4931f..775acbdea1a9 100644 static inline void amd_check_microcode(void) { } #endif diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c -index ff8f25faca3d..44b13a4e2874 100644 +index ff8f25faca3d..956984054bf3 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -9,6 +9,17 @@ @@ -56,7 +91,14 @@ index ff8f25faca3d..44b13a4e2874 100644 /* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ bool cpc_supported_by_cpu(void) -@@ -75,15 +86,17 @@ static void amd_set_max_freq_ratio(void) +@@ -69,31 +80,30 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) + static void amd_set_max_freq_ratio(void) + { + struct cppc_perf_caps perf_caps; +- u64 highest_perf, nominal_perf; ++ u64 numerator, nominal_perf; + u64 perf_ratio; + int rc; rc = cppc_get_perf_caps(0, &perf_caps); if (rc) { @@ -66,9 +108,11 @@ index ff8f25faca3d..44b13a4e2874 100644 } - highest_perf = amd_get_highest_perf(); -+ rc = amd_get_boost_ratio_numerator(0, &highest_perf); -+ if (rc) -+ pr_warn("Could not retrieve highest performance\n"); ++ rc = amd_get_boost_ratio_numerator(0, &numerator); ++ if (rc) { ++ pr_warn("Could not retrieve highest performance (%d)\n", rc); ++ return; ++ } nominal_perf = perf_caps.nominal_perf; - if (!highest_perf || !nominal_perf) { @@ -78,16 +122,18 @@ index ff8f25faca3d..44b13a4e2874 100644 return; } -@@ -91,7 +104,7 @@ static void amd_set_max_freq_ratio(void) +- perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); /* midpoint between max_boost and max_P */ - perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; - if (!perf_ratio) { +- perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; +- if (!perf_ratio) { - pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); -+ pr_warn("Non-zero highest/nominal perf values led to a 0 ratio\n"); - return; - } +- return; +- } ++ perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1; -@@ -116,3 +129,139 @@ void init_freq_invariance_cppc(void) + freq_invariance_set_perf_ratio(perf_ratio, false); + } +@@ -116,3 +126,143 @@ void init_freq_invariance_cppc(void) init_done = true; mutex_unlock(&freq_invariance_lock); } @@ -191,6 +237,10 @@ index ff8f25faca3d..44b13a4e2874 100644 + * a CPU. On systems that support preferred cores, this will be a hardcoded + * value. On other systems this will the highest performance register value. + * ++ * If booting the system with amd-pstate enabled but preferred cores disabled then ++ * the correct boost numerator will be returned to match hardware capabilities ++ * even if the preferred cores scheduling hints are not enabled. ++ * + * Return: 0 for success, negative error code otherwise. + */ +int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) @@ -254,6 +304,36 @@ index 1e0fe5f8ab84..015971adadfc 100644 static void zenbleed_check_cpu(void *unused) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); +diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c +index dd3d3082c8c7..3b5b695bb80b 100644 +--- a/drivers/acpi/cppc_acpi.c ++++ b/drivers/acpi/cppc_acpi.c +@@ -103,6 +103,11 @@ static DEFINE_PER_CPU(struct cpc_desc *, cpc_desc_ptr); + (cpc)->cpc_entry.reg.space_id == \ + ACPI_ADR_SPACE_PLATFORM_COMM) + ++/* Check if a CPC register is in FFH */ ++#define CPC_IN_FFH(cpc) ((cpc)->type == ACPI_TYPE_BUFFER && \ ++ (cpc)->cpc_entry.reg.space_id == \ ++ ACPI_ADR_SPACE_FIXED_HARDWARE) ++ + /* Check if a CPC register is in SystemMemory */ + #define CPC_IN_SYSTEM_MEMORY(cpc) ((cpc)->type == ACPI_TYPE_BUFFER && \ + (cpc)->cpc_entry.reg.space_id == \ +@@ -1486,9 +1491,12 @@ int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) + /* after writing CPC, transfer the ownership of PCC to platform */ + ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE); + up_write(&pcc_ss_data->pcc_lock); ++ } else if (osc_cpc_flexible_adr_space_confirmed && ++ CPC_SUPPORTED(epp_set_reg) && CPC_IN_FFH(epp_set_reg)) { ++ ret = cpc_write(cpu, epp_set_reg, perf_ctrls->energy_perf); + } else { + ret = -ENOTSUPP; +- pr_debug("_CPC in PCC is not supported\n"); ++ pr_debug("_CPC in PCC and _CPC in FFH are not supported\n"); + } + + return ret; diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index a8ca625a98b8..0f04feb6cafa 100644 --- a/drivers/cpufreq/acpi-cpufreq.c @@ -279,7 +359,7 @@ index a8ca625a98b8..0f04feb6cafa 100644 nominal_perf = perf_caps.nominal_perf; diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c -index 89bda7a2bb8d..93cac81e1cbe 100644 +index 259a917da75f..113f82130a30 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -52,8 +52,6 @@ @@ -337,7 +417,13 @@ index 89bda7a2bb8d..93cac81e1cbe 100644 WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1)); WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1)); WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1)); -@@ -426,12 +398,7 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) +@@ -420,19 +392,13 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) + static int cppc_init_perf(struct amd_cpudata *cpudata) + { + struct cppc_perf_caps cppc_perf; +- u32 highest_perf; + + int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); if (ret) return ret; @@ -347,11 +433,13 @@ index 89bda7a2bb8d..93cac81e1cbe 100644 - highest_perf = cppc_perf.highest_perf; - - WRITE_ONCE(cpudata->highest_perf, highest_perf); +- WRITE_ONCE(cpudata->max_limit_perf, highest_perf); + WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf); - WRITE_ONCE(cpudata->max_limit_perf, highest_perf); ++ WRITE_ONCE(cpudata->max_limit_perf, cppc_perf.highest_perf); WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf); WRITE_ONCE(cpudata->lowest_nonlinear_perf, -@@ -554,12 +521,15 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, + cppc_perf.lowest_nonlinear_perf); +@@ -554,12 +520,15 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, } if (value == prev) @@ -368,7 +456,7 @@ index 89bda7a2bb8d..93cac81e1cbe 100644 } static int amd_pstate_verify(struct cpufreq_policy_data *policy) -@@ -803,66 +773,22 @@ static void amd_pstste_sched_prefcore_workfn(struct work_struct *work) +@@ -803,66 +772,22 @@ static void amd_pstste_sched_prefcore_workfn(struct work_struct *work) } static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn); @@ -438,7 +526,7 @@ index 89bda7a2bb8d..93cac81e1cbe 100644 schedule_work(&sched_prefcore_work); } -@@ -875,17 +801,17 @@ static void amd_pstate_update_limits(unsigned int cpu) +@@ -875,17 +800,17 @@ static void amd_pstate_update_limits(unsigned int cpu) int ret; bool highest_perf_changed = false; @@ -462,7 +550,7 @@ index 89bda7a2bb8d..93cac81e1cbe 100644 WRITE_ONCE(cpudata->prefcore_ranking, cur_high); if (cur_high < CPPC_MAX_PERF) -@@ -949,8 +875,8 @@ static u32 amd_pstate_get_transition_latency(unsigned int cpu) +@@ -949,8 +874,8 @@ static u32 amd_pstate_get_transition_latency(unsigned int cpu) static int amd_pstate_init_freq(struct amd_cpudata *cpudata) { int ret; @@ -473,7 +561,7 @@ index 89bda7a2bb8d..93cac81e1cbe 100644 u32 nominal_perf, nominal_freq; u32 lowest_nonlinear_perf, lowest_nonlinear_freq; u32 boost_ratio, lowest_nonlinear_ratio; -@@ -972,8 +898,10 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) +@@ -972,8 +897,10 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) nominal_perf = READ_ONCE(cpudata->nominal_perf); @@ -486,7 +574,7 @@ index 89bda7a2bb8d..93cac81e1cbe 100644 max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000; lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); -@@ -1028,12 +956,12 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) +@@ -1028,12 +955,12 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) cpudata->cpu = policy->cpu; @@ -501,21 +589,7 @@ index 89bda7a2bb8d..93cac81e1cbe 100644 ret = amd_pstate_init_freq(cpudata); if (ret) goto free_cpudata1; -@@ -1187,12 +1115,7 @@ static ssize_t show_amd_pstate_prefcore_ranking(struct cpufreq_policy *policy, - static ssize_t show_amd_pstate_hw_prefcore(struct cpufreq_policy *policy, - char *buf) - { -- bool hw_prefcore; -- struct amd_cpudata *cpudata = policy->driver_data; -- -- hw_prefcore = READ_ONCE(cpudata->hw_prefcore); -- -- return sysfs_emit(buf, "%s\n", str_enabled_disabled(hw_prefcore)); -+ return sysfs_emit(buf, "%s\n", str_enabled_disabled(amd_pstate_prefcore)); - } - - static ssize_t show_energy_performance_available_preferences( -@@ -1483,12 +1406,12 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) +@@ -1483,12 +1410,12 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) cpudata->cpu = policy->cpu; cpudata->epp_policy = 0; @@ -530,20 +604,7 @@ index 89bda7a2bb8d..93cac81e1cbe 100644 ret = amd_pstate_init_freq(cpudata); if (ret) goto free_cpudata1; -@@ -1841,10 +1764,8 @@ static bool amd_cppc_supported(void) - * the code is added for debugging purposes. - */ - if (!cpu_feature_enabled(X86_FEATURE_CPPC)) { -- if (cpu_feature_enabled(X86_FEATURE_ZEN1) || cpu_feature_enabled(X86_FEATURE_ZEN2)) { -- if (c->x86_model > 0x60 && c->x86_model < 0xaf) -- warn = true; -- } else if (cpu_feature_enabled(X86_FEATURE_ZEN3) || cpu_feature_enabled(X86_FEATURE_ZEN4)) { -+ if (cpu_feature_enabled(X86_FEATURE_ZEN3) || -+ cpu_feature_enabled(X86_FEATURE_ZEN4)) { - if ((c->x86_model > 0x10 && c->x86_model < 0x1F) || - (c->x86_model > 0x40 && c->x86_model < 0xaf)) - warn = true; -@@ -1933,6 +1854,12 @@ static int __init amd_pstate_init(void) +@@ -1947,6 +1874,12 @@ static int __init amd_pstate_init(void) static_call_update(amd_pstate_update_perf, cppc_update_perf); } @@ -557,32 +618,106 @@ index 89bda7a2bb8d..93cac81e1cbe 100644 ret = amd_pstate_enable(true); if (ret) { diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h -index 930b6afba6f4..1d79320a2349 100644 +index 930b6afba6f4..482e0587a041 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h -@@ -136,6 +136,16 @@ struct cppc_cpudata { - cpumask_var_t shared_cpu_map; - }; - -+#ifdef CONFIG_CPU_SUP_AMD -+extern int amd_detect_prefcore(bool *detected); +@@ -159,34 +159,37 @@ extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf); + extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable); + extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps); + extern int cppc_set_auto_sel(int cpu, bool enable); +extern int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf); +extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator); -+#else /* !CONFIG_CPU_SUP_AMD */ -+static inline int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf) { return -ENODEV; } -+static inline int amd_detect_prefcore(bool *detected) { return -ENODEV; } -+static inline int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) { return -ENODEV; } -+#endif /* !CONFIG_CPU_SUP_AMD */ -+ - #ifdef CONFIG_ACPI_CPPC_LIB - extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf); - extern int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf); ++extern int amd_detect_prefcore(bool *detected); + #else /* !CONFIG_ACPI_CPPC_LIB */ + static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_get_highest_perf(int cpunum, u64 *highest_perf) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_set_enable(int cpu, bool enable) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline bool cppc_perf_ctrs_in_pcc(void) + { +@@ -210,27 +213,39 @@ static inline bool cpc_ffh_supported(void) + } + static inline int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_set_auto_sel(int cpu, bool enable) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; ++} ++static inline int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf) ++{ ++ return -ENODEV; ++} ++static inline int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) ++{ ++ return -EOPNOTSUPP; ++} ++static inline int amd_detect_prefcore(bool *detected) ++{ ++ return -ENODEV; + } + #endif /* !CONFIG_ACPI_CPPC_LIB */ + -- -2.46.0 +2.46.1 -From c03b9d435583136f64f0b91d4ac79f27d0e176cd Mon Sep 17 00:00:00 2001 +From 2676833deb16654c45007f79fb6725a3409899ff Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 1 Sep 2024 10:38:38 +0200 +Date: Sun, 15 Sep 2024 17:28:27 +0200 Subject: [PATCH 02/11] bbr3 Signed-off-by: Peter Jung @@ -3964,11 +4099,11 @@ index 4d40615dc8fc..f27941201ef2 100644 event = icsk->icsk_pending; -- -2.46.0 +2.46.1 -From cfadd59d3bf4eb2ba75e1b778510b13bd2299f1f Mon Sep 17 00:00:00 2001 +From 8f73cbbad2683b2bebffdf85fb133c78e44603a4 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 1 Sep 2024 10:38:49 +0200 +Date: Sun, 15 Sep 2024 17:28:36 +0200 Subject: [PATCH 03/11] block Signed-off-by: Peter Jung @@ -4449,11 +4584,11 @@ index acdc28756d9d..8b214233a061 100644 if (dd_has_work_for_prio(&dd->per_prio[prio])) return true; -- -2.46.0 +2.46.1 -From 0dbaf3e34fbdd41800ee182694dc6b4a16bd5021 Mon Sep 17 00:00:00 2001 +From e1cb9da59b75cd677cd7af9923864344099b0973 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 1 Sep 2024 10:38:57 +0200 +Date: Sun, 15 Sep 2024 17:28:46 +0200 Subject: [PATCH 04/11] cachy Signed-off-by: Peter Jung @@ -4517,8 +4652,8 @@ Signed-off-by: Peter Jung mm/vmpressure.c | 4 + mm/vmscan.c | 142 + scripts/Makefile.package | 3 +- - scripts/package/PKGBUILD | 39 +- - 60 files changed, 6100 insertions(+), 110 deletions(-) + scripts/package/PKGBUILD | 52 +- + 60 files changed, 6113 insertions(+), 110 deletions(-) create mode 100644 drivers/i2c/busses/i2c-nct6775.c create mode 100644 drivers/media/v4l2-core/v4l2loopback.c create mode 100644 drivers/media/v4l2-core/v4l2loopback.h @@ -4653,10 +4788,10 @@ index f48eaa98d22d..fc777c14cff6 100644 unprivileged_userfaultfd ======================== diff --git a/Makefile b/Makefile -index d57cfc6896b8..e280a998f618 100644 +index 34bd1d5f9672..7b497ab43754 100644 --- a/Makefile +++ b/Makefile -@@ -802,6 +802,9 @@ KBUILD_CFLAGS += -fno-delete-null-pointer-checks +@@ -803,6 +803,9 @@ KBUILD_CFLAGS += -fno-delete-null-pointer-checks ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE KBUILD_CFLAGS += -O2 KBUILD_RUSTFLAGS += -Copt-level=2 @@ -4666,7 +4801,7 @@ index d57cfc6896b8..e280a998f618 100644 else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE KBUILD_CFLAGS += -Os KBUILD_RUSTFLAGS += -Copt-level=s -@@ -990,9 +993,9 @@ KBUILD_CFLAGS += -fno-strict-overflow +@@ -991,9 +994,9 @@ KBUILD_CFLAGS += -fno-strict-overflow # Make sure -fstack-check isn't enabled (like gentoo apparently did) KBUILD_CFLAGS += -fno-stack-check @@ -5436,7 +5571,7 @@ index f9bc95f4488d..e85dd2bf39ed 100644 slab_kill: diff --git a/block/elevator.c b/block/elevator.c -index f13d552a32c8..c9422523e393 100644 +index c355b55d0107..41cf94c3671e 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -567,9 +567,19 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) @@ -5676,10 +5811,10 @@ index df17e79c45c7..e454488c1a31 100644 + endmenu diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c -index 983a977632ff..68357fb6b551 100644 +index 1e069fa5211e..f16a43106eb0 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c -@@ -4384,7 +4384,7 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev) +@@ -4408,7 +4408,7 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev) return r; } @@ -5778,10 +5913,10 @@ index d5d6ab484e5a..dccba7bcdf97 100644 } diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c -index 74e35f8ddefc..c2c2c915db99 100644 +index 2cf951184561..1a53bf05f8fc 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c -@@ -2760,7 +2760,10 @@ int smu_get_power_limit(void *handle, +@@ -2762,7 +2762,10 @@ int smu_get_power_limit(void *handle, *limit = smu->max_power_limit; break; case SMU_PPT_LIMIT_MIN: @@ -5793,7 +5928,7 @@ index 74e35f8ddefc..c2c2c915db99 100644 break; default: return -EINVAL; -@@ -2784,7 +2787,14 @@ static int smu_set_power_limit(void *handle, uint32_t limit) +@@ -2786,7 +2789,14 @@ static int smu_set_power_limit(void *handle, uint32_t limit) if (smu->ppt_funcs->set_power_limit) return smu->ppt_funcs->set_power_limit(smu, limit_type, limit); @@ -10986,10 +11121,10 @@ index d4d2f4d1d7cb..e0e19d9c1323 100644 char name[CPUFREQ_NAME_LEN]; int (*init)(struct cpufreq_policy *policy); diff --git a/include/linux/mm.h b/include/linux/mm.h -index 6549d0979b28..dca9a4444101 100644 +index 147073601716..9fafa99d56d1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h -@@ -197,6 +197,14 @@ static inline void __mm_zero_struct_page(struct page *page) +@@ -201,6 +201,14 @@ static inline void __mm_zero_struct_page(struct page *page) extern int sysctl_max_map_count; @@ -11466,7 +11601,7 @@ index 4430ac68e4c4..3bd08b60a9b3 100644 EXPORT_SYMBOL_GPL(dirty_writeback_interval); diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index c565de8f48e9..ef44703d2070 100644 +index 91ace8ca97e2..f8b4dae35fc3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -271,7 +271,11 @@ const char * const migratetype_names[MIGRATE_TYPES] = { @@ -11519,7 +11654,7 @@ index bd5183dfd879..3a410f53a07c 100644 /* diff --git a/mm/vmscan.c b/mm/vmscan.c -index cfa839284b92..9b7bb2a5626c 100644 +index bd489c1af228..fb8fc07523b9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -147,6 +147,15 @@ struct scan_control { @@ -11572,7 +11707,7 @@ index cfa839284b92..9b7bb2a5626c 100644 /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing -@@ -2411,6 +2436,15 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, +@@ -2391,6 +2416,15 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, goto out; } @@ -11588,7 +11723,7 @@ index cfa839284b92..9b7bb2a5626c 100644 /* * If there is enough inactive page cache, we do not reclaim * anything from the anonymous working right now. -@@ -2555,6 +2589,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, +@@ -2535,6 +2569,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, BUG(); } @@ -11603,7 +11738,7 @@ index cfa839284b92..9b7bb2a5626c 100644 nr[lru] = scan; } } -@@ -3988,7 +4030,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc +@@ -3968,7 +4010,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc } /* to protect the working set of the last N jiffies */ @@ -11615,7 +11750,7 @@ index cfa839284b92..9b7bb2a5626c 100644 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { -@@ -4026,6 +4072,96 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +@@ -4006,6 +4052,96 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) } } @@ -11712,7 +11847,7 @@ index cfa839284b92..9b7bb2a5626c 100644 /****************************************************************************** * rmap/PT walk feedback ******************************************************************************/ -@@ -4519,6 +4655,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw +@@ -4499,6 +4635,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw */ if (!swappiness) type = LRU_GEN_FILE; @@ -11721,7 +11856,7 @@ index cfa839284b92..9b7bb2a5626c 100644 else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) type = LRU_GEN_ANON; else if (swappiness == 1) -@@ -4798,6 +4936,8 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) +@@ -4778,6 +4916,8 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -11730,7 +11865,7 @@ index cfa839284b92..9b7bb2a5626c 100644 /* lru_gen_age_node() called mem_cgroup_calculate_protection() */ if (mem_cgroup_below_min(NULL, memcg)) return MEMCG_LRU_YOUNG; -@@ -5945,6 +6085,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) +@@ -5925,6 +6065,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) prepare_scan_control(pgdat, sc); @@ -11754,7 +11889,7 @@ index 4a80584ec771..11d53f240a2b 100644 KBUILD_MAKEFLAGS="$(MAKEFLAGS)" \ KBUILD_REVISION="$(shell $(srctree)/scripts/build-version)" \ diff --git a/scripts/package/PKGBUILD b/scripts/package/PKGBUILD -index 663ce300dd06..839cd5e634d2 100644 +index 663ce300dd06..f83493838cf9 100644 --- a/scripts/package/PKGBUILD +++ b/scripts/package/PKGBUILD @@ -3,10 +3,13 @@ @@ -11767,7 +11902,7 @@ index 663ce300dd06..839cd5e634d2 100644 -fi +pkgname=("${pkgbase}") + -+_extrapackages=${PACMAN_EXTRAPACKAGES-headers api-headers} ++_extrapackages=${PACMAN_EXTRAPACKAGES-headers api-headers debug} +for pkg in $_extrapackages; do + pkgname+=("${pkgbase}-${pkg}") +done @@ -11830,7 +11965,7 @@ index 663ce300dd06..839cd5e634d2 100644 cp System.map "${builddir}/System.map" cp .config "${builddir}/.config" -@@ -94,8 +106,7 @@ _package-api-headers() { +@@ -94,12 +106,24 @@ _package-api-headers() { provides=(linux-api-headers) conflicts=(linux-api-headers) @@ -11840,12 +11975,29 @@ index 663ce300dd06..839cd5e634d2 100644 ${MAKE} headers_install INSTALL_HDR_PATH="${pkgdir}/usr" } + ++_package-debug(){ ++ pkgdesc="Non-stripped vmlinux file for the ${pkgdesc} kernel" ++ ++ local debugdir="${pkgdir}/usr/src/debug/${pkgbase}" ++ local builddir="${pkgdir}/usr/${MODLIB}/build" ++ ++ _prologue ++ ++ install -Dt "${debugdir}" -m644 vmlinux ++ mkdir -p "${builddir}" ++ ln -sr "${debugdir}/vmlinux" "${builddir}/vmlinux" ++} ++ + for _p in "${pkgname[@]}"; do + eval "package_$_p() { + $(declare -f "_package${_p#$pkgbase}") -- -2.46.0 +2.46.1 -From 8ece8492ab552375f939b01887f695b4c8bd7d39 Mon Sep 17 00:00:00 2001 +From c4201124b983cba28153bd6385dd44b26ffad1e7 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 1 Sep 2024 10:39:07 +0200 +Date: Sun, 15 Sep 2024 17:28:56 +0200 Subject: [PATCH 05/11] fixes Signed-off-by: Peter Jung @@ -12097,11 +12249,11 @@ index 3cffa6c79538..8b7a5a31e8c1 100644 {} }; -- -2.46.0 +2.46.1 -From ce9f099c5a2244989b1cb027580ead7a80d4fa55 Mon Sep 17 00:00:00 2001 +From b24e8834eb51bed12079009ec0ab23b16bc73198 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 1 Sep 2024 10:39:19 +0200 +Date: Sun, 15 Sep 2024 17:29:06 +0200 Subject: [PATCH 06/11] intel-pstate Signed-off-by: Peter Jung @@ -12581,11 +12733,11 @@ index 7d92f16a430a..86ad1fed71f1 100644 } -- -2.46.0 +2.46.1 -From f9f37e11d53bf4044bb8bf7b8a74e19090fc9bda Mon Sep 17 00:00:00 2001 +From 8eb5e816f13a599dd0385bccc1df837664cc7233 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 1 Sep 2024 10:39:28 +0200 +Date: Sun, 15 Sep 2024 17:29:23 +0200 Subject: [PATCH 07/11] ksm Signed-off-by: Peter Jung @@ -13023,11 +13175,11 @@ index 01071182763e..7394bad8178e 100644 +464 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable +465 common process_ksm_status sys_process_ksm_status sys_process_ksm_status -- -2.46.0 +2.46.1 -From 374d5a89939e019d2d3abd22f683486c82230791 Mon Sep 17 00:00:00 2001 +From c2703b85f5713426c2ab1e6f25f2364582f053fe Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 1 Sep 2024 10:39:38 +0200 +Date: Sun, 15 Sep 2024 17:29:35 +0200 Subject: [PATCH 08/11] ntsync Signed-off-by: Peter Jung @@ -13467,10 +13619,10 @@ index 000000000000..767844637a7d + ``objs`` and in ``alert``. If this is attempted, the function fails + with ``EINVAL``. diff --git a/MAINTAINERS b/MAINTAINERS -index fe83ba7194ea..d9681e662200 100644 +index cc40a9d9b8cd..2cd7168dc401 100644 --- a/MAINTAINERS +++ b/MAINTAINERS -@@ -16306,6 +16306,15 @@ T: git https://github.com/Paragon-Software-Group/linux-ntfs3.git +@@ -16319,6 +16319,15 @@ T: git https://github.com/Paragon-Software-Group/linux-ntfs3.git F: Documentation/filesystems/ntfs3.rst F: fs/ntfs3/ @@ -16112,11 +16264,11 @@ index 000000000000..5fa2c9a0768c + +TEST_HARNESS_MAIN -- -2.46.0 +2.46.1 -From 99b646a36247a68818da737fd76441bdfe531213 Mon Sep 17 00:00:00 2001 +From bdd3f4dab12fd8eb06357b2b5593820eb3128651 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 1 Sep 2024 10:39:50 +0200 +Date: Sun, 15 Sep 2024 17:29:46 +0200 Subject: [PATCH 09/11] perf-per-core Signed-off-by: Peter Jung @@ -17020,11 +17172,11 @@ index 9a6069e7133c..23722aa21e2f 100644 /* Package relative core ID */ -- -2.46.0 +2.46.1 -From ca5746bedb2da2feb67f1cfed4e80ace86b3c240 Mon Sep 17 00:00:00 2001 +From f2a149176007718766b709be8299b005ddd6b158 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 1 Sep 2024 10:40:01 +0200 +Date: Sun, 15 Sep 2024 17:30:08 +0200 Subject: [PATCH 10/11] t2 Signed-off-by: Peter Jung @@ -17183,10 +17335,10 @@ index 4451ef501936..c726a846f752 100644 ---- diff --git a/MAINTAINERS b/MAINTAINERS -index d9681e662200..67ef02a08b8b 100644 +index 2cd7168dc401..16df466c205d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS -@@ -6894,6 +6894,12 @@ S: Supported +@@ -6895,6 +6895,12 @@ S: Supported T: git https://gitlab.freedesktop.org/drm/misc/kernel.git F: drivers/gpu/drm/sun4i/sun8i* @@ -17326,10 +17478,10 @@ index 49a1ac4f5491..c8c10a6104c4 100644 fb->base.width, fb->base.height, sizes->fb_width, sizes->fb_height); diff --git a/drivers/gpu/drm/i915/display/intel_quirks.c b/drivers/gpu/drm/i915/display/intel_quirks.c -index 14d5fefc9c5b..727639b8f6a6 100644 +index dfd8b4960e6d..7232f9acd0a0 100644 --- a/drivers/gpu/drm/i915/display/intel_quirks.c +++ b/drivers/gpu/drm/i915/display/intel_quirks.c -@@ -59,6 +59,18 @@ static void quirk_increase_ddi_disabled_time(struct intel_display *display) +@@ -64,6 +64,18 @@ static void quirk_increase_ddi_disabled_time(struct intel_display *display) drm_info(display->drm, "Applying Increase DDI Disabled quirk\n"); } @@ -17348,7 +17500,7 @@ index 14d5fefc9c5b..727639b8f6a6 100644 static void quirk_no_pps_backlight_power_hook(struct intel_display *display) { intel_set_quirk(display, QUIRK_NO_PPS_BACKLIGHT_POWER_HOOK); -@@ -201,6 +213,9 @@ static struct intel_quirk intel_quirks[] = { +@@ -229,6 +241,9 @@ static struct intel_quirk intel_quirks[] = { { 0x3184, 0x1019, 0xa94d, quirk_increase_ddi_disabled_time }, /* HP Notebook - 14-r206nv */ { 0x0f31, 0x103c, 0x220f, quirk_invert_brightness }, @@ -17357,15 +17509,15 @@ index 14d5fefc9c5b..727639b8f6a6 100644 + { 0x3e9b, 0x106b, 0x0176, quirk_ddi_a_force_4_lanes }, }; - void intel_init_quirks(struct intel_display *display) + static struct intel_dpcd_quirk intel_dpcd_quirks[] = { diff --git a/drivers/gpu/drm/i915/display/intel_quirks.h b/drivers/gpu/drm/i915/display/intel_quirks.h -index 151c8f4ae576..46e7feba88f4 100644 +index cafdebda7535..a5296f82776e 100644 --- a/drivers/gpu/drm/i915/display/intel_quirks.h +++ b/drivers/gpu/drm/i915/display/intel_quirks.h -@@ -17,6 +17,7 @@ enum intel_quirk_id { - QUIRK_INVERT_BRIGHTNESS, +@@ -20,6 +20,7 @@ enum intel_quirk_id { QUIRK_LVDS_SSC_DISABLE, QUIRK_NO_PPS_BACKLIGHT_POWER_HOOK, + QUIRK_FW_SYNC_LEN, + QUIRK_DDI_A_FORCE_4_LANES, }; @@ -27435,11 +27587,11 @@ index 4427572b2477..b60c99d61882 100755 last; } -- -2.46.0 +2.46.1 -From dec2bed6cdefc145bc1bc816f59c2bb833070742 Mon Sep 17 00:00:00 2001 +From d312d9b44e2d51dd64ceecf38fccbfbcc8944738 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 1 Sep 2024 10:40:12 +0200 +Date: Sun, 15 Sep 2024 17:30:16 +0200 Subject: [PATCH 11/11] zstd Signed-off-by: Peter Jung @@ -46087,4 +46239,4 @@ index f4ed952ed485..7d31518e9d5a 100644 EXPORT_SYMBOL(zstd_reset_dstream); -- -2.46.0 +2.46.1 diff --git a/patches/0002-sched-ext.patch b/patches/0002-sched-ext.patch index 16ea79f..0a4809b 100644 --- a/patches/0002-sched-ext.patch +++ b/patches/0002-sched-ext.patch @@ -1,9 +1,9 @@ -From e7c32074ed8a979d226208ce91b05f04dfc7667d Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Mon, 29 Jul 2024 14:07:56 +0200 +From b9aae9642313560e22d9424b9b32c15dc23e36e1 Mon Sep 17 00:00:00 2001 +From: Piotr Gorski +Date: Thu, 12 Sep 2024 07:55:48 +0200 Subject: [PATCH] sched-ext -Signed-off-by: Peter Jung +Signed-off-by: Piotr Gorski --- Documentation/scheduler/index.rst | 1 + Documentation/scheduler/sched-ext.rst | 316 + @@ -12,38 +12,42 @@ Signed-off-by: Peter Jung include/asm-generic/vmlinux.lds.h | 1 + include/linux/cgroup.h | 4 +- include/linux/sched.h | 5 + - include/linux/sched/ext.h | 206 + - include/linux/sched/task.h | 3 +- + include/linux/sched/ext.h | 216 + + include/linux/sched/task.h | 8 +- include/trace/events/sched_ext.h | 32 + include/uapi/linux/sched.h | 1 + + init/Kconfig | 10 + init/init_task.c | 12 + - kernel/Kconfig.preempt | 26 +- + kernel/Kconfig.preempt | 27 +- kernel/fork.c | 17 +- kernel/sched/build_policy.c | 11 + - kernel/sched/core.c | 192 +- + kernel/sched/core.c | 288 +- kernel/sched/cpufreq_schedutil.c | 50 +- kernel/sched/debug.c | 3 + - kernel/sched/ext.c | 6537 +++++++++++++++++ - kernel/sched/ext.h | 69 + + kernel/sched/ext.c | 7213 +++++++++++++++++ + kernel/sched/ext.h | 91 + kernel/sched/fair.c | 22 +- kernel/sched/idle.c | 2 + - kernel/sched/sched.h | 158 +- + kernel/sched/sched.h | 193 +- kernel/sched/syscalls.c | 26 + lib/dump_stack.c | 1 + tools/Makefile | 10 +- tools/sched_ext/.gitignore | 2 + tools/sched_ext/Makefile | 246 + - tools/sched_ext/README.md | 258 + + tools/sched_ext/README.md | 270 + .../sched_ext/include/bpf-compat/gnu/stubs.h | 11 + - tools/sched_ext/include/scx/common.bpf.h | 401 + + tools/sched_ext/include/scx/common.bpf.h | 412 + tools/sched_ext/include/scx/common.h | 75 + tools/sched_ext/include/scx/compat.bpf.h | 28 + tools/sched_ext/include/scx/compat.h | 186 + tools/sched_ext/include/scx/user_exit_info.h | 111 + tools/sched_ext/scx_central.bpf.c | 361 + tools/sched_ext/scx_central.c | 135 + - tools/sched_ext/scx_qmap.bpf.c | 706 ++ - tools/sched_ext/scx_qmap.c | 144 + + tools/sched_ext/scx_flatcg.bpf.c | 949 +++ + tools/sched_ext/scx_flatcg.c | 233 + + tools/sched_ext/scx_flatcg.h | 51 + + tools/sched_ext/scx_qmap.bpf.c | 813 ++ + tools/sched_ext/scx_qmap.c | 153 + tools/sched_ext/scx_show_state.py | 39 + tools/sched_ext/scx_simple.bpf.c | 156 + tools/sched_ext/scx_simple.c | 107 + @@ -70,7 +74,7 @@ Signed-off-by: Peter Jung .../selftests/sched_ext/hotplug_test.h | 15 + .../sched_ext/init_enable_count.bpf.c | 53 + .../selftests/sched_ext/init_enable_count.c | 166 + - .../testing/selftests/sched_ext/maximal.bpf.c | 132 + + .../testing/selftests/sched_ext/maximal.bpf.c | 164 + tools/testing/selftests/sched_ext/maximal.c | 51 + .../selftests/sched_ext/maybe_null.bpf.c | 36 + .../testing/selftests/sched_ext/maybe_null.c | 49 + @@ -78,7 +82,7 @@ Signed-off-by: Peter Jung .../sched_ext/maybe_null_fail_yld.bpf.c | 28 + .../testing/selftests/sched_ext/minimal.bpf.c | 21 + tools/testing/selftests/sched_ext/minimal.c | 58 + - .../selftests/sched_ext/prog_run.bpf.c | 32 + + .../selftests/sched_ext/prog_run.bpf.c | 33 + tools/testing/selftests/sched_ext/prog_run.c | 78 + .../testing/selftests/sched_ext/reload_loop.c | 75 + tools/testing/selftests/sched_ext/runner.c | 201 + @@ -98,7 +102,7 @@ Signed-off-by: Peter Jung .../selftests/sched_ext/test_example.c | 49 + tools/testing/selftests/sched_ext/util.c | 71 + tools/testing/selftests/sched_ext/util.h | 13 + - 94 files changed, 13835 insertions(+), 96 deletions(-) + 97 files changed, 16063 insertions(+), 126 deletions(-) create mode 100644 Documentation/scheduler/sched-ext.rst create mode 100644 include/linux/sched/ext.h create mode 100644 include/trace/events/sched_ext.h @@ -115,6 +119,9 @@ Signed-off-by: Peter Jung create mode 100644 tools/sched_ext/include/scx/user_exit_info.h create mode 100644 tools/sched_ext/scx_central.bpf.c create mode 100644 tools/sched_ext/scx_central.c + create mode 100644 tools/sched_ext/scx_flatcg.bpf.c + create mode 100644 tools/sched_ext/scx_flatcg.c + create mode 100644 tools/sched_ext/scx_flatcg.h create mode 100644 tools/sched_ext/scx_qmap.bpf.c create mode 100644 tools/sched_ext/scx_qmap.c create mode 100644 tools/sched_ext/scx_show_state.py @@ -507,10 +514,10 @@ index 000000000000..a707d2181a77 +possible, they are subject to change without warning between kernel +versions. diff --git a/MAINTAINERS b/MAINTAINERS -index 42decde38320..8d93e2d8ce99 100644 +index 3b5c5c42eb03..827105d4441e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS -@@ -20296,6 +20296,19 @@ F: include/linux/wait.h +@@ -20352,6 +20352,19 @@ F: include/linux/wait.h F: include/uapi/linux/sched.h F: kernel/sched/ @@ -543,7 +550,7 @@ index 14f8f00fdcf9..930b04e3d148 100644 NULL, /* T */ NULL, /* U */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h -index ad6afc5c4918..06c532f201fb 100644 +index 1ae44793132a..19ec49a9179b 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -133,6 +133,7 @@ @@ -601,10 +608,10 @@ index f8d150343d42..5b4f78fe379d 100644 #ifdef CONFIG_SCHED_CORE diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h new file mode 100644 -index 000000000000..593d2f4909dd +index 000000000000..76166d3b14fc --- /dev/null +++ b/include/linux/sched/ext.h -@@ -0,0 +1,206 @@ +@@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst @@ -713,25 +720,31 @@ index 000000000000..593d2f4909dd + * mechanism. See scx_kf_allow(). + */ +enum scx_kf_mask { -+ SCX_KF_UNLOCKED = 0, /* not sleepable, not rq locked */ -+ /* all non-sleepables may be nested inside SLEEPABLE */ -+ SCX_KF_SLEEPABLE = 1 << 0, /* sleepable init operations */ ++ SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ + /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ -+ SCX_KF_CPU_RELEASE = 1 << 1, /* ops.cpu_release() */ ++ SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ + /* ops.dequeue (in REST) may be nested inside DISPATCH */ -+ SCX_KF_DISPATCH = 1 << 2, /* ops.dispatch() */ -+ SCX_KF_ENQUEUE = 1 << 3, /* ops.enqueue() and ops.select_cpu() */ -+ SCX_KF_SELECT_CPU = 1 << 4, /* ops.select_cpu() */ -+ SCX_KF_REST = 1 << 5, /* other rq-locked operations */ ++ SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ ++ SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ ++ SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */ ++ SCX_KF_REST = 1 << 4, /* other rq-locked operations */ + + __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | + SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, + __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, +}; + ++enum scx_dsq_lnode_flags { ++ SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0, ++ ++ /* high 16 bits can be for iter cursor flags */ ++ __SCX_DSQ_LNODE_PRIV_SHIFT = 16, ++}; ++ +struct scx_dsq_list_node { + struct list_head node; -+ bool is_bpf_iter_cursor; ++ u32 flags; ++ u32 priv; /* can be used by iter cursor */ +}; + +/* @@ -788,15 +801,19 @@ index 000000000000..593d2f4909dd + * If set, reject future sched_setscheduler(2) calls updating the policy + * to %SCHED_EXT with -%EACCES. + * -+ * If set from ops.init_task() and the task's policy is already -+ * %SCHED_EXT, which can happen while the BPF scheduler is being loaded -+ * or by inhering the parent's policy during fork, the task's policy is -+ * rejected and forcefully reverted to %SCHED_NORMAL. The number of -+ * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected. ++ * Can be set from ops.init_task() while the BPF scheduler is being ++ * loaded (!scx_init_task_args->fork). If set and the task's policy is ++ * already %SCHED_EXT, the task's policy is rejected and forcefully ++ * reverted to %SCHED_NORMAL. The number of such events are reported ++ * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag ++ * during fork is not allowed. + */ + bool disallow; /* reject switching into SCX */ + + /* cold fields */ ++#ifdef CONFIG_EXT_GROUP_SCHED ++ struct cgroup *cgrp_moving_from; ++#endif + /* must be the last field, see init_scx_entity() */ + struct list_head tasks_node; +}; @@ -812,7 +829,7 @@ index 000000000000..593d2f4909dd +#endif /* CONFIG_SCHED_CLASS_EXT */ +#endif /* _LINUX_SCHED_EXT_H */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h -index d362aacf9f89..4df2f9055587 100644 +index d362aacf9f89..0f2aeb37bbb0 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -63,7 +63,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); @@ -825,6 +842,18 @@ index d362aacf9f89..4df2f9055587 100644 extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); +@@ -119,6 +120,11 @@ static inline struct task_struct *get_task_struct(struct task_struct *t) + return t; + } + ++static inline struct task_struct *tryget_task_struct(struct task_struct *t) ++{ ++ return refcount_inc_not_zero(&t->usage) ? t : NULL; ++} ++ + extern void __put_task_struct(struct task_struct *t); + extern void __put_task_struct_rcu_cb(struct rcu_head *rhp); + diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h new file mode 100644 index 000000000000..fe19da7315a9 @@ -875,6 +904,37 @@ index 3bac0a8ceab2..359a14cc76a4 100644 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ #define SCHED_RESET_ON_FORK 0x40000000 +diff --git a/init/Kconfig b/init/Kconfig +index 08a0d51afaae..e1a88d48d652 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1028,9 +1028,13 @@ menuconfig CGROUP_SCHED + tasks. + + if CGROUP_SCHED ++config GROUP_SCHED_WEIGHT ++ def_bool n ++ + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED ++ select GROUP_SCHED_WEIGHT + default CGROUP_SCHED + + config CFS_BANDWIDTH +@@ -1055,6 +1059,12 @@ config RT_GROUP_SCHED + realtime bandwidth for them. + See Documentation/scheduler/sched-rt-group.rst for more information. + ++config EXT_GROUP_SCHED ++ bool ++ depends on SCHED_CLASS_EXT && CGROUP_SCHED ++ select GROUP_SCHED_WEIGHT ++ default y ++ + endif #CGROUP_SCHED + + config SCHED_MM_CID diff --git a/init/init_task.c b/init/init_task.c index eeb110c65fe2..e222722e790b 100644 --- a/init/init_task.c @@ -906,10 +966,10 @@ index eeb110c65fe2..e222722e790b 100644 .ptraced = LIST_HEAD_INIT(init_task.ptraced), .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt -index c2f1fd95a821..f3d140c3acc1 100644 +index c2f1fd95a821..fe782cd77388 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt -@@ -133,4 +133,28 @@ config SCHED_CORE +@@ -133,4 +133,29 @@ config SCHED_CORE which is the likely usage by Linux distributions, there should be no measurable impact on performance. @@ -917,6 +977,7 @@ index c2f1fd95a821..f3d140c3acc1 100644 +config SCHED_CLASS_EXT + bool "Extensible Scheduling Class" + depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF ++ select STACKTRACE if STACKTRACE_SUPPORT + help + This option enables a new scheduler class sched_ext (SCX), which + allows scheduling policies to be implemented as BPF programs to @@ -940,7 +1001,7 @@ index c2f1fd95a821..f3d140c3acc1 100644 + Documentation/scheduler/sched-ext.rst + https://github.com/sched-ext/scx diff --git a/kernel/fork.c b/kernel/fork.c -index cc760491f201..41771bde2ce7 100644 +index 238695afc630..69a0a7210060 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -23,6 +23,7 @@ @@ -951,7 +1012,7 @@ index cc760491f201..41771bde2ce7 100644 #include #include #include -@@ -969,6 +970,7 @@ void __put_task_struct(struct task_struct *tsk) +@@ -973,6 +974,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); @@ -959,7 +1020,7 @@ index cc760491f201..41771bde2ce7 100644 io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); -@@ -2347,7 +2349,7 @@ __latent_entropy struct task_struct *copy_process( +@@ -2355,7 +2357,7 @@ __latent_entropy struct task_struct *copy_process( retval = perf_event_init_task(p, clone_flags); if (retval) @@ -968,7 +1029,7 @@ index cc760491f201..41771bde2ce7 100644 retval = audit_alloc(p); if (retval) goto bad_fork_cleanup_perf; -@@ -2480,7 +2482,9 @@ __latent_entropy struct task_struct *copy_process( +@@ -2488,7 +2490,9 @@ __latent_entropy struct task_struct *copy_process( * cgroup specific, it unconditionally needs to place the task on a * runqueue. */ @@ -979,7 +1040,7 @@ index cc760491f201..41771bde2ce7 100644 /* * From this point on we must avoid any synchronous user-space -@@ -2526,13 +2530,13 @@ __latent_entropy struct task_struct *copy_process( +@@ -2534,13 +2538,13 @@ __latent_entropy struct task_struct *copy_process( /* Don't start children in a dying pid namespace */ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; @@ -995,7 +1056,7 @@ index cc760491f201..41771bde2ce7 100644 } /* No more failure paths after this point. */ -@@ -2606,10 +2610,11 @@ __latent_entropy struct task_struct *copy_process( +@@ -2614,10 +2618,11 @@ __latent_entropy struct task_struct *copy_process( return p; @@ -1008,7 +1069,7 @@ index cc760491f201..41771bde2ce7 100644 cgroup_cancel_fork(p, args); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) { -@@ -2648,6 +2653,8 @@ __latent_entropy struct task_struct *copy_process( +@@ -2656,6 +2661,8 @@ __latent_entropy struct task_struct *copy_process( audit_free(p); bad_fork_cleanup_perf: perf_event_free_task(p); @@ -1057,7 +1118,7 @@ index 39c315182b35..fae1f5c921eb 100644 + #include "syscalls.c" diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index a9f655025607..22f86d5e9231 100644 +index f3951e4a55e5..c792a6feb7a9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -169,7 +169,10 @@ static inline int __task_prio(const struct task_struct *p) @@ -1096,10 +1157,10 @@ index a9f655025607..22f86d5e9231 100644 + * involuntary preemption. For SCX, ask. */ - if (rq->nr_running > 1) -+ if (!scx_switched_all() && rq->nr_running > 1) ++ if (scx_enabled() && !scx_can_stop_tick(rq)) + return false; + -+ if (scx_enabled() && !scx_can_stop_tick(rq)) ++ if (rq->cfs.nr_running > 1) return false; /* @@ -1132,6 +1193,24 @@ index a9f655025607..22f86d5e9231 100644 /* * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, * use the balance_callback list if you want balancing. +@@ -2289,7 +2311,7 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) + static inline bool is_cpu_allowed(struct task_struct *p, int cpu) + { + /* When not in the task's cpumask, no point in looking further. */ +- if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ if (!task_allowed_on_cpu(p, cpu)) + return false; + + /* migrate_disabled() must be allowed to finish. */ +@@ -2298,7 +2320,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) + + /* Non kernel threads are not allowed during either online or offline. */ + if (!(p->flags & PF_KTHREAD)) +- return cpu_active(cpu) && task_cpu_possible(cpu, p); ++ return cpu_active(cpu); + + /* KTHREAD_IS_PER_CPU is always allowed. */ + if (kthread_is_per_cpu(p)) @@ -3775,6 +3797,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) @@ -1230,10 +1309,11 @@ index a9f655025607..22f86d5e9231 100644 #endif } -@@ -5773,7 +5827,19 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, +@@ -5772,8 +5826,19 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) + static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - #ifdef CONFIG_SMP +-#ifdef CONFIG_SMP + const struct sched_class *start_class = prev->sched_class; const struct sched_class *class; + @@ -1243,23 +1323,28 @@ index a9f655025607..22f86d5e9231 100644 + * when waking up from SCHED_IDLE. If @start_class is below SCX, start + * from SCX instead. + */ -+ if (sched_class_above(&ext_sched_class, start_class)) ++ if (scx_enabled() && sched_class_above(&ext_sched_class, start_class)) + start_class = &ext_sched_class; +#endif + /* * We must do the balancing pass before put_prev_task(), such * that when we release the rq->lock the task is in the same -@@ -5782,7 +5848,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, +@@ -5782,11 +5847,10 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, * We can terminate the balance pass as soon as we know there is * a runnable task of @class priority or higher. */ - for_class_range(class, prev->sched_class, &idle_sched_class) { +- if (class->balance(rq, prev, rf)) + for_active_class_range(class, start_class, &idle_sched_class) { - if (class->balance(rq, prev, rf)) ++ if (class->balance && class->balance(rq, prev, rf)) break; } -@@ -5800,6 +5866,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +-#endif + + put_prev_task(rq, prev); + } +@@ -5800,6 +5864,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) const struct sched_class *class; struct task_struct *p; @@ -1269,7 +1354,7 @@ index a9f655025607..22f86d5e9231 100644 /* * Optimization: we know that if all tasks are in the fair class we can * call that function directly, but only if the @prev task wasn't of a -@@ -5840,10 +5909,15 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +@@ -5840,10 +5907,15 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (prev->dl_server) prev->dl_server = NULL; @@ -1287,7 +1372,7 @@ index a9f655025607..22f86d5e9231 100644 } BUG(); /* The idle class should always have a runnable task. */ -@@ -5873,7 +5947,7 @@ static inline struct task_struct *pick_task(struct rq *rq) +@@ -5873,7 +5945,7 @@ static inline struct task_struct *pick_task(struct rq *rq) const struct sched_class *class; struct task_struct *p; @@ -1296,7 +1381,7 @@ index a9f655025607..22f86d5e9231 100644 p = class->pick_task(rq); if (p) return p; -@@ -6870,6 +6944,10 @@ void __setscheduler_prio(struct task_struct *p, int prio) +@@ -6870,6 +6942,10 @@ void __setscheduler_prio(struct task_struct *p, int prio) p->sched_class = &dl_sched_class; else if (rt_prio(prio)) p->sched_class = &rt_sched_class; @@ -1307,7 +1392,7 @@ index a9f655025607..22f86d5e9231 100644 else p->sched_class = &fair_sched_class; -@@ -7015,6 +7093,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) +@@ -7015,6 +7091,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) } __setscheduler_prio(p, prio); @@ -1315,7 +1400,7 @@ index a9f655025607..22f86d5e9231 100644 if (queued) enqueue_task(rq, p, queue_flag); -@@ -7429,6 +7508,7 @@ void sched_show_task(struct task_struct *p) +@@ -7429,6 +7506,7 @@ void sched_show_task(struct task_struct *p) print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); @@ -1323,7 +1408,7 @@ index a9f655025607..22f86d5e9231 100644 show_stack(p, NULL, KERN_INFO); put_task_stack(p); } -@@ -7921,6 +8001,8 @@ int sched_cpu_activate(unsigned int cpu) +@@ -7957,6 +8035,8 @@ int sched_cpu_activate(unsigned int cpu) cpuset_cpu_active(); } @@ -1332,16 +1417,16 @@ index a9f655025607..22f86d5e9231 100644 /* * Put the rq online, if not already. This happens: * -@@ -7981,6 +8063,8 @@ int sched_cpu_deactivate(unsigned int cpu) - } - rq_unlock_irqrestore(rq, &rf); +@@ -8006,6 +8086,8 @@ int sched_cpu_deactivate(unsigned int cpu) + + sched_set_rq_offline(rq, cpu); + scx_rq_deactivate(rq); + - #ifdef CONFIG_SCHED_SMT /* * When going down, decrement the number of cores with SMT present. -@@ -8164,11 +8248,15 @@ void __init sched_init(void) + */ +@@ -8190,11 +8272,15 @@ void __init sched_init(void) int i; /* Make sure the linker didn't screw up */ @@ -1361,7 +1446,17 @@ index a9f655025607..22f86d5e9231 100644 #endif wait_bit_init(); -@@ -8337,6 +8425,7 @@ void __init sched_init(void) +@@ -8218,6 +8304,9 @@ void __init sched_init(void) + root_task_group.shares = ROOT_TASK_GROUP_LOAD; + init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); + #endif /* CONFIG_FAIR_GROUP_SCHED */ ++#ifdef CONFIG_EXT_GROUP_SCHED ++ root_task_group.scx_weight = CGROUP_WEIGHT_DFL; ++#endif /* CONFIG_EXT_GROUP_SCHED */ + #ifdef CONFIG_RT_GROUP_SCHED + root_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +@@ -8363,6 +8452,7 @@ void __init sched_init(void) balance_push_set(smp_processor_id(), false); #endif init_sched_fair_class(); @@ -1369,15 +1464,181 @@ index a9f655025607..22f86d5e9231 100644 psi_init(); -@@ -9534,29 +9623,27 @@ static int cpu_local_stat_show(struct seq_file *sf, +@@ -8648,6 +8738,7 @@ struct task_group *sched_create_group(struct task_group *parent) + if (!alloc_rt_sched_group(tg, parent)) + goto err; + ++ scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); + alloc_uclamp_sched_group(tg, parent); + + return tg; +@@ -8775,6 +8866,7 @@ void sched_move_task(struct task_struct *tsk) + put_prev_task(rq, tsk); + + sched_change_group(tsk, group); ++ scx_move_task(tsk); + + if (queued) + enqueue_task(rq, tsk, queue_flags); +@@ -8789,11 +8881,6 @@ void sched_move_task(struct task_struct *tsk) + } } - #ifdef CONFIG_FAIR_GROUP_SCHED +-static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +-{ +- return css ? container_of(css, struct task_group, css) : NULL; +-} +- + static struct cgroup_subsys_state * + cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) + { +@@ -8817,6 +8904,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) + { + struct task_group *tg = css_tg(css); + struct task_group *parent = css_tg(css->parent); ++ int ret; + ++ ret = scx_tg_online(tg); ++ if (ret) ++ return ret; + + if (parent) + sched_online_group(tg, parent); +@@ -8831,6 +8923,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) + return 0; + } + ++static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ scx_tg_offline(tg); ++} ++ + static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) + { + struct task_group *tg = css_tg(css); +@@ -8848,9 +8947,9 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) + sched_unregister_group(tg); + } + +-#ifdef CONFIG_RT_GROUP_SCHED + static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) + { ++#ifdef CONFIG_RT_GROUP_SCHED + struct task_struct *task; + struct cgroup_subsys_state *css; + +@@ -8858,9 +8957,9 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) + if (!sched_rt_can_attach(css_tg(css), task)) + return -EINVAL; + } +- return 0; +-} + #endif ++ return scx_cgroup_can_attach(tset); ++} + + static void cpu_cgroup_attach(struct cgroup_taskset *tset) + { +@@ -8869,6 +8968,13 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) + + cgroup_taskset_for_each(task, css, tset) + sched_move_task(task); ++ ++ scx_cgroup_finish_attach(); ++} ++ ++static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) ++{ ++ scx_cgroup_cancel_attach(tset); + } + + #ifdef CONFIG_UCLAMP_TASK_GROUP +@@ -9045,22 +9151,36 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v) + } + #endif /* CONFIG_UCLAMP_TASK_GROUP */ + ++#ifdef CONFIG_GROUP_SCHED_WEIGHT +static unsigned long tg_weight(struct task_group *tg) +{ + #ifdef CONFIG_FAIR_GROUP_SCHED + return scale_load_down(tg->shares); ++#else ++ return sched_weight_from_cgroup(tg->scx_weight); ++#endif +} ++ + static int cpu_shares_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 shareval) + { ++ int ret; ++ + if (shareval > scale_load_down(ULONG_MAX)) + shareval = MAX_SHARES; +- return sched_group_set_shares(css_tg(css), scale_load(shareval)); ++ ret = sched_group_set_shares(css_tg(css), scale_load(shareval)); ++ if (!ret) ++ scx_group_set_weight(css_tg(css), ++ sched_weight_to_cgroup(shareval)); ++ return ret; + } + + static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) + { +- struct task_group *tg = css_tg(css); +- +- return (u64) scale_load_down(tg->shares); ++ return tg_weight(css_tg(css)); + } ++#endif /* CONFIG_GROUP_SCHED_WEIGHT */ + + #ifdef CONFIG_CFS_BANDWIDTH + static DEFINE_MUTEX(cfs_constraints_mutex); +@@ -9406,7 +9526,6 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) + return 0; + } + #endif /* CONFIG_CFS_BANDWIDTH */ +-#endif /* CONFIG_FAIR_GROUP_SCHED */ + + #ifdef CONFIG_RT_GROUP_SCHED + static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, +@@ -9434,7 +9553,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, + } + #endif /* CONFIG_RT_GROUP_SCHED */ + +-#ifdef CONFIG_FAIR_GROUP_SCHED ++#ifdef CONFIG_GROUP_SCHED_WEIGHT + static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) + { +@@ -9444,12 +9563,17 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, + static int cpu_idle_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 idle) + { +- return sched_group_set_idle(css_tg(css), idle); ++ int ret; ++ ++ ret = sched_group_set_idle(css_tg(css), idle); ++ if (!ret) ++ scx_group_set_idle(css_tg(css), idle); ++ return ret; + } + #endif + + static struct cftype cpu_legacy_files[] = { +-#ifdef CONFIG_FAIR_GROUP_SCHED ++#ifdef CONFIG_GROUP_SCHED_WEIGHT + { + .name = "shares", + .read_u64 = cpu_shares_read_u64, +@@ -9559,38 +9683,35 @@ static int cpu_local_stat_show(struct seq_file *sf, + return 0; + } + +-#ifdef CONFIG_FAIR_GROUP_SCHED ++#ifdef CONFIG_GROUP_SCHED_WEIGHT + static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) @@ -1402,6 +1663,7 @@ index a9f655025607..22f86d5e9231 100644 - */ - if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) + unsigned long weight; ++ int ret; + + if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX) return -ERANGE; @@ -1409,9 +1671,13 @@ index a9f655025607..22f86d5e9231 100644 - weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); + weight = sched_weight_from_cgroup(cgrp_weight); - return sched_group_set_shares(css_tg(css), scale_load(weight)); +- return sched_group_set_shares(css_tg(css), scale_load(weight)); ++ ret = sched_group_set_shares(css_tg(css), scale_load(weight)); ++ if (!ret) ++ scx_group_set_weight(css_tg(css), cgrp_weight); ++ return ret; } -@@ -9564,7 +9651,7 @@ static int cpu_weight_write_u64(struct cgroup_subsys_state *css, + static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -1420,7 +1686,58 @@ index a9f655025607..22f86d5e9231 100644 int last_delta = INT_MAX; int prio, delta; -@@ -10305,3 +10392,38 @@ void sched_mm_cid_fork(struct task_struct *t) +@@ -9609,7 +9730,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 nice) + { + unsigned long weight; +- int idx; ++ int idx, ret; + + if (nice < MIN_NICE || nice > MAX_NICE) + return -ERANGE; +@@ -9618,9 +9739,13 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, + idx = array_index_nospec(idx, 40); + weight = sched_prio_to_weight[idx]; + +- return sched_group_set_shares(css_tg(css), scale_load(weight)); ++ ret = sched_group_set_shares(css_tg(css), scale_load(weight)); ++ if (!ret) ++ scx_group_set_weight(css_tg(css), ++ sched_weight_to_cgroup(weight)); ++ return ret; + } +-#endif ++#endif /* CONFIG_GROUP_SCHED_WEIGHT */ + + static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, + long period, long quota) +@@ -9680,7 +9805,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, + #endif + + static struct cftype cpu_files[] = { +-#ifdef CONFIG_FAIR_GROUP_SCHED ++#ifdef CONFIG_GROUP_SCHED_WEIGHT + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, +@@ -9734,14 +9859,14 @@ static struct cftype cpu_files[] = { + struct cgroup_subsys cpu_cgrp_subsys = { + .css_alloc = cpu_cgroup_css_alloc, + .css_online = cpu_cgroup_css_online, ++ .css_offline = cpu_cgroup_css_offline, + .css_released = cpu_cgroup_css_released, + .css_free = cpu_cgroup_css_free, + .css_extra_stat_show = cpu_extra_stat_show, + .css_local_stat_show = cpu_local_stat_show, +-#ifdef CONFIG_RT_GROUP_SCHED + .can_attach = cpu_cgroup_can_attach, +-#endif + .attach = cpu_cgroup_attach, ++ .cancel_attach = cpu_cgroup_cancel_attach, + .legacy_cftypes = cpu_legacy_files, + .dfl_cftypes = cpu_files, + .early_init = true, +@@ -10331,3 +10456,38 @@ void sched_mm_cid_fork(struct task_struct *t) t->mm_cid_active = 1; } #endif @@ -1564,10 +1881,10 @@ index c1eb9a1afd13..c057ef46c5f8 100644 diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c new file mode 100644 -index 000000000000..da9cac6b6cc2 +index 000000000000..4509fd38d92b --- /dev/null +++ b/kernel/sched/ext.c -@@ -0,0 +1,6537 @@ +@@ -0,0 +1,7213 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst @@ -1686,10 +2003,16 @@ index 000000000000..da9cac6b6cc2 + */ + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + ++ /* ++ * CPU cgroup support flags ++ */ ++ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */ ++ + SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | + SCX_OPS_ENQ_LAST | + SCX_OPS_ENQ_EXITING | -+ SCX_OPS_SWITCH_PARTIAL, ++ SCX_OPS_SWITCH_PARTIAL | ++ SCX_OPS_HAS_CGROUP_WEIGHT, +}; + +/* argument container for ops.init_task() */ @@ -1699,6 +2022,10 @@ index 000000000000..da9cac6b6cc2 + * to the scheduler transition path. + */ + bool fork; ++#ifdef CONFIG_EXT_GROUP_SCHED ++ /* the cgroup the task is joining */ ++ struct cgroup *cgroup; ++#endif +}; + +/* argument container for ops.exit_task() */ @@ -1707,6 +2034,12 @@ index 000000000000..da9cac6b6cc2 + bool cancelled; +}; + ++/* argument container for ops->cgroup_init() */ ++struct scx_cgroup_init_args { ++ /* the weight of the cgroup [1..10000] */ ++ u32 weight; ++}; ++ +enum scx_cpu_preempt_reason { + /* next task is being scheduled by &sched_class_rt */ + SCX_CPU_PREEMPT_RT, @@ -2071,6 +2404,79 @@ index 000000000000..da9cac6b6cc2 + */ + void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); + ++#ifdef CONFIG_EXT_GROUP_SCHED ++ /** ++ * cgroup_init - Initialize a cgroup ++ * @cgrp: cgroup being initialized ++ * @args: init arguments, see the struct definition ++ * ++ * Either the BPF scheduler is being loaded or @cgrp created, initialize ++ * @cgrp for sched_ext. This operation may block. ++ * ++ * Return 0 for success, -errno for failure. An error return while ++ * loading will abort loading of the BPF scheduler. During cgroup ++ * creation, it will abort the specific cgroup creation. ++ */ ++ s32 (*cgroup_init)(struct cgroup *cgrp, ++ struct scx_cgroup_init_args *args); ++ ++ /** ++ * cgroup_exit - Exit a cgroup ++ * @cgrp: cgroup being exited ++ * ++ * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit ++ * @cgrp for sched_ext. This operation my block. ++ */ ++ void (*cgroup_exit)(struct cgroup *cgrp); ++ ++ /** ++ * cgroup_prep_move - Prepare a task to be moved to a different cgroup ++ * @p: task being moved ++ * @from: cgroup @p is being moved from ++ * @to: cgroup @p is being moved to ++ * ++ * Prepare @p for move from cgroup @from to @to. This operation may ++ * block and can be used for allocations. ++ * ++ * Return 0 for success, -errno for failure. An error return aborts the ++ * migration. ++ */ ++ s32 (*cgroup_prep_move)(struct task_struct *p, ++ struct cgroup *from, struct cgroup *to); ++ ++ /** ++ * cgroup_move - Commit cgroup move ++ * @p: task being moved ++ * @from: cgroup @p is being moved from ++ * @to: cgroup @p is being moved to ++ * ++ * Commit the move. @p is dequeued during this operation. ++ */ ++ void (*cgroup_move)(struct task_struct *p, ++ struct cgroup *from, struct cgroup *to); ++ ++ /** ++ * cgroup_cancel_move - Cancel cgroup move ++ * @p: task whose cgroup move is being canceled ++ * @from: cgroup @p was being moved from ++ * @to: cgroup @p was being moved to ++ * ++ * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). ++ * Undo the preparation. ++ */ ++ void (*cgroup_cancel_move)(struct task_struct *p, ++ struct cgroup *from, struct cgroup *to); ++ ++ /** ++ * cgroup_set_weight - A cgroup's weight is being changed ++ * @cgrp: cgroup whose weight is being updated ++ * @weight: new weight [1..10000] ++ * ++ * Update @tg's weight to @weight. ++ */ ++ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); ++#endif /* CONFIG_CGROUPS */ ++ + /* + * All online ops must come before ops.cpu_online(). + */ @@ -2256,6 +2662,11 @@ index 000000000000..da9cac6b6cc2 + SCX_KICK_WAIT = 1LLU << 2, +}; + ++enum scx_tg_flags { ++ SCX_TG_ONLINE = 1U << 0, ++ SCX_TG_INITED = 1U << 1, ++}; ++ +enum scx_ops_enable_state { + SCX_OPS_PREPPING, + SCX_OPS_ENABLING, @@ -2344,7 +2755,7 @@ index 000000000000..da9cac6b6cc2 +static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); + -+struct static_key_false scx_has_op[SCX_OPI_END] = ++static struct static_key_false scx_has_op[SCX_OPI_END] = + { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; + +static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); @@ -2447,7 +2858,7 @@ index 000000000000..da9cac6b6cc2 + struct scx_bstr_buf buf; +}; + -+struct scx_dump_data scx_dump_data = { ++static struct scx_dump_data scx_dump_data = { + .cpu = -1, +}; + @@ -2599,16 +3010,12 @@ index 000000000000..da9cac6b6cc2 + return false; + } + -+ if (unlikely((mask & SCX_KF_SLEEPABLE) && in_interrupt())) { -+ scx_ops_error("sleepable kfunc called from non-sleepable context"); -+ return false; -+ } -+ + /* + * Enforce nesting boundaries. e.g. A kfunc which can be called from + * DISPATCH must not be called if we're running DEQUEUE which is nested -+ * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE -+ * boundary thanks to the above in_interrupt() check. ++ * inside ops.dispatch(). We don't need to check boundaries for any ++ * blocking kfuncs as the verifier ensures they're only called from ++ * sleepable progs. + */ + if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && + (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { @@ -2641,6 +3048,11 @@ index 000000000000..da9cac6b6cc2 + return true; +} + ++static bool scx_kf_allowed_if_unlocked(void) ++{ ++ return !current->scx.kf_mask; ++} ++ +/** + * nldsq_next_task - Iterate to the next task in a non-local DSQ + * @dsq: user dsq being interated @@ -2674,7 +3086,7 @@ index 000000000000..da9cac6b6cc2 + + dsq_lnode = container_of(list_node, struct scx_dsq_list_node, + node); -+ } while (dsq_lnode->is_bpf_iter_cursor); ++ } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR); + + return container_of(dsq_lnode, struct task_struct, scx.dsq_list); +} @@ -2692,16 +3104,22 @@ index 000000000000..da9cac6b6cc2 + */ +enum scx_dsq_iter_flags { + /* iterate in the reverse dispatch order */ -+ SCX_DSQ_ITER_REV = 1U << 0, ++ SCX_DSQ_ITER_REV = 1U << 16, + -+ __SCX_DSQ_ITER_ALL_FLAGS = SCX_DSQ_ITER_REV, ++ __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, ++ __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, ++ ++ __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, ++ __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | ++ __SCX_DSQ_ITER_HAS_SLICE | ++ __SCX_DSQ_ITER_HAS_VTIME, +}; + +struct bpf_iter_scx_dsq_kern { + struct scx_dsq_list_node cursor; + struct scx_dispatch_q *dsq; -+ u32 dsq_seq; -+ u32 flags; ++ u64 slice; ++ u64 vtime; +} __attribute__((aligned(8))); + +struct bpf_iter_scx_dsq { @@ -2739,6 +3157,9 @@ index 000000000000..da9cac6b6cc2 +{ + lockdep_assert_held(&scx_tasks_lock); + ++ BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & ++ ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); ++ + iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; + list_add(&iter->cursor.tasks_node, &scx_tasks); + iter->locked = NULL; @@ -2817,17 +3238,37 @@ index 000000000000..da9cac6b6cc2 + * whether they would like to filter out dead tasks. See scx_task_iter_init() + * for details. + */ -+static struct task_struct * -+scx_task_iter_next_locked(struct scx_task_iter *iter, bool include_dead) ++static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) +{ + struct task_struct *p; -+retry: ++ + scx_task_iter_rq_unlock(iter); + + while ((p = scx_task_iter_next(iter))) { + /* -+ * is_idle_task() tests %PF_IDLE which may not be set for CPUs -+ * which haven't yet been onlined. Test sched_class directly. ++ * scx_task_iter is used to prepare and move tasks into SCX ++ * while loading the BPF scheduler and vice-versa while ++ * unloading. The init_tasks ("swappers") should be excluded ++ * from the iteration because: ++ * ++ * - It's unsafe to use __setschduler_prio() on an init_task to ++ * determine the sched_class to use as it won't preserve its ++ * idle_sched_class. ++ * ++ * - ops.init/exit_task() can easily be confused if called with ++ * init_tasks as they, e.g., share PID 0. ++ * ++ * As init_tasks are never scheduled through SCX, they can be ++ * skipped safely. Note that is_idle_task() which tests %PF_IDLE ++ * doesn't work here: ++ * ++ * - %PF_IDLE may not be set for an init_task whose CPU hasn't ++ * yet been onlined. ++ * ++ * - %PF_IDLE can be set on tasks that are not init_tasks. See ++ * play_idle_precise() used by CONFIG_IDLE_INJECT. ++ * ++ * Test for idle_sched_class as only init_tasks are on it. + */ + if (p->sched_class != &idle_sched_class) + break; @@ -2838,16 +3279,6 @@ index 000000000000..da9cac6b6cc2 + iter->rq = task_rq_lock(p, &iter->rf); + iter->locked = p; + -+ /* -+ * If we see %TASK_DEAD, @p already disabled preemption, is about to do -+ * the final __schedule(), won't ever need to be scheduled again and can -+ * thus be safely ignored. If we don't see %TASK_DEAD, @p can't enter -+ * the final __schedle() while we're locking its rq and thus will stay -+ * alive until the rq is unlocked. -+ */ -+ if (!include_dead && READ_ONCE(p->__state) == TASK_DEAD) -+ goto retry; -+ + return p; +} + @@ -2870,9 +3301,9 @@ index 000000000000..da9cac6b6cc2 + return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to); +} + -+static bool scx_ops_bypassing(void) ++static bool scx_rq_bypassing(struct rq *rq) +{ -+ return unlikely(atomic_read(&scx_ops_bypass_depth)); ++ return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); +} + +/** @@ -3006,13 +3437,18 @@ index 000000000000..da9cac6b6cc2 + */ +static void touch_core_sched(struct rq *rq, struct task_struct *p) +{ ++ lockdep_assert_rq_held(rq); ++ +#ifdef CONFIG_SCHED_CORE + /* + * It's okay to update the timestamp spuriously. Use + * sched_core_disabled() which is cheaper than enabled(). ++ * ++ * As this is used to determine ordering between tasks of sibling CPUs, ++ * it may be better to use per-core dispatch sequence instead. + */ + if (!sched_core_disabled()) -+ p->scx.core_sched_at = rq_clock_task(rq); ++ p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq)); +#endif +} + @@ -3029,7 +3465,6 @@ index 000000000000..da9cac6b6cc2 +static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); -+ assert_clock_updated(rq); + +#ifdef CONFIG_SCHED_CORE + if (SCX_HAS_OP(core_sched_before)) @@ -3040,20 +3475,14 @@ index 000000000000..da9cac6b6cc2 +static void update_curr_scx(struct rq *rq) +{ + struct task_struct *curr = rq->curr; -+ u64 now = rq_clock_task(rq); -+ u64 delta_exec; ++ s64 delta_exec; + -+ if (time_before_eq64(now, curr->se.exec_start)) ++ delta_exec = update_curr_common(rq); ++ if (unlikely(delta_exec <= 0)) + return; + -+ delta_exec = now - curr->se.exec_start; -+ curr->se.exec_start = now; -+ curr->se.sum_exec_runtime += delta_exec; -+ account_group_exec_runtime(curr, delta_exec); -+ cgroup_account_cputime(curr, delta_exec); -+ + if (curr->scx.slice != SCX_SLICE_INF) { -+ curr->scx.slice -= min(curr->scx.slice, delta_exec); ++ curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec); + if (!curr->scx.slice) + touch_core_sched(rq, curr); + } @@ -3194,6 +3623,8 @@ index 000000000000..da9cac6b6cc2 +static void task_unlink_from_dsq(struct task_struct *p, + struct scx_dispatch_q *dsq) +{ ++ WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node)); ++ + if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { + rb_erase(&p->scx.dsq_priq, &dsq->priq); + RB_CLEAR_NODE(&p->scx.dsq_priq); @@ -3201,6 +3632,7 @@ index 000000000000..da9cac6b6cc2 + } + + list_del_init(&p->scx.dsq_list.node); ++ dsq_mod_nr(dsq, -1); +} + +static void dispatch_dequeue(struct rq *rq, struct task_struct *p) @@ -3237,9 +3669,7 @@ index 000000000000..da9cac6b6cc2 + */ + if (p->scx.holding_cpu < 0) { + /* @p must still be on @dsq, dequeue */ -+ WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node)); + task_unlink_from_dsq(p, dsq); -+ dsq_mod_nr(dsq, -1); + } else { + /* + * We're racing against dispatch_to_local_dsq() which already @@ -3279,6 +3709,15 @@ index 000000000000..da9cac6b6cc2 + if (dsq_id == SCX_DSQ_LOCAL) + return &rq->scx.local_dsq; + ++ if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { ++ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; ++ ++ if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) ++ return &scx_dsq_global; ++ ++ return &cpu_rq(cpu)->scx.local_dsq; ++ } ++ + dsq = find_non_local_dsq(dsq_id); + if (unlikely(!dsq)) { + scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", @@ -3322,8 +3761,8 @@ index 000000000000..da9cac6b6cc2 +static void direct_dispatch(struct task_struct *p, u64 enq_flags) +{ + struct rq *rq = task_rq(p); -+ struct scx_dispatch_q *dsq; -+ u64 dsq_id = p->scx.ddsp_dsq_id; ++ struct scx_dispatch_q *dsq = ++ find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p); + + touch_core_sched_dispatch(rq, p); + @@ -3335,15 +3774,9 @@ index 000000000000..da9cac6b6cc2 + * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer + * the enqueue so that it's executed when @rq can be unlocked. + */ -+ if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { -+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; ++ if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) { + unsigned long opss; + -+ if (cpu == cpu_of(rq)) { -+ dsq_id = SCX_DSQ_LOCAL; -+ goto dispatch; -+ } -+ + opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK; + + switch (opss & SCX_OPSS_STATE_MASK) { @@ -3370,14 +3803,19 @@ index 000000000000..da9cac6b6cc2 + return; + } + -+dispatch: -+ dsq = find_dsq_for_dispatch(rq, dsq_id, p); + dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); +} + +static bool scx_rq_online(struct rq *rq) +{ -+ return likely(rq->scx.flags & SCX_RQ_ONLINE); ++ /* ++ * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates ++ * the online state as seen from the BPF scheduler. cpu_active() test ++ * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will ++ * stay set until the current scheduling operation is complete even if ++ * we aren't locking @rq. ++ */ ++ return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq))); +} + +static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, @@ -3400,7 +3838,7 @@ index 000000000000..da9cac6b6cc2 + if (!scx_rq_online(rq)) + goto local; + -+ if (scx_ops_bypassing()) { ++ if (scx_rq_bypassing(rq)) { + if (enq_flags & SCX_ENQ_LAST) + goto local; + else @@ -3527,7 +3965,7 @@ index 000000000000..da9cac6b6cc2 + rq->scx.nr_running++; + add_nr_running(rq, 1); + -+ if (SCX_HAS_OP(runnable)) ++ if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p)) + SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); + + if (enq_flags & SCX_ENQ_WAKEUP) @@ -3611,7 +4049,7 @@ index 000000000000..da9cac6b6cc2 + SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false); + } + -+ if (SCX_HAS_OP(quiescent)) ++ if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p)) + SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags); + + if (deq_flags & SCX_DEQ_SLEEP) @@ -3646,193 +4084,172 @@ index 000000000000..da9cac6b6cc2 + return false; +} + ++static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, ++ struct scx_dispatch_q *src_dsq, ++ struct rq *dst_rq) ++{ ++ struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq; ++ ++ /* @dsq is locked and @p is on @dst_rq */ ++ lockdep_assert_held(&src_dsq->lock); ++ lockdep_assert_rq_held(dst_rq); ++ ++ WARN_ON_ONCE(p->scx.holding_cpu >= 0); ++ ++ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) ++ list_add(&p->scx.dsq_list.node, &dst_dsq->list); ++ else ++ list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list); ++ ++ dsq_mod_nr(dst_dsq, 1); ++ p->scx.dsq = dst_dsq; ++} ++ +#ifdef CONFIG_SMP +/** -+ * move_task_to_local_dsq - Move a task from a different rq to a local DSQ -+ * @rq: rq to move the task into, currently locked ++ * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ + * @p: task to move + * @enq_flags: %SCX_ENQ_* ++ * @src_rq: rq to move the task from, locked on entry, released on return ++ * @dst_rq: rq to move the task into, locked on return + * -+ * Move @p which is currently on a different rq to @rq's local DSQ. The caller -+ * must: -+ * -+ * 1. Start with exclusive access to @p either through its DSQ lock or -+ * %SCX_OPSS_DISPATCHING flag. -+ * -+ * 2. Set @p->scx.holding_cpu to raw_smp_processor_id(). -+ * -+ * 3. Remember task_rq(@p). Release the exclusive access so that we don't -+ * deadlock with dequeue. -+ * -+ * 4. Lock @rq and the task_rq from #3. -+ * -+ * 5. Call this function. -+ * -+ * Returns %true if @p was successfully moved. %false after racing dequeue and -+ * losing. ++ * Move @p which is currently on @src_rq to @dst_rq's local DSQ. + */ -+static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p, -+ u64 enq_flags) ++static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, ++ struct rq *src_rq, struct rq *dst_rq) +{ -+ struct rq *task_rq; ++ lockdep_assert_rq_held(src_rq); + -+ lockdep_assert_rq_held(rq); ++ /* the following marks @p MIGRATING which excludes dequeue */ ++ deactivate_task(src_rq, p, 0); ++ set_task_cpu(p, cpu_of(dst_rq)); ++ p->scx.sticky_cpu = cpu_of(dst_rq); + -+ /* -+ * If dequeue got to @p while we were trying to lock both rq's, it'd -+ * have cleared @p->scx.holding_cpu to -1. While other cpus may have -+ * updated it to different values afterwards, as this operation can't be -+ * preempted or recurse, @p->scx.holding_cpu can never become -+ * raw_smp_processor_id() again before we're done. Thus, we can tell -+ * whether we lost to dequeue by testing whether @p->scx.holding_cpu is -+ * still raw_smp_processor_id(). -+ * -+ * See dispatch_dequeue() for the counterpart. -+ */ -+ if (unlikely(p->scx.holding_cpu != raw_smp_processor_id())) -+ return false; -+ -+ /* @p->rq couldn't have changed if we're still the holding cpu */ -+ task_rq = task_rq(p); -+ lockdep_assert_rq_held(task_rq); -+ -+ WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr)); -+ deactivate_task(task_rq, p, 0); -+ set_task_cpu(p, cpu_of(rq)); -+ p->scx.sticky_cpu = cpu_of(rq); ++ raw_spin_rq_unlock(src_rq); ++ raw_spin_rq_lock(dst_rq); + + /* + * We want to pass scx-specific enq_flags but activate_task() will + * truncate the upper 32 bit. As we own @rq, we can pass them through + * @rq->scx.extra_enq_flags instead. + */ -+ WARN_ON_ONCE(rq->scx.extra_enq_flags); -+ rq->scx.extra_enq_flags = enq_flags; -+ activate_task(rq, p, 0); -+ rq->scx.extra_enq_flags = 0; -+ -+ return true; ++ WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)); ++ WARN_ON_ONCE(dst_rq->scx.extra_enq_flags); ++ dst_rq->scx.extra_enq_flags = enq_flags; ++ activate_task(dst_rq, p, 0); ++ dst_rq->scx.extra_enq_flags = 0; +} + -+/** -+ * dispatch_to_local_dsq_lock - Ensure source and destination rq's are locked -+ * @rq: current rq which is locked -+ * @src_rq: rq to move task from -+ * @dst_rq: rq to move task to -+ * -+ * We're holding @rq lock and trying to dispatch a task from @src_rq to -+ * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether -+ * @rq stays locked isn't important as long as the state is restored after -+ * dispatch_to_local_dsq_unlock(). -+ */ -+static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq *src_rq, -+ struct rq *dst_rq) -+{ -+ if (src_rq == dst_rq) { -+ raw_spin_rq_unlock(rq); -+ raw_spin_rq_lock(dst_rq); -+ } else if (rq == src_rq) { -+ double_lock_balance(rq, dst_rq); -+ } else if (rq == dst_rq) { -+ double_lock_balance(rq, src_rq); -+ } else { -+ raw_spin_rq_unlock(rq); -+ double_rq_lock(src_rq, dst_rq); -+ } -+} -+ -+/** -+ * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock() -+ * @rq: current rq which is locked -+ * @src_rq: rq to move task from -+ * @dst_rq: rq to move task to -+ * -+ * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return. -+ */ -+static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq *src_rq, -+ struct rq *dst_rq) -+{ -+ if (src_rq == dst_rq) { -+ raw_spin_rq_unlock(dst_rq); -+ raw_spin_rq_lock(rq); -+ } else if (rq == src_rq) { -+ double_unlock_balance(rq, dst_rq); -+ } else if (rq == dst_rq) { -+ double_unlock_balance(rq, src_rq); -+ } else { -+ double_rq_unlock(src_rq, dst_rq); -+ raw_spin_rq_lock(rq); -+ } -+} -+#endif /* CONFIG_SMP */ -+ -+static void consume_local_task(struct rq *rq, struct scx_dispatch_q *dsq, -+ struct task_struct *p) -+{ -+ lockdep_assert_held(&dsq->lock); /* released on return */ -+ -+ /* @dsq is locked and @p is on this rq */ -+ WARN_ON_ONCE(p->scx.holding_cpu >= 0); -+ task_unlink_from_dsq(p, dsq); -+ list_add_tail(&p->scx.dsq_list.node, &rq->scx.local_dsq.list); -+ dsq_mod_nr(dsq, -1); -+ dsq_mod_nr(&rq->scx.local_dsq, 1); -+ p->scx.dsq = &rq->scx.local_dsq; -+ raw_spin_unlock(&dsq->lock); -+} -+ -+#ifdef CONFIG_SMP +/* -+ * Similar to kernel/sched/core.c::is_cpu_allowed() but we're testing whether @p -+ * can be pulled to @rq. ++ * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two ++ * differences: ++ * ++ * - is_cpu_allowed() asks "Can this task run on this CPU?" while ++ * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to ++ * this CPU?". ++ * ++ * While migration is disabled, is_cpu_allowed() has to say "yes" as the task ++ * must be allowed to finish on the CPU that it's currently on regardless of ++ * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the ++ * BPF scheduler shouldn't attempt to migrate a task which has migration ++ * disabled. ++ * ++ * - The BPF scheduler is bypassed while the rq is offline and we can always say ++ * no to the BPF scheduler initiated migrations while offline. + */ -+static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) ++static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, ++ bool trigger_error) +{ + int cpu = cpu_of(rq); + -+ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ /* ++ * We don't require the BPF scheduler to avoid dispatching to offline ++ * CPUs mostly for convenience but also because CPUs can go offline ++ * between scx_bpf_dispatch() calls and here. Trigger error iff the ++ * picked CPU is outside the allowed mask. ++ */ ++ if (!task_allowed_on_cpu(p, cpu)) { ++ if (trigger_error) ++ scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", ++ cpu_of(rq), p->comm, p->pid); + return false; ++ } ++ + if (unlikely(is_migration_disabled(p))) + return false; -+ if (!(p->flags & PF_KTHREAD) && unlikely(!task_cpu_possible(cpu, p))) -+ return false; ++ + if (!scx_rq_online(rq)) + return false; ++ + return true; +} + -+static bool consume_remote_task(struct rq *rq, struct scx_dispatch_q *dsq, -+ struct task_struct *p, struct rq *task_rq) ++/** ++ * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq ++ * @p: target task ++ * @dsq: locked DSQ @p is currently on ++ * @src_rq: rq @p is currently on, stable with @dsq locked ++ * ++ * Called with @dsq locked but no rq's locked. We want to move @p to a different ++ * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is ++ * required when transferring into a local DSQ. Even when transferring into a ++ * non-local DSQ, it's better to use the same mechanism to protect against ++ * dequeues and maintain the invariant that @p->scx.dsq can only change while ++ * @src_rq is locked, which e.g. scx_dump_task() depends on. ++ * ++ * We want to grab @src_rq but that can deadlock if we try while locking @dsq, ++ * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As ++ * this may race with dequeue, which can't drop the rq lock or fail, do a little ++ * dancing from our side. ++ * ++ * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets ++ * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu ++ * would be cleared to -1. While other cpus may have updated it to different ++ * values afterwards, as this operation can't be preempted or recurse, the ++ * holding_cpu can never become this CPU again before we're done. Thus, we can ++ * tell whether we lost to dequeue by testing whether the holding_cpu still ++ * points to this CPU. See dispatch_dequeue() for the counterpart. ++ * ++ * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is ++ * still valid. %false if lost to dequeue. ++ */ ++static bool unlink_dsq_and_lock_src_rq(struct task_struct *p, ++ struct scx_dispatch_q *dsq, ++ struct rq *src_rq) +{ -+ bool moved = false; ++ s32 cpu = raw_smp_processor_id(); + -+ lockdep_assert_held(&dsq->lock); /* released on return */ ++ lockdep_assert_held(&dsq->lock); + -+ /* -+ * @dsq is locked and @p is on a remote rq. @p is currently protected by -+ * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab -+ * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the -+ * rq lock or fail, do a little dancing from our side. See -+ * move_task_to_local_dsq(). -+ */ + WARN_ON_ONCE(p->scx.holding_cpu >= 0); + task_unlink_from_dsq(p, dsq); -+ dsq_mod_nr(dsq, -1); -+ p->scx.holding_cpu = raw_smp_processor_id(); ++ p->scx.holding_cpu = cpu; ++ + raw_spin_unlock(&dsq->lock); ++ raw_spin_rq_lock(src_rq); + -+ double_lock_balance(rq, task_rq); ++ /* task_rq couldn't have changed if we're still the holding cpu */ ++ return likely(p->scx.holding_cpu == cpu) && ++ !WARN_ON_ONCE(src_rq != task_rq(p)); ++} + -+ moved = move_task_to_local_dsq(rq, p, 0); ++static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, ++ struct scx_dispatch_q *dsq, struct rq *src_rq) ++{ ++ raw_spin_rq_unlock(this_rq); + -+ double_unlock_balance(rq, task_rq); -+ -+ return moved; ++ if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) { ++ move_remote_task_to_local_dsq(p, 0, src_rq, this_rq); ++ return true; ++ } else { ++ raw_spin_rq_unlock(src_rq); ++ raw_spin_rq_lock(this_rq); ++ return false; ++ } +} +#else /* CONFIG_SMP */ -+static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) { return false; } -+static bool consume_remote_task(struct rq *rq, struct scx_dispatch_q *dsq, -+ struct task_struct *p, struct rq *task_rq) { return false; } ++static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } ++static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } +#endif /* CONFIG_SMP */ + +static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq) @@ -3853,12 +4270,14 @@ index 000000000000..da9cac6b6cc2 + struct rq *task_rq = task_rq(p); + + if (rq == task_rq) { -+ consume_local_task(rq, dsq, p); ++ task_unlink_from_dsq(p, dsq); ++ move_local_task_to_local_dsq(p, 0, dsq, rq); ++ raw_spin_unlock(&dsq->lock); + return true; + } + -+ if (task_can_run_on_remote_rq(p, rq)) { -+ if (likely(consume_remote_task(rq, dsq, p, task_rq))) ++ if (task_can_run_on_remote_rq(p, rq, false)) { ++ if (likely(consume_remote_task(rq, p, dsq, task_rq))) + return true; + goto retry; + } @@ -3868,122 +4287,94 @@ index 000000000000..da9cac6b6cc2 + return false; +} + -+enum dispatch_to_local_dsq_ret { -+ DTL_DISPATCHED, /* successfully dispatched */ -+ DTL_LOST, /* lost race to dequeue */ -+ DTL_NOT_LOCAL, /* destination is not a local DSQ */ -+ DTL_INVALID, /* invalid local dsq_id */ -+}; -+ +/** + * dispatch_to_local_dsq - Dispatch a task to a local dsq + * @rq: current rq which is locked -+ * @dsq_id: destination dsq ID ++ * @dst_dsq: destination DSQ + * @p: task to dispatch + * @enq_flags: %SCX_ENQ_* + * -+ * We're holding @rq lock and want to dispatch @p to the local DSQ identified by -+ * @dsq_id. This function performs all the synchronization dancing needed -+ * because local DSQs are protected with rq locks. ++ * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local ++ * DSQ. This function performs all the synchronization dancing needed because ++ * local DSQs are protected with rq locks. + * + * The caller must have exclusive ownership of @p (e.g. through + * %SCX_OPSS_DISPATCHING). + */ -+static enum dispatch_to_local_dsq_ret -+dispatch_to_local_dsq(struct rq *rq, u64 dsq_id, struct task_struct *p, -+ u64 enq_flags) ++static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, ++ struct task_struct *p, u64 enq_flags) +{ + struct rq *src_rq = task_rq(p); -+ struct rq *dst_rq; ++ struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); + + /* + * We're synchronized against dequeue through DISPATCHING. As @p can't + * be dequeued, its task_rq and cpus_allowed are stable too. ++ * ++ * If dispatching to @rq that @p is already on, no lock dancing needed. + */ -+ if (dsq_id == SCX_DSQ_LOCAL) { -+ dst_rq = rq; -+ } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { -+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; -+ -+ if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) -+ return DTL_INVALID; -+ dst_rq = cpu_rq(cpu); -+ } else { -+ return DTL_NOT_LOCAL; -+ } -+ -+ /* if dispatching to @rq that @p is already on, no lock dancing needed */ + if (rq == src_rq && rq == dst_rq) { -+ dispatch_enqueue(&dst_rq->scx.local_dsq, p, -+ enq_flags | SCX_ENQ_CLEAR_OPSS); -+ return DTL_DISPATCHED; ++ dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); ++ return; + } + +#ifdef CONFIG_SMP -+ if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) { -+ struct rq *locked_dst_rq = dst_rq; -+ bool dsp; ++ if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { ++ dispatch_enqueue(&scx_dsq_global, p, enq_flags | SCX_ENQ_CLEAR_OPSS); ++ return; ++ } + ++ /* ++ * @p is on a possibly remote @src_rq which we need to lock to move the ++ * task. If dequeue is in progress, it'd be locking @src_rq and waiting ++ * on DISPATCHING, so we can't grab @src_rq lock while holding ++ * DISPATCHING. ++ * ++ * As DISPATCHING guarantees that @p is wholly ours, we can pretend that ++ * we're moving from a DSQ and use the same mechanism - mark the task ++ * under transfer with holding_cpu, release DISPATCHING and then follow ++ * the same protocol. See unlink_dsq_and_lock_src_rq(). ++ */ ++ p->scx.holding_cpu = raw_smp_processor_id(); ++ ++ /* store_release ensures that dequeue sees the above */ ++ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); ++ ++ /* switch to @src_rq lock */ ++ if (rq != src_rq) { ++ raw_spin_rq_unlock(rq); ++ raw_spin_rq_lock(src_rq); ++ } ++ ++ /* task_rq couldn't have changed if we're still the holding cpu */ ++ if (likely(p->scx.holding_cpu == raw_smp_processor_id()) && ++ !WARN_ON_ONCE(src_rq != task_rq(p))) { + /* -+ * @p is on a possibly remote @src_rq which we need to lock to -+ * move the task. If dequeue is in progress, it'd be locking -+ * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq -+ * lock while holding DISPATCHING. -+ * -+ * As DISPATCHING guarantees that @p is wholly ours, we can -+ * pretend that we're moving from a DSQ and use the same -+ * mechanism - mark the task under transfer with holding_cpu, -+ * release DISPATCHING and then follow the same protocol. ++ * If @p is staying on the same rq, there's no need to go ++ * through the full deactivate/activate cycle. Optimize by ++ * abbreviating move_remote_task_to_local_dsq(). + */ -+ p->scx.holding_cpu = raw_smp_processor_id(); -+ -+ /* store_release ensures that dequeue sees the above */ -+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); -+ -+ dispatch_to_local_dsq_lock(rq, src_rq, locked_dst_rq); -+ -+ /* -+ * We don't require the BPF scheduler to avoid dispatching to -+ * offline CPUs mostly for convenience but also because CPUs can -+ * go offline between scx_bpf_dispatch() calls and here. If @p -+ * is destined to an offline CPU, queue it on its current CPU -+ * instead, which should always be safe. As this is an allowed -+ * behavior, don't trigger an ops error. -+ */ -+ if (!scx_rq_online(dst_rq)) -+ dst_rq = src_rq; -+ + if (src_rq == dst_rq) { -+ /* -+ * As @p is staying on the same rq, there's no need to -+ * go through the full deactivate/activate cycle. -+ * Optimize by abbreviating the operations in -+ * move_task_to_local_dsq(). -+ */ -+ dsp = p->scx.holding_cpu == raw_smp_processor_id(); -+ if (likely(dsp)) { -+ p->scx.holding_cpu = -1; -+ dispatch_enqueue(&dst_rq->scx.local_dsq, p, -+ enq_flags); -+ } ++ p->scx.holding_cpu = -1; ++ dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags); + } else { -+ dsp = move_task_to_local_dsq(dst_rq, p, enq_flags); ++ move_remote_task_to_local_dsq(p, enq_flags, ++ src_rq, dst_rq); + } + + /* if the destination CPU is idle, wake it up */ -+ if (dsp && sched_class_above(p->sched_class, -+ dst_rq->curr->sched_class)) ++ if (sched_class_above(p->sched_class, dst_rq->curr->sched_class)) + resched_curr(dst_rq); -+ -+ dispatch_to_local_dsq_unlock(rq, src_rq, locked_dst_rq); -+ -+ return dsp ? DTL_DISPATCHED : DTL_LOST; + } -+#endif /* CONFIG_SMP */ + -+ scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", -+ cpu_of(dst_rq), p->comm, p->pid); -+ return DTL_INVALID; ++ /* switch back to @rq lock */ ++ if (rq != dst_rq) { ++ raw_spin_rq_unlock(dst_rq); ++ raw_spin_rq_lock(rq); ++ } ++#else /* CONFIG_SMP */ ++ BUG(); /* control can not reach here on UP */ ++#endif /* CONFIG_SMP */ +} + +/** @@ -4058,20 +4449,12 @@ index 000000000000..da9cac6b6cc2 + + BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); + -+ switch (dispatch_to_local_dsq(rq, dsq_id, p, enq_flags)) { -+ case DTL_DISPATCHED: -+ break; -+ case DTL_LOST: -+ break; -+ case DTL_INVALID: -+ dsq_id = SCX_DSQ_GLOBAL; -+ fallthrough; -+ case DTL_NOT_LOCAL: -+ dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()), -+ dsq_id, p); ++ dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p); ++ ++ if (dsq->id == SCX_DSQ_LOCAL) ++ dispatch_to_local_dsq(rq, dsq, p, enq_flags); ++ else + dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); -+ break; -+ } +} + +static void flush_dispatch_buf(struct rq *rq) @@ -4133,7 +4516,7 @@ index 000000000000..da9cac6b6cc2 + * same conditions later and pick @rq->curr accordingly. + */ + if ((prev->scx.flags & SCX_TASK_QUEUED) && -+ prev->scx.slice && !scx_ops_bypassing()) { ++ prev->scx.slice && !scx_rq_bypassing(rq)) { + if (local) + prev->scx.flags |= SCX_TASK_BAL_KEEP; + goto has_tasks; @@ -4147,7 +4530,7 @@ index 000000000000..da9cac6b6cc2 + if (consume_dispatch_q(rq, &scx_dsq_global)) + goto has_tasks; + -+ if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq)) ++ if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq)) + goto out; + + dspc->rq = rq; @@ -4196,7 +4579,6 @@ index 000000000000..da9cac6b6cc2 + return has_tasks; +} + -+#ifdef CONFIG_SMP +static int balance_scx(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) +{ @@ -4230,7 +4612,31 @@ index 000000000000..da9cac6b6cc2 + + return ret; +} -+#endif ++ ++static void process_ddsp_deferred_locals(struct rq *rq) ++{ ++ struct task_struct *p; ++ ++ lockdep_assert_rq_held(rq); ++ ++ /* ++ * Now that @rq can be unlocked, execute the deferred enqueueing of ++ * tasks directly dispatched to the local DSQs of other CPUs. See ++ * direct_dispatch(). Keep popping from the head instead of using ++ * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq ++ * temporarily. ++ */ ++ while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, ++ struct task_struct, scx.dsq_list.node))) { ++ struct scx_dispatch_q *dsq; ++ ++ list_del_init(&p->scx.dsq_list.node); ++ ++ dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p); ++ if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) ++ dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags); ++ } ++} + +static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) +{ @@ -4274,221 +4680,6 @@ index 000000000000..da9cac6b6cc2 + } +} + -+static void process_ddsp_deferred_locals(struct rq *rq) -+{ -+ struct task_struct *p, *tmp; -+ -+ lockdep_assert_rq_held(rq); -+ -+ /* -+ * Now that @rq can be unlocked, execute the deferred enqueueing of -+ * tasks directly dispatched to the local DSQs of other CPUs. See -+ * direct_dispatch(). -+ */ -+ list_for_each_entry_safe(p, tmp, &rq->scx.ddsp_deferred_locals, -+ scx.dsq_list.node) { -+ s32 ret; -+ -+ list_del_init(&p->scx.dsq_list.node); -+ -+ ret = dispatch_to_local_dsq(rq, p->scx.ddsp_dsq_id, p, -+ p->scx.ddsp_enq_flags); -+ WARN_ON_ONCE(ret == DTL_NOT_LOCAL); -+ } -+} -+ -+static void put_prev_task_scx(struct rq *rq, struct task_struct *p) -+{ -+#ifndef CONFIG_SMP -+ /* -+ * UP workaround. -+ * -+ * Because SCX may transfer tasks across CPUs during dispatch, dispatch -+ * is performed from its balance operation which isn't called in UP. -+ * Let's work around by calling it from the operations which come right -+ * after. -+ * -+ * 1. If the prev task is on SCX, pick_next_task() calls -+ * .put_prev_task() right after. As .put_prev_task() is also called -+ * from other places, we need to distinguish the calls which can be -+ * done by looking at the previous task's state - if still queued or -+ * dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task(). -+ * This case is handled here. -+ * -+ * 2. If the prev task is not on SCX, the first following call into SCX -+ * will be .pick_next_task(), which is covered by calling -+ * balance_scx() from pick_next_task_scx(). -+ * -+ * Note that we can't merge the first case into the second as -+ * balance_scx() must be called before the previous SCX task goes -+ * through put_prev_task_scx(). -+ * -+ * @rq is pinned and can't be unlocked. As UP doesn't transfer tasks -+ * around, balance_one() doesn't need to. -+ */ -+ if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP)) -+ balance_one(rq, p, true); -+#endif -+ -+ update_curr_scx(rq); -+ -+ /* see dequeue_task_scx() on why we skip when !QUEUED */ -+ if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); -+ -+ /* -+ * If we're being called from put_prev_task_balance(), balance_scx() may -+ * have decided that @p should keep running. -+ */ -+ if (p->scx.flags & SCX_TASK_BAL_KEEP) { -+ p->scx.flags &= ~SCX_TASK_BAL_KEEP; -+ set_task_runnable(rq, p); -+ dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); -+ return; -+ } -+ -+ if (p->scx.flags & SCX_TASK_QUEUED) { -+ set_task_runnable(rq, p); -+ -+ /* -+ * If @p has slice left and balance_scx() didn't tag it for -+ * keeping, @p is getting preempted by a higher priority -+ * scheduler class or core-sched forcing a different task. Leave -+ * it at the head of the local DSQ. -+ */ -+ if (p->scx.slice && !scx_ops_bypassing()) { -+ dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); -+ return; -+ } -+ -+ /* -+ * If we're in the pick_next_task path, balance_scx() should -+ * have already populated the local DSQ if there are any other -+ * available tasks. If empty, tell ops.enqueue() that @p is the -+ * only one available for this cpu. ops.enqueue() should put it -+ * on the local DSQ so that the subsequent pick_next_task_scx() -+ * can find the task unless it wants to trigger a separate -+ * follow-up scheduling event. -+ */ -+ if (list_empty(&rq->scx.local_dsq.list)) -+ do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); -+ else -+ do_enqueue_task(rq, p, 0, -1); -+ } -+} -+ -+static struct task_struct *first_local_task(struct rq *rq) -+{ -+ return list_first_entry_or_null(&rq->scx.local_dsq.list, -+ struct task_struct, scx.dsq_list.node); -+} -+ -+static struct task_struct *pick_next_task_scx(struct rq *rq) -+{ -+ struct task_struct *p; -+ -+#ifndef CONFIG_SMP -+ /* UP workaround - see the comment at the head of put_prev_task_scx() */ -+ if (unlikely(rq->curr->sched_class != &ext_sched_class)) -+ balance_one(rq, rq->curr, true); -+#endif -+ -+ p = first_local_task(rq); -+ if (!p) -+ return NULL; -+ -+ set_next_task_scx(rq, p, true); -+ -+ if (unlikely(!p->scx.slice)) { -+ if (!scx_ops_bypassing() && !scx_warned_zero_slice) { -+ printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", -+ p->comm, p->pid); -+ scx_warned_zero_slice = true; -+ } -+ p->scx.slice = SCX_SLICE_DFL; -+ } -+ -+ return p; -+} -+ -+#ifdef CONFIG_SCHED_CORE -+/** -+ * scx_prio_less - Task ordering for core-sched -+ * @a: task A -+ * @b: task B -+ * -+ * Core-sched is implemented as an additional scheduling layer on top of the -+ * usual sched_class'es and needs to find out the expected task ordering. For -+ * SCX, core-sched calls this function to interrogate the task ordering. -+ * -+ * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used -+ * to implement the default task ordering. The older the timestamp, the higher -+ * prority the task - the global FIFO ordering matching the default scheduling -+ * behavior. -+ * -+ * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to -+ * implement FIFO ordering within each local DSQ. See pick_task_scx(). -+ */ -+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, -+ bool in_fi) -+{ -+ /* -+ * The const qualifiers are dropped from task_struct pointers when -+ * calling ops.core_sched_before(). Accesses are controlled by the -+ * verifier. -+ */ -+ if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing()) -+ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, -+ (struct task_struct *)a, -+ (struct task_struct *)b); -+ else -+ return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); -+} -+ -+/** -+ * pick_task_scx - Pick a candidate task for core-sched -+ * @rq: rq to pick the candidate task from -+ * -+ * Core-sched calls this function on each SMT sibling to determine the next -+ * tasks to run on the SMT siblings. balance_one() has been called on all -+ * siblings and put_prev_task_scx() has been called only for the current CPU. -+ * -+ * As put_prev_task_scx() hasn't been called on remote CPUs, we can't just look -+ * at the first task in the local dsq. @rq->curr has to be considered explicitly -+ * to mimic %SCX_TASK_BAL_KEEP. -+ */ -+static struct task_struct *pick_task_scx(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ struct task_struct *first = first_local_task(rq); -+ -+ if (curr->scx.flags & SCX_TASK_QUEUED) { -+ /* is curr the only runnable task? */ -+ if (!first) -+ return curr; -+ -+ /* -+ * Does curr trump first? We can always go by core_sched_at for -+ * this comparison as it represents global FIFO ordering when -+ * the default core-sched ordering is used and local-DSQ FIFO -+ * ordering otherwise. -+ * -+ * We can have a task with an earlier timestamp on the DSQ. For -+ * example, when a current task is preempted by a sibling -+ * picking a different cookie, the task would be requeued at the -+ * head of the local DSQ with an earlier timestamp than the -+ * core-sched picked next task. Besides, the BPF scheduler may -+ * dispatch any tasks to the local DSQ anytime. -+ */ -+ if (curr->scx.slice && time_before64(curr->scx.core_sched_at, -+ first->scx.core_sched_at)) -+ return curr; -+ } -+ -+ return first; /* this may be %NULL */ -+} -+#endif /* CONFIG_SCHED_CORE */ -+ +static enum scx_cpu_preempt_reason +preempt_reason_from_class(const struct sched_class *class) +{ @@ -4552,6 +4743,161 @@ index 000000000000..da9cac6b6cc2 + } +} + ++static void put_prev_task_scx(struct rq *rq, struct task_struct *p) ++{ ++ update_curr_scx(rq); ++ ++ /* see dequeue_task_scx() on why we skip when !QUEUED */ ++ if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) ++ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); ++ ++ /* ++ * If we're being called from put_prev_task_balance(), balance_scx() may ++ * have decided that @p should keep running. ++ */ ++ if (p->scx.flags & SCX_TASK_BAL_KEEP) { ++ p->scx.flags &= ~SCX_TASK_BAL_KEEP; ++ set_task_runnable(rq, p); ++ dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); ++ return; ++ } ++ ++ if (p->scx.flags & SCX_TASK_QUEUED) { ++ set_task_runnable(rq, p); ++ ++ /* ++ * If @p has slice left and balance_scx() didn't tag it for ++ * keeping, @p is getting preempted by a higher priority ++ * scheduler class or core-sched forcing a different task. Leave ++ * it at the head of the local DSQ. ++ */ ++ if (p->scx.slice && !scx_rq_bypassing(rq)) { ++ dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); ++ return; ++ } ++ ++ /* ++ * If we're in the pick_next_task path, balance_scx() should ++ * have already populated the local DSQ if there are any other ++ * available tasks. If empty, tell ops.enqueue() that @p is the ++ * only one available for this cpu. ops.enqueue() should put it ++ * on the local DSQ so that the subsequent pick_next_task_scx() ++ * can find the task unless it wants to trigger a separate ++ * follow-up scheduling event. ++ */ ++ if (list_empty(&rq->scx.local_dsq.list)) ++ do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); ++ else ++ do_enqueue_task(rq, p, 0, -1); ++ } ++} ++ ++static struct task_struct *first_local_task(struct rq *rq) ++{ ++ return list_first_entry_or_null(&rq->scx.local_dsq.list, ++ struct task_struct, scx.dsq_list.node); ++} ++ ++static struct task_struct *pick_next_task_scx(struct rq *rq) ++{ ++ struct task_struct *p; ++ ++ p = first_local_task(rq); ++ if (!p) ++ return NULL; ++ ++ set_next_task_scx(rq, p, true); ++ ++ if (unlikely(!p->scx.slice)) { ++ if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { ++ printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", ++ p->comm, p->pid); ++ scx_warned_zero_slice = true; ++ } ++ p->scx.slice = SCX_SLICE_DFL; ++ } ++ ++ return p; ++} ++ ++#ifdef CONFIG_SCHED_CORE ++/** ++ * scx_prio_less - Task ordering for core-sched ++ * @a: task A ++ * @b: task B ++ * ++ * Core-sched is implemented as an additional scheduling layer on top of the ++ * usual sched_class'es and needs to find out the expected task ordering. For ++ * SCX, core-sched calls this function to interrogate the task ordering. ++ * ++ * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used ++ * to implement the default task ordering. The older the timestamp, the higher ++ * prority the task - the global FIFO ordering matching the default scheduling ++ * behavior. ++ * ++ * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to ++ * implement FIFO ordering within each local DSQ. See pick_task_scx(). ++ */ ++bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, ++ bool in_fi) ++{ ++ /* ++ * The const qualifiers are dropped from task_struct pointers when ++ * calling ops.core_sched_before(). Accesses are controlled by the ++ * verifier. ++ */ ++ if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a))) ++ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, ++ (struct task_struct *)a, ++ (struct task_struct *)b); ++ else ++ return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); ++} ++ ++/** ++ * pick_task_scx - Pick a candidate task for core-sched ++ * @rq: rq to pick the candidate task from ++ * ++ * Core-sched calls this function on each SMT sibling to determine the next ++ * tasks to run on the SMT siblings. balance_one() has been called on all ++ * siblings and put_prev_task_scx() has been called only for the current CPU. ++ * ++ * As put_prev_task_scx() hasn't been called on remote CPUs, we can't just look ++ * at the first task in the local dsq. @rq->curr has to be considered explicitly ++ * to mimic %SCX_TASK_BAL_KEEP. ++ */ ++static struct task_struct *pick_task_scx(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ struct task_struct *first = first_local_task(rq); ++ ++ if (curr->scx.flags & SCX_TASK_QUEUED) { ++ /* is curr the only runnable task? */ ++ if (!first) ++ return curr; ++ ++ /* ++ * Does curr trump first? We can always go by core_sched_at for ++ * this comparison as it represents global FIFO ordering when ++ * the default core-sched ordering is used and local-DSQ FIFO ++ * ordering otherwise. ++ * ++ * We can have a task with an earlier timestamp on the DSQ. For ++ * example, when a current task is preempted by a sibling ++ * picking a different cookie, the task would be requeued at the ++ * head of the local DSQ with an earlier timestamp than the ++ * core-sched picked next task. Besides, the BPF scheduler may ++ * dispatch any tasks to the local DSQ anytime. ++ */ ++ if (curr->scx.slice && time_before64(curr->scx.core_sched_at, ++ first->scx.core_sched_at)) ++ return curr; ++ } ++ ++ return first; /* this may be %NULL */ ++} ++#endif /* CONFIG_SCHED_CORE */ ++ +#ifdef CONFIG_SMP + +static bool test_and_clear_cpu_idle(int cpu) @@ -4794,9 +5140,9 @@ index 000000000000..da9cac6b6cc2 + atomic_long_inc(&scx_hotplug_seq); + + if (online && SCX_HAS_OP(cpu_online)) -+ SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_online, cpu); ++ SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); + else if (!online && SCX_HAS_OP(cpu_offline)) -+ SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_offline, cpu); ++ SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu); + else + scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "cpu %d going %s, exiting scheduler", cpu, @@ -4902,7 +5248,7 @@ index 000000000000..da9cac6b6cc2 + * While disabling, always resched and refresh core-sched timestamp as + * we can't trust the slice management or ops.core_sched_before(). + */ -+ if (scx_ops_bypassing()) { ++ if (scx_rq_bypassing(rq)) { + curr->scx.slice = 0; + touch_core_sched(rq, curr); + } else if (SCX_HAS_OP(tick)) { @@ -4913,6 +5259,28 @@ index 000000000000..da9cac6b6cc2 + resched_curr(rq); +} + ++#ifdef CONFIG_EXT_GROUP_SCHED ++static struct cgroup *tg_cgrp(struct task_group *tg) ++{ ++ /* ++ * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, ++ * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the ++ * root cgroup. ++ */ ++ if (tg && tg->css.cgroup) ++ return tg->css.cgroup; ++ else ++ return &cgrp_dfl_root.cgrp; ++} ++ ++#define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), ++ ++#else /* CONFIG_EXT_GROUP_SCHED */ ++ ++#define SCX_INIT_TASK_ARGS_CGROUP(tg) ++ ++#endif /* CONFIG_EXT_GROUP_SCHED */ ++ +static enum scx_task_state scx_get_task_state(const struct task_struct *p) +{ + return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; @@ -4957,10 +5325,11 @@ index 000000000000..da9cac6b6cc2 + + if (SCX_HAS_OP(init_task)) { + struct scx_init_task_args args = { ++ SCX_INIT_TASK_ARGS_CGROUP(tg) + .fork = fork, + }; + -+ ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init_task, p, &args); ++ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args); + if (unlikely(ret)) { + ret = ops_sanitize_err("init_task", ret); + return ret; @@ -4970,24 +5339,29 @@ index 000000000000..da9cac6b6cc2 + scx_set_task_state(p, SCX_TASK_INIT); + + if (p->scx.disallow) { -+ struct rq *rq; -+ struct rq_flags rf; ++ if (!fork) { ++ struct rq *rq; ++ struct rq_flags rf; + -+ rq = task_rq_lock(p, &rf); ++ rq = task_rq_lock(p, &rf); + -+ /* -+ * We're either in fork or load path and @p->policy will be -+ * applied right after. Reverting @p->policy here and rejecting -+ * %SCHED_EXT transitions from scx_check_setscheduler() -+ * guarantees that if ops.init_task() sets @p->disallow, @p can -+ * never be in SCX. -+ */ -+ if (p->policy == SCHED_EXT) { -+ p->policy = SCHED_NORMAL; -+ atomic_long_inc(&scx_nr_rejected); ++ /* ++ * We're in the load path and @p->policy will be applied ++ * right after. Reverting @p->policy here and rejecting ++ * %SCHED_EXT transitions from scx_check_setscheduler() ++ * guarantees that if ops.init_task() sets @p->disallow, ++ * @p can never be in SCX. ++ */ ++ if (p->policy == SCHED_EXT) { ++ p->policy = SCHED_NORMAL; ++ atomic_long_inc(&scx_nr_rejected); ++ } ++ ++ task_rq_unlock(rq, p, &rf); ++ } else if (p->policy == SCHED_EXT) { ++ scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork", ++ p->comm, p->pid); + } -+ -+ task_rq_unlock(rq, p, &rf); + } + + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; @@ -5016,7 +5390,7 @@ index 000000000000..da9cac6b6cc2 + scx_set_task_state(p, SCX_TASK_ENABLED); + + if (SCX_HAS_OP(set_weight)) -+ SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight); ++ SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); +} + +static void scx_ops_disable_task(struct task_struct *p) @@ -5213,7 +5587,7 @@ index 000000000000..da9cac6b6cc2 +{ + struct task_struct *p = rq->curr; + -+ if (scx_ops_bypassing()) ++ if (scx_rq_bypassing(rq)) + return false; + + if (p->sched_class != &ext_sched_class) @@ -5228,6 +5602,219 @@ index 000000000000..da9cac6b6cc2 +} +#endif + ++#ifdef CONFIG_EXT_GROUP_SCHED ++ ++DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); ++static bool cgroup_warned_missing_weight; ++static bool cgroup_warned_missing_idle; ++ ++static void scx_cgroup_warn_missing_weight(struct task_group *tg) ++{ ++ if (scx_ops_enable_state() == SCX_OPS_DISABLED || ++ cgroup_warned_missing_weight) ++ return; ++ ++ if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent) ++ return; ++ ++ pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n", ++ scx_ops.name); ++ cgroup_warned_missing_weight = true; ++} ++ ++static void scx_cgroup_warn_missing_idle(struct task_group *tg) ++{ ++ if (scx_ops_enable_state() == SCX_OPS_DISABLED || ++ cgroup_warned_missing_idle) ++ return; ++ ++ if (!tg->idle) ++ return; ++ ++ pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n", ++ scx_ops.name); ++ cgroup_warned_missing_idle = true; ++} ++ ++int scx_tg_online(struct task_group *tg) ++{ ++ int ret = 0; ++ ++ WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); ++ ++ percpu_down_read(&scx_cgroup_rwsem); ++ ++ scx_cgroup_warn_missing_weight(tg); ++ ++ if (SCX_HAS_OP(cgroup_init)) { ++ struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; ++ ++ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, ++ tg->css.cgroup, &args); ++ if (!ret) ++ tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; ++ else ++ ret = ops_sanitize_err("cgroup_init", ret); ++ } else { ++ tg->scx_flags |= SCX_TG_ONLINE; ++ } ++ ++ percpu_up_read(&scx_cgroup_rwsem); ++ return ret; ++} ++ ++void scx_tg_offline(struct task_group *tg) ++{ ++ WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); ++ ++ percpu_down_read(&scx_cgroup_rwsem); ++ ++ if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED)) ++ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup); ++ tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); ++ ++ percpu_up_read(&scx_cgroup_rwsem); ++} ++ ++int scx_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ struct cgroup_subsys_state *css; ++ struct task_struct *p; ++ int ret; ++ ++ /* released in scx_finish/cancel_attach() */ ++ percpu_down_read(&scx_cgroup_rwsem); ++ ++ if (!scx_enabled()) ++ return 0; ++ ++ cgroup_taskset_for_each(p, css, tset) { ++ struct cgroup *from = tg_cgrp(task_group(p)); ++ struct cgroup *to = tg_cgrp(css_tg(css)); ++ ++ WARN_ON_ONCE(p->scx.cgrp_moving_from); ++ ++ /* ++ * sched_move_task() omits identity migrations. Let's match the ++ * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() ++ * always match one-to-one. ++ */ ++ if (from == to) ++ continue; ++ ++ if (SCX_HAS_OP(cgroup_prep_move)) { ++ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, ++ p, from, css->cgroup); ++ if (ret) ++ goto err; ++ } ++ ++ p->scx.cgrp_moving_from = from; ++ } ++ ++ return 0; ++ ++err: ++ cgroup_taskset_for_each(p, css, tset) { ++ if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) ++ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, ++ p->scx.cgrp_moving_from, css->cgroup); ++ p->scx.cgrp_moving_from = NULL; ++ } ++ ++ percpu_up_read(&scx_cgroup_rwsem); ++ return ops_sanitize_err("cgroup_prep_move", ret); ++} ++ ++void scx_move_task(struct task_struct *p) ++{ ++ if (!scx_enabled()) ++ return; ++ ++ /* ++ * We're called from sched_move_task() which handles both cgroup and ++ * autogroup moves. Ignore the latter. ++ * ++ * Also ignore exiting tasks, because in the exit path tasks transition ++ * from the autogroup to the root group, so task_group_is_autogroup() ++ * alone isn't able to catch exiting autogroup tasks. This is safe for ++ * cgroup_move(), because cgroup migrations never happen for PF_EXITING ++ * tasks. ++ */ ++ if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING)) ++ return; ++ ++ /* ++ * @p must have ops.cgroup_prep_move() called on it and thus ++ * cgrp_moving_from set. ++ */ ++ if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) ++ SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, ++ p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); ++ p->scx.cgrp_moving_from = NULL; ++} ++ ++void scx_cgroup_finish_attach(void) ++{ ++ percpu_up_read(&scx_cgroup_rwsem); ++} ++ ++void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) ++{ ++ struct cgroup_subsys_state *css; ++ struct task_struct *p; ++ ++ if (!scx_enabled()) ++ goto out_unlock; ++ ++ cgroup_taskset_for_each(p, css, tset) { ++ if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) ++ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, ++ p->scx.cgrp_moving_from, css->cgroup); ++ p->scx.cgrp_moving_from = NULL; ++ } ++out_unlock: ++ percpu_up_read(&scx_cgroup_rwsem); ++} ++ ++void scx_group_set_weight(struct task_group *tg, unsigned long weight) ++{ ++ percpu_down_read(&scx_cgroup_rwsem); ++ ++ if (tg->scx_weight != weight) { ++ if (SCX_HAS_OP(cgroup_set_weight)) ++ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, ++ tg_cgrp(tg), weight); ++ tg->scx_weight = weight; ++ } ++ ++ percpu_up_read(&scx_cgroup_rwsem); ++} ++ ++void scx_group_set_idle(struct task_group *tg, bool idle) ++{ ++ percpu_down_read(&scx_cgroup_rwsem); ++ scx_cgroup_warn_missing_idle(tg); ++ percpu_up_read(&scx_cgroup_rwsem); ++} ++ ++static void scx_cgroup_lock(void) ++{ ++ percpu_down_write(&scx_cgroup_rwsem); ++} ++ ++static void scx_cgroup_unlock(void) ++{ ++ percpu_up_write(&scx_cgroup_rwsem); ++} ++ ++#else /* CONFIG_EXT_GROUP_SCHED */ ++ ++static inline void scx_cgroup_lock(void) {} ++static inline void scx_cgroup_unlock(void) {} ++ ++#endif /* CONFIG_EXT_GROUP_SCHED */ ++ +/* + * Omitted operations: + * @@ -5248,6 +5835,7 @@ index 000000000000..da9cac6b6cc2 + + .wakeup_preempt = wakeup_preempt_scx, + ++ .balance = balance_scx, + .pick_next_task = pick_next_task_scx, + + .put_prev_task = put_prev_task_scx, @@ -5256,7 +5844,6 @@ index 000000000000..da9cac6b6cc2 + .switch_class = switch_class_scx, + +#ifdef CONFIG_SMP -+ .balance = balance_scx, + .select_task_rq = select_task_rq_scx, + .task_woken = task_woken_scx, + .set_cpus_allowed = set_cpus_allowed_scx, @@ -5365,6 +5952,96 @@ index 000000000000..da9cac6b6cc2 + rcu_read_unlock(); +} + ++#ifdef CONFIG_EXT_GROUP_SCHED ++static void scx_cgroup_exit(void) ++{ ++ struct cgroup_subsys_state *css; ++ ++ percpu_rwsem_assert_held(&scx_cgroup_rwsem); ++ ++ /* ++ * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk ++ * cgroups and exit all the inited ones, all online cgroups are exited. ++ */ ++ rcu_read_lock(); ++ css_for_each_descendant_post(css, &root_task_group.css) { ++ struct task_group *tg = css_tg(css); ++ ++ if (!(tg->scx_flags & SCX_TG_INITED)) ++ continue; ++ tg->scx_flags &= ~SCX_TG_INITED; ++ ++ if (!scx_ops.cgroup_exit) ++ continue; ++ ++ if (WARN_ON_ONCE(!css_tryget(css))) ++ continue; ++ rcu_read_unlock(); ++ ++ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup); ++ ++ rcu_read_lock(); ++ css_put(css); ++ } ++ rcu_read_unlock(); ++} ++ ++static int scx_cgroup_init(void) ++{ ++ struct cgroup_subsys_state *css; ++ int ret; ++ ++ percpu_rwsem_assert_held(&scx_cgroup_rwsem); ++ ++ cgroup_warned_missing_weight = false; ++ cgroup_warned_missing_idle = false; ++ ++ /* ++ * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk ++ * cgroups and init, all online cgroups are initialized. ++ */ ++ rcu_read_lock(); ++ css_for_each_descendant_pre(css, &root_task_group.css) { ++ struct task_group *tg = css_tg(css); ++ struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; ++ ++ scx_cgroup_warn_missing_weight(tg); ++ scx_cgroup_warn_missing_idle(tg); ++ ++ if ((tg->scx_flags & ++ (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) ++ continue; ++ ++ if (!scx_ops.cgroup_init) { ++ tg->scx_flags |= SCX_TG_INITED; ++ continue; ++ } ++ ++ if (WARN_ON_ONCE(!css_tryget(css))) ++ continue; ++ rcu_read_unlock(); ++ ++ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, ++ css->cgroup, &args); ++ if (ret) { ++ css_put(css); ++ return ret; ++ } ++ tg->scx_flags |= SCX_TG_INITED; ++ ++ rcu_read_lock(); ++ css_put(css); ++ } ++ rcu_read_unlock(); ++ ++ return 0; ++} ++ ++#else ++static void scx_cgroup_exit(void) {} ++static int scx_cgroup_init(void) { return 0; } ++#endif ++ + +/******************************************************************************** + * Sysfs interface and ops enable/disable. @@ -5508,16 +6185,8 @@ index 000000000000..da9cac6b6cc2 + } + + /* -+ * We need to guarantee that no tasks are on the BPF scheduler while -+ * bypassing. Either we see enabled or the enable path sees the -+ * increased bypass_depth before moving tasks to SCX. -+ */ -+ if (!scx_enabled()) -+ return; -+ -+ /* + * No task property is changing. We just need to make sure all currently -+ * queued tasks are re-queued according to the new scx_ops_bypassing() ++ * queued tasks are re-queued according to the new scx_rq_bypassing() + * state. As an optimization, walk each rq's runnable_list instead of + * the scx_tasks list. + * @@ -5531,6 +6200,24 @@ index 000000000000..da9cac6b6cc2 + + rq_lock_irqsave(rq, &rf); + ++ if (bypass) { ++ WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); ++ rq->scx.flags |= SCX_RQ_BYPASSING; ++ } else { ++ WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING)); ++ rq->scx.flags &= ~SCX_RQ_BYPASSING; ++ } ++ ++ /* ++ * We need to guarantee that no tasks are on the BPF scheduler ++ * while bypassing. Either we see enabled or the enable path ++ * sees scx_rq_bypassing() before moving tasks to SCX. ++ */ ++ if (!scx_enabled()) { ++ rq_unlock_irqrestore(rq, &rf); ++ continue; ++ } ++ + /* + * The use of list_for_each_entry_safe_reverse() is required + * because each task is going to be removed from and added back @@ -5586,11 +6273,11 @@ index 000000000000..da9cac6b6cc2 +{ + switch (kind) { + case SCX_EXIT_UNREG: -+ return "Scheduler unregistered from user space"; ++ return "unregistered from user space"; + case SCX_EXIT_UNREG_BPF: -+ return "Scheduler unregistered from BPF"; ++ return "unregistered from BPF"; + case SCX_EXIT_UNREG_KERN: -+ return "Scheduler unregistered from the main kernel"; ++ return "unregistered from the main kernel"; + case SCX_EXIT_SYSRQ: + return "disabled by sysrq-S"; + case SCX_EXIT_ERROR: @@ -5656,39 +6343,32 @@ index 000000000000..da9cac6b6cc2 + WRITE_ONCE(scx_switching_all, false); + + /* -+ * Avoid racing against fork. See scx_ops_enable() for explanation on -+ * the locking order. ++ * Avoid racing against fork and cgroup changes. See scx_ops_enable() ++ * for explanation on the locking order. + */ + percpu_down_write(&scx_fork_rwsem); + cpus_read_lock(); ++ scx_cgroup_lock(); + + spin_lock_irq(&scx_tasks_lock); + scx_task_iter_init(&sti); + /* -+ * Invoke scx_ops_exit_task() on all non-idle tasks, including -+ * TASK_DEAD tasks. Because dead tasks may have a nonzero refcount, -+ * we may not have invoked sched_ext_free() on them by the time a -+ * scheduler is disabled. We must therefore exit the task here, or we'd -+ * fail to invoke ops.exit_task(), as the scheduler will have been -+ * unloaded by the time the task is subsequently exited on the -+ * sched_ext_free() path. ++ * The BPF scheduler is going away. All tasks including %TASK_DEAD ones ++ * must be switched out and exited synchronously. + */ -+ while ((p = scx_task_iter_next_locked(&sti, true))) { ++ while ((p = scx_task_iter_next_locked(&sti))) { + const struct sched_class *old_class = p->sched_class; + struct sched_enq_and_set_ctx ctx; + -+ if (READ_ONCE(p->__state) != TASK_DEAD) { -+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, -+ &ctx); ++ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + -+ p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); -+ __setscheduler_prio(p, p->prio); -+ check_class_changing(task_rq(p), p, old_class); ++ p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); ++ __setscheduler_prio(p, p->prio); ++ check_class_changing(task_rq(p), p, old_class); + -+ sched_enq_and_set_task(&ctx); ++ sched_enq_and_set_task(&ctx); + -+ check_class_changed(task_rq(p), p, old_class, p->prio); -+ } ++ check_class_changed(task_rq(p), p, old_class, p->prio); + scx_ops_exit_task(p); + } + scx_task_iter_exit(&sti); @@ -5704,18 +6384,23 @@ index 000000000000..da9cac6b6cc2 + static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + synchronize_rcu(); + ++ scx_cgroup_exit(); ++ ++ scx_cgroup_unlock(); + cpus_read_unlock(); + percpu_up_write(&scx_fork_rwsem); + + if (ei->kind >= SCX_EXIT_ERROR) { -+ printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name); ++ pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", ++ scx_ops.name, ei->reason); + -+ if (ei->msg[0] == '\0') -+ printk(KERN_ERR "sched_ext: %s\n", ei->reason); -+ else -+ printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg); ++ if (ei->msg[0] != '\0') ++ pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg); + + stack_trace_print(ei->bt, ei->bt_len, 2); ++ } else { ++ pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", ++ scx_ops.name, ei->reason); + } + + if (scx_ops.exit) @@ -5904,7 +6589,7 @@ index 000000000000..da9cac6b6cc2 + static unsigned long bt[SCX_EXIT_BT_LEN]; + char dsq_id_buf[19] = "(n/a)"; + unsigned long ops_state = atomic_long_read(&p->scx.ops_state); -+ unsigned int bt_len; ++ unsigned int bt_len = 0; + + if (p->scx.dsq) + scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", @@ -5929,7 +6614,9 @@ index 000000000000..da9cac6b6cc2 + ops_dump_exit(); + } + ++#ifdef CONFIG_STACKTRACE + bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); ++#endif + if (bt_len) { + dump_newline(s); + dump_stack_trace(s, " ", bt, bt_len); @@ -6218,7 +6905,7 @@ index 000000000000..da9cac6b6cc2 + cpus_read_lock(); + + if (scx_ops.init) { -+ ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init); ++ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); + if (ret) { + ret = ops_sanitize_err("init", ret); + goto err_disable_unlock_cpus; @@ -6256,11 +6943,17 @@ index 000000000000..da9cac6b6cc2 + scx_watchdog_timeout / 2); + + /* -+ * Lock out forks before opening the floodgate so that they don't wander -+ * into the operations prematurely. ++ * Lock out forks, cgroup on/offlining and moves before opening the ++ * floodgate so that they don't wander into the operations prematurely. + * -+ * We don't need to keep the CPUs stable but grab cpus_read_lock() to -+ * ease future locking changes for cgroup suport. ++ * We don't need to keep the CPUs stable but static_branch_*() requires ++ * cpus_read_lock() and scx_cgroup_rwsem must nest inside ++ * cpu_hotplug_lock because of the following dependency chain: ++ * ++ * cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem ++ * ++ * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use ++ * static_branch_*_cpuslocked(). + * + * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the + * following dependency chain: @@ -6269,6 +6962,7 @@ index 000000000000..da9cac6b6cc2 + */ + percpu_down_write(&scx_fork_rwsem); + cpus_read_lock(); ++ scx_cgroup_lock(); + + check_hotplug_seq(ops); + @@ -6291,6 +6985,14 @@ index 000000000000..da9cac6b6cc2 + static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + } + ++ /* ++ * All cgroups should be initialized before letting in tasks. cgroup ++ * on/offlining and task migrations are already locked out. ++ */ ++ ret = scx_cgroup_init(); ++ if (ret) ++ goto err_disable_unlock_all; ++ + static_branch_enable_cpuslocked(&__scx_ops_enabled); + + /* @@ -6303,8 +7005,15 @@ index 000000000000..da9cac6b6cc2 + spin_lock_irq(&scx_tasks_lock); + + scx_task_iter_init(&sti); -+ while ((p = scx_task_iter_next_locked(&sti, false))) { -+ get_task_struct(p); ++ while ((p = scx_task_iter_next_locked(&sti))) { ++ /* ++ * @p may already be dead, have lost all its usages counts and ++ * be waiting for RCU grace period before being freed. @p can't ++ * be initialized for SCX in such cases and should be ignored. ++ */ ++ if (!tryget_task_struct(p)) ++ continue; ++ + scx_task_iter_rq_unlock(&sti); + spin_unlock_irq(&scx_tasks_lock); + @@ -6357,7 +7066,7 @@ index 000000000000..da9cac6b6cc2 + WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); + + scx_task_iter_init(&sti); -+ while ((p = scx_task_iter_next_locked(&sti, false))) { ++ while ((p = scx_task_iter_next_locked(&sti))) { + const struct sched_class *old_class = p->sched_class; + struct sched_enq_and_set_ctx ctx; + @@ -6375,6 +7084,7 @@ index 000000000000..da9cac6b6cc2 + + spin_unlock_irq(&scx_tasks_lock); + preempt_enable(); ++ scx_cgroup_unlock(); + cpus_read_unlock(); + percpu_up_write(&scx_fork_rwsem); + @@ -6388,6 +7098,8 @@ index 000000000000..da9cac6b6cc2 + if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) + static_branch_enable(&__scx_switched_all); + ++ pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", ++ scx_ops.name, scx_switched_all() ? "" : " (partial)"); + kobject_uevent(scx_root_kobj, KOBJ_ADD); + mutex_unlock(&scx_ops_enable_mutex); + @@ -6407,6 +7119,7 @@ index 000000000000..da9cac6b6cc2 + return ret; + +err_disable_unlock_all: ++ scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); +err_disable_unlock_cpus: + cpus_read_unlock(); @@ -6601,6 +7314,11 @@ index 000000000000..da9cac6b6cc2 + + switch (moff) { + case offsetof(struct sched_ext_ops, init_task): ++#ifdef CONFIG_EXT_GROUP_SCHED ++ case offsetof(struct sched_ext_ops, cgroup_init): ++ case offsetof(struct sched_ext_ops, cgroup_exit): ++ case offsetof(struct sched_ext_ops, cgroup_prep_move): ++#endif + case offsetof(struct sched_ext_ops, cpu_online): + case offsetof(struct sched_ext_ops, cpu_offline): + case offsetof(struct sched_ext_ops, init): @@ -6659,6 +7377,7 @@ index 000000000000..da9cac6b6cc2 +static void enqueue_stub(struct task_struct *p, u64 enq_flags) {} +static void dequeue_stub(struct task_struct *p, u64 enq_flags) {} +static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {} ++static void tick_stub(struct task_struct *p) {} +static void runnable_stub(struct task_struct *p, u64 enq_flags) {} +static void running_stub(struct task_struct *p) {} +static void stopping_stub(struct task_struct *p, bool runnable) {} @@ -6674,16 +7393,28 @@ index 000000000000..da9cac6b6cc2 +static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {} +static void enable_stub(struct task_struct *p) {} +static void disable_stub(struct task_struct *p) {} ++#ifdef CONFIG_EXT_GROUP_SCHED ++static s32 cgroup_init_stub(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } ++static void cgroup_exit_stub(struct cgroup *cgrp) {} ++static s32 cgroup_prep_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } ++static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} ++static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} ++static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {} ++#endif +static void cpu_online_stub(s32 cpu) {} +static void cpu_offline_stub(s32 cpu) {} +static s32 init_stub(void) { return -EINVAL; } +static void exit_stub(struct scx_exit_info *info) {} ++static void dump_stub(struct scx_dump_ctx *ctx) {} ++static void dump_cpu_stub(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {} ++static void dump_task_stub(struct scx_dump_ctx *ctx, struct task_struct *p) {} + +static struct sched_ext_ops __bpf_ops_sched_ext_ops = { + .select_cpu = select_cpu_stub, + .enqueue = enqueue_stub, + .dequeue = dequeue_stub, + .dispatch = dispatch_stub, ++ .tick = tick_stub, + .runnable = runnable_stub, + .running = running_stub, + .stopping = stopping_stub, @@ -6699,10 +7430,21 @@ index 000000000000..da9cac6b6cc2 + .exit_task = exit_task_stub, + .enable = enable_stub, + .disable = disable_stub, ++#ifdef CONFIG_EXT_GROUP_SCHED ++ .cgroup_init = cgroup_init_stub, ++ .cgroup_exit = cgroup_exit_stub, ++ .cgroup_prep_move = cgroup_prep_move_stub, ++ .cgroup_move = cgroup_move_stub, ++ .cgroup_cancel_move = cgroup_cancel_move_stub, ++ .cgroup_set_weight = cgroup_set_weight_stub, ++#endif + .cpu_online = cpu_online_stub, + .cpu_offline = cpu_offline_stub, + .init = init_stub, + .exit = exit_stub, ++ .dump = dump_stub, ++ .dump_cpu = dump_cpu_stub, ++ .dump_task = dump_task_stub, +}; + +static struct bpf_struct_ops bpf_sched_ext_ops = { @@ -6945,7 +7687,8 @@ index 000000000000..da9cac6b6cc2 + * definitions so that BPF scheduler implementations can use them + * through the generated vmlinux.h. + */ -+ WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT); ++ WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | ++ SCX_TG_ONLINE); + + BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); + init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); @@ -6990,38 +7733,6 @@ index 000000000000..da9cac6b6cc2 +__bpf_kfunc_start_defs(); + +/** -+ * scx_bpf_create_dsq - Create a custom DSQ -+ * @dsq_id: DSQ to create -+ * @node: NUMA node to allocate from -+ * -+ * Create a custom DSQ identified by @dsq_id. Can be called from ops.init() and -+ * ops.init_task(). -+ */ -+__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) -+{ -+ if (!scx_kf_allowed(SCX_KF_SLEEPABLE)) -+ return -EINVAL; -+ -+ if (unlikely(node >= (int)nr_node_ids || -+ (node < 0 && node != NUMA_NO_NODE))) -+ return -EINVAL; -+ return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); -+} -+ -+__bpf_kfunc_end_defs(); -+ -+BTF_KFUNCS_START(scx_kfunc_ids_sleepable) -+BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) -+BTF_KFUNCS_END(scx_kfunc_ids_sleepable) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_sleepable, -+}; -+ -+__bpf_kfunc_start_defs(); -+ -+/** + * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously @@ -7111,7 +7822,7 @@ index 000000000000..da9cac6b6cc2 + * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ + * @p: task_struct to dispatch + * @dsq_id: DSQ to dispatch to -+ * @slice: duration @p can run for in nsecs ++ * @slice: duration @p can run for in nsecs, 0 to keep the current value + * @enq_flags: SCX_ENQ_* + * + * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe @@ -7161,7 +7872,7 @@ index 000000000000..da9cac6b6cc2 + * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ + * @p: task_struct to dispatch + * @dsq_id: DSQ to dispatch to -+ * @slice: duration @p can run for in nsecs ++ * @slice: duration @p can run for in nsecs, 0 to keep the current value + * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ + * @enq_flags: SCX_ENQ_* + * @@ -7202,6 +7913,118 @@ index 000000000000..da9cac6b6cc2 + .set = &scx_kfunc_ids_enqueue_dispatch, +}; + ++static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit, ++ struct task_struct *p, u64 dsq_id, ++ u64 enq_flags) ++{ ++ struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; ++ struct rq *this_rq, *src_rq, *dst_rq, *locked_rq; ++ bool dispatched = false; ++ bool in_balance; ++ unsigned long flags; ++ ++ if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH)) ++ return false; ++ ++ /* ++ * Can be called from either ops.dispatch() locking this_rq() or any ++ * context where no rq lock is held. If latter, lock @p's task_rq which ++ * we'll likely need anyway. ++ */ ++ src_rq = task_rq(p); ++ ++ local_irq_save(flags); ++ this_rq = this_rq(); ++ in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE; ++ ++ if (in_balance) { ++ if (this_rq != src_rq) { ++ raw_spin_rq_unlock(this_rq); ++ raw_spin_rq_lock(src_rq); ++ } ++ } else { ++ raw_spin_rq_lock(src_rq); ++ } ++ ++ locked_rq = src_rq; ++ raw_spin_lock(&src_dsq->lock); ++ ++ /* ++ * Did someone else get to it? @p could have already left $src_dsq, got ++ * re-enqueud, or be in the process of being consumed by someone else. ++ */ ++ if (unlikely(p->scx.dsq != src_dsq || ++ u32_before(kit->cursor.priv, p->scx.dsq_seq) || ++ p->scx.holding_cpu >= 0) || ++ WARN_ON_ONCE(src_rq != task_rq(p))) { ++ raw_spin_unlock(&src_dsq->lock); ++ goto out; ++ } ++ ++ /* @p is still on $src_dsq and stable, determine the destination */ ++ dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p); ++ ++ if (dst_dsq->id == SCX_DSQ_LOCAL) { ++ dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); ++ if (!task_can_run_on_remote_rq(p, dst_rq, true)) { ++ dst_dsq = &scx_dsq_global; ++ dst_rq = src_rq; ++ } ++ } else { ++ /* no need to migrate if destination is a non-local DSQ */ ++ dst_rq = src_rq; ++ } ++ ++ /* ++ * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different ++ * CPU, @p will be migrated. ++ */ ++ if (dst_dsq->id == SCX_DSQ_LOCAL) { ++ /* @p is going from a non-local DSQ to a local DSQ */ ++ if (src_rq == dst_rq) { ++ task_unlink_from_dsq(p, src_dsq); ++ move_local_task_to_local_dsq(p, enq_flags, ++ src_dsq, dst_rq); ++ raw_spin_unlock(&src_dsq->lock); ++ } else { ++ raw_spin_unlock(&src_dsq->lock); ++ move_remote_task_to_local_dsq(p, enq_flags, ++ src_rq, dst_rq); ++ locked_rq = dst_rq; ++ } ++ } else { ++ /* ++ * @p is going from a non-local DSQ to a non-local DSQ. As ++ * $src_dsq is already locked, do an abbreviated dequeue. ++ */ ++ task_unlink_from_dsq(p, src_dsq); ++ p->scx.dsq = NULL; ++ raw_spin_unlock(&src_dsq->lock); ++ ++ if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) ++ p->scx.dsq_vtime = kit->vtime; ++ dispatch_enqueue(dst_dsq, p, enq_flags); ++ } ++ ++ if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE) ++ p->scx.slice = kit->slice; ++ ++ dispatched = true; ++out: ++ if (in_balance) { ++ if (this_rq != locked_rq) { ++ raw_spin_rq_unlock(locked_rq); ++ raw_spin_rq_lock(this_rq); ++ } ++ } else { ++ raw_spin_rq_unlock_irqrestore(locked_rq, flags); ++ } ++ ++ kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE | ++ __SCX_DSQ_ITER_HAS_VTIME); ++ return dispatched; ++} ++ +__bpf_kfunc_start_defs(); + +/** @@ -7281,12 +8104,112 @@ index 000000000000..da9cac6b6cc2 + } +} + ++/** ++ * scx_bpf_dispatch_from_dsq_set_slice - Override slice when dispatching from DSQ ++ * @it__iter: DSQ iterator in progress ++ * @slice: duration the dispatched task can run for in nsecs ++ * ++ * Override the slice of the next task that will be dispatched from @it__iter ++ * using scx_bpf_dispatch_from_dsq[_vtime](). If this function is not called, ++ * the previous slice duration is kept. ++ */ ++__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice( ++ struct bpf_iter_scx_dsq *it__iter, u64 slice) ++{ ++ struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; ++ ++ kit->slice = slice; ++ kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; ++} ++ ++/** ++ * scx_bpf_dispatch_from_dsq_set_vtime - Override vtime when dispatching from DSQ ++ * @it__iter: DSQ iterator in progress ++ * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ ++ * ++ * Override the vtime of the next task that will be dispatched from @it__iter ++ * using scx_bpf_dispatch_from_dsq_vtime(). If this function is not called, the ++ * previous slice vtime is kept. If scx_bpf_dispatch_from_dsq() is used to ++ * dispatch the next task, the override is ignored and cleared. ++ */ ++__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( ++ struct bpf_iter_scx_dsq *it__iter, u64 vtime) ++{ ++ struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; ++ ++ kit->vtime = vtime; ++ kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; ++} ++ ++/** ++ * scx_bpf_dispatch_from_dsq - Move a task from DSQ iteration to a DSQ ++ * @it__iter: DSQ iterator in progress ++ * @p: task to transfer ++ * @dsq_id: DSQ to move @p to ++ * @enq_flags: SCX_ENQ_* ++ * ++ * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ ++ * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can ++ * be the destination. ++ * ++ * For the transfer to be successful, @p must still be on the DSQ and have been ++ * queued before the DSQ iteration started. This function doesn't care whether ++ * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have ++ * been queued before the iteration started. ++ * ++ * @p's slice is kept by default. Use scx_bpf_dispatch_from_dsq_set_slice() to ++ * update. ++ * ++ * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq ++ * lock (e.g. BPF timers or SYSCALL programs). ++ * ++ * Returns %true if @p has been consumed, %false if @p had already been consumed ++ * or dequeued. ++ */ ++__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, ++ struct task_struct *p, u64 dsq_id, ++ u64 enq_flags) ++{ ++ return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter, ++ p, dsq_id, enq_flags); ++} ++ ++/** ++ * scx_bpf_dispatch_vtime_from_dsq - Move a task from DSQ iteration to a PRIQ DSQ ++ * @it__iter: DSQ iterator in progress ++ * @p: task to transfer ++ * @dsq_id: DSQ to move @p to ++ * @enq_flags: SCX_ENQ_* ++ * ++ * Transfer @p which is on the DSQ currently iterated by @it__iter to the ++ * priority queue of the DSQ specified by @dsq_id. The destination must be a ++ * user DSQ as only user DSQs support priority queue. ++ * ++ * @p's slice and vtime are kept by default. Use ++ * scx_bpf_dispatch_from_dsq_set_slice() and ++ * scx_bpf_dispatch_from_dsq_set_vtime() to update. ++ * ++ * All other aspects are identical to scx_bpf_dispatch_from_dsq(). See ++ * scx_bpf_dispatch_vtime() for more information on @vtime. ++ */ ++__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, ++ struct task_struct *p, u64 dsq_id, ++ u64 enq_flags) ++{ ++ return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter, ++ p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); ++} ++ +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_dispatch) +BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) +BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) +BTF_ID_FLAGS(func, scx_bpf_consume) ++BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) ++BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) ++BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) ++BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_dispatch) + +static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { @@ -7364,6 +8287,37 @@ index 000000000000..da9cac6b6cc2 +__bpf_kfunc_start_defs(); + +/** ++ * scx_bpf_create_dsq - Create a custom DSQ ++ * @dsq_id: DSQ to create ++ * @node: NUMA node to allocate from ++ * ++ * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable ++ * scx callback, and any BPF_PROG_TYPE_SYSCALL prog. ++ */ ++__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) ++{ ++ if (unlikely(node >= (int)nr_node_ids || ++ (node < 0 && node != NUMA_NO_NODE))) ++ return -EINVAL; ++ return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); ++} ++ ++__bpf_kfunc_end_defs(); ++ ++BTF_KFUNCS_START(scx_kfunc_ids_unlocked) ++BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) ++BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) ++BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) ++BTF_KFUNCS_END(scx_kfunc_ids_unlocked) ++ ++static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { ++ .owner = THIS_MODULE, ++ .set = &scx_kfunc_ids_unlocked, ++}; ++ ++__bpf_kfunc_start_defs(); ++ ++/** + * scx_bpf_kick_cpu - Trigger reschedule on a CPU + * @cpu: cpu to kick + * @flags: %SCX_KICK_* flags @@ -7381,17 +8335,17 @@ index 000000000000..da9cac6b6cc2 + if (!ops_cpu_valid(cpu, NULL)) + return; + ++ local_irq_save(irq_flags); ++ ++ this_rq = this_rq(); ++ + /* + * While bypassing for PM ops, IRQ handling may not be online which can + * lead to irq_work_queue() malfunction such as infinite busy wait for + * IRQ status update. Suppress kicking. + */ -+ if (scx_ops_bypassing()) -+ return; -+ -+ local_irq_save(irq_flags); -+ -+ this_rq = this_rq(); ++ if (scx_rq_bypassing(this_rq)) ++ goto out; + + /* + * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting @@ -7497,7 +8451,7 @@ index 000000000000..da9cac6b6cc2 + BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != + __alignof__(struct bpf_iter_scx_dsq)); + -+ if (flags & ~__SCX_DSQ_ITER_ALL_FLAGS) ++ if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) + return -EINVAL; + + kit->dsq = find_non_local_dsq(dsq_id); @@ -7505,9 +8459,8 @@ index 000000000000..da9cac6b6cc2 + return -ENOENT; + + INIT_LIST_HEAD(&kit->cursor.node); -+ kit->cursor.is_bpf_iter_cursor = true; -+ kit->dsq_seq = READ_ONCE(kit->dsq->seq); -+ kit->flags = flags; ++ kit->cursor.flags |= SCX_DSQ_LNODE_ITER_CURSOR | flags; ++ kit->cursor.priv = READ_ONCE(kit->dsq->seq); + + return 0; +} @@ -7521,7 +8474,7 @@ index 000000000000..da9cac6b6cc2 +__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) +{ + struct bpf_iter_scx_dsq_kern *kit = (void *)it; -+ bool rev = kit->flags & SCX_DSQ_ITER_REV; ++ bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV; + struct task_struct *p; + unsigned long flags; + @@ -7542,7 +8495,7 @@ index 000000000000..da9cac6b6cc2 + */ + do { + p = nldsq_next_task(kit->dsq, p, rev); -+ } while (p && unlikely(u32_before(kit->dsq_seq, p->scx.dsq_seq))); ++ } while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq))); + + if (p) { + if (rev) @@ -8008,6 +8961,41 @@ index 000000000000..da9cac6b6cc2 + return cpu_rq(cpu); +} + ++/** ++ * scx_bpf_task_cgroup - Return the sched cgroup of a task ++ * @p: task of interest ++ * ++ * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with ++ * from the scheduler's POV. SCX operations should use this function to ++ * determine @p's current cgroup as, unlike following @p->cgroups, ++ * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all ++ * rq-locked operations. Can be called on the parameter tasks of rq-locked ++ * operations. The restriction guarantees that @p's rq is locked by the caller. ++ */ ++#ifdef CONFIG_CGROUP_SCHED ++__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) ++{ ++ struct task_group *tg = p->sched_task_group; ++ struct cgroup *cgrp = &cgrp_dfl_root.cgrp; ++ ++ if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) ++ goto out; ++ ++ /* ++ * A task_group may either be a cgroup or an autogroup. In the latter ++ * case, @tg->css.cgroup is %NULL. A task_group can't become the other ++ * kind once created. ++ */ ++ if (tg && tg->css.cgroup) ++ cgrp = tg->css.cgroup; ++ else ++ cgrp = &cgrp_dfl_root.cgrp; ++out: ++ cgroup_get(cgrp); ++ return cgrp; ++} ++#endif ++ +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_any) @@ -8036,6 +9024,9 @@ index 000000000000..da9cac6b6cc2 +BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_cpu_rq) ++#ifdef CONFIG_CGROUP_SCHED ++BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) ++#endif +BTF_KFUNCS_END(scx_kfunc_ids_any) + +static const struct btf_kfunc_id_set scx_kfunc_set_any = { @@ -8059,8 +9050,6 @@ index 000000000000..da9cac6b6cc2 + * check using scx_kf_allowed(). + */ + if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_sleepable)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_select_cpu)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_enqueue_dispatch)) || @@ -8069,6 +9058,10 @@ index 000000000000..da9cac6b6cc2 + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_cpu_release)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, ++ &scx_kfunc_set_unlocked)) || ++ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, ++ &scx_kfunc_set_unlocked)) || ++ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_any)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &scx_kfunc_set_any)) || @@ -8107,10 +9100,10 @@ index 000000000000..da9cac6b6cc2 +__initcall(scx_init); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h new file mode 100644 -index 000000000000..32d3a51f591a +index 000000000000..246019519231 --- /dev/null +++ b/kernel/sched/ext.h -@@ -0,0 +1,69 @@ +@@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst @@ -8180,11 +9173,33 @@ index 000000000000..32d3a51f591a +#else +static inline void scx_update_idle(struct rq *rq, bool idle) {} +#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++#ifdef CONFIG_EXT_GROUP_SCHED ++int scx_tg_online(struct task_group *tg); ++void scx_tg_offline(struct task_group *tg); ++int scx_cgroup_can_attach(struct cgroup_taskset *tset); ++void scx_move_task(struct task_struct *p); ++void scx_cgroup_finish_attach(void); ++void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); ++void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); ++void scx_group_set_idle(struct task_group *tg, bool idle); ++#else /* CONFIG_EXT_GROUP_SCHED */ ++static inline int scx_tg_online(struct task_group *tg) { return 0; } ++static inline void scx_tg_offline(struct task_group *tg) {} ++static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } ++static inline void scx_move_task(struct task_struct *p) {} ++static inline void scx_cgroup_finish_attach(void) {} ++static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} ++static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} ++static inline void scx_group_set_idle(struct task_group *tg, bool idle) {} ++#endif /* CONFIG_EXT_GROUP_SCHED */ ++#endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 9057584ec06d..5904405ffc59 100644 +index 5dc714fd8a22..2928026d76a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c -@@ -3835,7 +3835,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, +@@ -3848,7 +3848,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, } } @@ -8194,7 +9209,7 @@ index 9057584ec06d..5904405ffc59 100644 { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); -@@ -8390,7 +8391,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int +@@ -8403,7 +8404,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * Batch and idle tasks do not preempt non-idle tasks (their preemption * is driven by the tick): */ @@ -8203,7 +9218,7 @@ index 9057584ec06d..5904405ffc59 100644 return; find_matching_se(&se, &pse); -@@ -9347,28 +9348,18 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) { +@@ -9360,28 +9361,18 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) { static bool __update_blocked_others(struct rq *rq, bool *done) { @@ -8235,7 +9250,7 @@ index 9057584ec06d..5904405ffc59 100644 } #ifdef CONFIG_FAIR_GROUP_SCHED -@@ -13214,6 +13205,7 @@ DEFINE_SCHED_CLASS(fair) = { +@@ -13227,6 +13218,7 @@ DEFINE_SCHED_CLASS(fair) = { .task_tick = task_tick_fair, .task_fork = task_fork_fair, @@ -8262,7 +9277,7 @@ index 6e78d071beb5..c7a218123b7a 100644 } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 4c36cc680361..42b4d1428c2c 100644 +index 432b43aa091c..207a04f02b4c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -192,9 +192,18 @@ static inline int idle_policy(int policy) @@ -8310,7 +9325,49 @@ index 4c36cc680361..42b4d1428c2c 100644 /* * !! For sched_setattr_nocheck() (kernel) only !! * -@@ -588,6 +615,11 @@ do { \ +@@ -424,6 +451,11 @@ struct task_group { + struct rt_bandwidth rt_bandwidth; + #endif + ++#ifdef CONFIG_EXT_GROUP_SCHED ++ u32 scx_flags; /* SCX_TG_* */ ++ u32 scx_weight; ++#endif ++ + struct rcu_head rcu; + struct list_head list; + +@@ -448,7 +480,7 @@ struct task_group { + + }; + +-#ifdef CONFIG_FAIR_GROUP_SCHED ++#ifdef CONFIG_GROUP_SCHED_WEIGHT + #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD + + /* +@@ -479,6 +511,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) + return walk_tg_tree_from(&root_task_group, down, up, data); + } + ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ + extern int tg_nop(struct task_group *tg, void *data); + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -535,6 +572,8 @@ extern void set_task_rq_fair(struct sched_entity *se, + static inline void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) { } + #endif /* CONFIG_SMP */ ++#else /* !CONFIG_FAIR_GROUP_SCHED */ ++static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } + #endif /* CONFIG_FAIR_GROUP_SCHED */ + + #else /* CONFIG_CGROUP_SCHED */ +@@ -588,6 +627,11 @@ do { \ # define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) # define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) @@ -8322,7 +9379,7 @@ index 4c36cc680361..42b4d1428c2c 100644 /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; -@@ -696,6 +728,42 @@ struct cfs_rq { +@@ -696,6 +740,43 @@ struct cfs_rq { #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -8336,6 +9393,7 @@ index 4c36cc680361..42b4d1428c2c 100644 + */ + SCX_RQ_ONLINE = 1 << 0, + SCX_RQ_CAN_STOP_TICK = 1 << 1, ++ SCX_RQ_BYPASSING = 1 << 3, + + SCX_RQ_IN_WAKEUP = 1 << 16, + SCX_RQ_IN_BALANCE = 1 << 17, @@ -8365,7 +9423,7 @@ index 4c36cc680361..42b4d1428c2c 100644 static inline int rt_bandwidth_enabled(void) { return sysctl_sched_rt_runtime >= 0; -@@ -996,11 +1064,6 @@ struct uclamp_rq { +@@ -996,11 +1077,6 @@ struct uclamp_rq { DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); #endif /* CONFIG_UCLAMP_TASK */ @@ -8377,7 +9435,7 @@ index 4c36cc680361..42b4d1428c2c 100644 /* * This is the main, per-CPU runqueue data structure. * -@@ -1043,6 +1106,9 @@ struct rq { +@@ -1043,6 +1119,9 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; @@ -8387,16 +9445,24 @@ index 4c36cc680361..42b4d1428c2c 100644 #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ -@@ -2296,6 +2362,8 @@ struct sched_class { +@@ -2291,13 +2370,15 @@ struct sched_class { + + void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); + ++ int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + struct task_struct *(*pick_next_task)(struct rq *rq); + void (*put_prev_task)(struct rq *rq, struct task_struct *p); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); + void (*switch_class)(struct rq *rq, struct task_struct *next); + #ifdef CONFIG_SMP - int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); +- int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); -@@ -2323,8 +2391,11 @@ struct sched_class { + + struct task_struct * (*pick_task)(struct rq *rq); +@@ -2323,8 +2404,11 @@ struct sched_class { * cannot assume the switched_from/switched_to pair is serialized by * rq->lock. They are however serialized by p->pi_lock. */ @@ -8408,7 +9474,7 @@ index 4c36cc680361..42b4d1428c2c 100644 void (*prio_changed) (struct rq *this_rq, struct task_struct *task, int oldprio); -@@ -2373,19 +2444,54 @@ const struct sched_class name##_sched_class \ +@@ -2373,19 +2457,54 @@ const struct sched_class name##_sched_class \ extern struct sched_class __sched_class_highest[]; extern struct sched_class __sched_class_lowest[]; @@ -8469,7 +9535,39 @@ index 4c36cc680361..42b4d1428c2c 100644 static inline bool sched_stop_runnable(struct rq *rq) { -@@ -2510,8 +2616,6 @@ extern void init_sched_dl_class(void); +@@ -2424,6 +2543,19 @@ extern void sched_balance_trigger(struct rq *rq); + extern int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx); + extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx); + ++static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) ++{ ++ /* When not in the task's cpumask, no point in looking further. */ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ /* Can @cpu run a user thread? */ ++ if (!(p->flags & PF_KTHREAD) && !task_cpu_possible(cpu, p)) ++ return false; ++ ++ return true; ++} ++ + static inline cpumask_t *alloc_user_cpus_ptr(int node) + { + /* +@@ -2457,6 +2589,11 @@ extern int push_cpu_stop(void *arg); + + #else /* !CONFIG_SMP: */ + ++static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) ++{ ++ return true; ++} ++ + static inline int __set_cpus_allowed_ptr(struct task_struct *p, + struct affinity_context *ctx) + { +@@ -2510,8 +2647,6 @@ extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); @@ -8478,7 +9576,7 @@ index 4c36cc680361..42b4d1428c2c 100644 extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); -@@ -3056,6 +3160,8 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { } +@@ -3056,6 +3191,8 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { } #ifdef CONFIG_SMP @@ -8487,7 +9585,7 @@ index 4c36cc680361..42b4d1428c2c 100644 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long *min, unsigned long *max); -@@ -3099,6 +3205,8 @@ static inline unsigned long cpu_util_rt(struct rq *rq) +@@ -3099,6 +3236,8 @@ static inline unsigned long cpu_util_rt(struct rq *rq) return READ_ONCE(rq->avg_rt.util_avg); } @@ -8496,7 +9594,7 @@ index 4c36cc680361..42b4d1428c2c 100644 #endif /* CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK -@@ -3609,6 +3717,8 @@ extern void set_load_weight(struct task_struct *p, bool update_load); +@@ -3609,6 +3748,8 @@ extern void set_load_weight(struct task_struct *p, bool update_load); extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags); @@ -8505,7 +9603,7 @@ index 4c36cc680361..42b4d1428c2c 100644 extern void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio); -@@ -3629,4 +3739,24 @@ static inline void balance_callbacks(struct rq *rq, struct balance_callback *hea +@@ -3629,4 +3770,24 @@ static inline void balance_callbacks(struct rq *rq, struct balance_callback *hea #endif @@ -8658,7 +9756,7 @@ index 000000000000..d6264fe1c8cd +build/ diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile new file mode 100644 -index 000000000000..bf7e108f5ae1 +index 000000000000..ca3815e572d8 --- /dev/null +++ b/tools/sched_ext/Makefile @@ -0,0 +1,246 @@ @@ -8840,7 +9938,7 @@ index 000000000000..bf7e108f5ae1 + +SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) + -+c-sched-targets = scx_simple scx_qmap scx_central ++c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg + +$(addprefix $(BINDIR)/,$(c-sched-targets)): \ + $(BINDIR)/%: \ @@ -8910,10 +10008,10 @@ index 000000000000..bf7e108f5ae1 +.SECONDARY: diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md new file mode 100644 -index 000000000000..8efe70cc4363 +index 000000000000..16a42e4060f6 --- /dev/null +++ b/tools/sched_ext/README.md -@@ -0,0 +1,258 @@ +@@ -0,0 +1,270 @@ +SCHED_EXT EXAMPLE SCHEDULERS +============================ + @@ -9108,6 +10206,18 @@ index 000000000000..8efe70cc4363 +infinite slices and no timer ticks allows the VM to avoid unnecessary expensive +vmexits. + ++## scx_flatcg ++ ++A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical ++weight-based cgroup CPU control by flattening the cgroup hierarchy into a single ++layer, by compounding the active weight share at each level. The effect of this ++is a much more performant CPU controller, which does not need to descend down ++cgroup trees in order to properly compute a cgroup's share. ++ ++Similar to scx_simple, in limited scenarios, this scheduler can perform ++reasonably well on single socket-socket systems with a unified L3 cache and show ++significantly lowered hierarchical scheduling overhead. ++ + +# Troubleshooting + @@ -9191,10 +10301,10 @@ index 000000000000..ad7d139ce907 + */ diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h new file mode 100644 -index 000000000000..20280df62857 +index 000000000000..f538c75db183 --- /dev/null +++ b/tools/sched_ext/include/scx/common.bpf.h -@@ -0,0 +1,401 @@ +@@ -0,0 +1,412 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. @@ -9232,6 +10342,10 @@ index 000000000000..20280df62857 +u32 scx_bpf_dispatch_nr_slots(void) __ksym; +void scx_bpf_dispatch_cancel(void) __ksym; +bool scx_bpf_consume(u64 dsq_id) __ksym; ++void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym; ++void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym; ++bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; ++bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +u32 scx_bpf_reenqueue_local(void) __ksym; +void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; +s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; @@ -9258,6 +10372,13 @@ index 000000000000..20280df62857 +bool scx_bpf_task_running(const struct task_struct *p) __ksym; +s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; +struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; ++struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; ++ ++/* ++ * Use the following as @it__iter when calling ++ * scx_bpf_dispatch[_vtime]_from_dsq() from within bpf_for_each() loops. ++ */ ++#define BPF_FOR_EACH_ITER (&___it) + +static inline __attribute__((format(printf, 1, 2))) +void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} @@ -10022,7 +11143,7 @@ index 000000000000..891693ee604e +#endif /* __USER_EXIT_INFO_H */ diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c new file mode 100644 -index 000000000000..1d8fd570eaa7 +index 000000000000..8dd8eb73b6b8 --- /dev/null +++ b/tools/sched_ext/scx_central.bpf.c @@ -0,0 +1,361 @@ @@ -10226,7 +11347,7 @@ index 000000000000..1d8fd570eaa7 + + /* central's gimme is never set */ + gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); -+ if (gimme && !*gimme) ++ if (!gimme || !*gimme) + continue; + + if (dispatch_to_cpu(cpu)) @@ -10528,12 +11649,1263 @@ index 000000000000..21deea320bd7 + goto restart; + return 0; +} +diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c +new file mode 100644 +index 000000000000..3ab2b60781a0 +--- /dev/null ++++ b/tools/sched_ext/scx_flatcg.bpf.c +@@ -0,0 +1,949 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * A demo sched_ext flattened cgroup hierarchy scheduler. It implements ++ * hierarchical weight-based cgroup CPU control by flattening the cgroup ++ * hierarchy into a single layer by compounding the active weight share at each ++ * level. Consider the following hierarchy with weights in parentheses: ++ * ++ * R + A (100) + B (100) ++ * | \ C (100) ++ * \ D (200) ++ * ++ * Ignoring the root and threaded cgroups, only B, C and D can contain tasks. ++ * Let's say all three have runnable tasks. The total share that each of these ++ * three cgroups is entitled to can be calculated by compounding its share at ++ * each level. ++ * ++ * For example, B is competing against C and in that competition its share is ++ * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's ++ * share in that competition is 100/(200+100) == 1/3. B's eventual share in the ++ * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's ++ * eventual shaer is the same at 1/6. D is only competing at the top level and ++ * its share is 200/(100+200) == 2/3. ++ * ++ * So, instead of hierarchically scheduling level-by-level, we can consider it ++ * as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3 ++ * and keep updating the eventual shares as the cgroups' runnable states change. ++ * ++ * This flattening of hierarchy can bring a substantial performance gain when ++ * the cgroup hierarchy is nested multiple levels. in a simple benchmark using ++ * wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it ++ * outperforms CFS by ~3% with CPU controller disabled and by ~10% with two ++ * apache instances competing with 2:1 weight ratio nested four level deep. ++ * ++ * However, the gain comes at the cost of not being able to properly handle ++ * thundering herd of cgroups. For example, if many cgroups which are nested ++ * behind a low priority parent cgroup wake up around the same time, they may be ++ * able to consume more CPU cycles than they are entitled to. In many use cases, ++ * this isn't a real concern especially given the performance gain. Also, there ++ * are ways to mitigate the problem further by e.g. introducing an extra ++ * scheduling layer on cgroup delegation boundaries. ++ * ++ * The scheduler first picks the cgroup to run and then schedule the tasks ++ * within by using nested weighted vtime scheduling by default. The ++ * cgroup-internal scheduling can be switched to FIFO with the -f option. ++ */ ++#include ++#include "scx_flatcg.h" ++ ++/* ++ * Maximum amount of retries to find a valid cgroup. ++ */ ++#define CGROUP_MAX_RETRIES 1024 ++ ++char _license[] SEC("license") = "GPL"; ++ ++const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */ ++const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL; ++const volatile bool fifo_sched; ++ ++u64 cvtime_now; ++UEI_DEFINE(uei); ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); ++ __type(key, u32); ++ __type(value, u64); ++ __uint(max_entries, FCG_NR_STATS); ++} stats SEC(".maps"); ++ ++static void stat_inc(enum fcg_stat_idx idx) ++{ ++ u32 idx_v = idx; ++ ++ u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v); ++ if (cnt_p) ++ (*cnt_p)++; ++} ++ ++struct fcg_cpu_ctx { ++ u64 cur_cgid; ++ u64 cur_at; ++}; ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); ++ __type(key, u32); ++ __type(value, struct fcg_cpu_ctx); ++ __uint(max_entries, 1); ++} cpu_ctx SEC(".maps"); ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_CGRP_STORAGE); ++ __uint(map_flags, BPF_F_NO_PREALLOC); ++ __type(key, int); ++ __type(value, struct fcg_cgrp_ctx); ++} cgrp_ctx SEC(".maps"); ++ ++struct cgv_node { ++ struct bpf_rb_node rb_node; ++ __u64 cvtime; ++ __u64 cgid; ++}; ++ ++private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock; ++private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node); ++ ++struct cgv_node_stash { ++ struct cgv_node __kptr *node; ++}; ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_HASH); ++ __uint(max_entries, 16384); ++ __type(key, __u64); ++ __type(value, struct cgv_node_stash); ++} cgv_node_stash SEC(".maps"); ++ ++struct fcg_task_ctx { ++ u64 bypassed_at; ++}; ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_TASK_STORAGE); ++ __uint(map_flags, BPF_F_NO_PREALLOC); ++ __type(key, int); ++ __type(value, struct fcg_task_ctx); ++} task_ctx SEC(".maps"); ++ ++/* gets inc'd on weight tree changes to expire the cached hweights */ ++u64 hweight_gen = 1; ++ ++static u64 div_round_up(u64 dividend, u64 divisor) ++{ ++ return (dividend + divisor - 1) / divisor; ++} ++ ++static bool vtime_before(u64 a, u64 b) ++{ ++ return (s64)(a - b) < 0; ++} ++ ++static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b) ++{ ++ struct cgv_node *cgc_a, *cgc_b; ++ ++ cgc_a = container_of(a, struct cgv_node, rb_node); ++ cgc_b = container_of(b, struct cgv_node, rb_node); ++ ++ return cgc_a->cvtime < cgc_b->cvtime; ++} ++ ++static struct fcg_cpu_ctx *find_cpu_ctx(void) ++{ ++ struct fcg_cpu_ctx *cpuc; ++ u32 idx = 0; ++ ++ cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx); ++ if (!cpuc) { ++ scx_bpf_error("cpu_ctx lookup failed"); ++ return NULL; ++ } ++ return cpuc; ++} ++ ++static struct fcg_cgrp_ctx *find_cgrp_ctx(struct cgroup *cgrp) ++{ ++ struct fcg_cgrp_ctx *cgc; ++ ++ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); ++ if (!cgc) { ++ scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", cgrp->kn->id); ++ return NULL; ++ } ++ return cgc; ++} ++ ++static struct fcg_cgrp_ctx *find_ancestor_cgrp_ctx(struct cgroup *cgrp, int level) ++{ ++ struct fcg_cgrp_ctx *cgc; ++ ++ cgrp = bpf_cgroup_ancestor(cgrp, level); ++ if (!cgrp) { ++ scx_bpf_error("ancestor cgroup lookup failed"); ++ return NULL; ++ } ++ ++ cgc = find_cgrp_ctx(cgrp); ++ if (!cgc) ++ scx_bpf_error("ancestor cgrp_ctx lookup failed"); ++ bpf_cgroup_release(cgrp); ++ return cgc; ++} ++ ++static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) ++{ ++ int level; ++ ++ if (!cgc->nr_active) { ++ stat_inc(FCG_STAT_HWT_SKIP); ++ return; ++ } ++ ++ if (cgc->hweight_gen == hweight_gen) { ++ stat_inc(FCG_STAT_HWT_CACHE); ++ return; ++ } ++ ++ stat_inc(FCG_STAT_HWT_UPDATES); ++ bpf_for(level, 0, cgrp->level + 1) { ++ struct fcg_cgrp_ctx *cgc; ++ bool is_active; ++ ++ cgc = find_ancestor_cgrp_ctx(cgrp, level); ++ if (!cgc) ++ break; ++ ++ if (!level) { ++ cgc->hweight = FCG_HWEIGHT_ONE; ++ cgc->hweight_gen = hweight_gen; ++ } else { ++ struct fcg_cgrp_ctx *pcgc; ++ ++ pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); ++ if (!pcgc) ++ break; ++ ++ /* ++ * We can be oppotunistic here and not grab the ++ * cgv_tree_lock and deal with the occasional races. ++ * However, hweight updates are already cached and ++ * relatively low-frequency. Let's just do the ++ * straightforward thing. ++ */ ++ bpf_spin_lock(&cgv_tree_lock); ++ is_active = cgc->nr_active; ++ if (is_active) { ++ cgc->hweight_gen = pcgc->hweight_gen; ++ cgc->hweight = ++ div_round_up(pcgc->hweight * cgc->weight, ++ pcgc->child_weight_sum); ++ } ++ bpf_spin_unlock(&cgv_tree_lock); ++ ++ if (!is_active) { ++ stat_inc(FCG_STAT_HWT_RACE); ++ break; ++ } ++ } ++ } ++} ++ ++static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc) ++{ ++ u64 delta, cvtime, max_budget; ++ ++ /* ++ * A node which is on the rbtree can't be pointed to from elsewhere yet ++ * and thus can't be updated and repositioned. Instead, we collect the ++ * vtime deltas separately and apply it asynchronously here. ++ */ ++ delta = cgc->cvtime_delta; ++ __sync_fetch_and_sub(&cgc->cvtime_delta, delta); ++ cvtime = cgv_node->cvtime + delta; ++ ++ /* ++ * Allow a cgroup to carry the maximum budget proportional to its ++ * hweight such that a full-hweight cgroup can immediately take up half ++ * of the CPUs at the most while staying at the front of the rbtree. ++ */ ++ max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) / ++ (2 * FCG_HWEIGHT_ONE); ++ if (vtime_before(cvtime, cvtime_now - max_budget)) ++ cvtime = cvtime_now - max_budget; ++ ++ cgv_node->cvtime = cvtime; ++} ++ ++static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) ++{ ++ struct cgv_node_stash *stash; ++ struct cgv_node *cgv_node; ++ u64 cgid = cgrp->kn->id; ++ ++ /* paired with cmpxchg in try_pick_next_cgroup() */ ++ if (__sync_val_compare_and_swap(&cgc->queued, 0, 1)) { ++ stat_inc(FCG_STAT_ENQ_SKIP); ++ return; ++ } ++ ++ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); ++ if (!stash) { ++ scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid); ++ return; ++ } ++ ++ /* NULL if the node is already on the rbtree */ ++ cgv_node = bpf_kptr_xchg(&stash->node, NULL); ++ if (!cgv_node) { ++ stat_inc(FCG_STAT_ENQ_RACE); ++ return; ++ } ++ ++ bpf_spin_lock(&cgv_tree_lock); ++ cgrp_cap_budget(cgv_node, cgc); ++ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); ++ bpf_spin_unlock(&cgv_tree_lock); ++} ++ ++static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc) ++{ ++ /* ++ * Tell fcg_stopping() that this bypassed the regular scheduling path ++ * and should be force charged to the cgroup. 0 is used to indicate that ++ * the task isn't bypassing, so if the current runtime is 0, go back by ++ * one nanosecond. ++ */ ++ taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1; ++} ++ ++s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) ++{ ++ struct fcg_task_ctx *taskc; ++ bool is_idle = false; ++ s32 cpu; ++ ++ cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); ++ ++ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); ++ if (!taskc) { ++ scx_bpf_error("task_ctx lookup failed"); ++ return cpu; ++ } ++ ++ /* ++ * If select_cpu_dfl() is recommending local enqueue, the target CPU is ++ * idle. Follow it and charge the cgroup later in fcg_stopping() after ++ * the fact. ++ */ ++ if (is_idle) { ++ set_bypassed_at(p, taskc); ++ stat_inc(FCG_STAT_LOCAL); ++ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); ++ } ++ ++ return cpu; ++} ++ ++void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) ++{ ++ struct fcg_task_ctx *taskc; ++ struct cgroup *cgrp; ++ struct fcg_cgrp_ctx *cgc; ++ ++ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); ++ if (!taskc) { ++ scx_bpf_error("task_ctx lookup failed"); ++ return; ++ } ++ ++ /* ++ * Use the direct dispatching and force charging to deal with tasks with ++ * custom affinities so that we don't have to worry about per-cgroup ++ * dq's containing tasks that can't be executed from some CPUs. ++ */ ++ if (p->nr_cpus_allowed != nr_cpus) { ++ set_bypassed_at(p, taskc); ++ ++ /* ++ * The global dq is deprioritized as we don't want to let tasks ++ * to boost themselves by constraining its cpumask. The ++ * deprioritization is rather severe, so let's not apply that to ++ * per-cpu kernel threads. This is ham-fisted. We probably wanna ++ * implement per-cgroup fallback dq's instead so that we have ++ * more control over when tasks with custom cpumask get issued. ++ */ ++ if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) { ++ stat_inc(FCG_STAT_LOCAL); ++ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); ++ } else { ++ stat_inc(FCG_STAT_GLOBAL); ++ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++ } ++ return; ++ } ++ ++ cgrp = scx_bpf_task_cgroup(p); ++ cgc = find_cgrp_ctx(cgrp); ++ if (!cgc) ++ goto out_release; ++ ++ if (fifo_sched) { ++ scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags); ++ } else { ++ u64 tvtime = p->scx.dsq_vtime; ++ ++ /* ++ * Limit the amount of budget that an idling task can accumulate ++ * to one slice. ++ */ ++ if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL)) ++ tvtime = cgc->tvtime_now - SCX_SLICE_DFL; ++ ++ scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL, ++ tvtime, enq_flags); ++ } ++ ++ cgrp_enqueued(cgrp, cgc); ++out_release: ++ bpf_cgroup_release(cgrp); ++} ++ ++/* ++ * Walk the cgroup tree to update the active weight sums as tasks wake up and ++ * sleep. The weight sums are used as the base when calculating the proportion a ++ * given cgroup or task is entitled to at each level. ++ */ ++static void update_active_weight_sums(struct cgroup *cgrp, bool runnable) ++{ ++ struct fcg_cgrp_ctx *cgc; ++ bool updated = false; ++ int idx; ++ ++ cgc = find_cgrp_ctx(cgrp); ++ if (!cgc) ++ return; ++ ++ /* ++ * In most cases, a hot cgroup would have multiple threads going to ++ * sleep and waking up while the whole cgroup stays active. In leaf ++ * cgroups, ->nr_runnable which is updated with __sync operations gates ++ * ->nr_active updates, so that we don't have to grab the cgv_tree_lock ++ * repeatedly for a busy cgroup which is staying active. ++ */ ++ if (runnable) { ++ if (__sync_fetch_and_add(&cgc->nr_runnable, 1)) ++ return; ++ stat_inc(FCG_STAT_ACT); ++ } else { ++ if (__sync_sub_and_fetch(&cgc->nr_runnable, 1)) ++ return; ++ stat_inc(FCG_STAT_DEACT); ++ } ++ ++ /* ++ * If @cgrp is becoming runnable, its hweight should be refreshed after ++ * it's added to the weight tree so that enqueue has the up-to-date ++ * value. If @cgrp is becoming quiescent, the hweight should be ++ * refreshed before it's removed from the weight tree so that the usage ++ * charging which happens afterwards has access to the latest value. ++ */ ++ if (!runnable) ++ cgrp_refresh_hweight(cgrp, cgc); ++ ++ /* propagate upwards */ ++ bpf_for(idx, 0, cgrp->level) { ++ int level = cgrp->level - idx; ++ struct fcg_cgrp_ctx *cgc, *pcgc = NULL; ++ bool propagate = false; ++ ++ cgc = find_ancestor_cgrp_ctx(cgrp, level); ++ if (!cgc) ++ break; ++ if (level) { ++ pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); ++ if (!pcgc) ++ break; ++ } ++ ++ /* ++ * We need the propagation protected by a lock to synchronize ++ * against weight changes. There's no reason to drop the lock at ++ * each level but bpf_spin_lock() doesn't want any function ++ * calls while locked. ++ */ ++ bpf_spin_lock(&cgv_tree_lock); ++ ++ if (runnable) { ++ if (!cgc->nr_active++) { ++ updated = true; ++ if (pcgc) { ++ propagate = true; ++ pcgc->child_weight_sum += cgc->weight; ++ } ++ } ++ } else { ++ if (!--cgc->nr_active) { ++ updated = true; ++ if (pcgc) { ++ propagate = true; ++ pcgc->child_weight_sum -= cgc->weight; ++ } ++ } ++ } ++ ++ bpf_spin_unlock(&cgv_tree_lock); ++ ++ if (!propagate) ++ break; ++ } ++ ++ if (updated) ++ __sync_fetch_and_add(&hweight_gen, 1); ++ ++ if (runnable) ++ cgrp_refresh_hweight(cgrp, cgc); ++} ++ ++void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags) ++{ ++ struct cgroup *cgrp; ++ ++ cgrp = scx_bpf_task_cgroup(p); ++ update_active_weight_sums(cgrp, true); ++ bpf_cgroup_release(cgrp); ++} ++ ++void BPF_STRUCT_OPS(fcg_running, struct task_struct *p) ++{ ++ struct cgroup *cgrp; ++ struct fcg_cgrp_ctx *cgc; ++ ++ if (fifo_sched) ++ return; ++ ++ cgrp = scx_bpf_task_cgroup(p); ++ cgc = find_cgrp_ctx(cgrp); ++ if (cgc) { ++ /* ++ * @cgc->tvtime_now always progresses forward as tasks start ++ * executing. The test and update can be performed concurrently ++ * from multiple CPUs and thus racy. Any error should be ++ * contained and temporary. Let's just live with it. ++ */ ++ if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime)) ++ cgc->tvtime_now = p->scx.dsq_vtime; ++ } ++ bpf_cgroup_release(cgrp); ++} ++ ++void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable) ++{ ++ struct fcg_task_ctx *taskc; ++ struct cgroup *cgrp; ++ struct fcg_cgrp_ctx *cgc; ++ ++ /* ++ * Scale the execution time by the inverse of the weight and charge. ++ * ++ * Note that the default yield implementation yields by setting ++ * @p->scx.slice to zero and the following would treat the yielding task ++ * as if it has consumed all its slice. If this penalizes yielding tasks ++ * too much, determine the execution time by taking explicit timestamps ++ * instead of depending on @p->scx.slice. ++ */ ++ if (!fifo_sched) ++ p->scx.dsq_vtime += ++ (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; ++ ++ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); ++ if (!taskc) { ++ scx_bpf_error("task_ctx lookup failed"); ++ return; ++ } ++ ++ if (!taskc->bypassed_at) ++ return; ++ ++ cgrp = scx_bpf_task_cgroup(p); ++ cgc = find_cgrp_ctx(cgrp); ++ if (cgc) { ++ __sync_fetch_and_add(&cgc->cvtime_delta, ++ p->se.sum_exec_runtime - taskc->bypassed_at); ++ taskc->bypassed_at = 0; ++ } ++ bpf_cgroup_release(cgrp); ++} ++ ++void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags) ++{ ++ struct cgroup *cgrp; ++ ++ cgrp = scx_bpf_task_cgroup(p); ++ update_active_weight_sums(cgrp, false); ++ bpf_cgroup_release(cgrp); ++} ++ ++void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight) ++{ ++ struct fcg_cgrp_ctx *cgc, *pcgc = NULL; ++ ++ cgc = find_cgrp_ctx(cgrp); ++ if (!cgc) ++ return; ++ ++ if (cgrp->level) { ++ pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - 1); ++ if (!pcgc) ++ return; ++ } ++ ++ bpf_spin_lock(&cgv_tree_lock); ++ if (pcgc && cgc->nr_active) ++ pcgc->child_weight_sum += (s64)weight - cgc->weight; ++ cgc->weight = weight; ++ bpf_spin_unlock(&cgv_tree_lock); ++} ++ ++static bool try_pick_next_cgroup(u64 *cgidp) ++{ ++ struct bpf_rb_node *rb_node; ++ struct cgv_node_stash *stash; ++ struct cgv_node *cgv_node; ++ struct fcg_cgrp_ctx *cgc; ++ struct cgroup *cgrp; ++ u64 cgid; ++ ++ /* pop the front cgroup and wind cvtime_now accordingly */ ++ bpf_spin_lock(&cgv_tree_lock); ++ ++ rb_node = bpf_rbtree_first(&cgv_tree); ++ if (!rb_node) { ++ bpf_spin_unlock(&cgv_tree_lock); ++ stat_inc(FCG_STAT_PNC_NO_CGRP); ++ *cgidp = 0; ++ return true; ++ } ++ ++ rb_node = bpf_rbtree_remove(&cgv_tree, rb_node); ++ bpf_spin_unlock(&cgv_tree_lock); ++ ++ if (!rb_node) { ++ /* ++ * This should never happen. bpf_rbtree_first() was called ++ * above while the tree lock was held, so the node should ++ * always be present. ++ */ ++ scx_bpf_error("node could not be removed"); ++ return true; ++ } ++ ++ cgv_node = container_of(rb_node, struct cgv_node, rb_node); ++ cgid = cgv_node->cgid; ++ ++ if (vtime_before(cvtime_now, cgv_node->cvtime)) ++ cvtime_now = cgv_node->cvtime; ++ ++ /* ++ * If lookup fails, the cgroup's gone. Free and move on. See ++ * fcg_cgroup_exit(). ++ */ ++ cgrp = bpf_cgroup_from_id(cgid); ++ if (!cgrp) { ++ stat_inc(FCG_STAT_PNC_GONE); ++ goto out_free; ++ } ++ ++ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); ++ if (!cgc) { ++ bpf_cgroup_release(cgrp); ++ stat_inc(FCG_STAT_PNC_GONE); ++ goto out_free; ++ } ++ ++ if (!scx_bpf_consume(cgid)) { ++ bpf_cgroup_release(cgrp); ++ stat_inc(FCG_STAT_PNC_EMPTY); ++ goto out_stash; ++ } ++ ++ /* ++ * Successfully consumed from the cgroup. This will be our current ++ * cgroup for the new slice. Refresh its hweight. ++ */ ++ cgrp_refresh_hweight(cgrp, cgc); ++ ++ bpf_cgroup_release(cgrp); ++ ++ /* ++ * As the cgroup may have more tasks, add it back to the rbtree. Note ++ * that here we charge the full slice upfront and then exact later ++ * according to the actual consumption. This prevents lowpri thundering ++ * herd from saturating the machine. ++ */ ++ bpf_spin_lock(&cgv_tree_lock); ++ cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: 1); ++ cgrp_cap_budget(cgv_node, cgc); ++ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); ++ bpf_spin_unlock(&cgv_tree_lock); ++ ++ *cgidp = cgid; ++ stat_inc(FCG_STAT_PNC_NEXT); ++ return true; ++ ++out_stash: ++ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); ++ if (!stash) { ++ stat_inc(FCG_STAT_PNC_GONE); ++ goto out_free; ++ } ++ ++ /* ++ * Paired with cmpxchg in cgrp_enqueued(). If they see the following ++ * transition, they'll enqueue the cgroup. If they are earlier, we'll ++ * see their task in the dq below and requeue the cgroup. ++ */ ++ __sync_val_compare_and_swap(&cgc->queued, 1, 0); ++ ++ if (scx_bpf_dsq_nr_queued(cgid)) { ++ bpf_spin_lock(&cgv_tree_lock); ++ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); ++ bpf_spin_unlock(&cgv_tree_lock); ++ stat_inc(FCG_STAT_PNC_RACE); ++ } else { ++ cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); ++ if (cgv_node) { ++ scx_bpf_error("unexpected !NULL cgv_node stash"); ++ goto out_free; ++ } ++ } ++ ++ return false; ++ ++out_free: ++ bpf_obj_drop(cgv_node); ++ return false; ++} ++ ++void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev) ++{ ++ struct fcg_cpu_ctx *cpuc; ++ struct fcg_cgrp_ctx *cgc; ++ struct cgroup *cgrp; ++ u64 now = bpf_ktime_get_ns(); ++ bool picked_next = false; ++ ++ cpuc = find_cpu_ctx(); ++ if (!cpuc) ++ return; ++ ++ if (!cpuc->cur_cgid) ++ goto pick_next_cgroup; ++ ++ if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) { ++ if (scx_bpf_consume(cpuc->cur_cgid)) { ++ stat_inc(FCG_STAT_CNS_KEEP); ++ return; ++ } ++ stat_inc(FCG_STAT_CNS_EMPTY); ++ } else { ++ stat_inc(FCG_STAT_CNS_EXPIRE); ++ } ++ ++ /* ++ * The current cgroup is expiring. It was already charged a full slice. ++ * Calculate the actual usage and accumulate the delta. ++ */ ++ cgrp = bpf_cgroup_from_id(cpuc->cur_cgid); ++ if (!cgrp) { ++ stat_inc(FCG_STAT_CNS_GONE); ++ goto pick_next_cgroup; ++ } ++ ++ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); ++ if (cgc) { ++ /* ++ * We want to update the vtime delta and then look for the next ++ * cgroup to execute but the latter needs to be done in a loop ++ * and we can't keep the lock held. Oh well... ++ */ ++ bpf_spin_lock(&cgv_tree_lock); ++ __sync_fetch_and_add(&cgc->cvtime_delta, ++ (cpuc->cur_at + cgrp_slice_ns - now) * ++ FCG_HWEIGHT_ONE / (cgc->hweight ?: 1)); ++ bpf_spin_unlock(&cgv_tree_lock); ++ } else { ++ stat_inc(FCG_STAT_CNS_GONE); ++ } ++ ++ bpf_cgroup_release(cgrp); ++ ++pick_next_cgroup: ++ cpuc->cur_at = now; ++ ++ if (scx_bpf_consume(SCX_DSQ_GLOBAL)) { ++ cpuc->cur_cgid = 0; ++ return; ++ } ++ ++ bpf_repeat(CGROUP_MAX_RETRIES) { ++ if (try_pick_next_cgroup(&cpuc->cur_cgid)) { ++ picked_next = true; ++ break; ++ } ++ } ++ ++ /* ++ * This only happens if try_pick_next_cgroup() races against enqueue ++ * path for more than CGROUP_MAX_RETRIES times, which is extremely ++ * unlikely and likely indicates an underlying bug. There shouldn't be ++ * any stall risk as the race is against enqueue. ++ */ ++ if (!picked_next) ++ stat_inc(FCG_STAT_PNC_FAIL); ++} ++ ++s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p, ++ struct scx_init_task_args *args) ++{ ++ struct fcg_task_ctx *taskc; ++ struct fcg_cgrp_ctx *cgc; ++ ++ /* ++ * @p is new. Let's ensure that its task_ctx is available. We can sleep ++ * in this function and the following will automatically use GFP_KERNEL. ++ */ ++ taskc = bpf_task_storage_get(&task_ctx, p, 0, ++ BPF_LOCAL_STORAGE_GET_F_CREATE); ++ if (!taskc) ++ return -ENOMEM; ++ ++ taskc->bypassed_at = 0; ++ ++ if (!(cgc = find_cgrp_ctx(args->cgroup))) ++ return -ENOENT; ++ ++ p->scx.dsq_vtime = cgc->tvtime_now; ++ ++ return 0; ++} ++ ++int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp, ++ struct scx_cgroup_init_args *args) ++{ ++ struct fcg_cgrp_ctx *cgc; ++ struct cgv_node *cgv_node; ++ struct cgv_node_stash empty_stash = {}, *stash; ++ u64 cgid = cgrp->kn->id; ++ int ret; ++ ++ /* ++ * Technically incorrect as cgroup ID is full 64bit while dq ID is ++ * 63bit. Should not be a problem in practice and easy to spot in the ++ * unlikely case that it breaks. ++ */ ++ ret = scx_bpf_create_dsq(cgid, -1); ++ if (ret) ++ return ret; ++ ++ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, ++ BPF_LOCAL_STORAGE_GET_F_CREATE); ++ if (!cgc) { ++ ret = -ENOMEM; ++ goto err_destroy_dsq; ++ } ++ ++ cgc->weight = args->weight; ++ cgc->hweight = FCG_HWEIGHT_ONE; ++ ++ ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash, ++ BPF_NOEXIST); ++ if (ret) { ++ if (ret != -ENOMEM) ++ scx_bpf_error("unexpected stash creation error (%d)", ++ ret); ++ goto err_destroy_dsq; ++ } ++ ++ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); ++ if (!stash) { ++ scx_bpf_error("unexpected cgv_node stash lookup failure"); ++ ret = -ENOENT; ++ goto err_destroy_dsq; ++ } ++ ++ cgv_node = bpf_obj_new(struct cgv_node); ++ if (!cgv_node) { ++ ret = -ENOMEM; ++ goto err_del_cgv_node; ++ } ++ ++ cgv_node->cgid = cgid; ++ cgv_node->cvtime = cvtime_now; ++ ++ cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); ++ if (cgv_node) { ++ scx_bpf_error("unexpected !NULL cgv_node stash"); ++ ret = -EBUSY; ++ goto err_drop; ++ } ++ ++ return 0; ++ ++err_drop: ++ bpf_obj_drop(cgv_node); ++err_del_cgv_node: ++ bpf_map_delete_elem(&cgv_node_stash, &cgid); ++err_destroy_dsq: ++ scx_bpf_destroy_dsq(cgid); ++ return ret; ++} ++ ++void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp) ++{ ++ u64 cgid = cgrp->kn->id; ++ ++ /* ++ * For now, there's no way find and remove the cgv_node if it's on the ++ * cgv_tree. Let's drain them in the dispatch path as they get popped ++ * off the front of the tree. ++ */ ++ bpf_map_delete_elem(&cgv_node_stash, &cgid); ++ scx_bpf_destroy_dsq(cgid); ++} ++ ++void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p, ++ struct cgroup *from, struct cgroup *to) ++{ ++ struct fcg_cgrp_ctx *from_cgc, *to_cgc; ++ s64 vtime_delta; ++ ++ /* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */ ++ if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to))) ++ return; ++ ++ vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now; ++ p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta; ++} ++ ++void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei) ++{ ++ UEI_RECORD(uei, ei); ++} ++ ++SCX_OPS_DEFINE(flatcg_ops, ++ .select_cpu = (void *)fcg_select_cpu, ++ .enqueue = (void *)fcg_enqueue, ++ .dispatch = (void *)fcg_dispatch, ++ .runnable = (void *)fcg_runnable, ++ .running = (void *)fcg_running, ++ .stopping = (void *)fcg_stopping, ++ .quiescent = (void *)fcg_quiescent, ++ .init_task = (void *)fcg_init_task, ++ .cgroup_set_weight = (void *)fcg_cgroup_set_weight, ++ .cgroup_init = (void *)fcg_cgroup_init, ++ .cgroup_exit = (void *)fcg_cgroup_exit, ++ .cgroup_move = (void *)fcg_cgroup_move, ++ .exit = (void *)fcg_exit, ++ .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING, ++ .name = "flatcg"); +diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c +new file mode 100644 +index 000000000000..5d24ca9c29d9 +--- /dev/null ++++ b/tools/sched_ext/scx_flatcg.c +@@ -0,0 +1,233 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 Tejun Heo ++ * Copyright (c) 2023 David Vernet ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "scx_flatcg.h" ++#include "scx_flatcg.bpf.skel.h" ++ ++#ifndef FILEID_KERNFS ++#define FILEID_KERNFS 0xfe ++#endif ++ ++const char help_fmt[] = ++"A flattened cgroup hierarchy sched_ext scheduler.\n" ++"\n" ++"See the top-level comment in .bpf.c for more details.\n" ++"\n" ++"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-v]\n" ++"\n" ++" -s SLICE_US Override slice duration\n" ++" -i INTERVAL Report interval\n" ++" -f Use FIFO scheduling instead of weighted vtime scheduling\n" ++" -v Print libbpf debug messages\n" ++" -h Display this help and exit\n"; ++ ++static bool verbose; ++static volatile int exit_req; ++ ++static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) ++{ ++ if (level == LIBBPF_DEBUG && !verbose) ++ return 0; ++ return vfprintf(stderr, format, args); ++} ++ ++static void sigint_handler(int dummy) ++{ ++ exit_req = 1; ++} ++ ++static float read_cpu_util(__u64 *last_sum, __u64 *last_idle) ++{ ++ FILE *fp; ++ char buf[4096]; ++ char *line, *cur = NULL, *tok; ++ __u64 sum = 0, idle = 0; ++ __u64 delta_sum, delta_idle; ++ int idx; ++ ++ fp = fopen("/proc/stat", "r"); ++ if (!fp) { ++ perror("fopen(\"/proc/stat\")"); ++ return 0.0; ++ } ++ ++ if (!fgets(buf, sizeof(buf), fp)) { ++ perror("fgets(\"/proc/stat\")"); ++ fclose(fp); ++ return 0.0; ++ } ++ fclose(fp); ++ ++ line = buf; ++ for (idx = 0; (tok = strtok_r(line, " \n", &cur)); idx++) { ++ char *endp = NULL; ++ __u64 v; ++ ++ if (idx == 0) { ++ line = NULL; ++ continue; ++ } ++ v = strtoull(tok, &endp, 0); ++ if (!endp || *endp != '\0') { ++ fprintf(stderr, "failed to parse %dth field of /proc/stat (\"%s\")\n", ++ idx, tok); ++ continue; ++ } ++ sum += v; ++ if (idx == 4) ++ idle = v; ++ } ++ ++ delta_sum = sum - *last_sum; ++ delta_idle = idle - *last_idle; ++ *last_sum = sum; ++ *last_idle = idle; ++ ++ return delta_sum ? (float)(delta_sum - delta_idle) / delta_sum : 0.0; ++} ++ ++static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats) ++{ ++ __u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus]; ++ __u32 idx; ++ ++ memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS); ++ ++ for (idx = 0; idx < FCG_NR_STATS; idx++) { ++ int ret, cpu; ++ ++ ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), ++ &idx, cnts[idx]); ++ if (ret < 0) ++ continue; ++ for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++) ++ stats[idx] += cnts[idx][cpu]; ++ } ++} ++ ++int main(int argc, char **argv) ++{ ++ struct scx_flatcg *skel; ++ struct bpf_link *link; ++ struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 }; ++ bool dump_cgrps = false; ++ __u64 last_cpu_sum = 0, last_cpu_idle = 0; ++ __u64 last_stats[FCG_NR_STATS] = {}; ++ unsigned long seq = 0; ++ __s32 opt; ++ __u64 ecode; ++ ++ libbpf_set_print(libbpf_print_fn); ++ signal(SIGINT, sigint_handler); ++ signal(SIGTERM, sigint_handler); ++restart: ++ skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg); ++ ++ skel->rodata->nr_cpus = libbpf_num_possible_cpus(); ++ ++ while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) { ++ double v; ++ ++ switch (opt) { ++ case 's': ++ v = strtod(optarg, NULL); ++ skel->rodata->cgrp_slice_ns = v * 1000; ++ break; ++ case 'i': ++ v = strtod(optarg, NULL); ++ intv_ts.tv_sec = v; ++ intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000; ++ break; ++ case 'd': ++ dump_cgrps = true; ++ break; ++ case 'f': ++ skel->rodata->fifo_sched = true; ++ break; ++ case 'v': ++ verbose = true; ++ break; ++ case 'h': ++ default: ++ fprintf(stderr, help_fmt, basename(argv[0])); ++ return opt != 'h'; ++ } ++ } ++ ++ printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d", ++ (double)skel->rodata->cgrp_slice_ns / 1000000.0, ++ (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0, ++ dump_cgrps); ++ ++ SCX_OPS_LOAD(skel, flatcg_ops, scx_flatcg, uei); ++ link = SCX_OPS_ATTACH(skel, flatcg_ops, scx_flatcg); ++ ++ while (!exit_req && !UEI_EXITED(skel, uei)) { ++ __u64 acc_stats[FCG_NR_STATS]; ++ __u64 stats[FCG_NR_STATS]; ++ float cpu_util; ++ int i; ++ ++ cpu_util = read_cpu_util(&last_cpu_sum, &last_cpu_idle); ++ ++ fcg_read_stats(skel, acc_stats); ++ for (i = 0; i < FCG_NR_STATS; i++) ++ stats[i] = acc_stats[i] - last_stats[i]; ++ ++ memcpy(last_stats, acc_stats, sizeof(acc_stats)); ++ ++ printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n", ++ seq++, cpu_util * 100.0, skel->data->hweight_gen); ++ printf(" act:%6llu deact:%6llu global:%6llu local:%6llu\n", ++ stats[FCG_STAT_ACT], ++ stats[FCG_STAT_DEACT], ++ stats[FCG_STAT_GLOBAL], ++ stats[FCG_STAT_LOCAL]); ++ printf("HWT cache:%6llu update:%6llu skip:%6llu race:%6llu\n", ++ stats[FCG_STAT_HWT_CACHE], ++ stats[FCG_STAT_HWT_UPDATES], ++ stats[FCG_STAT_HWT_SKIP], ++ stats[FCG_STAT_HWT_RACE]); ++ printf("ENQ skip:%6llu race:%6llu\n", ++ stats[FCG_STAT_ENQ_SKIP], ++ stats[FCG_STAT_ENQ_RACE]); ++ printf("CNS keep:%6llu expire:%6llu empty:%6llu gone:%6llu\n", ++ stats[FCG_STAT_CNS_KEEP], ++ stats[FCG_STAT_CNS_EXPIRE], ++ stats[FCG_STAT_CNS_EMPTY], ++ stats[FCG_STAT_CNS_GONE]); ++ printf("PNC next:%6llu empty:%6llu nocgrp:%6llu gone:%6llu race:%6llu fail:%6llu\n", ++ stats[FCG_STAT_PNC_NEXT], ++ stats[FCG_STAT_PNC_EMPTY], ++ stats[FCG_STAT_PNC_NO_CGRP], ++ stats[FCG_STAT_PNC_GONE], ++ stats[FCG_STAT_PNC_RACE], ++ stats[FCG_STAT_PNC_FAIL]); ++ printf("BAD remove:%6llu\n", ++ acc_stats[FCG_STAT_BAD_REMOVAL]); ++ fflush(stdout); ++ ++ nanosleep(&intv_ts, NULL); ++ } ++ ++ bpf_link__destroy(link); ++ ecode = UEI_REPORT(skel, uei); ++ scx_flatcg__destroy(skel); ++ ++ if (UEI_ECODE_RESTART(ecode)) ++ goto restart; ++ return 0; ++} +diff --git a/tools/sched_ext/scx_flatcg.h b/tools/sched_ext/scx_flatcg.h +new file mode 100644 +index 000000000000..6f2ea50acb1c +--- /dev/null ++++ b/tools/sched_ext/scx_flatcg.h +@@ -0,0 +1,51 @@ ++#ifndef __SCX_EXAMPLE_FLATCG_H ++#define __SCX_EXAMPLE_FLATCG_H ++ ++enum { ++ FCG_HWEIGHT_ONE = 1LLU << 16, ++}; ++ ++enum fcg_stat_idx { ++ FCG_STAT_ACT, ++ FCG_STAT_DEACT, ++ FCG_STAT_LOCAL, ++ FCG_STAT_GLOBAL, ++ ++ FCG_STAT_HWT_UPDATES, ++ FCG_STAT_HWT_CACHE, ++ FCG_STAT_HWT_SKIP, ++ FCG_STAT_HWT_RACE, ++ ++ FCG_STAT_ENQ_SKIP, ++ FCG_STAT_ENQ_RACE, ++ ++ FCG_STAT_CNS_KEEP, ++ FCG_STAT_CNS_EXPIRE, ++ FCG_STAT_CNS_EMPTY, ++ FCG_STAT_CNS_GONE, ++ ++ FCG_STAT_PNC_NO_CGRP, ++ FCG_STAT_PNC_NEXT, ++ FCG_STAT_PNC_EMPTY, ++ FCG_STAT_PNC_GONE, ++ FCG_STAT_PNC_RACE, ++ FCG_STAT_PNC_FAIL, ++ ++ FCG_STAT_BAD_REMOVAL, ++ ++ FCG_NR_STATS, ++}; ++ ++struct fcg_cgrp_ctx { ++ u32 nr_active; ++ u32 nr_runnable; ++ u32 queued; ++ u32 weight; ++ u32 hweight; ++ u64 child_weight_sum; ++ u64 hweight_gen; ++ s64 cvtime_delta; ++ u64 tvtime_now; ++}; ++ ++#endif /* __SCX_EXAMPLE_FLATCG_H */ diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c new file mode 100644 -index 000000000000..892278f12dce +index 000000000000..391d80b4ac8e --- /dev/null +++ b/tools/sched_ext/scx_qmap.bpf.c -@@ -0,0 +1,706 @@ +@@ -0,0 +1,813 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A simple five-level FIFO queue scheduler. @@ -10563,6 +12935,8 @@ index 000000000000..892278f12dce +enum consts { + ONE_SEC_IN_NS = 1000000000, + SHARED_DSQ = 0, ++ HIGHPRI_DSQ = 1, ++ HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */ +}; + +char _license[] SEC("license") = "GPL"; @@ -10572,10 +12946,12 @@ index 000000000000..892278f12dce +const volatile u32 stall_kernel_nth; +const volatile u32 dsp_inf_loop_after; +const volatile u32 dsp_batch; ++const volatile bool highpri_boosting; +const volatile bool print_shared_dsq; +const volatile s32 disallow_tgid; +const volatile bool suppress_dump; + ++u64 nr_highpri_queued; +u32 test_error_cnt; + +UEI_DEFINE(uei); @@ -10631,6 +13007,7 @@ index 000000000000..892278f12dce +/* Per-task scheduling context */ +struct task_ctx { + bool force_local; /* Dispatch directly to local_dsq */ ++ bool highpri; + u64 core_sched_seq; +}; + @@ -10658,6 +13035,7 @@ index 000000000000..892278f12dce +/* Statistics */ +u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq; +u64 nr_core_sched_execed; ++u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer; +u32 cpuperf_min, cpuperf_avg, cpuperf_max; +u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; + @@ -10676,17 +13054,25 @@ index 000000000000..892278f12dce + return -1; +} + ++static struct task_ctx *lookup_task_ctx(struct task_struct *p) ++{ ++ struct task_ctx *tctx; ++ ++ if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) { ++ scx_bpf_error("task_ctx lookup failed"); ++ return NULL; ++ } ++ return tctx; ++} ++ +s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + struct task_ctx *tctx; + s32 cpu; + -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("task_ctx lookup failed"); ++ if (!(tctx = lookup_task_ctx(p))) + return -ESRCH; -+ } + + cpu = pick_direct_dispatch_cpu(p, prev_cpu); + @@ -10733,11 +13119,8 @@ index 000000000000..892278f12dce + if (test_error_cnt && !--test_error_cnt) + scx_bpf_error("test triggering error"); + -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("task_ctx lookup failed"); ++ if (!(tctx = lookup_task_ctx(p))) + return; -+ } + + /* + * All enqueued tasks must have their core_sched_seq updated for correct @@ -10792,6 +13175,10 @@ index 000000000000..892278f12dce + return; + } + ++ if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) { ++ tctx->highpri = true; ++ __sync_fetch_and_add(&nr_highpri_queued, 1); ++ } + __sync_fetch_and_add(&nr_enqueued, 1); +} + @@ -10808,13 +13195,80 @@ index 000000000000..892278f12dce + +static void update_core_sched_head_seq(struct task_struct *p) +{ -+ struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + int idx = weight_to_idx(p->scx.weight); ++ struct task_ctx *tctx; + -+ if (tctx) ++ if ((tctx = lookup_task_ctx(p))) + core_sched_head_seqs[idx] = tctx->core_sched_seq; -+ else -+ scx_bpf_error("task_ctx lookup failed"); ++} ++ ++/* ++ * To demonstrate the use of scx_bpf_dispatch_from_dsq(), implement silly ++ * selective priority boosting mechanism by scanning SHARED_DSQ looking for ++ * highpri tasks, moving them to HIGHPRI_DSQ and then consuming them first. This ++ * makes minor difference only when dsp_batch is larger than 1. ++ * ++ * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and ++ * non-rq-lock holding BPF programs. As demonstration, this function is called ++ * from qmap_dispatch() and monitor_timerfn(). ++ */ ++static bool dispatch_highpri(bool from_timer) ++{ ++ struct task_struct *p; ++ s32 this_cpu = bpf_get_smp_processor_id(); ++ ++ /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */ ++ bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) { ++ static u64 highpri_seq; ++ struct task_ctx *tctx; ++ ++ if (!(tctx = lookup_task_ctx(p))) ++ return false; ++ ++ if (tctx->highpri) { ++ /* exercise the set_*() and vtime interface too */ ++ scx_bpf_dispatch_from_dsq_set_slice( ++ BPF_FOR_EACH_ITER, slice_ns * 2); ++ scx_bpf_dispatch_from_dsq_set_vtime( ++ BPF_FOR_EACH_ITER, highpri_seq++); ++ scx_bpf_dispatch_vtime_from_dsq( ++ BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); ++ } ++ } ++ ++ /* ++ * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU ++ * is found. ++ */ ++ bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) { ++ bool dispatched = false; ++ s32 cpu; ++ ++ if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr)) ++ cpu = this_cpu; ++ else ++ cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); ++ ++ if (scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p, ++ SCX_DSQ_LOCAL_ON | cpu, ++ SCX_ENQ_PREEMPT)) { ++ if (cpu == this_cpu) { ++ dispatched = true; ++ __sync_fetch_and_add(&nr_expedited_local, 1); ++ } else { ++ __sync_fetch_and_add(&nr_expedited_remote, 1); ++ } ++ if (from_timer) ++ __sync_fetch_and_add(&nr_expedited_from_timer, 1); ++ } else { ++ __sync_fetch_and_add(&nr_expedited_lost, 1); ++ } ++ ++ if (dispatched) ++ return true; ++ } ++ ++ return false; +} + +void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) @@ -10825,7 +13279,10 @@ index 000000000000..892278f12dce + void *fifo; + s32 i, pid; + -+ if (scx_bpf_consume(SHARED_DSQ)) ++ if (dispatch_highpri(false)) ++ return; ++ ++ if (!nr_highpri_queued && scx_bpf_consume(SHARED_DSQ)) + return; + + if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { @@ -10862,6 +13319,8 @@ index 000000000000..892278f12dce + + /* Dispatch or advance. */ + bpf_repeat(BPF_MAX_LOOPS) { ++ struct task_ctx *tctx; ++ + if (bpf_map_pop_elem(fifo, &pid)) + break; + @@ -10869,13 +13328,25 @@ index 000000000000..892278f12dce + if (!p) + continue; + ++ if (!(tctx = lookup_task_ctx(p))) { ++ bpf_task_release(p); ++ return; ++ } ++ ++ if (tctx->highpri) ++ __sync_fetch_and_sub(&nr_highpri_queued, 1); ++ + update_core_sched_head_seq(p); + __sync_fetch_and_add(&nr_dispatched, 1); ++ + scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); + bpf_task_release(p); ++ + batch--; + cpuc->dsp_cnt--; + if (!batch || !scx_bpf_dispatch_nr_slots()) { ++ if (dispatch_highpri(false)) ++ return; + scx_bpf_consume(SHARED_DSQ); + return; + } @@ -11185,6 +13656,10 @@ index 000000000000..892278f12dce + +static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) +{ ++ bpf_rcu_read_lock(); ++ dispatch_highpri(true); ++ bpf_rcu_read_unlock(); ++ + monitor_cpuperf(); + + if (print_shared_dsq) @@ -11206,6 +13681,10 @@ index 000000000000..892278f12dce + if (ret) + return ret; + ++ ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1); ++ if (ret) ++ return ret; ++ + timer = bpf_map_lookup_elem(&monitor_timer, &key); + if (!timer) + return -ESRCH; @@ -11242,10 +13721,10 @@ index 000000000000..892278f12dce + .name = "qmap"); diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c new file mode 100644 -index 000000000000..c9ca30d62b2b +index 000000000000..ac45a02b4055 --- /dev/null +++ b/tools/sched_ext/scx_qmap.c -@@ -0,0 +1,144 @@ +@@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. @@ -11277,6 +13756,7 @@ index 000000000000..c9ca30d62b2b +" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" +" -b COUNT Dispatch upto COUNT tasks together\n" +" -P Print out DSQ content to trace_pipe every second, use with -b\n" ++" -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n" +" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" +" -D LEN Set scx_exit_info.dump buffer length\n" +" -S Suppress qmap-specific debug dump\n" @@ -11311,7 +13791,7 @@ index 000000000000..c9ca30d62b2b + + skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); + -+ while ((opt = getopt(argc, argv, "s:e:t:T:l:b:Pd:D:Spvh")) != -1) { ++ while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) { + switch (opt) { + case 's': + skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; @@ -11334,6 +13814,9 @@ index 000000000000..c9ca30d62b2b + case 'P': + skel->rodata->print_shared_dsq = true; + break; ++ case 'H': ++ skel->rodata->highpri_boosting = true; ++ break; + case 'd': + skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); + if (skel->rodata->disallow_tgid < 0) @@ -11369,6 +13852,11 @@ index 000000000000..c9ca30d62b2b + skel->bss->nr_reenqueued, skel->bss->nr_dequeued, + skel->bss->nr_core_sched_execed, + skel->bss->nr_ddsp_from_enq); ++ printf(" exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n", ++ skel->bss->nr_expedited_local, ++ skel->bss->nr_expedited_remote, ++ skel->bss->nr_expedited_from_timer, ++ skel->bss->nr_expedited_lost); + if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur")) + printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n", + skel->bss->cpuperf_min, @@ -13322,10 +15810,10 @@ index 000000000000..97d45f1e5597 +REGISTER_SCX_TEST(&init_enable_count) diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c new file mode 100644 -index 000000000000..44612fdaf399 +index 000000000000..00bfa9cb95d3 --- /dev/null +++ b/tools/testing/selftests/sched_ext/maximal.bpf.c -@@ -0,0 +1,132 @@ +@@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler with every callback defined. @@ -13423,6 +15911,32 @@ index 000000000000..44612fdaf399 +void BPF_STRUCT_OPS(maximal_disable, struct task_struct *p) +{} + ++s32 BPF_STRUCT_OPS(maximal_cgroup_init, struct cgroup *cgrp, ++ struct scx_cgroup_init_args *args) ++{ ++ return 0; ++} ++ ++void BPF_STRUCT_OPS(maximal_cgroup_exit, struct cgroup *cgrp) ++{} ++ ++s32 BPF_STRUCT_OPS(maximal_cgroup_prep_move, struct task_struct *p, ++ struct cgroup *from, struct cgroup *to) ++{ ++ return 0; ++} ++ ++void BPF_STRUCT_OPS(maximal_cgroup_move, struct task_struct *p, ++ struct cgroup *from, struct cgroup *to) ++{} ++ ++void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p, ++ struct cgroup *from, struct cgroup *to) ++{} ++ ++void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight) ++{} ++ +s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) +{ + return 0; @@ -13454,6 +15968,12 @@ index 000000000000..44612fdaf399 + .enable = maximal_enable, + .exit_task = maximal_exit_task, + .disable = maximal_disable, ++ .cgroup_init = maximal_cgroup_init, ++ .cgroup_exit = maximal_cgroup_exit, ++ .cgroup_prep_move = maximal_cgroup_prep_move, ++ .cgroup_move = maximal_cgroup_move, ++ .cgroup_cancel_move = maximal_cgroup_cancel_move, ++ .cgroup_set_weight = maximal_cgroup_set_weight, + .init = maximal_init, + .exit = maximal_exit, + .name = "maximal", @@ -13770,10 +16290,10 @@ index 000000000000..6c5db8ebbf8a +REGISTER_SCX_TEST(&minimal) diff --git a/tools/testing/selftests/sched_ext/prog_run.bpf.c b/tools/testing/selftests/sched_ext/prog_run.bpf.c new file mode 100644 -index 000000000000..fd2c8f12af16 +index 000000000000..6a4d7c48e3f2 --- /dev/null +++ b/tools/testing/selftests/sched_ext/prog_run.bpf.c -@@ -0,0 +1,32 @@ +@@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates that we can invoke sched_ext kfuncs in @@ -13792,6 +16312,7 @@ index 000000000000..fd2c8f12af16 +SEC("syscall") +int BPF_PROG(prog_run_syscall) +{ ++ scx_bpf_create_dsq(0, -1); + scx_bpf_exit(0xdeadbeef, "Exited from PROG_RUN"); + return 0; +} @@ -15261,4 +17782,4 @@ index 000000000000..bc13dfec1267 + +#endif // __SCX_TEST_H__ -- -2.46.0.rc1 +2.45.2.606.g9005149a4a diff --git a/patches/0003-bore-cachy-ext.patch b/patches/0003-bore-cachy-ext.patch index 111734a..6a54611 100644 --- a/patches/0003-bore-cachy-ext.patch +++ b/patches/0003-bore-cachy-ext.patch @@ -1,6 +1,6 @@ -From 8748a0aa2fc51c5a22d17c4da434f12e91b4d211 Mon Sep 17 00:00:00 2001 +From 35259c1c06596a086582bb3c63461b039e1e517d Mon Sep 17 00:00:00 2001 From: Piotr Gorski -Date: Mon, 12 Aug 2024 13:44:47 +0200 +Date: Fri, 13 Sep 2024 14:15:05 +0200 Subject: [PATCH] bore-cachy-ext Signed-off-by: Piotr Gorski @@ -8,12 +8,12 @@ Signed-off-by: Piotr Gorski include/linux/sched.h | 10 ++ init/Kconfig | 17 ++ kernel/Kconfig.hz | 16 ++ - kernel/sched/core.c | 143 +++++++++++++++ + kernel/sched/core.c | 141 +++++++++++++++ kernel/sched/debug.c | 60 ++++++- - kernel/sched/fair.c | 388 +++++++++++++++++++++++++++++++++++++--- + kernel/sched/fair.c | 379 +++++++++++++++++++++++++++++++++++++--- kernel/sched/features.h | 20 ++- kernel/sched/sched.h | 7 + - 8 files changed, 634 insertions(+), 27 deletions(-) + 8 files changed, 623 insertions(+), 27 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5b4f78fe3..b9e5ea2aa 100644 @@ -37,10 +37,10 @@ index 5b4f78fe3..b9e5ea2aa 100644 u64 slice; diff --git a/init/Kconfig b/init/Kconfig -index e24741512..511a13dcd 100644 +index e1a88d48d..3aea8e43c 100644 --- a/init/Kconfig +++ b/init/Kconfig -@@ -1317,6 +1317,23 @@ config CHECKPOINT_RESTORE +@@ -1327,6 +1327,23 @@ config CHECKPOINT_RESTORE If unsure, say N here. @@ -91,10 +91,10 @@ index 0f78364ef..b50189ee5 100644 config SCHED_HRTICK def_bool HIGH_RES_TIMERS diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 7ba808949..8c010f1f5 100644 +index c792a6feb..dfb93c5f7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c -@@ -4336,6 +4336,138 @@ int wake_up_state(struct task_struct *p, unsigned int state) +@@ -4336,6 +4336,136 @@ int wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } @@ -125,7 +125,7 @@ index 7ba808949..8c010f1f5 100644 + return cnt; +} + -+static inline bool task_is_inheritable(struct task_struct *p) { ++static inline bool task_burst_inheritable(struct task_struct *p) { + return (p->sched_class == &fair_sched_class); +} + @@ -136,7 +136,7 @@ index 7ba808949..8c010f1f5 100644 +} + +static void __update_child_burst_cache( -+ struct task_struct *p, u32 cnt, u32 sum, u64 now) { ++ struct task_struct *p, u32 cnt, u32 sum, u64 now) { + u8 avg = 0; + if (cnt) avg = sum / cnt; + p->se.child_burst = max(avg, p->se.burst_penalty); @@ -146,11 +146,10 @@ index 7ba808949..8c010f1f5 100644 + +static inline void update_child_burst_direct(struct task_struct *p, u64 now) { + struct task_struct *child; -+ u32 cnt = 0; -+ u32 sum = 0; ++ u32 cnt = 0, sum = 0; + + list_for_each_entry(child, &p->children, sibling) { -+ if (!task_is_inheritable(child)) continue; ++ if (!task_burst_inheritable(child)) continue; + cnt++; + sum += child->se.burst_penalty; + } @@ -169,8 +168,7 @@ index 7ba808949..8c010f1f5 100644 +static void update_child_burst_topological( + struct task_struct *p, u64 now, u32 depth, u32 *acnt, u32 *asum) { + struct task_struct *child, *dec; -+ u32 cnt = 0, dcnt = 0; -+ u32 sum = 0; ++ u32 cnt = 0, dcnt = 0, sum = 0; + + list_for_each_entry(child, &p->children, sibling) { + dec = child; @@ -178,7 +176,7 @@ index 7ba808949..8c010f1f5 100644 + dec = list_first_entry(&dec->children, struct task_struct, sibling); + + if (!dcnt || !depth) { -+ if (!task_is_inheritable(dec)) continue; ++ if (!task_burst_inheritable(dec)) continue; + cnt++; + sum += dec->se.burst_penalty; + continue; @@ -224,7 +222,7 @@ index 7ba808949..8c010f1f5 100644 +} + +static void sched_post_fork_bore(struct task_struct *p) { -+ if (p->sched_class == &fair_sched_class) ++ if (task_burst_inheritable(p)) + inherit_burst(p); + p->se.burst_penalty = p->se.prev_burst_penalty; +} @@ -233,7 +231,7 @@ index 7ba808949..8c010f1f5 100644 /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. -@@ -4352,6 +4484,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +@@ -4352,6 +4482,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; @@ -243,7 +241,7 @@ index 7ba808949..8c010f1f5 100644 p->se.vlag = 0; p->se.slice = sysctl_sched_base_slice; INIT_LIST_HEAD(&p->se.group_node); -@@ -4686,6 +4821,9 @@ void sched_cancel_fork(struct task_struct *p) +@@ -4686,6 +4819,9 @@ void sched_cancel_fork(struct task_struct *p) void sched_post_fork(struct task_struct *p) { @@ -253,13 +251,13 @@ index 7ba808949..8c010f1f5 100644 uclamp_post_fork(p); scx_post_fork(p); } -@@ -8285,6 +8423,11 @@ void __init sched_init(void) +@@ -8283,6 +8419,11 @@ void __init sched_init(void) BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); #endif +#ifdef CONFIG_SCHED_BORE + sched_init_bore(); -+ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 5.2.10 by Masahito Suzuki"); ++ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 5.2.11 by Masahito Suzuki"); +#endif // CONFIG_SCHED_BORE + wait_bit_init(); @@ -373,7 +371,7 @@ index c057ef46c..3cab39e34 100644 P(se.avg.runnable_sum); P(se.avg.util_sum); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 2928026d7..64987a5d1 100644 +index 2928026d7..f7040962b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -19,6 +19,9 @@ @@ -386,7 +384,7 @@ index 2928026d7..64987a5d1 100644 */ #include #include -@@ -64,28 +67,182 @@ +@@ -64,28 +67,174 @@ * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus * @@ -447,8 +445,7 @@ index 2928026d7..64987a5d1 100644 + +static inline u32 log2plus1_u64_u32f8(u64 v) { + u32 msb = fls64(v); -+ s32 excess_bits = msb - 9; -+ u8 fractional = (0 <= excess_bits)? v >> excess_bits: v << -excess_bits; ++ u8 fractional = (v << (64 - msb) >> 55); + return msb << 8 | fractional; +} + @@ -487,7 +484,6 @@ index 2928026d7..64987a5d1 100644 + +static inline u8 effective_prio(struct task_struct *p) { + u8 prio = p->static_prio - MAX_RT_PRIO; -+ + if (likely(sched_bore)) + prio += p->se.burst_score; + return min(39, prio); @@ -499,9 +495,8 @@ index 2928026d7..64987a5d1 100644 + u8 prev_prio = effective_prio(p); + + u8 burst_score = 0; -+ if (!(sched_burst_exclude_kthreads && (p->flags & PF_KTHREAD))) ++ if (!((p->flags & PF_KTHREAD) && likely(sched_burst_exclude_kthreads))) + burst_score = se->burst_penalty >> 2; -+ + se->burst_score = burst_score; + + u8 new_prio = effective_prio(p); @@ -516,10 +511,10 @@ index 2928026d7..64987a5d1 100644 +} + +static inline u32 binary_smooth(u32 new, u32 old) { -+ int increment = new - old; -+ return (0 <= increment)? -+ old + ( increment >> (int)sched_burst_smoothness_long): -+ old - (-increment >> (int)sched_burst_smoothness_short); ++ int increment = new - old; ++ return (0 <= increment)? ++ old + ( increment >> (int)sched_burst_smoothness_long): ++ old - (-increment >> (int)sched_burst_smoothness_short); +} + +static void restart_burst(struct sched_entity *se) { @@ -546,23 +541,18 @@ index 2928026d7..64987a5d1 100644 +} + +static void reset_task_weights_bore(void) { -+ struct task_struct *task; -+ struct rq *rq; -+ struct rq_flags rf; ++ struct task_struct *task; ++ struct rq *rq; ++ struct rq_flags rf; + -+ write_lock_irq(&tasklist_lock); -+ -+ for_each_process(task) { -+ rq = task_rq(task); -+ -+ rq_lock_irqsave(rq, &rf); -+ -+ reweight_task_by_prio(task, effective_prio(task)); -+ -+ rq_unlock_irqrestore(rq, &rf); -+ } -+ -+ write_unlock_irq(&tasklist_lock); ++ write_lock_irq(&tasklist_lock); ++ for_each_process(task) { ++ rq = task_rq(task); ++ rq_lock_irqsave(rq, &rf); ++ reweight_task_by_prio(task, effective_prio(task)); ++ rq_unlock_irqrestore(rq, &rf); ++ } ++ write_unlock_irq(&tasklist_lock); +} + +int sched_bore_update_handler(const struct ctl_table *table, int write, @@ -580,7 +570,7 @@ index 2928026d7..64987a5d1 100644 static int __init setup_sched_thermal_decay_shift(char *str) { -@@ -130,12 +287,8 @@ int __weak arch_asym_cpu_priority(int cpu) +@@ -130,12 +279,8 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ @@ -593,7 +583,7 @@ index 2928026d7..64987a5d1 100644 #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ -@@ -144,6 +297,92 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; +@@ -144,6 +289,92 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = { @@ -686,7 +676,7 @@ index 2928026d7..64987a5d1 100644 #ifdef CONFIG_CFS_BANDWIDTH { .procname = "sched_cfs_bandwidth_slice_us", -@@ -201,6 +440,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) +@@ -201,6 +432,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) * * This idea comes from the SD scheduler of Con Kolivas: */ @@ -700,7 +690,7 @@ index 2928026d7..64987a5d1 100644 static unsigned int get_update_sysctl_factor(void) { unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); -@@ -231,6 +477,7 @@ static void update_sysctl(void) +@@ -231,6 +469,7 @@ static void update_sysctl(void) SET_SYSCTL(sched_base_slice); #undef SET_SYSCTL } @@ -708,18 +698,17 @@ index 2928026d7..64987a5d1 100644 void __init sched_init_granularity(void) { -@@ -708,6 +955,10 @@ static s64 entity_lag(u64 avruntime, struct sched_entity *se) +@@ -708,6 +947,9 @@ static s64 entity_lag(u64 avruntime, struct sched_entity *se) vlag = avruntime - se->vruntime; limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); +#ifdef CONFIG_SCHED_BORE -+ if (likely(sched_bore)) -+ limit >>= 1; ++ limit >>= !!sched_bore; +#endif // CONFIG_SCHED_BORE return clamp(vlag, -limit, limit); } -@@ -868,6 +1119,39 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) +@@ -868,6 +1110,39 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) return __node_2_se(left); } @@ -759,7 +748,7 @@ index 2928026d7..64987a5d1 100644 /* * Earliest Eligible Virtual Deadline First * -@@ -887,28 +1171,27 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) +@@ -887,28 +1162,27 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) * * Which allows tree pruning through eligibility. */ @@ -795,7 +784,7 @@ index 2928026d7..64987a5d1 100644 return curr; /* Pick the leftmost entity if it's eligible */ -@@ -967,6 +1250,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +@@ -967,6 +1241,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ #ifdef CONFIG_SMP @@ -803,7 +792,7 @@ index 2928026d7..64987a5d1 100644 int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); -@@ -978,6 +1262,7 @@ int sched_update_scaling(void) +@@ -978,6 +1253,7 @@ int sched_update_scaling(void) return 0; } @@ -811,7 +800,7 @@ index 2928026d7..64987a5d1 100644 #endif #endif -@@ -1178,6 +1463,10 @@ static void update_curr(struct cfs_rq *cfs_rq) +@@ -1178,6 +1454,10 @@ static void update_curr(struct cfs_rq *cfs_rq) if (unlikely(delta_exec <= 0)) return; @@ -822,7 +811,7 @@ index 2928026d7..64987a5d1 100644 curr->vruntime += calc_delta_fair(delta_exec, curr); update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); -@@ -5193,6 +5482,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -5193,6 +5473,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) s64 lag = 0; se->slice = sysctl_sched_base_slice; @@ -835,7 +824,7 @@ index 2928026d7..64987a5d1 100644 vslice = calc_delta_fair(se->slice, se); /* -@@ -5203,6 +5498,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -5203,6 +5489,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * EEVDF: placement strategy #1 / #2 */ @@ -845,7 +834,7 @@ index 2928026d7..64987a5d1 100644 if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { struct sched_entity *curr = cfs_rq->curr; unsigned long load; -@@ -5278,6 +5576,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -5278,6 +5567,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * on average, halfway through their slice, as such start tasks * off with half a slice to ease into the competition. */ @@ -859,7 +848,7 @@ index 2928026d7..64987a5d1 100644 if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) vslice /= 2; -@@ -5492,7 +5797,7 @@ pick_next_entity(struct cfs_rq *cfs_rq) +@@ -5492,7 +5788,7 @@ pick_next_entity(struct cfs_rq *cfs_rq) cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) return cfs_rq->next; @@ -868,7 +857,7 @@ index 2928026d7..64987a5d1 100644 } static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); -@@ -6860,6 +7165,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +@@ -6860,6 +7156,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) bool was_sched_idle = sched_idle_rq(rq); util_est_dequeue(&rq->cfs, p); @@ -883,7 +872,7 @@ index 2928026d7..64987a5d1 100644 for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); -@@ -8428,7 +8741,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int +@@ -8428,7 +8732,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int /* * XXX pick_eevdf(cfs_rq) != se ? */ @@ -892,7 +881,7 @@ index 2928026d7..64987a5d1 100644 goto preempt; return; -@@ -8646,16 +8959,25 @@ static void yield_task_fair(struct rq *rq) +@@ -8646,16 +8950,25 @@ static void yield_task_fair(struct rq *rq) /* * Are we the only task in the tree? */ @@ -918,7 +907,7 @@ index 2928026d7..64987a5d1 100644 /* * Tell update_rq_clock() that we've just updated, * so we don't do microscopic update in schedule() -@@ -12720,6 +13042,9 @@ static void task_fork_fair(struct task_struct *p) +@@ -12720,6 +13033,9 @@ static void task_fork_fair(struct task_struct *p) curr = cfs_rq->curr; if (curr) update_curr(cfs_rq); @@ -928,7 +917,7 @@ index 2928026d7..64987a5d1 100644 place_entity(cfs_rq, se, ENQUEUE_INITIAL); rq_unlock(rq, &rf); } -@@ -13303,3 +13628,16 @@ __init void init_sched_fair_class(void) +@@ -13303,3 +13619,16 @@ __init void init_sched_fair_class(void) #endif /* SMP */ } @@ -978,10 +967,10 @@ index 143f55df8..bfeb9f653 100644 /* * Prefer to schedule the task we woke last (assuming it failed diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 71c346fb9..c30cb4a7c 100644 +index 207a04f02..c99430161 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h -@@ -2050,7 +2050,11 @@ static inline void update_sched_domain_debugfs(void) { } +@@ -2063,7 +2063,11 @@ static inline void update_sched_domain_debugfs(void) { } static inline void dirty_sched_domain_sysctl(int cpu) { } #endif @@ -993,7 +982,7 @@ index 71c346fb9..c30cb4a7c 100644 static inline const struct cpumask *task_user_cpus(struct task_struct *p) { -@@ -2705,6 +2709,9 @@ extern const_debug unsigned int sysctl_sched_nr_migrate; +@@ -2736,6 +2740,9 @@ extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_base_slice;