Update patches/0002-sched-ext.patch

This commit is contained in:
ferreo 2024-10-10 18:22:29 +02:00
parent 20339b16e1
commit a280828878

View File

@ -1,6 +1,6 @@
From 11276ed2c72c57624c1214e980efd24648be015c Mon Sep 17 00:00:00 2001 From c0d9f38dcc2b6bb16e54e7f438c9c449319ebef4 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev> From: Peter Jung <admin@ptr1337.dev>
Date: Fri, 4 Oct 2024 17:12:13 +0200 Date: Thu, 10 Oct 2024 12:47:12 +0200
Subject: [PATCH] sched-ext Subject: [PATCH] sched-ext
Signed-off-by: Peter Jung <admin@ptr1337.dev> Signed-off-by: Peter Jung <admin@ptr1337.dev>
@ -24,7 +24,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
kernel/sched/core.c | 288 +- kernel/sched/core.c | 288 +-
kernel/sched/cpufreq_schedutil.c | 50 +- kernel/sched/cpufreq_schedutil.c | 50 +-
kernel/sched/debug.c | 3 + kernel/sched/debug.c | 3 +
kernel/sched/ext.c | 7262 +++++++++++++++++ kernel/sched/ext.c | 7281 +++++++++++++++++
kernel/sched/ext.h | 91 + kernel/sched/ext.h | 91 +
kernel/sched/fair.c | 21 +- kernel/sched/fair.c | 21 +-
kernel/sched/idle.c | 2 + kernel/sched/idle.c | 2 +
@ -102,7 +102,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
.../selftests/sched_ext/test_example.c | 49 + .../selftests/sched_ext/test_example.c | 49 +
tools/testing/selftests/sched_ext/util.c | 71 + tools/testing/selftests/sched_ext/util.c | 71 +
tools/testing/selftests/sched_ext/util.h | 13 + tools/testing/selftests/sched_ext/util.h | 13 +
97 files changed, 16174 insertions(+), 130 deletions(-) 97 files changed, 16193 insertions(+), 130 deletions(-)
create mode 100644 Documentation/scheduler/sched-ext.rst create mode 100644 Documentation/scheduler/sched-ext.rst
create mode 100644 include/linux/sched/ext.h create mode 100644 include/linux/sched/ext.h
create mode 100644 include/trace/events/sched_ext.h create mode 100644 include/trace/events/sched_ext.h
@ -524,10 +524,10 @@ index 000000000000..6c0d70e2e27d
+possible, they are subject to change without warning between kernel +possible, they are subject to change without warning between kernel
+versions. +versions.
diff --git a/MAINTAINERS b/MAINTAINERS diff --git a/MAINTAINERS b/MAINTAINERS
index c2a7363e86fe..bcfe36daf67a 100644 index 16df466c205d..3345a15afded 100644
--- a/MAINTAINERS --- a/MAINTAINERS
+++ b/MAINTAINERS +++ b/MAINTAINERS
@@ -20364,6 +20364,19 @@ F: include/linux/wait.h @@ -20353,6 +20353,19 @@ F: include/linux/wait.h
F: include/uapi/linux/sched.h F: include/uapi/linux/sched.h
F: kernel/sched/ F: kernel/sched/
@ -594,7 +594,7 @@ index c60ba0ab1462..7139b33cb104 100644
CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */
CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */
diff --git a/include/linux/sched.h b/include/linux/sched.h diff --git a/include/linux/sched.h b/include/linux/sched.h
index f8d150343d42..5b4f78fe379d 100644 index 1c771ea4481d..c5a7901b2580 100644
--- a/include/linux/sched.h --- a/include/linux/sched.h
+++ b/include/linux/sched.h +++ b/include/linux/sched.h
@@ -82,6 +82,8 @@ struct task_group; @@ -82,6 +82,8 @@ struct task_group;
@ -606,7 +606,7 @@ index f8d150343d42..5b4f78fe379d 100644
/* /*
* Task state bitmask. NOTE! These bits are also * Task state bitmask. NOTE! These bits are also
* encoded in fs/proc/array.c: get_task_state(). * encoded in fs/proc/array.c: get_task_state().
@@ -810,6 +812,9 @@ struct task_struct { @@ -812,6 +814,9 @@ struct task_struct {
struct sched_rt_entity rt; struct sched_rt_entity rt;
struct sched_dl_entity dl; struct sched_dl_entity dl;
struct sched_dl_entity *dl_server; struct sched_dl_entity *dl_server;
@ -1011,7 +1011,7 @@ index c2f1fd95a821..fe782cd77388 100644
+ Documentation/scheduler/sched-ext.rst + Documentation/scheduler/sched-ext.rst
+ https://github.com/sched-ext/scx + https://github.com/sched-ext/scx
diff --git a/kernel/fork.c b/kernel/fork.c diff --git a/kernel/fork.c b/kernel/fork.c
index 238695afc630..69a0a7210060 100644 index 003de4829c15..eb290420d926 100644
--- a/kernel/fork.c --- a/kernel/fork.c
+++ b/kernel/fork.c +++ b/kernel/fork.c
@@ -23,6 +23,7 @@ @@ -23,6 +23,7 @@
@ -1030,7 +1030,7 @@ index 238695afc630..69a0a7210060 100644
io_uring_free(tsk); io_uring_free(tsk);
cgroup_free(tsk); cgroup_free(tsk);
task_numa_free(tsk, true); task_numa_free(tsk, true);
@@ -2355,7 +2357,7 @@ __latent_entropy struct task_struct *copy_process( @@ -2352,7 +2354,7 @@ __latent_entropy struct task_struct *copy_process(
retval = perf_event_init_task(p, clone_flags); retval = perf_event_init_task(p, clone_flags);
if (retval) if (retval)
@ -1039,7 +1039,7 @@ index 238695afc630..69a0a7210060 100644
retval = audit_alloc(p); retval = audit_alloc(p);
if (retval) if (retval)
goto bad_fork_cleanup_perf; goto bad_fork_cleanup_perf;
@@ -2488,7 +2490,9 @@ __latent_entropy struct task_struct *copy_process( @@ -2485,7 +2487,9 @@ __latent_entropy struct task_struct *copy_process(
* cgroup specific, it unconditionally needs to place the task on a * cgroup specific, it unconditionally needs to place the task on a
* runqueue. * runqueue.
*/ */
@ -1050,7 +1050,7 @@ index 238695afc630..69a0a7210060 100644
/* /*
* From this point on we must avoid any synchronous user-space * From this point on we must avoid any synchronous user-space
@@ -2534,13 +2538,13 @@ __latent_entropy struct task_struct *copy_process( @@ -2531,13 +2535,13 @@ __latent_entropy struct task_struct *copy_process(
/* Don't start children in a dying pid namespace */ /* Don't start children in a dying pid namespace */
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM; retval = -ENOMEM;
@ -1066,7 +1066,7 @@ index 238695afc630..69a0a7210060 100644
} }
/* No more failure paths after this point. */ /* No more failure paths after this point. */
@@ -2614,10 +2618,11 @@ __latent_entropy struct task_struct *copy_process( @@ -2611,10 +2615,11 @@ __latent_entropy struct task_struct *copy_process(
return p; return p;
@ -1079,7 +1079,7 @@ index 238695afc630..69a0a7210060 100644
cgroup_cancel_fork(p, args); cgroup_cancel_fork(p, args);
bad_fork_put_pidfd: bad_fork_put_pidfd:
if (clone_flags & CLONE_PIDFD) { if (clone_flags & CLONE_PIDFD) {
@@ -2656,6 +2661,8 @@ __latent_entropy struct task_struct *copy_process( @@ -2653,6 +2658,8 @@ __latent_entropy struct task_struct *copy_process(
audit_free(p); audit_free(p);
bad_fork_cleanup_perf: bad_fork_cleanup_perf:
perf_event_free_task(p); perf_event_free_task(p);
@ -1128,7 +1128,7 @@ index 39c315182b35..fae1f5c921eb 100644
+ +
#include "syscalls.c" #include "syscalls.c"
diff --git a/kernel/sched/core.c b/kernel/sched/core.c diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f3951e4a55e5..c792a6feb7a9 100644 index 1af59cf714cd..8ae04bd4a5a4 100644
--- a/kernel/sched/core.c --- a/kernel/sched/core.c
+++ b/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -169,7 +169,10 @@ static inline int __task_prio(const struct task_struct *p) @@ -169,7 +169,10 @@ static inline int __task_prio(const struct task_struct *p)
@ -1353,8 +1353,8 @@ index f3951e4a55e5..c792a6feb7a9 100644
-#endif -#endif
put_prev_task(rq, prev); put_prev_task(rq, prev);
}
@@ -5800,6 +5864,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) @@ -5808,6 +5872,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
const struct sched_class *class; const struct sched_class *class;
struct task_struct *p; struct task_struct *p;
@ -1364,9 +1364,9 @@ index f3951e4a55e5..c792a6feb7a9 100644
/* /*
* Optimization: we know that if all tasks are in the fair class we can * Optimization: we know that if all tasks are in the fair class we can
* call that function directly, but only if the @prev task wasn't of a * call that function directly, but only if the @prev task wasn't of a
@@ -5840,10 +5907,15 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) @@ -5847,10 +5914,15 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (prev->dl_server) restart:
prev->dl_server = NULL; put_prev_task_balance(rq, prev, rf);
- for_each_class(class) { - for_each_class(class) {
+ for_each_active_class(class) { + for_each_active_class(class) {
@ -1382,7 +1382,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
} }
BUG(); /* The idle class should always have a runnable task. */ BUG(); /* The idle class should always have a runnable task. */
@@ -5873,7 +5945,7 @@ static inline struct task_struct *pick_task(struct rq *rq) @@ -5880,7 +5952,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
const struct sched_class *class; const struct sched_class *class;
struct task_struct *p; struct task_struct *p;
@ -1391,7 +1391,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
p = class->pick_task(rq); p = class->pick_task(rq);
if (p) if (p)
return p; return p;
@@ -6870,6 +6942,10 @@ void __setscheduler_prio(struct task_struct *p, int prio) @@ -6877,6 +6949,10 @@ void __setscheduler_prio(struct task_struct *p, int prio)
p->sched_class = &dl_sched_class; p->sched_class = &dl_sched_class;
else if (rt_prio(prio)) else if (rt_prio(prio))
p->sched_class = &rt_sched_class; p->sched_class = &rt_sched_class;
@ -1402,7 +1402,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
else else
p->sched_class = &fair_sched_class; p->sched_class = &fair_sched_class;
@@ -7015,6 +7091,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) @@ -7022,6 +7098,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
} }
__setscheduler_prio(p, prio); __setscheduler_prio(p, prio);
@ -1410,7 +1410,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
if (queued) if (queued)
enqueue_task(rq, p, queue_flag); enqueue_task(rq, p, queue_flag);
@@ -7429,6 +7506,7 @@ void sched_show_task(struct task_struct *p) @@ -7436,6 +7513,7 @@ void sched_show_task(struct task_struct *p)
print_worker_info(KERN_INFO, p); print_worker_info(KERN_INFO, p);
print_stop_info(KERN_INFO, p); print_stop_info(KERN_INFO, p);
@ -1418,7 +1418,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
show_stack(p, NULL, KERN_INFO); show_stack(p, NULL, KERN_INFO);
put_task_stack(p); put_task_stack(p);
} }
@@ -7957,6 +8035,8 @@ int sched_cpu_activate(unsigned int cpu) @@ -7964,6 +8042,8 @@ int sched_cpu_activate(unsigned int cpu)
cpuset_cpu_active(); cpuset_cpu_active();
} }
@ -1427,7 +1427,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
/* /*
* Put the rq online, if not already. This happens: * Put the rq online, if not already. This happens:
* *
@@ -8006,6 +8086,8 @@ int sched_cpu_deactivate(unsigned int cpu) @@ -8013,6 +8093,8 @@ int sched_cpu_deactivate(unsigned int cpu)
sched_set_rq_offline(rq, cpu); sched_set_rq_offline(rq, cpu);
@ -1436,7 +1436,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
/* /*
* When going down, decrement the number of cores with SMT present. * When going down, decrement the number of cores with SMT present.
*/ */
@@ -8190,11 +8272,15 @@ void __init sched_init(void) @@ -8197,11 +8279,15 @@ void __init sched_init(void)
int i; int i;
/* Make sure the linker didn't screw up */ /* Make sure the linker didn't screw up */
@ -1456,7 +1456,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
#endif #endif
wait_bit_init(); wait_bit_init();
@@ -8218,6 +8304,9 @@ void __init sched_init(void) @@ -8225,6 +8311,9 @@ void __init sched_init(void)
root_task_group.shares = ROOT_TASK_GROUP_LOAD; root_task_group.shares = ROOT_TASK_GROUP_LOAD;
init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */
@ -1466,7 +1466,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
#ifdef CONFIG_RT_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr; root_task_group.rt_se = (struct sched_rt_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **); ptr += nr_cpu_ids * sizeof(void **);
@@ -8363,6 +8452,7 @@ void __init sched_init(void) @@ -8370,6 +8459,7 @@ void __init sched_init(void)
balance_push_set(smp_processor_id(), false); balance_push_set(smp_processor_id(), false);
#endif #endif
init_sched_fair_class(); init_sched_fair_class();
@ -1474,7 +1474,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
psi_init(); psi_init();
@@ -8648,6 +8738,7 @@ struct task_group *sched_create_group(struct task_group *parent) @@ -8655,6 +8745,7 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent)) if (!alloc_rt_sched_group(tg, parent))
goto err; goto err;
@ -1482,7 +1482,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
alloc_uclamp_sched_group(tg, parent); alloc_uclamp_sched_group(tg, parent);
return tg; return tg;
@@ -8775,6 +8866,7 @@ void sched_move_task(struct task_struct *tsk) @@ -8782,6 +8873,7 @@ void sched_move_task(struct task_struct *tsk)
put_prev_task(rq, tsk); put_prev_task(rq, tsk);
sched_change_group(tsk, group); sched_change_group(tsk, group);
@ -1490,7 +1490,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
if (queued) if (queued)
enqueue_task(rq, tsk, queue_flags); enqueue_task(rq, tsk, queue_flags);
@@ -8789,11 +8881,6 @@ void sched_move_task(struct task_struct *tsk) @@ -8796,11 +8888,6 @@ void sched_move_task(struct task_struct *tsk)
} }
} }
@ -1502,7 +1502,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
static struct cgroup_subsys_state * static struct cgroup_subsys_state *
cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{ {
@@ -8817,6 +8904,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) @@ -8824,6 +8911,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
{ {
struct task_group *tg = css_tg(css); struct task_group *tg = css_tg(css);
struct task_group *parent = css_tg(css->parent); struct task_group *parent = css_tg(css->parent);
@ -1514,7 +1514,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
if (parent) if (parent)
sched_online_group(tg, parent); sched_online_group(tg, parent);
@@ -8831,6 +8923,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) @@ -8838,6 +8930,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
return 0; return 0;
} }
@ -1528,7 +1528,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{ {
struct task_group *tg = css_tg(css); struct task_group *tg = css_tg(css);
@@ -8848,9 +8947,9 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) @@ -8855,9 +8954,9 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_unregister_group(tg); sched_unregister_group(tg);
} }
@ -1539,7 +1539,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
struct task_struct *task; struct task_struct *task;
struct cgroup_subsys_state *css; struct cgroup_subsys_state *css;
@@ -8858,9 +8957,9 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) @@ -8865,9 +8964,9 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
if (!sched_rt_can_attach(css_tg(css), task)) if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL; return -EINVAL;
} }
@ -1551,7 +1551,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
static void cpu_cgroup_attach(struct cgroup_taskset *tset) static void cpu_cgroup_attach(struct cgroup_taskset *tset)
{ {
@@ -8869,6 +8968,13 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) @@ -8876,6 +8975,13 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
cgroup_taskset_for_each(task, css, tset) cgroup_taskset_for_each(task, css, tset)
sched_move_task(task); sched_move_task(task);
@ -1565,7 +1565,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
} }
#ifdef CONFIG_UCLAMP_TASK_GROUP #ifdef CONFIG_UCLAMP_TASK_GROUP
@@ -9045,22 +9151,36 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v) @@ -9052,22 +9158,36 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
} }
#endif /* CONFIG_UCLAMP_TASK_GROUP */ #endif /* CONFIG_UCLAMP_TASK_GROUP */
@ -1606,7 +1606,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
#ifdef CONFIG_CFS_BANDWIDTH #ifdef CONFIG_CFS_BANDWIDTH
static DEFINE_MUTEX(cfs_constraints_mutex); static DEFINE_MUTEX(cfs_constraints_mutex);
@@ -9406,7 +9526,6 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) @@ -9413,7 +9533,6 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
return 0; return 0;
} }
#endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_CFS_BANDWIDTH */
@ -1614,7 +1614,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
#ifdef CONFIG_RT_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED
static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
@@ -9434,7 +9553,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, @@ -9441,7 +9560,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
} }
#endif /* CONFIG_RT_GROUP_SCHED */ #endif /* CONFIG_RT_GROUP_SCHED */
@ -1623,7 +1623,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
struct cftype *cft) struct cftype *cft)
{ {
@@ -9444,12 +9563,17 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, @@ -9451,12 +9570,17 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
static int cpu_idle_write_s64(struct cgroup_subsys_state *css, static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
struct cftype *cft, s64 idle) struct cftype *cft, s64 idle)
{ {
@ -1643,7 +1643,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
{ {
.name = "shares", .name = "shares",
.read_u64 = cpu_shares_read_u64, .read_u64 = cpu_shares_read_u64,
@@ -9559,38 +9683,35 @@ static int cpu_local_stat_show(struct seq_file *sf, @@ -9566,38 +9690,35 @@ static int cpu_local_stat_show(struct seq_file *sf,
return 0; return 0;
} }
@ -1696,7 +1696,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
int last_delta = INT_MAX; int last_delta = INT_MAX;
int prio, delta; int prio, delta;
@@ -9609,7 +9730,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, @@ -9616,7 +9737,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
struct cftype *cft, s64 nice) struct cftype *cft, s64 nice)
{ {
unsigned long weight; unsigned long weight;
@ -1705,7 +1705,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
if (nice < MIN_NICE || nice > MAX_NICE) if (nice < MIN_NICE || nice > MAX_NICE)
return -ERANGE; return -ERANGE;
@@ -9618,9 +9739,13 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, @@ -9625,9 +9746,13 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
idx = array_index_nospec(idx, 40); idx = array_index_nospec(idx, 40);
weight = sched_prio_to_weight[idx]; weight = sched_prio_to_weight[idx];
@ -1721,7 +1721,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
long period, long quota) long period, long quota)
@@ -9680,7 +9805,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, @@ -9687,7 +9812,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
#endif #endif
static struct cftype cpu_files[] = { static struct cftype cpu_files[] = {
@ -1730,7 +1730,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
{ {
.name = "weight", .name = "weight",
.flags = CFTYPE_NOT_ON_ROOT, .flags = CFTYPE_NOT_ON_ROOT,
@@ -9734,14 +9859,14 @@ static struct cftype cpu_files[] = { @@ -9741,14 +9866,14 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = { struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc, .css_alloc = cpu_cgroup_css_alloc,
.css_online = cpu_cgroup_css_online, .css_online = cpu_cgroup_css_online,
@ -1747,7 +1747,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
.legacy_cftypes = cpu_legacy_files, .legacy_cftypes = cpu_legacy_files,
.dfl_cftypes = cpu_files, .dfl_cftypes = cpu_files,
.early_init = true, .early_init = true,
@@ -10331,3 +10456,38 @@ void sched_mm_cid_fork(struct task_struct *t) @@ -10338,3 +10463,38 @@ void sched_mm_cid_fork(struct task_struct *t)
t->mm_cid_active = 1; t->mm_cid_active = 1;
} }
#endif #endif
@ -1891,10 +1891,10 @@ index c1eb9a1afd13..c057ef46c5f8 100644
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
new file mode 100644 new file mode 100644
index 000000000000..25fadfaace33 index 000000000000..5fae2292ec29
--- /dev/null --- /dev/null
+++ b/kernel/sched/ext.c +++ b/kernel/sched/ext.c
@@ -0,0 +1,7262 @@ @@ -0,0 +1,7281 @@
+/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 */
+/* +/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
@ -1915,6 +1915,12 @@ index 000000000000..25fadfaace33
+ SCX_EXIT_DUMP_DFL_LEN = 32768, + SCX_EXIT_DUMP_DFL_LEN = 32768,
+ +
+ SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, + SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
+
+ /*
+ * Iterating all tasks may take a while. Periodically drop
+ * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
+ */
+ SCX_OPS_TASK_ITER_BATCH = 32,
+}; +};
+ +
+enum scx_exit_kind { +enum scx_exit_kind {
@ -3168,86 +3174,105 @@ index 000000000000..25fadfaace33
+ struct task_struct *locked; + struct task_struct *locked;
+ struct rq *rq; + struct rq *rq;
+ struct rq_flags rf; + struct rq_flags rf;
+ u32 cnt;
+}; +};
+ +
+/** +/**
+ * scx_task_iter_init - Initialize a task iterator + * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration
+ * @iter: iterator to init + * @iter: iterator to init
+ * + *
+ * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, + * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter
+ * @iter must eventually be exited with scx_task_iter_exit(). + * must eventually be stopped with scx_task_iter_stop().
+ * + *
+ * scx_tasks_lock may be released between this and the first next() call or + * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock()
+ * between any two next() calls. If scx_tasks_lock is released between two + * between this and the first next() call or between any two next() calls. If
+ * next() calls, the caller is responsible for ensuring that the task being + * the locks are released between two next() calls, the caller is responsible
+ * iterated remains accessible either through RCU read lock or obtaining a + * for ensuring that the task being iterated remains accessible either through
+ * reference count. + * RCU read lock or obtaining a reference count.
+ * + *
+ * All tasks which existed when the iteration started are guaranteed to be + * All tasks which existed when the iteration started are guaranteed to be
+ * visited as long as they still exist. + * visited as long as they still exist.
+ */ + */
+static void scx_task_iter_init(struct scx_task_iter *iter) +static void scx_task_iter_start(struct scx_task_iter *iter)
+{ +{
+ lockdep_assert_held(&scx_tasks_lock);
+
+ BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & + BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
+ ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); + ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
+ +
+ spin_lock_irq(&scx_tasks_lock);
+
+ iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; + iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
+ list_add(&iter->cursor.tasks_node, &scx_tasks); + list_add(&iter->cursor.tasks_node, &scx_tasks);
+ iter->locked = NULL; + iter->locked = NULL;
+ iter->cnt = 0;
+} +}
+ +
+/** +static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
+ * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator
+ * @iter: iterator to unlock rq for
+ *
+ * If @iter is in the middle of a locked iteration, it may be locking the rq of
+ * the task currently being visited. Unlock the rq if so. This function can be
+ * safely called anytime during an iteration.
+ *
+ * Returns %true if the rq @iter was locking is unlocked. %false if @iter was
+ * not locking an rq.
+ */
+static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter)
+{ +{
+ if (iter->locked) { + if (iter->locked) {
+ task_rq_unlock(iter->rq, iter->locked, &iter->rf); + task_rq_unlock(iter->rq, iter->locked, &iter->rf);
+ iter->locked = NULL; + iter->locked = NULL;
+ return true;
+ } else {
+ return false;
+ } + }
+} +}
+ +
+/** +/**
+ * scx_task_iter_exit - Exit a task iterator + * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator
+ * @iter: iterator to unlock
+ *
+ * If @iter is in the middle of a locked iteration, it may be locking the rq of
+ * the task currently being visited in addition to scx_tasks_lock. Unlock both.
+ * This function can be safely called anytime during an iteration.
+ */
+static void scx_task_iter_unlock(struct scx_task_iter *iter)
+{
+ __scx_task_iter_rq_unlock(iter);
+ spin_unlock_irq(&scx_tasks_lock);
+}
+
+/**
+ * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock()
+ * @iter: iterator to re-lock
+ *
+ * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it
+ * doesn't re-lock the rq lock. Must be called before other iterator operations.
+ */
+static void scx_task_iter_relock(struct scx_task_iter *iter)
+{
+ spin_lock_irq(&scx_tasks_lock);
+}
+
+/**
+ * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock
+ * @iter: iterator to exit + * @iter: iterator to exit
+ * + *
+ * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. + * Exit a previously initialized @iter. Must be called with scx_tasks_lock held
+ * If the iterator holds a task's rq lock, that rq lock is released. See + * which is released on return. If the iterator holds a task's rq lock, that rq
+ * scx_task_iter_init() for details. + * lock is also released. See scx_task_iter_start() for details.
+ */ + */
+static void scx_task_iter_exit(struct scx_task_iter *iter) +static void scx_task_iter_stop(struct scx_task_iter *iter)
+{ +{
+ lockdep_assert_held(&scx_tasks_lock);
+
+ scx_task_iter_rq_unlock(iter);
+ list_del_init(&iter->cursor.tasks_node); + list_del_init(&iter->cursor.tasks_node);
+ scx_task_iter_unlock(iter);
+} +}
+ +
+/** +/**
+ * scx_task_iter_next - Next task + * scx_task_iter_next - Next task
+ * @iter: iterator to walk + * @iter: iterator to walk
+ * + *
+ * Visit the next task. See scx_task_iter_init() for details. + * Visit the next task. See scx_task_iter_start() for details. Locks are dropped
+ * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing
+ * stalls by holding scx_tasks_lock for too long.
+ */ + */
+static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) +static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
+{ +{
+ struct list_head *cursor = &iter->cursor.tasks_node; + struct list_head *cursor = &iter->cursor.tasks_node;
+ struct sched_ext_entity *pos; + struct sched_ext_entity *pos;
+ +
+ lockdep_assert_held(&scx_tasks_lock); + if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) {
+ scx_task_iter_unlock(iter);
+ cpu_relax();
+ cond_resched();
+ scx_task_iter_relock(iter);
+ }
+ +
+ list_for_each_entry(pos, cursor, tasks_node) { + list_for_each_entry(pos, cursor, tasks_node) {
+ if (&pos->tasks_node == &scx_tasks) + if (&pos->tasks_node == &scx_tasks)
@ -3268,14 +3293,14 @@ index 000000000000..25fadfaace33
+ * @include_dead: Whether we should include dead tasks in the iteration + * @include_dead: Whether we should include dead tasks in the iteration
+ * + *
+ * Visit the non-idle task with its rq lock held. Allows callers to specify + * Visit the non-idle task with its rq lock held. Allows callers to specify
+ * whether they would like to filter out dead tasks. See scx_task_iter_init() + * whether they would like to filter out dead tasks. See scx_task_iter_start()
+ * for details. + * for details.
+ */ + */
+static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) +static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
+{ +{
+ struct task_struct *p; + struct task_struct *p;
+ +
+ scx_task_iter_rq_unlock(iter); + __scx_task_iter_rq_unlock(iter);
+ +
+ while ((p = scx_task_iter_next(iter))) { + while ((p = scx_task_iter_next(iter))) {
+ /* + /*
@ -4989,11 +5014,6 @@ index 000000000000..25fadfaace33
+ +
+ *found = false; + *found = false;
+ +
+ if (!static_branch_likely(&scx_builtin_idle_enabled)) {
+ scx_ops_error("built-in idle tracking is disabled");
+ return prev_cpu;
+ }
+
+ /* + /*
+ * If WAKE_SYNC, the waker's local DSQ is empty, and the system is + * If WAKE_SYNC, the waker's local DSQ is empty, and the system is
+ * under utilized, wake up @p to the local DSQ of the waker. Checking + * under utilized, wake up @p to the local DSQ of the waker. Checking
@ -5067,7 +5087,7 @@ index 000000000000..25fadfaace33
+ if (unlikely(wake_flags & WF_EXEC)) + if (unlikely(wake_flags & WF_EXEC))
+ return prev_cpu; + return prev_cpu;
+ +
+ if (SCX_HAS_OP(select_cpu)) { + if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) {
+ s32 cpu; + s32 cpu;
+ struct task_struct **ddsp_taskp; + struct task_struct **ddsp_taskp;
+ +
@ -5132,7 +5152,7 @@ index 000000000000..25fadfaace33
+{ +{
+ int cpu = cpu_of(rq); + int cpu = cpu_of(rq);
+ +
+ if (SCX_HAS_OP(update_idle)) { + if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
+ SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+ if (!static_branch_unlikely(&scx_builtin_idle_enabled)) + if (!static_branch_unlikely(&scx_builtin_idle_enabled))
+ return; + return;
@ -6201,20 +6221,22 @@ index 000000000000..25fadfaace33
+ * the DISABLING state and then cycling the queued tasks through dequeue/enqueue + * the DISABLING state and then cycling the queued tasks through dequeue/enqueue
+ * to force global FIFO scheduling. + * to force global FIFO scheduling.
+ * + *
+ * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. + * - ops.select_cpu() is ignored and the default select_cpu() is used.
+ * + *
+ * b. ops.dispatch() is ignored. + * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
+ * + *
+ * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be + * - ops.dispatch() is ignored.
+ * trusted. Whenever a tick triggers, the running task is rotated to the tail
+ * of the queue with core_sched_at touched.
+ * + *
+ * d. pick_next_task() suppresses zero slice warning. + * - balance_scx() does not set %SCX_RQ_BAL_KEEP on no*n-zero slice as slice
+ * can't be trusted. Whenever a tick triggers, the running task is rotated to
+ * the tail of the queue with core_sched_at touched.
+ * + *
+ * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * - pick_next_task() suppresses zero slice warning.
+ * operations.
+ * + *
+ * f. scx_prio_less() reverts to the default core_sched_at order. + * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
+ * operations.
+ *
+ * - scx_prio_less() reverts to the default core_sched_at order.
+ */ + */
+static void scx_ops_bypass(bool bypass) +static void scx_ops_bypass(bool bypass)
+{ +{
@ -6284,7 +6306,7 @@ index 000000000000..25fadfaace33
+ +
+ rq_unlock_irqrestore(rq, &rf); + rq_unlock_irqrestore(rq, &rf);
+ +
+ /* kick to restore ticks */ + /* resched to restore ticks and idle state */
+ resched_cpu(cpu); + resched_cpu(cpu);
+ } + }
+} +}
@ -6406,15 +6428,13 @@ index 000000000000..25fadfaace33
+ +
+ scx_ops_init_task_enabled = false; + scx_ops_init_task_enabled = false;
+ +
+ spin_lock_irq(&scx_tasks_lock); + scx_task_iter_start(&sti);
+ scx_task_iter_init(&sti);
+ while ((p = scx_task_iter_next_locked(&sti))) { + while ((p = scx_task_iter_next_locked(&sti))) {
+ const struct sched_class *old_class = p->sched_class; + const struct sched_class *old_class = p->sched_class;
+ struct sched_enq_and_set_ctx ctx; + struct sched_enq_and_set_ctx ctx;
+ +
+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+ +
+ p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
+ __setscheduler_prio(p, p->prio); + __setscheduler_prio(p, p->prio);
+ check_class_changing(task_rq(p), p, old_class); + check_class_changing(task_rq(p), p, old_class);
+ +
@ -6423,8 +6443,7 @@ index 000000000000..25fadfaace33
+ check_class_changed(task_rq(p), p, old_class, p->prio); + check_class_changed(task_rq(p), p, old_class, p->prio);
+ scx_ops_exit_task(p); + scx_ops_exit_task(p);
+ } + }
+ scx_task_iter_exit(&sti); + scx_task_iter_stop(&sti);
+ spin_unlock_irq(&scx_tasks_lock);
+ percpu_up_write(&scx_fork_rwsem); + percpu_up_write(&scx_fork_rwsem);
+ +
+ /* no task is on scx, turn off all the switches and flush in-progress calls */ + /* no task is on scx, turn off all the switches and flush in-progress calls */
@ -7074,8 +7093,7 @@ index 000000000000..25fadfaace33
+ if (ret) + if (ret)
+ goto err_disable_unlock_all; + goto err_disable_unlock_all;
+ +
+ spin_lock_irq(&scx_tasks_lock); + scx_task_iter_start(&sti);
+ scx_task_iter_init(&sti);
+ while ((p = scx_task_iter_next_locked(&sti))) { + while ((p = scx_task_iter_next_locked(&sti))) {
+ /* + /*
+ * @p may already be dead, have lost all its usages counts and + * @p may already be dead, have lost all its usages counts and
@ -7085,15 +7103,13 @@ index 000000000000..25fadfaace33
+ if (!tryget_task_struct(p)) + if (!tryget_task_struct(p))
+ continue; + continue;
+ +
+ scx_task_iter_rq_unlock(&sti); + scx_task_iter_unlock(&sti);
+ spin_unlock_irq(&scx_tasks_lock);
+ +
+ ret = scx_ops_init_task(p, task_group(p), false); + ret = scx_ops_init_task(p, task_group(p), false);
+ if (ret) { + if (ret) {
+ put_task_struct(p); + put_task_struct(p);
+ spin_lock_irq(&scx_tasks_lock); + scx_task_iter_relock(&sti);
+ scx_task_iter_exit(&sti); + scx_task_iter_stop(&sti);
+ spin_unlock_irq(&scx_tasks_lock);
+ pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", + pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
+ ret, p->comm, p->pid); + ret, p->comm, p->pid);
+ goto err_disable_unlock_all; + goto err_disable_unlock_all;
@ -7102,10 +7118,9 @@ index 000000000000..25fadfaace33
+ scx_set_task_state(p, SCX_TASK_READY); + scx_set_task_state(p, SCX_TASK_READY);
+ +
+ put_task_struct(p); + put_task_struct(p);
+ spin_lock_irq(&scx_tasks_lock); + scx_task_iter_relock(&sti);
+ } + }
+ scx_task_iter_exit(&sti); + scx_task_iter_stop(&sti);
+ spin_unlock_irq(&scx_tasks_lock);
+ scx_cgroup_unlock(); + scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem); + percpu_up_write(&scx_fork_rwsem);
+ +
@ -7122,14 +7137,14 @@ index 000000000000..25fadfaace33
+ * scx_tasks_lock. + * scx_tasks_lock.
+ */ + */
+ percpu_down_write(&scx_fork_rwsem); + percpu_down_write(&scx_fork_rwsem);
+ spin_lock_irq(&scx_tasks_lock); + scx_task_iter_start(&sti);
+ scx_task_iter_init(&sti);
+ while ((p = scx_task_iter_next_locked(&sti))) { + while ((p = scx_task_iter_next_locked(&sti))) {
+ const struct sched_class *old_class = p->sched_class; + const struct sched_class *old_class = p->sched_class;
+ struct sched_enq_and_set_ctx ctx; + struct sched_enq_and_set_ctx ctx;
+ +
+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+ +
+ p->scx.slice = SCX_SLICE_DFL;
+ __setscheduler_prio(p, p->prio); + __setscheduler_prio(p, p->prio);
+ check_class_changing(task_rq(p), p, old_class); + check_class_changing(task_rq(p), p, old_class);
+ +
@ -7137,8 +7152,7 @@ index 000000000000..25fadfaace33
+ +
+ check_class_changed(task_rq(p), p, old_class, p->prio); + check_class_changed(task_rq(p), p, old_class, p->prio);
+ } + }
+ scx_task_iter_exit(&sti); + scx_task_iter_stop(&sti);
+ spin_unlock_irq(&scx_tasks_lock);
+ percpu_up_write(&scx_fork_rwsem); + percpu_up_write(&scx_fork_rwsem);
+ +
+ scx_ops_bypass(false); + scx_ops_bypass(false);
@ -7808,16 +7822,21 @@ index 000000000000..25fadfaace33
+__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, +__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
+ u64 wake_flags, bool *is_idle) + u64 wake_flags, bool *is_idle)
+{ +{
+ if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { + if (!static_branch_likely(&scx_builtin_idle_enabled)) {
+ *is_idle = false; + scx_ops_error("built-in idle tracking is disabled");
+ return prev_cpu; + goto prev_cpu;
+ } + }
+
+ if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
+ goto prev_cpu;
+
+#ifdef CONFIG_SMP +#ifdef CONFIG_SMP
+ return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); + return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
+#else +#endif
+
+prev_cpu:
+ *is_idle = false; + *is_idle = false;
+ return prev_cpu; + return prev_cpu;
+#endif
+} +}
+ +
+__bpf_kfunc_end_defs(); +__bpf_kfunc_end_defs();