From a280828878d8c3c7b7bd125208cde207019aa6b2 Mon Sep 17 00:00:00 2001
From: ferreo <ferreo@noreply.pika>
Date: Thu, 10 Oct 2024 18:22:29 +0200
Subject: [PATCH] Update patches/0002-sched-ext.patch

---
 patches/0002-sched-ext.patch | 275 +++++++++++++++++++----------------
 1 file changed, 147 insertions(+), 128 deletions(-)

diff --git a/patches/0002-sched-ext.patch b/patches/0002-sched-ext.patch
index a44827e..3d1b009 100644
--- a/patches/0002-sched-ext.patch
+++ b/patches/0002-sched-ext.patch
@@ -1,6 +1,6 @@
-From 11276ed2c72c57624c1214e980efd24648be015c Mon Sep 17 00:00:00 2001
+From c0d9f38dcc2b6bb16e54e7f438c9c449319ebef4 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Fri, 4 Oct 2024 17:12:13 +0200
+Date: Thu, 10 Oct 2024 12:47:12 +0200
 Subject: [PATCH] sched-ext
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
@@ -24,7 +24,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  kernel/sched/core.c                           |  288 +-
  kernel/sched/cpufreq_schedutil.c              |   50 +-
  kernel/sched/debug.c                          |    3 +
- kernel/sched/ext.c                            | 7262 +++++++++++++++++
+ kernel/sched/ext.c                            | 7281 +++++++++++++++++
  kernel/sched/ext.h                            |   91 +
  kernel/sched/fair.c                           |   21 +-
  kernel/sched/idle.c                           |    2 +
@@ -102,7 +102,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  .../selftests/sched_ext/test_example.c        |   49 +
  tools/testing/selftests/sched_ext/util.c      |   71 +
  tools/testing/selftests/sched_ext/util.h      |   13 +
- 97 files changed, 16174 insertions(+), 130 deletions(-)
+ 97 files changed, 16193 insertions(+), 130 deletions(-)
  create mode 100644 Documentation/scheduler/sched-ext.rst
  create mode 100644 include/linux/sched/ext.h
  create mode 100644 include/trace/events/sched_ext.h
@@ -524,10 +524,10 @@ index 000000000000..6c0d70e2e27d
 +possible, they are subject to change without warning between kernel
 +versions.
 diff --git a/MAINTAINERS b/MAINTAINERS
-index c2a7363e86fe..bcfe36daf67a 100644
+index 16df466c205d..3345a15afded 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
-@@ -20364,6 +20364,19 @@ F:	include/linux/wait.h
+@@ -20353,6 +20353,19 @@ F:	include/linux/wait.h
  F:	include/uapi/linux/sched.h
  F:	kernel/sched/
  
@@ -594,7 +594,7 @@ index c60ba0ab1462..7139b33cb104 100644
  	CSS_TASK_ITER_PROCS    = (1U << 0),  /* walk only threadgroup leaders */
  	CSS_TASK_ITER_THREADED = (1U << 1),  /* walk all threaded css_sets in the domain */
 diff --git a/include/linux/sched.h b/include/linux/sched.h
-index f8d150343d42..5b4f78fe379d 100644
+index 1c771ea4481d..c5a7901b2580 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -82,6 +82,8 @@ struct task_group;
@@ -606,7 +606,7 @@ index f8d150343d42..5b4f78fe379d 100644
  /*
   * Task state bitmask. NOTE! These bits are also
   * encoded in fs/proc/array.c: get_task_state().
-@@ -810,6 +812,9 @@ struct task_struct {
+@@ -812,6 +814,9 @@ struct task_struct {
  	struct sched_rt_entity		rt;
  	struct sched_dl_entity		dl;
  	struct sched_dl_entity		*dl_server;
@@ -1011,7 +1011,7 @@ index c2f1fd95a821..fe782cd77388 100644
 +	    Documentation/scheduler/sched-ext.rst
 +	    https://github.com/sched-ext/scx
 diff --git a/kernel/fork.c b/kernel/fork.c
-index 238695afc630..69a0a7210060 100644
+index 003de4829c15..eb290420d926 100644
 --- a/kernel/fork.c
 +++ b/kernel/fork.c
 @@ -23,6 +23,7 @@
@@ -1030,7 +1030,7 @@ index 238695afc630..69a0a7210060 100644
  	io_uring_free(tsk);
  	cgroup_free(tsk);
  	task_numa_free(tsk, true);
-@@ -2355,7 +2357,7 @@ __latent_entropy struct task_struct *copy_process(
+@@ -2352,7 +2354,7 @@ __latent_entropy struct task_struct *copy_process(
  
  	retval = perf_event_init_task(p, clone_flags);
  	if (retval)
@@ -1039,7 +1039,7 @@ index 238695afc630..69a0a7210060 100644
  	retval = audit_alloc(p);
  	if (retval)
  		goto bad_fork_cleanup_perf;
-@@ -2488,7 +2490,9 @@ __latent_entropy struct task_struct *copy_process(
+@@ -2485,7 +2487,9 @@ __latent_entropy struct task_struct *copy_process(
  	 * cgroup specific, it unconditionally needs to place the task on a
  	 * runqueue.
  	 */
@@ -1050,7 +1050,7 @@ index 238695afc630..69a0a7210060 100644
  
  	/*
  	 * From this point on we must avoid any synchronous user-space
-@@ -2534,13 +2538,13 @@ __latent_entropy struct task_struct *copy_process(
+@@ -2531,13 +2535,13 @@ __latent_entropy struct task_struct *copy_process(
  	/* Don't start children in a dying pid namespace */
  	if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
  		retval = -ENOMEM;
@@ -1066,7 +1066,7 @@ index 238695afc630..69a0a7210060 100644
  	}
  
  	/* No more failure paths after this point. */
-@@ -2614,10 +2618,11 @@ __latent_entropy struct task_struct *copy_process(
+@@ -2611,10 +2615,11 @@ __latent_entropy struct task_struct *copy_process(
  
  	return p;
  
@@ -1079,7 +1079,7 @@ index 238695afc630..69a0a7210060 100644
  	cgroup_cancel_fork(p, args);
  bad_fork_put_pidfd:
  	if (clone_flags & CLONE_PIDFD) {
-@@ -2656,6 +2661,8 @@ __latent_entropy struct task_struct *copy_process(
+@@ -2653,6 +2658,8 @@ __latent_entropy struct task_struct *copy_process(
  	audit_free(p);
  bad_fork_cleanup_perf:
  	perf_event_free_task(p);
@@ -1128,7 +1128,7 @@ index 39c315182b35..fae1f5c921eb 100644
 +
  #include "syscalls.c"
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index f3951e4a55e5..c792a6feb7a9 100644
+index 1af59cf714cd..8ae04bd4a5a4 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
 @@ -169,7 +169,10 @@ static inline int __task_prio(const struct task_struct *p)
@@ -1353,8 +1353,8 @@ index f3951e4a55e5..c792a6feb7a9 100644
 -#endif
  
  	put_prev_task(rq, prev);
- }
-@@ -5800,6 +5864,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 
+@@ -5808,6 +5872,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  	const struct sched_class *class;
  	struct task_struct *p;
  
@@ -1364,9 +1364,9 @@ index f3951e4a55e5..c792a6feb7a9 100644
  	/*
  	 * Optimization: we know that if all tasks are in the fair class we can
  	 * call that function directly, but only if the @prev task wasn't of a
-@@ -5840,10 +5907,15 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
- 	if (prev->dl_server)
- 		prev->dl_server = NULL;
+@@ -5847,10 +5914,15 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ restart:
+ 	put_prev_task_balance(rq, prev, rf);
  
 -	for_each_class(class) {
 +	for_each_active_class(class) {
@@ -1382,7 +1382,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  	}
  
  	BUG(); /* The idle class should always have a runnable task. */
-@@ -5873,7 +5945,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
+@@ -5880,7 +5952,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
  	const struct sched_class *class;
  	struct task_struct *p;
  
@@ -1391,7 +1391,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  		p = class->pick_task(rq);
  		if (p)
  			return p;
-@@ -6870,6 +6942,10 @@ void __setscheduler_prio(struct task_struct *p, int prio)
+@@ -6877,6 +6949,10 @@ void __setscheduler_prio(struct task_struct *p, int prio)
  		p->sched_class = &dl_sched_class;
  	else if (rt_prio(prio))
  		p->sched_class = &rt_sched_class;
@@ -1402,7 +1402,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  	else
  		p->sched_class = &fair_sched_class;
  
-@@ -7015,6 +7091,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
+@@ -7022,6 +7098,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
  	}
  
  	__setscheduler_prio(p, prio);
@@ -1410,7 +1410,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  
  	if (queued)
  		enqueue_task(rq, p, queue_flag);
-@@ -7429,6 +7506,7 @@ void sched_show_task(struct task_struct *p)
+@@ -7436,6 +7513,7 @@ void sched_show_task(struct task_struct *p)
  
  	print_worker_info(KERN_INFO, p);
  	print_stop_info(KERN_INFO, p);
@@ -1418,7 +1418,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  	show_stack(p, NULL, KERN_INFO);
  	put_task_stack(p);
  }
-@@ -7957,6 +8035,8 @@ int sched_cpu_activate(unsigned int cpu)
+@@ -7964,6 +8042,8 @@ int sched_cpu_activate(unsigned int cpu)
  		cpuset_cpu_active();
  	}
  
@@ -1427,7 +1427,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  	/*
  	 * Put the rq online, if not already. This happens:
  	 *
-@@ -8006,6 +8086,8 @@ int sched_cpu_deactivate(unsigned int cpu)
+@@ -8013,6 +8093,8 @@ int sched_cpu_deactivate(unsigned int cpu)
  
  	sched_set_rq_offline(rq, cpu);
  
@@ -1436,7 +1436,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  	/*
  	 * When going down, decrement the number of cores with SMT present.
  	 */
-@@ -8190,11 +8272,15 @@ void __init sched_init(void)
+@@ -8197,11 +8279,15 @@ void __init sched_init(void)
  	int i;
  
  	/* Make sure the linker didn't screw up */
@@ -1456,7 +1456,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  #endif
  
  	wait_bit_init();
-@@ -8218,6 +8304,9 @@ void __init sched_init(void)
+@@ -8225,6 +8311,9 @@ void __init sched_init(void)
  		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
  		init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
  #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -1466,7 +1466,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  #ifdef CONFIG_RT_GROUP_SCHED
  		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
  		ptr += nr_cpu_ids * sizeof(void **);
-@@ -8363,6 +8452,7 @@ void __init sched_init(void)
+@@ -8370,6 +8459,7 @@ void __init sched_init(void)
  	balance_push_set(smp_processor_id(), false);
  #endif
  	init_sched_fair_class();
@@ -1474,7 +1474,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  
  	psi_init();
  
-@@ -8648,6 +8738,7 @@ struct task_group *sched_create_group(struct task_group *parent)
+@@ -8655,6 +8745,7 @@ struct task_group *sched_create_group(struct task_group *parent)
  	if (!alloc_rt_sched_group(tg, parent))
  		goto err;
  
@@ -1482,7 +1482,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  	alloc_uclamp_sched_group(tg, parent);
  
  	return tg;
-@@ -8775,6 +8866,7 @@ void sched_move_task(struct task_struct *tsk)
+@@ -8782,6 +8873,7 @@ void sched_move_task(struct task_struct *tsk)
  		put_prev_task(rq, tsk);
  
  	sched_change_group(tsk, group);
@@ -1490,7 +1490,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  
  	if (queued)
  		enqueue_task(rq, tsk, queue_flags);
-@@ -8789,11 +8881,6 @@ void sched_move_task(struct task_struct *tsk)
+@@ -8796,11 +8888,6 @@ void sched_move_task(struct task_struct *tsk)
  	}
  }
  
@@ -1502,7 +1502,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  static struct cgroup_subsys_state *
  cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
  {
-@@ -8817,6 +8904,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+@@ -8824,6 +8911,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
  {
  	struct task_group *tg = css_tg(css);
  	struct task_group *parent = css_tg(css->parent);
@@ -1514,7 +1514,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  
  	if (parent)
  		sched_online_group(tg, parent);
-@@ -8831,6 +8923,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+@@ -8838,6 +8930,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
  	return 0;
  }
  
@@ -1528,7 +1528,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
  {
  	struct task_group *tg = css_tg(css);
-@@ -8848,9 +8947,9 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
+@@ -8855,9 +8954,9 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
  	sched_unregister_group(tg);
  }
  
@@ -1539,7 +1539,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  	struct task_struct *task;
  	struct cgroup_subsys_state *css;
  
-@@ -8858,9 +8957,9 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
+@@ -8865,9 +8964,9 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
  		if (!sched_rt_can_attach(css_tg(css), task))
  			return -EINVAL;
  	}
@@ -1551,7 +1551,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  
  static void cpu_cgroup_attach(struct cgroup_taskset *tset)
  {
-@@ -8869,6 +8968,13 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
+@@ -8876,6 +8975,13 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
  
  	cgroup_taskset_for_each(task, css, tset)
  		sched_move_task(task);
@@ -1565,7 +1565,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  }
  
  #ifdef CONFIG_UCLAMP_TASK_GROUP
-@@ -9045,22 +9151,36 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+@@ -9052,22 +9158,36 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
  }
  #endif /* CONFIG_UCLAMP_TASK_GROUP */
  
@@ -1606,7 +1606,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  
  #ifdef CONFIG_CFS_BANDWIDTH
  static DEFINE_MUTEX(cfs_constraints_mutex);
-@@ -9406,7 +9526,6 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
+@@ -9413,7 +9533,6 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
  	return 0;
  }
  #endif /* CONFIG_CFS_BANDWIDTH */
@@ -1614,7 +1614,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  
  #ifdef CONFIG_RT_GROUP_SCHED
  static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
-@@ -9434,7 +9553,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
+@@ -9441,7 +9560,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
  }
  #endif /* CONFIG_RT_GROUP_SCHED */
  
@@ -1623,7 +1623,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
  			       struct cftype *cft)
  {
-@@ -9444,12 +9563,17 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
+@@ -9451,12 +9570,17 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
  static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
  				struct cftype *cft, s64 idle)
  {
@@ -1643,7 +1643,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  	{
  		.name = "shares",
  		.read_u64 = cpu_shares_read_u64,
-@@ -9559,38 +9683,35 @@ static int cpu_local_stat_show(struct seq_file *sf,
+@@ -9566,38 +9690,35 @@ static int cpu_local_stat_show(struct seq_file *sf,
  	return 0;
  }
  
@@ -1696,7 +1696,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  	int last_delta = INT_MAX;
  	int prio, delta;
  
-@@ -9609,7 +9730,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
+@@ -9616,7 +9737,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
  				     struct cftype *cft, s64 nice)
  {
  	unsigned long weight;
@@ -1705,7 +1705,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  
  	if (nice < MIN_NICE || nice > MAX_NICE)
  		return -ERANGE;
-@@ -9618,9 +9739,13 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
+@@ -9625,9 +9746,13 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
  	idx = array_index_nospec(idx, 40);
  	weight = sched_prio_to_weight[idx];
  
@@ -1721,7 +1721,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  
  static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
  						  long period, long quota)
-@@ -9680,7 +9805,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
+@@ -9687,7 +9812,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
  #endif
  
  static struct cftype cpu_files[] = {
@@ -1730,7 +1730,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  	{
  		.name = "weight",
  		.flags = CFTYPE_NOT_ON_ROOT,
-@@ -9734,14 +9859,14 @@ static struct cftype cpu_files[] = {
+@@ -9741,14 +9866,14 @@ static struct cftype cpu_files[] = {
  struct cgroup_subsys cpu_cgrp_subsys = {
  	.css_alloc	= cpu_cgroup_css_alloc,
  	.css_online	= cpu_cgroup_css_online,
@@ -1747,7 +1747,7 @@ index f3951e4a55e5..c792a6feb7a9 100644
  	.legacy_cftypes	= cpu_legacy_files,
  	.dfl_cftypes	= cpu_files,
  	.early_init	= true,
-@@ -10331,3 +10456,38 @@ void sched_mm_cid_fork(struct task_struct *t)
+@@ -10338,3 +10463,38 @@ void sched_mm_cid_fork(struct task_struct *t)
  	t->mm_cid_active = 1;
  }
  #endif
@@ -1891,10 +1891,10 @@ index c1eb9a1afd13..c057ef46c5f8 100644
  
 diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
 new file mode 100644
-index 000000000000..25fadfaace33
+index 000000000000..5fae2292ec29
 --- /dev/null
 +++ b/kernel/sched/ext.c
-@@ -0,0 +1,7262 @@
+@@ -0,0 +1,7281 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
@@ -1915,6 +1915,12 @@ index 000000000000..25fadfaace33
 +	SCX_EXIT_DUMP_DFL_LEN		= 32768,
 +
 +	SCX_CPUPERF_ONE			= SCHED_CAPACITY_SCALE,
++
++	/*
++	 * Iterating all tasks may take a while. Periodically drop
++	 * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
++	 */
++	SCX_OPS_TASK_ITER_BATCH		= 32,
 +};
 +
 +enum scx_exit_kind {
@@ -3168,86 +3174,105 @@ index 000000000000..25fadfaace33
 +	struct task_struct		*locked;
 +	struct rq			*rq;
 +	struct rq_flags			rf;
++	u32				cnt;
 +};
 +
 +/**
-+ * scx_task_iter_init - Initialize a task iterator
++ * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration
 + * @iter: iterator to init
 + *
-+ * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized,
-+ * @iter must eventually be exited with scx_task_iter_exit().
++ * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter
++ * must eventually be stopped with scx_task_iter_stop().
 + *
-+ * scx_tasks_lock may be released between this and the first next() call or
-+ * between any two next() calls. If scx_tasks_lock is released between two
-+ * next() calls, the caller is responsible for ensuring that the task being
-+ * iterated remains accessible either through RCU read lock or obtaining a
-+ * reference count.
++ * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock()
++ * between this and the first next() call or between any two next() calls. If
++ * the locks are released between two next() calls, the caller is responsible
++ * for ensuring that the task being iterated remains accessible either through
++ * RCU read lock or obtaining a reference count.
 + *
 + * All tasks which existed when the iteration started are guaranteed to be
 + * visited as long as they still exist.
 + */
-+static void scx_task_iter_init(struct scx_task_iter *iter)
++static void scx_task_iter_start(struct scx_task_iter *iter)
 +{
-+	lockdep_assert_held(&scx_tasks_lock);
-+
 +	BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
 +		     ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
 +
++	spin_lock_irq(&scx_tasks_lock);
++
 +	iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
 +	list_add(&iter->cursor.tasks_node, &scx_tasks);
 +	iter->locked = NULL;
++	iter->cnt = 0;
 +}
 +
-+/**
-+ * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator
-+ * @iter: iterator to unlock rq for
-+ *
-+ * If @iter is in the middle of a locked iteration, it may be locking the rq of
-+ * the task currently being visited. Unlock the rq if so. This function can be
-+ * safely called anytime during an iteration.
-+ *
-+ * Returns %true if the rq @iter was locking is unlocked. %false if @iter was
-+ * not locking an rq.
-+ */
-+static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter)
++static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
 +{
 +	if (iter->locked) {
 +		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
 +		iter->locked = NULL;
-+		return true;
-+	} else {
-+		return false;
 +	}
 +}
 +
 +/**
-+ * scx_task_iter_exit - Exit a task iterator
++ * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator
++ * @iter: iterator to unlock
++ *
++ * If @iter is in the middle of a locked iteration, it may be locking the rq of
++ * the task currently being visited in addition to scx_tasks_lock. Unlock both.
++ * This function can be safely called anytime during an iteration.
++ */
++static void scx_task_iter_unlock(struct scx_task_iter *iter)
++{
++	__scx_task_iter_rq_unlock(iter);
++	spin_unlock_irq(&scx_tasks_lock);
++}
++
++/**
++ * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock()
++ * @iter: iterator to re-lock
++ *
++ * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it
++ * doesn't re-lock the rq lock. Must be called before other iterator operations.
++ */
++static void scx_task_iter_relock(struct scx_task_iter *iter)
++{
++	spin_lock_irq(&scx_tasks_lock);
++}
++
++/**
++ * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock
 + * @iter: iterator to exit
 + *
-+ * Exit a previously initialized @iter. Must be called with scx_tasks_lock held.
-+ * If the iterator holds a task's rq lock, that rq lock is released. See
-+ * scx_task_iter_init() for details.
++ * Exit a previously initialized @iter. Must be called with scx_tasks_lock held
++ * which is released on return. If the iterator holds a task's rq lock, that rq
++ * lock is also released. See scx_task_iter_start() for details.
 + */
-+static void scx_task_iter_exit(struct scx_task_iter *iter)
++static void scx_task_iter_stop(struct scx_task_iter *iter)
 +{
-+	lockdep_assert_held(&scx_tasks_lock);
-+
-+	scx_task_iter_rq_unlock(iter);
 +	list_del_init(&iter->cursor.tasks_node);
++	scx_task_iter_unlock(iter);
 +}
 +
 +/**
 + * scx_task_iter_next - Next task
 + * @iter: iterator to walk
 + *
-+ * Visit the next task. See scx_task_iter_init() for details.
++ * Visit the next task. See scx_task_iter_start() for details. Locks are dropped
++ * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing
++ * stalls by holding scx_tasks_lock for too long.
 + */
 +static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
 +{
 +	struct list_head *cursor = &iter->cursor.tasks_node;
 +	struct sched_ext_entity *pos;
 +
-+	lockdep_assert_held(&scx_tasks_lock);
++	if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) {
++		scx_task_iter_unlock(iter);
++		cpu_relax();
++		cond_resched();
++		scx_task_iter_relock(iter);
++	}
 +
 +	list_for_each_entry(pos, cursor, tasks_node) {
 +		if (&pos->tasks_node == &scx_tasks)
@@ -3268,14 +3293,14 @@ index 000000000000..25fadfaace33
 + * @include_dead: Whether we should include dead tasks in the iteration
 + *
 + * Visit the non-idle task with its rq lock held. Allows callers to specify
-+ * whether they would like to filter out dead tasks. See scx_task_iter_init()
++ * whether they would like to filter out dead tasks. See scx_task_iter_start()
 + * for details.
 + */
 +static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
 +{
 +	struct task_struct *p;
 +
-+	scx_task_iter_rq_unlock(iter);
++	__scx_task_iter_rq_unlock(iter);
 +
 +	while ((p = scx_task_iter_next(iter))) {
 +		/*
@@ -4989,11 +5014,6 @@ index 000000000000..25fadfaace33
 +
 +	*found = false;
 +
-+	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
-+		scx_ops_error("built-in idle tracking is disabled");
-+		return prev_cpu;
-+	}
-+
 +	/*
 +	 * If WAKE_SYNC, the waker's local DSQ is empty, and the system is
 +	 * under utilized, wake up @p to the local DSQ of the waker. Checking
@@ -5067,7 +5087,7 @@ index 000000000000..25fadfaace33
 +	if (unlikely(wake_flags & WF_EXEC))
 +		return prev_cpu;
 +
-+	if (SCX_HAS_OP(select_cpu)) {
++	if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) {
 +		s32 cpu;
 +		struct task_struct **ddsp_taskp;
 +
@@ -5132,7 +5152,7 @@ index 000000000000..25fadfaace33
 +{
 +	int cpu = cpu_of(rq);
 +
-+	if (SCX_HAS_OP(update_idle)) {
++	if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
 +		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
 +		if (!static_branch_unlikely(&scx_builtin_idle_enabled))
 +			return;
@@ -6201,20 +6221,22 @@ index 000000000000..25fadfaace33
 + * the DISABLING state and then cycling the queued tasks through dequeue/enqueue
 + * to force global FIFO scheduling.
 + *
-+ * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
++ * - ops.select_cpu() is ignored and the default select_cpu() is used.
 + *
-+ * b. ops.dispatch() is ignored.
++ * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
 + *
-+ * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be
-+ *    trusted. Whenever a tick triggers, the running task is rotated to the tail
-+ *    of the queue with core_sched_at touched.
++ * - ops.dispatch() is ignored.
 + *
-+ * d. pick_next_task() suppresses zero slice warning.
++ * - balance_scx() does not set %SCX_RQ_BAL_KEEP on no*n-zero slice as slice
++ *   can't be trusted. Whenever a tick triggers, the running task is rotated to
++ *   the tail of the queue with core_sched_at touched.
 + *
-+ * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
-+ *    operations.
++ * - pick_next_task() suppresses zero slice warning.
 + *
-+ * f. scx_prio_less() reverts to the default core_sched_at order.
++ * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
++ *   operations.
++ *
++ * - scx_prio_less() reverts to the default core_sched_at order.
 + */
 +static void scx_ops_bypass(bool bypass)
 +{
@@ -6284,7 +6306,7 @@ index 000000000000..25fadfaace33
 +
 +		rq_unlock_irqrestore(rq, &rf);
 +
-+		/* kick to restore ticks */
++		/* resched to restore ticks and idle state */
 +		resched_cpu(cpu);
 +	}
 +}
@@ -6406,15 +6428,13 @@ index 000000000000..25fadfaace33
 +
 +	scx_ops_init_task_enabled = false;
 +
-+	spin_lock_irq(&scx_tasks_lock);
-+	scx_task_iter_init(&sti);
++	scx_task_iter_start(&sti);
 +	while ((p = scx_task_iter_next_locked(&sti))) {
 +		const struct sched_class *old_class = p->sched_class;
 +		struct sched_enq_and_set_ctx ctx;
 +
 +		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
 +
-+		p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
 +		__setscheduler_prio(p, p->prio);
 +		check_class_changing(task_rq(p), p, old_class);
 +
@@ -6423,8 +6443,7 @@ index 000000000000..25fadfaace33
 +		check_class_changed(task_rq(p), p, old_class, p->prio);
 +		scx_ops_exit_task(p);
 +	}
-+	scx_task_iter_exit(&sti);
-+	spin_unlock_irq(&scx_tasks_lock);
++	scx_task_iter_stop(&sti);
 +	percpu_up_write(&scx_fork_rwsem);
 +
 +	/* no task is on scx, turn off all the switches and flush in-progress calls */
@@ -7074,8 +7093,7 @@ index 000000000000..25fadfaace33
 +	if (ret)
 +		goto err_disable_unlock_all;
 +
-+	spin_lock_irq(&scx_tasks_lock);
-+	scx_task_iter_init(&sti);
++	scx_task_iter_start(&sti);
 +	while ((p = scx_task_iter_next_locked(&sti))) {
 +		/*
 +		 * @p may already be dead, have lost all its usages counts and
@@ -7085,15 +7103,13 @@ index 000000000000..25fadfaace33
 +		if (!tryget_task_struct(p))
 +			continue;
 +
-+		scx_task_iter_rq_unlock(&sti);
-+		spin_unlock_irq(&scx_tasks_lock);
++		scx_task_iter_unlock(&sti);
 +
 +		ret = scx_ops_init_task(p, task_group(p), false);
 +		if (ret) {
 +			put_task_struct(p);
-+			spin_lock_irq(&scx_tasks_lock);
-+			scx_task_iter_exit(&sti);
-+			spin_unlock_irq(&scx_tasks_lock);
++			scx_task_iter_relock(&sti);
++			scx_task_iter_stop(&sti);
 +			pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
 +			       ret, p->comm, p->pid);
 +			goto err_disable_unlock_all;
@@ -7102,10 +7118,9 @@ index 000000000000..25fadfaace33
 +		scx_set_task_state(p, SCX_TASK_READY);
 +
 +		put_task_struct(p);
-+		spin_lock_irq(&scx_tasks_lock);
++		scx_task_iter_relock(&sti);
 +	}
-+	scx_task_iter_exit(&sti);
-+	spin_unlock_irq(&scx_tasks_lock);
++	scx_task_iter_stop(&sti);
 +	scx_cgroup_unlock();
 +	percpu_up_write(&scx_fork_rwsem);
 +
@@ -7122,14 +7137,14 @@ index 000000000000..25fadfaace33
 +	 * scx_tasks_lock.
 +	 */
 +	percpu_down_write(&scx_fork_rwsem);
-+	spin_lock_irq(&scx_tasks_lock);
-+	scx_task_iter_init(&sti);
++	scx_task_iter_start(&sti);
 +	while ((p = scx_task_iter_next_locked(&sti))) {
 +		const struct sched_class *old_class = p->sched_class;
 +		struct sched_enq_and_set_ctx ctx;
 +
 +		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
 +
++		p->scx.slice = SCX_SLICE_DFL;
 +		__setscheduler_prio(p, p->prio);
 +		check_class_changing(task_rq(p), p, old_class);
 +
@@ -7137,8 +7152,7 @@ index 000000000000..25fadfaace33
 +
 +		check_class_changed(task_rq(p), p, old_class, p->prio);
 +	}
-+	scx_task_iter_exit(&sti);
-+	spin_unlock_irq(&scx_tasks_lock);
++	scx_task_iter_stop(&sti);
 +	percpu_up_write(&scx_fork_rwsem);
 +
 +	scx_ops_bypass(false);
@@ -7808,16 +7822,21 @@ index 000000000000..25fadfaace33
 +__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
 +				       u64 wake_flags, bool *is_idle)
 +{
-+	if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) {
-+		*is_idle = false;
-+		return prev_cpu;
++	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
++		scx_ops_error("built-in idle tracking is disabled");
++		goto prev_cpu;
 +	}
++
++	if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
++		goto prev_cpu;
++
 +#ifdef CONFIG_SMP
 +	return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
-+#else
++#endif
++
++prev_cpu:
 +	*is_idle = false;
 +	return prev_cpu;
-+#endif
 +}
 +
 +__bpf_kfunc_end_defs();