diff --git a/patches/0002-sched-ext.patch b/patches/0002-sched-ext.patch deleted file mode 100644 index 3d1b009..0000000 --- a/patches/0002-sched-ext.patch +++ /dev/null @@ -1,17925 +0,0 @@ -From c0d9f38dcc2b6bb16e54e7f438c9c449319ebef4 Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Thu, 10 Oct 2024 12:47:12 +0200 -Subject: [PATCH] sched-ext - -Signed-off-by: Peter Jung ---- - Documentation/scheduler/index.rst | 1 + - Documentation/scheduler/sched-ext.rst | 326 + - MAINTAINERS | 13 + - drivers/tty/sysrq.c | 1 + - include/asm-generic/vmlinux.lds.h | 1 + - include/linux/cgroup.h | 4 +- - include/linux/sched.h | 5 + - include/linux/sched/ext.h | 216 + - include/linux/sched/task.h | 8 +- - include/trace/events/sched_ext.h | 32 + - include/uapi/linux/sched.h | 1 + - init/Kconfig | 10 + - init/init_task.c | 12 + - kernel/Kconfig.preempt | 27 +- - kernel/fork.c | 17 +- - kernel/sched/build_policy.c | 11 + - kernel/sched/core.c | 288 +- - kernel/sched/cpufreq_schedutil.c | 50 +- - kernel/sched/debug.c | 3 + - kernel/sched/ext.c | 7281 +++++++++++++++++ - kernel/sched/ext.h | 91 + - kernel/sched/fair.c | 21 +- - kernel/sched/idle.c | 2 + - kernel/sched/sched.h | 203 +- - kernel/sched/syscalls.c | 26 + - lib/dump_stack.c | 1 + - tools/Makefile | 10 +- - tools/sched_ext/.gitignore | 2 + - tools/sched_ext/Makefile | 246 + - tools/sched_ext/README.md | 270 + - .../sched_ext/include/bpf-compat/gnu/stubs.h | 11 + - tools/sched_ext/include/scx/common.bpf.h | 427 + - tools/sched_ext/include/scx/common.h | 75 + - tools/sched_ext/include/scx/compat.bpf.h | 47 + - tools/sched_ext/include/scx/compat.h | 186 + - tools/sched_ext/include/scx/user_exit_info.h | 115 + - tools/sched_ext/scx_central.bpf.c | 361 + - tools/sched_ext/scx_central.c | 135 + - tools/sched_ext/scx_flatcg.bpf.c | 957 +++ - tools/sched_ext/scx_flatcg.c | 233 + - tools/sched_ext/scx_flatcg.h | 51 + - tools/sched_ext/scx_qmap.bpf.c | 813 ++ - tools/sched_ext/scx_qmap.c | 153 + - tools/sched_ext/scx_show_state.py | 40 + - tools/sched_ext/scx_simple.bpf.c | 156 + - tools/sched_ext/scx_simple.c | 107 + - tools/testing/selftests/sched_ext/.gitignore | 6 + - tools/testing/selftests/sched_ext/Makefile | 218 + - tools/testing/selftests/sched_ext/config | 9 + - .../selftests/sched_ext/create_dsq.bpf.c | 58 + - .../testing/selftests/sched_ext/create_dsq.c | 57 + - .../sched_ext/ddsp_bogus_dsq_fail.bpf.c | 42 + - .../selftests/sched_ext/ddsp_bogus_dsq_fail.c | 57 + - .../sched_ext/ddsp_vtimelocal_fail.bpf.c | 39 + - .../sched_ext/ddsp_vtimelocal_fail.c | 56 + - .../selftests/sched_ext/dsp_local_on.bpf.c | 65 + - .../selftests/sched_ext/dsp_local_on.c | 58 + - .../sched_ext/enq_last_no_enq_fails.bpf.c | 21 + - .../sched_ext/enq_last_no_enq_fails.c | 60 + - .../sched_ext/enq_select_cpu_fails.bpf.c | 43 + - .../sched_ext/enq_select_cpu_fails.c | 61 + - tools/testing/selftests/sched_ext/exit.bpf.c | 84 + - tools/testing/selftests/sched_ext/exit.c | 55 + - tools/testing/selftests/sched_ext/exit_test.h | 20 + - .../testing/selftests/sched_ext/hotplug.bpf.c | 61 + - tools/testing/selftests/sched_ext/hotplug.c | 168 + - .../selftests/sched_ext/hotplug_test.h | 15 + - .../sched_ext/init_enable_count.bpf.c | 53 + - .../selftests/sched_ext/init_enable_count.c | 166 + - .../testing/selftests/sched_ext/maximal.bpf.c | 164 + - tools/testing/selftests/sched_ext/maximal.c | 51 + - .../selftests/sched_ext/maybe_null.bpf.c | 36 + - .../testing/selftests/sched_ext/maybe_null.c | 49 + - .../sched_ext/maybe_null_fail_dsp.bpf.c | 25 + - .../sched_ext/maybe_null_fail_yld.bpf.c | 28 + - .../testing/selftests/sched_ext/minimal.bpf.c | 21 + - tools/testing/selftests/sched_ext/minimal.c | 58 + - .../selftests/sched_ext/prog_run.bpf.c | 33 + - tools/testing/selftests/sched_ext/prog_run.c | 78 + - .../testing/selftests/sched_ext/reload_loop.c | 75 + - tools/testing/selftests/sched_ext/runner.c | 201 + - tools/testing/selftests/sched_ext/scx_test.h | 131 + - .../selftests/sched_ext/select_cpu_dfl.bpf.c | 40 + - .../selftests/sched_ext/select_cpu_dfl.c | 72 + - .../sched_ext/select_cpu_dfl_nodispatch.bpf.c | 89 + - .../sched_ext/select_cpu_dfl_nodispatch.c | 72 + - .../sched_ext/select_cpu_dispatch.bpf.c | 41 + - .../selftests/sched_ext/select_cpu_dispatch.c | 70 + - .../select_cpu_dispatch_bad_dsq.bpf.c | 37 + - .../sched_ext/select_cpu_dispatch_bad_dsq.c | 56 + - .../select_cpu_dispatch_dbl_dsp.bpf.c | 38 + - .../sched_ext/select_cpu_dispatch_dbl_dsp.c | 56 + - .../sched_ext/select_cpu_vtime.bpf.c | 92 + - .../selftests/sched_ext/select_cpu_vtime.c | 59 + - .../selftests/sched_ext/test_example.c | 49 + - tools/testing/selftests/sched_ext/util.c | 71 + - tools/testing/selftests/sched_ext/util.h | 13 + - 97 files changed, 16193 insertions(+), 130 deletions(-) - create mode 100644 Documentation/scheduler/sched-ext.rst - create mode 100644 include/linux/sched/ext.h - create mode 100644 include/trace/events/sched_ext.h - create mode 100644 kernel/sched/ext.c - create mode 100644 kernel/sched/ext.h - create mode 100644 tools/sched_ext/.gitignore - create mode 100644 tools/sched_ext/Makefile - create mode 100644 tools/sched_ext/README.md - create mode 100644 tools/sched_ext/include/bpf-compat/gnu/stubs.h - create mode 100644 tools/sched_ext/include/scx/common.bpf.h - create mode 100644 tools/sched_ext/include/scx/common.h - create mode 100644 tools/sched_ext/include/scx/compat.bpf.h - create mode 100644 tools/sched_ext/include/scx/compat.h - create mode 100644 tools/sched_ext/include/scx/user_exit_info.h - create mode 100644 tools/sched_ext/scx_central.bpf.c - create mode 100644 tools/sched_ext/scx_central.c - create mode 100644 tools/sched_ext/scx_flatcg.bpf.c - create mode 100644 tools/sched_ext/scx_flatcg.c - create mode 100644 tools/sched_ext/scx_flatcg.h - create mode 100644 tools/sched_ext/scx_qmap.bpf.c - create mode 100644 tools/sched_ext/scx_qmap.c - create mode 100644 tools/sched_ext/scx_show_state.py - create mode 100644 tools/sched_ext/scx_simple.bpf.c - create mode 100644 tools/sched_ext/scx_simple.c - create mode 100644 tools/testing/selftests/sched_ext/.gitignore - create mode 100644 tools/testing/selftests/sched_ext/Makefile - create mode 100644 tools/testing/selftests/sched_ext/config - create mode 100644 tools/testing/selftests/sched_ext/create_dsq.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/create_dsq.c - create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c - create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c - create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.c - create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c - create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.c - create mode 100644 tools/testing/selftests/sched_ext/exit.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/exit.c - create mode 100644 tools/testing/selftests/sched_ext/exit_test.h - create mode 100644 tools/testing/selftests/sched_ext/hotplug.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/hotplug.c - create mode 100644 tools/testing/selftests/sched_ext/hotplug_test.h - create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.c - create mode 100644 tools/testing/selftests/sched_ext/maximal.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/maximal.c - create mode 100644 tools/testing/selftests/sched_ext/maybe_null.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/maybe_null.c - create mode 100644 tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/minimal.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/minimal.c - create mode 100644 tools/testing/selftests/sched_ext/prog_run.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/prog_run.c - create mode 100644 tools/testing/selftests/sched_ext/reload_loop.c - create mode 100644 tools/testing/selftests/sched_ext/runner.c - create mode 100644 tools/testing/selftests/sched_ext/scx_test.h - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.c - create mode 100644 tools/testing/selftests/sched_ext/test_example.c - create mode 100644 tools/testing/selftests/sched_ext/util.c - create mode 100644 tools/testing/selftests/sched_ext/util.h - -diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst -index 43bd8a145b7a..0611dc3dda8e 100644 ---- a/Documentation/scheduler/index.rst -+++ b/Documentation/scheduler/index.rst -@@ -20,6 +20,7 @@ Scheduler - sched-nice-design - sched-rt-group - sched-stats -+ sched-ext - sched-debug - - text_files -diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst -new file mode 100644 -index 000000000000..6c0d70e2e27d ---- /dev/null -+++ b/Documentation/scheduler/sched-ext.rst -@@ -0,0 +1,326 @@ -+========================== -+Extensible Scheduler Class -+========================== -+ -+sched_ext is a scheduler class whose behavior can be defined by a set of BPF -+programs - the BPF scheduler. -+ -+* sched_ext exports a full scheduling interface so that any scheduling -+ algorithm can be implemented on top. -+ -+* The BPF scheduler can group CPUs however it sees fit and schedule them -+ together, as tasks aren't tied to specific CPUs at the time of wakeup. -+ -+* The BPF scheduler can be turned on and off dynamically anytime. -+ -+* The system integrity is maintained no matter what the BPF scheduler does. -+ The default scheduling behavior is restored anytime an error is detected, -+ a runnable task stalls, or on invoking the SysRq key sequence -+ :kbd:`SysRq-S`. -+ -+* When the BPF scheduler triggers an error, debug information is dumped to -+ aid debugging. The debug dump is passed to and printed out by the -+ scheduler binary. The debug dump can also be accessed through the -+ `sched_ext_dump` tracepoint. The SysRq key sequence :kbd:`SysRq-D` -+ triggers a debug dump. This doesn't terminate the BPF scheduler and can -+ only be read through the tracepoint. -+ -+Switching to and from sched_ext -+=============================== -+ -+``CONFIG_SCHED_CLASS_EXT`` is the config option to enable sched_ext and -+``tools/sched_ext`` contains the example schedulers. The following config -+options should be enabled to use sched_ext: -+ -+.. code-block:: none -+ -+ CONFIG_BPF=y -+ CONFIG_SCHED_CLASS_EXT=y -+ CONFIG_BPF_SYSCALL=y -+ CONFIG_BPF_JIT=y -+ CONFIG_DEBUG_INFO_BTF=y -+ CONFIG_BPF_JIT_ALWAYS_ON=y -+ CONFIG_BPF_JIT_DEFAULT_ON=y -+ CONFIG_PAHOLE_HAS_SPLIT_BTF=y -+ CONFIG_PAHOLE_HAS_BTF_TAG=y -+ -+sched_ext is used only when the BPF scheduler is loaded and running. -+ -+If a task explicitly sets its scheduling policy to ``SCHED_EXT``, it will be -+treated as ``SCHED_NORMAL`` and scheduled by CFS until the BPF scheduler is -+loaded. -+ -+When the BPF scheduler is loaded and ``SCX_OPS_SWITCH_PARTIAL`` is not set -+in ``ops->flags``, all ``SCHED_NORMAL``, ``SCHED_BATCH``, ``SCHED_IDLE``, and -+``SCHED_EXT`` tasks are scheduled by sched_ext. -+ -+However, when the BPF scheduler is loaded and ``SCX_OPS_SWITCH_PARTIAL`` is -+set in ``ops->flags``, only tasks with the ``SCHED_EXT`` policy are scheduled -+by sched_ext, while tasks with ``SCHED_NORMAL``, ``SCHED_BATCH`` and -+``SCHED_IDLE`` policies are scheduled by CFS. -+ -+Terminating the sched_ext scheduler program, triggering :kbd:`SysRq-S`, or -+detection of any internal error including stalled runnable tasks aborts the -+BPF scheduler and reverts all tasks back to CFS. -+ -+.. code-block:: none -+ -+ # make -j16 -C tools/sched_ext -+ # tools/sched_ext/scx_simple -+ local=0 global=3 -+ local=5 global=24 -+ local=9 global=44 -+ local=13 global=56 -+ local=17 global=72 -+ ^CEXIT: BPF scheduler unregistered -+ -+The current status of the BPF scheduler can be determined as follows: -+ -+.. code-block:: none -+ -+ # cat /sys/kernel/sched_ext/state -+ enabled -+ # cat /sys/kernel/sched_ext/root/ops -+ simple -+ -+You can check if any BPF scheduler has ever been loaded since boot by examining -+this monotonically incrementing counter (a value of zero indicates that no BPF -+scheduler has been loaded): -+ -+.. code-block:: none -+ -+ # cat /sys/kernel/sched_ext/enable_seq -+ 1 -+ -+``tools/sched_ext/scx_show_state.py`` is a drgn script which shows more -+detailed information: -+ -+.. code-block:: none -+ -+ # tools/sched_ext/scx_show_state.py -+ ops : simple -+ enabled : 1 -+ switching_all : 1 -+ switched_all : 1 -+ enable_state : enabled (2) -+ bypass_depth : 0 -+ nr_rejected : 0 -+ enable_seq : 1 -+ -+If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can -+be determined as follows: -+ -+.. code-block:: none -+ -+ # grep ext /proc/self/sched -+ ext.enabled : 1 -+ -+The Basics -+========== -+ -+Userspace can implement an arbitrary BPF scheduler by loading a set of BPF -+programs that implement ``struct sched_ext_ops``. The only mandatory field -+is ``ops.name`` which must be a valid BPF object name. All operations are -+optional. The following modified excerpt is from -+``tools/sched_ext/scx_simple.bpf.c`` showing a minimal global FIFO scheduler. -+ -+.. code-block:: c -+ -+ /* -+ * Decide which CPU a task should be migrated to before being -+ * enqueued (either at wakeup, fork time, or exec time). If an -+ * idle core is found by the default ops.select_cpu() implementation, -+ * then dispatch the task directly to SCX_DSQ_LOCAL and skip the -+ * ops.enqueue() callback. -+ * -+ * Note that this implementation has exactly the same behavior as the -+ * default ops.select_cpu implementation. The behavior of the scheduler -+ * would be exactly same if the implementation just didn't define the -+ * simple_select_cpu() struct_ops prog. -+ */ -+ s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+ { -+ s32 cpu; -+ /* Need to initialize or the BPF verifier will reject the program */ -+ bool direct = false; -+ -+ cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &direct); -+ -+ if (direct) -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); -+ -+ return cpu; -+ } -+ -+ /* -+ * Do a direct dispatch of a task to the global DSQ. This ops.enqueue() -+ * callback will only be invoked if we failed to find a core to dispatch -+ * to in ops.select_cpu() above. -+ * -+ * Note that this implementation has exactly the same behavior as the -+ * default ops.enqueue implementation, which just dispatches the task -+ * to SCX_DSQ_GLOBAL. The behavior of the scheduler would be exactly same -+ * if the implementation just didn't define the simple_enqueue struct_ops -+ * prog. -+ */ -+ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) -+ { -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); -+ } -+ -+ s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) -+ { -+ /* -+ * By default, all SCHED_EXT, SCHED_OTHER, SCHED_IDLE, and -+ * SCHED_BATCH tasks should use sched_ext. -+ */ -+ return 0; -+ } -+ -+ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) -+ { -+ exit_type = ei->type; -+ } -+ -+ SEC(".struct_ops") -+ struct sched_ext_ops simple_ops = { -+ .select_cpu = (void *)simple_select_cpu, -+ .enqueue = (void *)simple_enqueue, -+ .init = (void *)simple_init, -+ .exit = (void *)simple_exit, -+ .name = "simple", -+ }; -+ -+Dispatch Queues -+--------------- -+ -+To match the impedance between the scheduler core and the BPF scheduler, -+sched_ext uses DSQs (dispatch queues) which can operate as both a FIFO and a -+priority queue. By default, there is one global FIFO (``SCX_DSQ_GLOBAL``), -+and one local dsq per CPU (``SCX_DSQ_LOCAL``). The BPF scheduler can manage -+an arbitrary number of dsq's using ``scx_bpf_create_dsq()`` and -+``scx_bpf_destroy_dsq()``. -+ -+A CPU always executes a task from its local DSQ. A task is "dispatched" to a -+DSQ. A non-local DSQ is "consumed" to transfer a task to the consuming CPU's -+local DSQ. -+ -+When a CPU is looking for the next task to run, if the local DSQ is not -+empty, the first task is picked. Otherwise, the CPU tries to consume the -+global DSQ. If that doesn't yield a runnable task either, ``ops.dispatch()`` -+is invoked. -+ -+Scheduling Cycle -+---------------- -+ -+The following briefly shows how a waking task is scheduled and executed. -+ -+1. When a task is waking up, ``ops.select_cpu()`` is the first operation -+ invoked. This serves two purposes. First, CPU selection optimization -+ hint. Second, waking up the selected CPU if idle. -+ -+ The CPU selected by ``ops.select_cpu()`` is an optimization hint and not -+ binding. The actual decision is made at the last step of scheduling. -+ However, there is a small performance gain if the CPU -+ ``ops.select_cpu()`` returns matches the CPU the task eventually runs on. -+ -+ A side-effect of selecting a CPU is waking it up from idle. While a BPF -+ scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper, -+ using ``ops.select_cpu()`` judiciously can be simpler and more efficient. -+ -+ A task can be immediately dispatched to a DSQ from ``ops.select_cpu()`` by -+ calling ``scx_bpf_dispatch()``. If the task is dispatched to -+ ``SCX_DSQ_LOCAL`` from ``ops.select_cpu()``, it will be dispatched to the -+ local DSQ of whichever CPU is returned from ``ops.select_cpu()``. -+ Additionally, dispatching directly from ``ops.select_cpu()`` will cause the -+ ``ops.enqueue()`` callback to be skipped. -+ -+ Note that the scheduler core will ignore an invalid CPU selection, for -+ example, if it's outside the allowed cpumask of the task. -+ -+2. Once the target CPU is selected, ``ops.enqueue()`` is invoked (unless the -+ task was dispatched directly from ``ops.select_cpu()``). ``ops.enqueue()`` -+ can make one of the following decisions: -+ -+ * Immediately dispatch the task to either the global or local DSQ by -+ calling ``scx_bpf_dispatch()`` with ``SCX_DSQ_GLOBAL`` or -+ ``SCX_DSQ_LOCAL``, respectively. -+ -+ * Immediately dispatch the task to a custom DSQ by calling -+ ``scx_bpf_dispatch()`` with a DSQ ID which is smaller than 2^63. -+ -+ * Queue the task on the BPF side. -+ -+3. When a CPU is ready to schedule, it first looks at its local DSQ. If -+ empty, it then looks at the global DSQ. If there still isn't a task to -+ run, ``ops.dispatch()`` is invoked which can use the following two -+ functions to populate the local DSQ. -+ -+ * ``scx_bpf_dispatch()`` dispatches a task to a DSQ. Any target DSQ can -+ be used - ``SCX_DSQ_LOCAL``, ``SCX_DSQ_LOCAL_ON | cpu``, -+ ``SCX_DSQ_GLOBAL`` or a custom DSQ. While ``scx_bpf_dispatch()`` -+ currently can't be called with BPF locks held, this is being worked on -+ and will be supported. ``scx_bpf_dispatch()`` schedules dispatching -+ rather than performing them immediately. There can be up to -+ ``ops.dispatch_max_batch`` pending tasks. -+ -+ * ``scx_bpf_consume()`` tranfers a task from the specified non-local DSQ -+ to the dispatching DSQ. This function cannot be called with any BPF -+ locks held. ``scx_bpf_consume()`` flushes the pending dispatched tasks -+ before trying to consume the specified DSQ. -+ -+4. After ``ops.dispatch()`` returns, if there are tasks in the local DSQ, -+ the CPU runs the first one. If empty, the following steps are taken: -+ -+ * Try to consume the global DSQ. If successful, run the task. -+ -+ * If ``ops.dispatch()`` has dispatched any tasks, retry #3. -+ -+ * If the previous task is an SCX task and still runnable, keep executing -+ it (see ``SCX_OPS_ENQ_LAST``). -+ -+ * Go idle. -+ -+Note that the BPF scheduler can always choose to dispatch tasks immediately -+in ``ops.enqueue()`` as illustrated in the above simple example. If only the -+built-in DSQs are used, there is no need to implement ``ops.dispatch()`` as -+a task is never queued on the BPF scheduler and both the local and global -+DSQs are consumed automatically. -+ -+``scx_bpf_dispatch()`` queues the task on the FIFO of the target DSQ. Use -+``scx_bpf_dispatch_vtime()`` for the priority queue. Internal DSQs such as -+``SCX_DSQ_LOCAL`` and ``SCX_DSQ_GLOBAL`` do not support priority-queue -+dispatching, and must be dispatched to with ``scx_bpf_dispatch()``. See the -+function documentation and usage in ``tools/sched_ext/scx_simple.bpf.c`` for -+more information. -+ -+Where to Look -+============= -+ -+* ``include/linux/sched/ext.h`` defines the core data structures, ops table -+ and constants. -+ -+* ``kernel/sched/ext.c`` contains sched_ext core implementation and helpers. -+ The functions prefixed with ``scx_bpf_`` can be called from the BPF -+ scheduler. -+ -+* ``tools/sched_ext/`` hosts example BPF scheduler implementations. -+ -+ * ``scx_simple[.bpf].c``: Minimal global FIFO scheduler example using a -+ custom DSQ. -+ -+ * ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five -+ levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``. -+ -+ABI Instability -+=============== -+ -+The APIs provided by sched_ext to BPF schedulers programs have no stability -+guarantees. This includes the ops table callbacks and constants defined in -+``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in -+``kernel/sched/ext.c``. -+ -+While we will attempt to provide a relatively stable API surface when -+possible, they are subject to change without warning between kernel -+versions. -diff --git a/MAINTAINERS b/MAINTAINERS -index 16df466c205d..3345a15afded 100644 ---- a/MAINTAINERS -+++ b/MAINTAINERS -@@ -20353,6 +20353,19 @@ F: include/linux/wait.h - F: include/uapi/linux/sched.h - F: kernel/sched/ - -+SCHEDULER - SCHED_EXT -+R: Tejun Heo -+R: David Vernet -+L: linux-kernel@vger.kernel.org -+S: Maintained -+W: https://github.com/sched-ext/scx -+T: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git -+F: include/linux/sched/ext.h -+F: kernel/sched/ext.h -+F: kernel/sched/ext.c -+F: tools/sched_ext/ -+F: tools/testing/selftests/sched_ext -+ - SCIOSENSE ENS160 MULTI-GAS SENSOR DRIVER - M: Gustavo Silva - S: Maintained -diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c -index 14f8f00fdcf9..930b04e3d148 100644 ---- a/drivers/tty/sysrq.c -+++ b/drivers/tty/sysrq.c -@@ -531,6 +531,7 @@ static const struct sysrq_key_op *sysrq_key_table[62] = { - NULL, /* P */ - NULL, /* Q */ - &sysrq_replay_logs_op, /* R */ -+ /* S: May be registered by sched_ext for resetting */ - NULL, /* S */ - NULL, /* T */ - NULL, /* U */ -diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h -index 1ae44793132a..19ec49a9179b 100644 ---- a/include/asm-generic/vmlinux.lds.h -+++ b/include/asm-generic/vmlinux.lds.h -@@ -133,6 +133,7 @@ - *(__dl_sched_class) \ - *(__rt_sched_class) \ - *(__fair_sched_class) \ -+ *(__ext_sched_class) \ - *(__idle_sched_class) \ - __sched_class_lowest = .; - -diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h -index c60ba0ab1462..7139b33cb104 100644 ---- a/include/linux/cgroup.h -+++ b/include/linux/cgroup.h -@@ -28,8 +28,6 @@ - - struct kernel_clone_args; - --#ifdef CONFIG_CGROUPS -- - /* - * All weight knobs on the default hierarchy should use the following min, - * default and max values. The default value is the logarithmic center of -@@ -39,6 +37,8 @@ struct kernel_clone_args; - #define CGROUP_WEIGHT_DFL 100 - #define CGROUP_WEIGHT_MAX 10000 - -+#ifdef CONFIG_CGROUPS -+ - enum { - CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ - CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 1c771ea4481d..c5a7901b2580 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -82,6 +82,8 @@ struct task_group; - struct task_struct; - struct user_event_mm; - -+#include -+ - /* - * Task state bitmask. NOTE! These bits are also - * encoded in fs/proc/array.c: get_task_state(). -@@ -812,6 +814,9 @@ struct task_struct { - struct sched_rt_entity rt; - struct sched_dl_entity dl; - struct sched_dl_entity *dl_server; -+#ifdef CONFIG_SCHED_CLASS_EXT -+ struct sched_ext_entity scx; -+#endif - const struct sched_class *sched_class; - - #ifdef CONFIG_SCHED_CORE -diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h -new file mode 100644 -index 000000000000..76166d3b14fc ---- /dev/null -+++ b/include/linux/sched/ext.h -@@ -0,0 +1,216 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#ifndef _LINUX_SCHED_EXT_H -+#define _LINUX_SCHED_EXT_H -+ -+#ifdef CONFIG_SCHED_CLASS_EXT -+ -+#include -+#include -+ -+enum scx_public_consts { -+ SCX_OPS_NAME_LEN = 128, -+ -+ SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ -+ SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ -+}; -+ -+/* -+ * DSQ (dispatch queue) IDs are 64bit of the format: -+ * -+ * Bits: [63] [62 .. 0] -+ * [ B] [ ID ] -+ * -+ * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs -+ * ID: 63 bit ID -+ * -+ * Built-in IDs: -+ * -+ * Bits: [63] [62] [61..32] [31 .. 0] -+ * [ 1] [ L] [ R ] [ V ] -+ * -+ * 1: 1 for built-in DSQs. -+ * L: 1 for LOCAL_ON DSQ IDs, 0 for others -+ * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value. -+ */ -+enum scx_dsq_id_flags { -+ SCX_DSQ_FLAG_BUILTIN = 1LLU << 63, -+ SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62, -+ -+ SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, -+ SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, -+ SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, -+ SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, -+ SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, -+}; -+ -+/* -+ * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered -+ * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to -+ * buffer between the scheduler core and the BPF scheduler. See the -+ * documentation for more details. -+ */ -+struct scx_dispatch_q { -+ raw_spinlock_t lock; -+ struct list_head list; /* tasks in dispatch order */ -+ struct rb_root priq; /* used to order by p->scx.dsq_vtime */ -+ u32 nr; -+ u32 seq; /* used by BPF iter */ -+ u64 id; -+ struct rhash_head hash_node; -+ struct llist_node free_node; -+ struct rcu_head rcu; -+}; -+ -+/* scx_entity.flags */ -+enum scx_ent_flags { -+ SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ -+ SCX_TASK_BAL_KEEP = 1 << 1, /* balance decided to keep current */ -+ SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ -+ SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ -+ -+ SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ -+ SCX_TASK_STATE_BITS = 2, -+ SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, -+ -+ SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ -+}; -+ -+/* scx_entity.flags & SCX_TASK_STATE_MASK */ -+enum scx_task_state { -+ SCX_TASK_NONE, /* ops.init_task() not called yet */ -+ SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */ -+ SCX_TASK_READY, /* fully initialized, but not in sched_ext */ -+ SCX_TASK_ENABLED, /* fully initialized and in sched_ext */ -+ -+ SCX_TASK_NR_STATES, -+}; -+ -+/* scx_entity.dsq_flags */ -+enum scx_ent_dsq_flags { -+ SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ -+}; -+ -+/* -+ * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from -+ * everywhere and the following bits track which kfunc sets are currently -+ * allowed for %current. This simple per-task tracking works because SCX ops -+ * nest in a limited way. BPF will likely implement a way to allow and disallow -+ * kfuncs depending on the calling context which will replace this manual -+ * mechanism. See scx_kf_allow(). -+ */ -+enum scx_kf_mask { -+ SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ -+ /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ -+ SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ -+ /* ops.dequeue (in REST) may be nested inside DISPATCH */ -+ SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ -+ SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ -+ SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */ -+ SCX_KF_REST = 1 << 4, /* other rq-locked operations */ -+ -+ __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | -+ SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, -+ __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, -+}; -+ -+enum scx_dsq_lnode_flags { -+ SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0, -+ -+ /* high 16 bits can be for iter cursor flags */ -+ __SCX_DSQ_LNODE_PRIV_SHIFT = 16, -+}; -+ -+struct scx_dsq_list_node { -+ struct list_head node; -+ u32 flags; -+ u32 priv; /* can be used by iter cursor */ -+}; -+ -+/* -+ * The following is embedded in task_struct and contains all fields necessary -+ * for a task to be scheduled by SCX. -+ */ -+struct sched_ext_entity { -+ struct scx_dispatch_q *dsq; -+ struct scx_dsq_list_node dsq_list; /* dispatch order */ -+ struct rb_node dsq_priq; /* p->scx.dsq_vtime order */ -+ u32 dsq_seq; -+ u32 dsq_flags; /* protected by DSQ lock */ -+ u32 flags; /* protected by rq lock */ -+ u32 weight; -+ s32 sticky_cpu; -+ s32 holding_cpu; -+ u32 kf_mask; /* see scx_kf_mask above */ -+ struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ -+ atomic_long_t ops_state; -+ -+ struct list_head runnable_node; /* rq->scx.runnable_list */ -+ unsigned long runnable_at; -+ -+#ifdef CONFIG_SCHED_CORE -+ u64 core_sched_at; /* see scx_prio_less() */ -+#endif -+ u64 ddsp_dsq_id; -+ u64 ddsp_enq_flags; -+ -+ /* BPF scheduler modifiable fields */ -+ -+ /* -+ * Runtime budget in nsecs. This is usually set through -+ * scx_bpf_dispatch() but can also be modified directly by the BPF -+ * scheduler. Automatically decreased by SCX as the task executes. On -+ * depletion, a scheduling event is triggered. -+ * -+ * This value is cleared to zero if the task is preempted by -+ * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the -+ * task ran. Use p->se.sum_exec_runtime instead. -+ */ -+ u64 slice; -+ -+ /* -+ * Used to order tasks when dispatching to the vtime-ordered priority -+ * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime() -+ * but can also be modified directly by the BPF scheduler. Modifying it -+ * while a task is queued on a dsq may mangle the ordering and is not -+ * recommended. -+ */ -+ u64 dsq_vtime; -+ -+ /* -+ * If set, reject future sched_setscheduler(2) calls updating the policy -+ * to %SCHED_EXT with -%EACCES. -+ * -+ * Can be set from ops.init_task() while the BPF scheduler is being -+ * loaded (!scx_init_task_args->fork). If set and the task's policy is -+ * already %SCHED_EXT, the task's policy is rejected and forcefully -+ * reverted to %SCHED_NORMAL. The number of such events are reported -+ * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag -+ * during fork is not allowed. -+ */ -+ bool disallow; /* reject switching into SCX */ -+ -+ /* cold fields */ -+#ifdef CONFIG_EXT_GROUP_SCHED -+ struct cgroup *cgrp_moving_from; -+#endif -+ /* must be the last field, see init_scx_entity() */ -+ struct list_head tasks_node; -+}; -+ -+void sched_ext_free(struct task_struct *p); -+void print_scx_info(const char *log_lvl, struct task_struct *p); -+ -+#else /* !CONFIG_SCHED_CLASS_EXT */ -+ -+static inline void sched_ext_free(struct task_struct *p) {} -+static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} -+ -+#endif /* CONFIG_SCHED_CLASS_EXT */ -+#endif /* _LINUX_SCHED_EXT_H */ -diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h -index d362aacf9f89..0f2aeb37bbb0 100644 ---- a/include/linux/sched/task.h -+++ b/include/linux/sched/task.h -@@ -63,7 +63,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); - extern void init_idle(struct task_struct *idle, int cpu); - - extern int sched_fork(unsigned long clone_flags, struct task_struct *p); --extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); -+extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); -+extern void sched_cancel_fork(struct task_struct *p); - extern void sched_post_fork(struct task_struct *p); - extern void sched_dead(struct task_struct *p); - -@@ -119,6 +120,11 @@ static inline struct task_struct *get_task_struct(struct task_struct *t) - return t; - } - -+static inline struct task_struct *tryget_task_struct(struct task_struct *t) -+{ -+ return refcount_inc_not_zero(&t->usage) ? t : NULL; -+} -+ - extern void __put_task_struct(struct task_struct *t); - extern void __put_task_struct_rcu_cb(struct rcu_head *rhp); - -diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h -new file mode 100644 -index 000000000000..fe19da7315a9 ---- /dev/null -+++ b/include/trace/events/sched_ext.h -@@ -0,0 +1,32 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#undef TRACE_SYSTEM -+#define TRACE_SYSTEM sched_ext -+ -+#if !defined(_TRACE_SCHED_EXT_H) || defined(TRACE_HEADER_MULTI_READ) -+#define _TRACE_SCHED_EXT_H -+ -+#include -+ -+TRACE_EVENT(sched_ext_dump, -+ -+ TP_PROTO(const char *line), -+ -+ TP_ARGS(line), -+ -+ TP_STRUCT__entry( -+ __string(line, line) -+ ), -+ -+ TP_fast_assign( -+ __assign_str(line); -+ ), -+ -+ TP_printk("%s", -+ __get_str(line) -+ ) -+); -+ -+#endif /* _TRACE_SCHED_EXT_H */ -+ -+/* This part must be outside protection */ -+#include -diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h -index 3bac0a8ceab2..359a14cc76a4 100644 ---- a/include/uapi/linux/sched.h -+++ b/include/uapi/linux/sched.h -@@ -118,6 +118,7 @@ struct clone_args { - /* SCHED_ISO: reserved but not implemented yet */ - #define SCHED_IDLE 5 - #define SCHED_DEADLINE 6 -+#define SCHED_EXT 7 - - /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ - #define SCHED_RESET_ON_FORK 0x40000000 -diff --git a/init/Kconfig b/init/Kconfig -index 08a0d51afaae..e1a88d48d652 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -1028,9 +1028,13 @@ menuconfig CGROUP_SCHED - tasks. - - if CGROUP_SCHED -+config GROUP_SCHED_WEIGHT -+ def_bool n -+ - config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on CGROUP_SCHED -+ select GROUP_SCHED_WEIGHT - default CGROUP_SCHED - - config CFS_BANDWIDTH -@@ -1055,6 +1059,12 @@ config RT_GROUP_SCHED - realtime bandwidth for them. - See Documentation/scheduler/sched-rt-group.rst for more information. - -+config EXT_GROUP_SCHED -+ bool -+ depends on SCHED_CLASS_EXT && CGROUP_SCHED -+ select GROUP_SCHED_WEIGHT -+ default y -+ - endif #CGROUP_SCHED - - config SCHED_MM_CID -diff --git a/init/init_task.c b/init/init_task.c -index eeb110c65fe2..e222722e790b 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -6,6 +6,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -98,6 +99,17 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { - #endif - #ifdef CONFIG_CGROUP_SCHED - .sched_task_group = &root_task_group, -+#endif -+#ifdef CONFIG_SCHED_CLASS_EXT -+ .scx = { -+ .dsq_list.node = LIST_HEAD_INIT(init_task.scx.dsq_list.node), -+ .sticky_cpu = -1, -+ .holding_cpu = -1, -+ .runnable_node = LIST_HEAD_INIT(init_task.scx.runnable_node), -+ .runnable_at = INITIAL_JIFFIES, -+ .ddsp_dsq_id = SCX_DSQ_INVALID, -+ .slice = SCX_SLICE_DFL, -+ }, - #endif - .ptraced = LIST_HEAD_INIT(init_task.ptraced), - .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), -diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt -index c2f1fd95a821..fe782cd77388 100644 ---- a/kernel/Kconfig.preempt -+++ b/kernel/Kconfig.preempt -@@ -133,4 +133,29 @@ config SCHED_CORE - which is the likely usage by Linux distributions, there should - be no measurable impact on performance. - -- -+config SCHED_CLASS_EXT -+ bool "Extensible Scheduling Class" -+ depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF -+ select STACKTRACE if STACKTRACE_SUPPORT -+ help -+ This option enables a new scheduler class sched_ext (SCX), which -+ allows scheduling policies to be implemented as BPF programs to -+ achieve the following: -+ -+ - Ease of experimentation and exploration: Enabling rapid -+ iteration of new scheduling policies. -+ - Customization: Building application-specific schedulers which -+ implement policies that are not applicable to general-purpose -+ schedulers. -+ - Rapid scheduler deployments: Non-disruptive swap outs of -+ scheduling policies in production environments. -+ -+ sched_ext leverages BPF struct_ops feature to define a structure -+ which exports function callbacks and flags to BPF programs that -+ wish to implement scheduling policies. The struct_ops structure -+ exported by sched_ext is struct sched_ext_ops, and is conceptually -+ similar to struct sched_class. -+ -+ For more information: -+ Documentation/scheduler/sched-ext.rst -+ https://github.com/sched-ext/scx -diff --git a/kernel/fork.c b/kernel/fork.c -index 003de4829c15..eb290420d926 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -23,6 +23,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -973,6 +974,7 @@ void __put_task_struct(struct task_struct *tsk) - WARN_ON(refcount_read(&tsk->usage)); - WARN_ON(tsk == current); - -+ sched_ext_free(tsk); - io_uring_free(tsk); - cgroup_free(tsk); - task_numa_free(tsk, true); -@@ -2352,7 +2354,7 @@ __latent_entropy struct task_struct *copy_process( - - retval = perf_event_init_task(p, clone_flags); - if (retval) -- goto bad_fork_cleanup_policy; -+ goto bad_fork_sched_cancel_fork; - retval = audit_alloc(p); - if (retval) - goto bad_fork_cleanup_perf; -@@ -2485,7 +2487,9 @@ __latent_entropy struct task_struct *copy_process( - * cgroup specific, it unconditionally needs to place the task on a - * runqueue. - */ -- sched_cgroup_fork(p, args); -+ retval = sched_cgroup_fork(p, args); -+ if (retval) -+ goto bad_fork_cancel_cgroup; - - /* - * From this point on we must avoid any synchronous user-space -@@ -2531,13 +2535,13 @@ __latent_entropy struct task_struct *copy_process( - /* Don't start children in a dying pid namespace */ - if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { - retval = -ENOMEM; -- goto bad_fork_cancel_cgroup; -+ goto bad_fork_core_free; - } - - /* Let kill terminate clone/fork in the middle */ - if (fatal_signal_pending(current)) { - retval = -EINTR; -- goto bad_fork_cancel_cgroup; -+ goto bad_fork_core_free; - } - - /* No more failure paths after this point. */ -@@ -2611,10 +2615,11 @@ __latent_entropy struct task_struct *copy_process( - - return p; - --bad_fork_cancel_cgroup: -+bad_fork_core_free: - sched_core_free(p); - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); -+bad_fork_cancel_cgroup: - cgroup_cancel_fork(p, args); - bad_fork_put_pidfd: - if (clone_flags & CLONE_PIDFD) { -@@ -2653,6 +2658,8 @@ __latent_entropy struct task_struct *copy_process( - audit_free(p); - bad_fork_cleanup_perf: - perf_event_free_task(p); -+bad_fork_sched_cancel_fork: -+ sched_cancel_fork(p); - bad_fork_cleanup_policy: - lockdep_free_task(p); - #ifdef CONFIG_NUMA -diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c -index 39c315182b35..fae1f5c921eb 100644 ---- a/kernel/sched/build_policy.c -+++ b/kernel/sched/build_policy.c -@@ -16,18 +16,25 @@ - #include - #include - #include -+#include - #include - #include - - #include - #include -+#include - #include -+#include - #include -+#include -+#include - #include - #include - #include - #include - #include -+#include -+#include - - #include - -@@ -52,4 +59,8 @@ - #include "cputime.c" - #include "deadline.c" - -+#ifdef CONFIG_SCHED_CLASS_EXT -+# include "ext.c" -+#endif -+ - #include "syscalls.c" -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 1af59cf714cd..8ae04bd4a5a4 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -169,7 +169,10 @@ static inline int __task_prio(const struct task_struct *p) - if (p->sched_class == &idle_sched_class) - return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ - -- return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ -+ if (task_on_scx(p)) -+ return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */ -+ -+ return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */ - } - - /* -@@ -198,6 +201,11 @@ static inline bool prio_less(const struct task_struct *a, - if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ - return cfs_prio_less(a, b, in_fi); - -+#ifdef CONFIG_SCHED_CLASS_EXT -+ if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */ -+ return scx_prio_less(a, b, in_fi); -+#endif -+ - return false; - } - -@@ -1255,11 +1263,14 @@ bool sched_can_stop_tick(struct rq *rq) - return true; - - /* -- * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; -- * if there's more than one we need the tick for involuntary -- * preemption. -+ * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks -+ * left. For CFS, if there's more than one we need the tick for -+ * involuntary preemption. For SCX, ask. - */ -- if (rq->nr_running > 1) -+ if (scx_enabled() && !scx_can_stop_tick(rq)) -+ return false; -+ -+ if (rq->cfs.nr_running > 1) - return false; - - /* -@@ -1341,8 +1352,8 @@ void set_load_weight(struct task_struct *p, bool update_load) - * SCHED_OTHER tasks have to update their load when changing their - * weight - */ -- if (update_load && p->sched_class == &fair_sched_class) -- reweight_task(p, &lw); -+ if (update_load && p->sched_class->reweight_task) -+ p->sched_class->reweight_task(task_rq(p), p, &lw); - else - p->se.load = lw; - } -@@ -2031,6 +2042,17 @@ inline int task_curr(const struct task_struct *p) - return cpu_curr(task_cpu(p)) == p; - } - -+/* -+ * ->switching_to() is called with the pi_lock and rq_lock held and must not -+ * mess with locking. -+ */ -+void check_class_changing(struct rq *rq, struct task_struct *p, -+ const struct sched_class *prev_class) -+{ -+ if (prev_class != p->sched_class && p->sched_class->switching_to) -+ p->sched_class->switching_to(rq, p); -+} -+ - /* - * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, - * use the balance_callback list if you want balancing. -@@ -2289,7 +2311,7 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) - static inline bool is_cpu_allowed(struct task_struct *p, int cpu) - { - /* When not in the task's cpumask, no point in looking further. */ -- if (!cpumask_test_cpu(cpu, p->cpus_ptr)) -+ if (!task_allowed_on_cpu(p, cpu)) - return false; - - /* migrate_disabled() must be allowed to finish. */ -@@ -2298,7 +2320,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) - - /* Non kernel threads are not allowed during either online or offline. */ - if (!(p->flags & PF_KTHREAD)) -- return cpu_active(cpu) && task_cpu_possible(cpu, p); -+ return cpu_active(cpu); - - /* KTHREAD_IS_PER_CPU is always allowed. */ - if (kthread_is_per_cpu(p)) -@@ -3775,6 +3797,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu) - - static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) - { -+ /* -+ * The BPF scheduler may depend on select_task_rq() being invoked during -+ * wakeups. In addition, @p may end up executing on a different CPU -+ * regardless of what happens in the wakeup path making the ttwu_queue -+ * optimization less meaningful. Skip if on SCX. -+ */ -+ if (task_on_scx(p)) -+ return false; -+ - /* - * Do not complicate things with the async wake_list while the CPU is - * in hotplug state. -@@ -4342,6 +4373,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) - p->rt.on_rq = 0; - p->rt.on_list = 0; - -+#ifdef CONFIG_SCHED_CLASS_EXT -+ init_scx_entity(&p->scx); -+#endif -+ - #ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&p->preempt_notifiers); - #endif -@@ -4582,10 +4617,18 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) - - if (dl_prio(p->prio)) - return -EAGAIN; -- else if (rt_prio(p->prio)) -+ -+ scx_pre_fork(p); -+ -+ if (rt_prio(p->prio)) { - p->sched_class = &rt_sched_class; -- else -+#ifdef CONFIG_SCHED_CLASS_EXT -+ } else if (task_should_scx(p)) { -+ p->sched_class = &ext_sched_class; -+#endif -+ } else { - p->sched_class = &fair_sched_class; -+ } - - init_entity_runnable_average(&p->se); - -@@ -4605,7 +4648,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) - return 0; - } - --void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) -+int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) - { - unsigned long flags; - -@@ -4632,11 +4675,19 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ return scx_fork(p); -+} -+ -+void sched_cancel_fork(struct task_struct *p) -+{ -+ scx_cancel_fork(p); - } - - void sched_post_fork(struct task_struct *p) - { - uclamp_post_fork(p); -+ scx_post_fork(p); - } - - unsigned long to_ratio(u64 period, u64 runtime) -@@ -5469,6 +5520,7 @@ void sched_tick(void) - calc_global_load_tick(rq); - sched_core_tick(rq); - task_tick_mm_cid(rq, curr); -+ scx_tick(rq); - - rq_unlock(rq, &rf); - -@@ -5481,8 +5533,10 @@ void sched_tick(void) - wq_worker_tick(curr); - - #ifdef CONFIG_SMP -- rq->idle_balance = idle_cpu(cpu); -- sched_balance_trigger(rq); -+ if (!scx_switched_all()) { -+ rq->idle_balance = idle_cpu(cpu); -+ sched_balance_trigger(rq); -+ } - #endif - } - -@@ -5772,8 +5826,19 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) - static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf) - { --#ifdef CONFIG_SMP -+ const struct sched_class *start_class = prev->sched_class; - const struct sched_class *class; -+ -+#ifdef CONFIG_SCHED_CLASS_EXT -+ /* -+ * SCX requires a balance() call before every pick_next_task() including -+ * when waking up from SCHED_IDLE. If @start_class is below SCX, start -+ * from SCX instead. -+ */ -+ if (scx_enabled() && sched_class_above(&ext_sched_class, start_class)) -+ start_class = &ext_sched_class; -+#endif -+ - /* - * We must do the balancing pass before put_prev_task(), such - * that when we release the rq->lock the task is in the same -@@ -5782,11 +5847,10 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, - * We can terminate the balance pass as soon as we know there is - * a runnable task of @class priority or higher. - */ -- for_class_range(class, prev->sched_class, &idle_sched_class) { -- if (class->balance(rq, prev, rf)) -+ for_active_class_range(class, start_class, &idle_sched_class) { -+ if (class->balance && class->balance(rq, prev, rf)) - break; - } --#endif - - put_prev_task(rq, prev); - -@@ -5808,6 +5872,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) - const struct sched_class *class; - struct task_struct *p; - -+ if (scx_enabled()) -+ goto restart; -+ - /* - * Optimization: we know that if all tasks are in the fair class we can - * call that function directly, but only if the @prev task wasn't of a -@@ -5847,10 +5914,15 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) - restart: - put_prev_task_balance(rq, prev, rf); - -- for_each_class(class) { -+ for_each_active_class(class) { - p = class->pick_next_task(rq); -- if (p) -+ if (p) { -+ const struct sched_class *prev_class = prev->sched_class; -+ -+ if (class != prev_class && prev_class->switch_class) -+ prev_class->switch_class(rq, p); - return p; -+ } - } - - BUG(); /* The idle class should always have a runnable task. */ -@@ -5880,7 +5952,7 @@ static inline struct task_struct *pick_task(struct rq *rq) - const struct sched_class *class; - struct task_struct *p; - -- for_each_class(class) { -+ for_each_active_class(class) { - p = class->pick_task(rq); - if (p) - return p; -@@ -6877,6 +6949,10 @@ void __setscheduler_prio(struct task_struct *p, int prio) - p->sched_class = &dl_sched_class; - else if (rt_prio(prio)) - p->sched_class = &rt_sched_class; -+#ifdef CONFIG_SCHED_CLASS_EXT -+ else if (task_should_scx(p)) -+ p->sched_class = &ext_sched_class; -+#endif - else - p->sched_class = &fair_sched_class; - -@@ -7022,6 +7098,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) - } - - __setscheduler_prio(p, prio); -+ check_class_changing(rq, p, prev_class); - - if (queued) - enqueue_task(rq, p, queue_flag); -@@ -7436,6 +7513,7 @@ void sched_show_task(struct task_struct *p) - - print_worker_info(KERN_INFO, p); - print_stop_info(KERN_INFO, p); -+ print_scx_info(KERN_INFO, p); - show_stack(p, NULL, KERN_INFO); - put_task_stack(p); - } -@@ -7964,6 +8042,8 @@ int sched_cpu_activate(unsigned int cpu) - cpuset_cpu_active(); - } - -+ scx_rq_activate(rq); -+ - /* - * Put the rq online, if not already. This happens: - * -@@ -8013,6 +8093,8 @@ int sched_cpu_deactivate(unsigned int cpu) - - sched_set_rq_offline(rq, cpu); - -+ scx_rq_deactivate(rq); -+ - /* - * When going down, decrement the number of cores with SMT present. - */ -@@ -8197,11 +8279,15 @@ void __init sched_init(void) - int i; - - /* Make sure the linker didn't screw up */ -- BUG_ON(&idle_sched_class != &fair_sched_class + 1 || -- &fair_sched_class != &rt_sched_class + 1 || -- &rt_sched_class != &dl_sched_class + 1); - #ifdef CONFIG_SMP -- BUG_ON(&dl_sched_class != &stop_sched_class + 1); -+ BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); -+#endif -+ BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); -+ BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); -+ BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); -+#ifdef CONFIG_SCHED_CLASS_EXT -+ BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class)); -+ BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); - #endif - - wait_bit_init(); -@@ -8225,6 +8311,9 @@ void __init sched_init(void) - root_task_group.shares = ROOT_TASK_GROUP_LOAD; - init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); - #endif /* CONFIG_FAIR_GROUP_SCHED */ -+#ifdef CONFIG_EXT_GROUP_SCHED -+ root_task_group.scx_weight = CGROUP_WEIGHT_DFL; -+#endif /* CONFIG_EXT_GROUP_SCHED */ - #ifdef CONFIG_RT_GROUP_SCHED - root_task_group.rt_se = (struct sched_rt_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); -@@ -8370,6 +8459,7 @@ void __init sched_init(void) - balance_push_set(smp_processor_id(), false); - #endif - init_sched_fair_class(); -+ init_sched_ext_class(); - - psi_init(); - -@@ -8655,6 +8745,7 @@ struct task_group *sched_create_group(struct task_group *parent) - if (!alloc_rt_sched_group(tg, parent)) - goto err; - -+ scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); - alloc_uclamp_sched_group(tg, parent); - - return tg; -@@ -8782,6 +8873,7 @@ void sched_move_task(struct task_struct *tsk) - put_prev_task(rq, tsk); - - sched_change_group(tsk, group); -+ scx_move_task(tsk); - - if (queued) - enqueue_task(rq, tsk, queue_flags); -@@ -8796,11 +8888,6 @@ void sched_move_task(struct task_struct *tsk) - } - } - --static inline struct task_group *css_tg(struct cgroup_subsys_state *css) --{ -- return css ? container_of(css, struct task_group, css) : NULL; --} -- - static struct cgroup_subsys_state * - cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) - { -@@ -8824,6 +8911,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) - { - struct task_group *tg = css_tg(css); - struct task_group *parent = css_tg(css->parent); -+ int ret; -+ -+ ret = scx_tg_online(tg); -+ if (ret) -+ return ret; - - if (parent) - sched_online_group(tg, parent); -@@ -8838,6 +8930,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) - return 0; - } - -+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ scx_tg_offline(tg); -+} -+ - static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) - { - struct task_group *tg = css_tg(css); -@@ -8855,9 +8954,9 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) - sched_unregister_group(tg); - } - --#ifdef CONFIG_RT_GROUP_SCHED - static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) - { -+#ifdef CONFIG_RT_GROUP_SCHED - struct task_struct *task; - struct cgroup_subsys_state *css; - -@@ -8865,9 +8964,9 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) - if (!sched_rt_can_attach(css_tg(css), task)) - return -EINVAL; - } -- return 0; --} - #endif -+ return scx_cgroup_can_attach(tset); -+} - - static void cpu_cgroup_attach(struct cgroup_taskset *tset) - { -@@ -8876,6 +8975,13 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) - - cgroup_taskset_for_each(task, css, tset) - sched_move_task(task); -+ -+ scx_cgroup_finish_attach(); -+} -+ -+static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) -+{ -+ scx_cgroup_cancel_attach(tset); - } - - #ifdef CONFIG_UCLAMP_TASK_GROUP -@@ -9052,22 +9158,36 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v) - } - #endif /* CONFIG_UCLAMP_TASK_GROUP */ - -+#ifdef CONFIG_GROUP_SCHED_WEIGHT -+static unsigned long tg_weight(struct task_group *tg) -+{ - #ifdef CONFIG_FAIR_GROUP_SCHED -+ return scale_load_down(tg->shares); -+#else -+ return sched_weight_from_cgroup(tg->scx_weight); -+#endif -+} -+ - static int cpu_shares_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 shareval) - { -+ int ret; -+ - if (shareval > scale_load_down(ULONG_MAX)) - shareval = MAX_SHARES; -- return sched_group_set_shares(css_tg(css), scale_load(shareval)); -+ ret = sched_group_set_shares(css_tg(css), scale_load(shareval)); -+ if (!ret) -+ scx_group_set_weight(css_tg(css), -+ sched_weight_to_cgroup(shareval)); -+ return ret; - } - - static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) - { -- struct task_group *tg = css_tg(css); -- -- return (u64) scale_load_down(tg->shares); -+ return tg_weight(css_tg(css)); - } -+#endif /* CONFIG_GROUP_SCHED_WEIGHT */ - - #ifdef CONFIG_CFS_BANDWIDTH - static DEFINE_MUTEX(cfs_constraints_mutex); -@@ -9413,7 +9533,6 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) - return 0; - } - #endif /* CONFIG_CFS_BANDWIDTH */ --#endif /* CONFIG_FAIR_GROUP_SCHED */ - - #ifdef CONFIG_RT_GROUP_SCHED - static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, -@@ -9441,7 +9560,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, - } - #endif /* CONFIG_RT_GROUP_SCHED */ - --#ifdef CONFIG_FAIR_GROUP_SCHED -+#ifdef CONFIG_GROUP_SCHED_WEIGHT - static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, - struct cftype *cft) - { -@@ -9451,12 +9570,17 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, - static int cpu_idle_write_s64(struct cgroup_subsys_state *css, - struct cftype *cft, s64 idle) - { -- return sched_group_set_idle(css_tg(css), idle); -+ int ret; -+ -+ ret = sched_group_set_idle(css_tg(css), idle); -+ if (!ret) -+ scx_group_set_idle(css_tg(css), idle); -+ return ret; - } - #endif - - static struct cftype cpu_legacy_files[] = { --#ifdef CONFIG_FAIR_GROUP_SCHED -+#ifdef CONFIG_GROUP_SCHED_WEIGHT - { - .name = "shares", - .read_u64 = cpu_shares_read_u64, -@@ -9566,38 +9690,35 @@ static int cpu_local_stat_show(struct seq_file *sf, - return 0; - } - --#ifdef CONFIG_FAIR_GROUP_SCHED -+#ifdef CONFIG_GROUP_SCHED_WEIGHT -+ - static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) - { -- struct task_group *tg = css_tg(css); -- u64 weight = scale_load_down(tg->shares); -- -- return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); -+ return sched_weight_to_cgroup(tg_weight(css_tg(css))); - } - - static int cpu_weight_write_u64(struct cgroup_subsys_state *css, -- struct cftype *cft, u64 weight) -+ struct cftype *cft, u64 cgrp_weight) - { -- /* -- * cgroup weight knobs should use the common MIN, DFL and MAX -- * values which are 1, 100 and 10000 respectively. While it loses -- * a bit of range on both ends, it maps pretty well onto the shares -- * value used by scheduler and the round-trip conversions preserve -- * the original value over the entire range. -- */ -- if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) -+ unsigned long weight; -+ int ret; -+ -+ if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX) - return -ERANGE; - -- weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); -+ weight = sched_weight_from_cgroup(cgrp_weight); - -- return sched_group_set_shares(css_tg(css), scale_load(weight)); -+ ret = sched_group_set_shares(css_tg(css), scale_load(weight)); -+ if (!ret) -+ scx_group_set_weight(css_tg(css), cgrp_weight); -+ return ret; - } - - static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, - struct cftype *cft) - { -- unsigned long weight = scale_load_down(css_tg(css)->shares); -+ unsigned long weight = tg_weight(css_tg(css)); - int last_delta = INT_MAX; - int prio, delta; - -@@ -9616,7 +9737,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, - struct cftype *cft, s64 nice) - { - unsigned long weight; -- int idx; -+ int idx, ret; - - if (nice < MIN_NICE || nice > MAX_NICE) - return -ERANGE; -@@ -9625,9 +9746,13 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, - idx = array_index_nospec(idx, 40); - weight = sched_prio_to_weight[idx]; - -- return sched_group_set_shares(css_tg(css), scale_load(weight)); -+ ret = sched_group_set_shares(css_tg(css), scale_load(weight)); -+ if (!ret) -+ scx_group_set_weight(css_tg(css), -+ sched_weight_to_cgroup(weight)); -+ return ret; - } --#endif -+#endif /* CONFIG_GROUP_SCHED_WEIGHT */ - - static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, - long period, long quota) -@@ -9687,7 +9812,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, - #endif - - static struct cftype cpu_files[] = { --#ifdef CONFIG_FAIR_GROUP_SCHED -+#ifdef CONFIG_GROUP_SCHED_WEIGHT - { - .name = "weight", - .flags = CFTYPE_NOT_ON_ROOT, -@@ -9741,14 +9866,14 @@ static struct cftype cpu_files[] = { - struct cgroup_subsys cpu_cgrp_subsys = { - .css_alloc = cpu_cgroup_css_alloc, - .css_online = cpu_cgroup_css_online, -+ .css_offline = cpu_cgroup_css_offline, - .css_released = cpu_cgroup_css_released, - .css_free = cpu_cgroup_css_free, - .css_extra_stat_show = cpu_extra_stat_show, - .css_local_stat_show = cpu_local_stat_show, --#ifdef CONFIG_RT_GROUP_SCHED - .can_attach = cpu_cgroup_can_attach, --#endif - .attach = cpu_cgroup_attach, -+ .cancel_attach = cpu_cgroup_cancel_attach, - .legacy_cftypes = cpu_legacy_files, - .dfl_cftypes = cpu_files, - .early_init = true, -@@ -10338,3 +10463,38 @@ void sched_mm_cid_fork(struct task_struct *t) - t->mm_cid_active = 1; - } - #endif -+ -+#ifdef CONFIG_SCHED_CLASS_EXT -+void sched_deq_and_put_task(struct task_struct *p, int queue_flags, -+ struct sched_enq_and_set_ctx *ctx) -+{ -+ struct rq *rq = task_rq(p); -+ -+ lockdep_assert_rq_held(rq); -+ -+ *ctx = (struct sched_enq_and_set_ctx){ -+ .p = p, -+ .queue_flags = queue_flags, -+ .queued = task_on_rq_queued(p), -+ .running = task_current(rq, p), -+ }; -+ -+ update_rq_clock(rq); -+ if (ctx->queued) -+ dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); -+ if (ctx->running) -+ put_prev_task(rq, p); -+} -+ -+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) -+{ -+ struct rq *rq = task_rq(ctx->p); -+ -+ lockdep_assert_rq_held(rq); -+ -+ if (ctx->queued) -+ enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); -+ if (ctx->running) -+ set_next_task(rq, ctx->p); -+} -+#endif /* CONFIG_SCHED_CLASS_EXT */ -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index eece6244f9d2..e683e5d08daa 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -197,8 +197,10 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, - - static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) - { -- unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu); -+ unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu); - -+ if (!scx_switched_all()) -+ util += cpu_util_cfs_boost(sg_cpu->cpu); - util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); - util = max(util, boost); - sg_cpu->bw_min = min; -@@ -325,16 +327,35 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, - } - - #ifdef CONFIG_NO_HZ_COMMON --static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) -+static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) - { -- unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); -- bool ret = idle_calls == sg_cpu->saved_idle_calls; -+ unsigned long idle_calls; -+ bool ret; -+ -+ /* -+ * The heuristics in this function is for the fair class. For SCX, the -+ * performance target comes directly from the BPF scheduler. Let's just -+ * follow it. -+ */ -+ if (scx_switched_all()) -+ return false; -+ -+ /* if capped by uclamp_max, always update to be in compliance */ -+ if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu))) -+ return false; -+ -+ /* -+ * Maintain the frequency if the CPU has not been idle recently, as -+ * reduction is likely to be premature. -+ */ -+ idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); -+ ret = idle_calls == sg_cpu->saved_idle_calls; - - sg_cpu->saved_idle_calls = idle_calls; - return ret; - } - #else --static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } -+static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } - #endif /* CONFIG_NO_HZ_COMMON */ - - /* -@@ -382,14 +403,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, - return; - - next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); -- /* -- * Do not reduce the frequency if the CPU has not been idle -- * recently, as the reduction is likely to be premature then. -- * -- * Except when the rq is capped by uclamp_max. -- */ -- if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && -- sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq && -+ -+ if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq && - !sg_policy->need_freq_update) { - next_f = sg_policy->next_freq; - -@@ -436,14 +451,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, - if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) - return; - -- /* -- * Do not reduce the target performance level if the CPU has not been -- * idle recently, as the reduction is likely to be premature then. -- * -- * Except when the rq is capped by uclamp_max. -- */ -- if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && -- sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) -+ if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util) - sg_cpu->util = prev_util; - - cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, -diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index c1eb9a1afd13..c057ef46c5f8 100644 ---- a/kernel/sched/debug.c -+++ b/kernel/sched/debug.c -@@ -1090,6 +1090,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, - P(dl.runtime); - P(dl.deadline); - } -+#ifdef CONFIG_SCHED_CLASS_EXT -+ __PS("ext.enabled", task_on_scx(p)); -+#endif - #undef PN_SCHEDSTAT - #undef P_SCHEDSTAT - -diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c -new file mode 100644 -index 000000000000..5fae2292ec29 ---- /dev/null -+++ b/kernel/sched/ext.c -@@ -0,0 +1,7281 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) -+ -+enum scx_consts { -+ SCX_DSP_DFL_MAX_BATCH = 32, -+ SCX_DSP_MAX_LOOPS = 32, -+ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, -+ -+ SCX_EXIT_BT_LEN = 64, -+ SCX_EXIT_MSG_LEN = 1024, -+ SCX_EXIT_DUMP_DFL_LEN = 32768, -+ -+ SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, -+ -+ /* -+ * Iterating all tasks may take a while. Periodically drop -+ * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. -+ */ -+ SCX_OPS_TASK_ITER_BATCH = 32, -+}; -+ -+enum scx_exit_kind { -+ SCX_EXIT_NONE, -+ SCX_EXIT_DONE, -+ -+ SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ -+ SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ -+ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ -+ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ -+ -+ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ -+ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ -+ SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ -+}; -+ -+/* -+ * An exit code can be specified when exiting with scx_bpf_exit() or -+ * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN -+ * respectively. The codes are 64bit of the format: -+ * -+ * Bits: [63 .. 48 47 .. 32 31 .. 0] -+ * [ SYS ACT ] [ SYS RSN ] [ USR ] -+ * -+ * SYS ACT: System-defined exit actions -+ * SYS RSN: System-defined exit reasons -+ * USR : User-defined exit codes and reasons -+ * -+ * Using the above, users may communicate intention and context by ORing system -+ * actions and/or system reasons with a user-defined exit code. -+ */ -+enum scx_exit_code { -+ /* Reasons */ -+ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, -+ -+ /* Actions */ -+ SCX_ECODE_ACT_RESTART = 1LLU << 48, -+}; -+ -+/* -+ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is -+ * being disabled. -+ */ -+struct scx_exit_info { -+ /* %SCX_EXIT_* - broad category of the exit reason */ -+ enum scx_exit_kind kind; -+ -+ /* exit code if gracefully exiting */ -+ s64 exit_code; -+ -+ /* textual representation of the above */ -+ const char *reason; -+ -+ /* backtrace if exiting due to an error */ -+ unsigned long *bt; -+ u32 bt_len; -+ -+ /* informational message */ -+ char *msg; -+ -+ /* debug dump */ -+ char *dump; -+}; -+ -+/* sched_ext_ops.flags */ -+enum scx_ops_flags { -+ /* -+ * Keep built-in idle tracking even if ops.update_idle() is implemented. -+ */ -+ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, -+ -+ /* -+ * By default, if there are no other task to run on the CPU, ext core -+ * keeps running the current task even after its slice expires. If this -+ * flag is specified, such tasks are passed to ops.enqueue() with -+ * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. -+ */ -+ SCX_OPS_ENQ_LAST = 1LLU << 1, -+ -+ /* -+ * An exiting task may schedule after PF_EXITING is set. In such cases, -+ * bpf_task_from_pid() may not be able to find the task and if the BPF -+ * scheduler depends on pid lookup for dispatching, the task will be -+ * lost leading to various issues including RCU grace period stalls. -+ * -+ * To mask this problem, by default, unhashed tasks are automatically -+ * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't -+ * depend on pid lookups and wants to handle these tasks directly, the -+ * following flag can be used. -+ */ -+ SCX_OPS_ENQ_EXITING = 1LLU << 2, -+ -+ /* -+ * If set, only tasks with policy set to SCHED_EXT are attached to -+ * sched_ext. If clear, SCHED_NORMAL tasks are also included. -+ */ -+ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, -+ -+ /* -+ * CPU cgroup support flags -+ */ -+ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */ -+ -+ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | -+ SCX_OPS_ENQ_LAST | -+ SCX_OPS_ENQ_EXITING | -+ SCX_OPS_SWITCH_PARTIAL | -+ SCX_OPS_HAS_CGROUP_WEIGHT, -+}; -+ -+/* argument container for ops.init_task() */ -+struct scx_init_task_args { -+ /* -+ * Set if ops.init_task() is being invoked on the fork path, as opposed -+ * to the scheduler transition path. -+ */ -+ bool fork; -+#ifdef CONFIG_EXT_GROUP_SCHED -+ /* the cgroup the task is joining */ -+ struct cgroup *cgroup; -+#endif -+}; -+ -+/* argument container for ops.exit_task() */ -+struct scx_exit_task_args { -+ /* Whether the task exited before running on sched_ext. */ -+ bool cancelled; -+}; -+ -+/* argument container for ops->cgroup_init() */ -+struct scx_cgroup_init_args { -+ /* the weight of the cgroup [1..10000] */ -+ u32 weight; -+}; -+ -+enum scx_cpu_preempt_reason { -+ /* next task is being scheduled by &sched_class_rt */ -+ SCX_CPU_PREEMPT_RT, -+ /* next task is being scheduled by &sched_class_dl */ -+ SCX_CPU_PREEMPT_DL, -+ /* next task is being scheduled by &sched_class_stop */ -+ SCX_CPU_PREEMPT_STOP, -+ /* unknown reason for SCX being preempted */ -+ SCX_CPU_PREEMPT_UNKNOWN, -+}; -+ -+/* -+ * Argument container for ops->cpu_acquire(). Currently empty, but may be -+ * expanded in the future. -+ */ -+struct scx_cpu_acquire_args {}; -+ -+/* argument container for ops->cpu_release() */ -+struct scx_cpu_release_args { -+ /* the reason the CPU was preempted */ -+ enum scx_cpu_preempt_reason reason; -+ -+ /* the task that's going to be scheduled on the CPU */ -+ struct task_struct *task; -+}; -+ -+/* -+ * Informational context provided to dump operations. -+ */ -+struct scx_dump_ctx { -+ enum scx_exit_kind kind; -+ s64 exit_code; -+ const char *reason; -+ u64 at_ns; -+ u64 at_jiffies; -+}; -+ -+/** -+ * struct sched_ext_ops - Operation table for BPF scheduler implementation -+ * -+ * Userland can implement an arbitrary scheduling policy by implementing and -+ * loading operations in this table. -+ */ -+struct sched_ext_ops { -+ /** -+ * select_cpu - Pick the target CPU for a task which is being woken up -+ * @p: task being woken up -+ * @prev_cpu: the cpu @p was on before sleeping -+ * @wake_flags: SCX_WAKE_* -+ * -+ * Decision made here isn't final. @p may be moved to any CPU while it -+ * is getting dispatched for execution later. However, as @p is not on -+ * the rq at this point, getting the eventual execution CPU right here -+ * saves a small bit of overhead down the line. -+ * -+ * If an idle CPU is returned, the CPU is kicked and will try to -+ * dispatch. While an explicit custom mechanism can be added, -+ * select_cpu() serves as the default way to wake up idle CPUs. -+ * -+ * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p -+ * is dispatched, the ops.enqueue() callback will be skipped. Finally, -+ * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the -+ * local DSQ of whatever CPU is returned by this callback. -+ */ -+ s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); -+ -+ /** -+ * enqueue - Enqueue a task on the BPF scheduler -+ * @p: task being enqueued -+ * @enq_flags: %SCX_ENQ_* -+ * -+ * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch() -+ * or enqueue on the BPF scheduler. If not directly dispatched, the bpf -+ * scheduler owns @p and if it fails to dispatch @p, the task will -+ * stall. -+ * -+ * If @p was dispatched from ops.select_cpu(), this callback is -+ * skipped. -+ */ -+ void (*enqueue)(struct task_struct *p, u64 enq_flags); -+ -+ /** -+ * dequeue - Remove a task from the BPF scheduler -+ * @p: task being dequeued -+ * @deq_flags: %SCX_DEQ_* -+ * -+ * Remove @p from the BPF scheduler. This is usually called to isolate -+ * the task while updating its scheduling properties (e.g. priority). -+ * -+ * The ext core keeps track of whether the BPF side owns a given task or -+ * not and can gracefully ignore spurious dispatches from BPF side, -+ * which makes it safe to not implement this method. However, depending -+ * on the scheduling logic, this can lead to confusing behaviors - e.g. -+ * scheduling position not being updated across a priority change. -+ */ -+ void (*dequeue)(struct task_struct *p, u64 deq_flags); -+ -+ /** -+ * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs -+ * @cpu: CPU to dispatch tasks for -+ * @prev: previous task being switched out -+ * -+ * Called when a CPU's local dsq is empty. The operation should dispatch -+ * one or more tasks from the BPF scheduler into the DSQs using -+ * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using -+ * scx_bpf_consume(). -+ * -+ * The maximum number of times scx_bpf_dispatch() can be called without -+ * an intervening scx_bpf_consume() is specified by -+ * ops.dispatch_max_batch. See the comments on top of the two functions -+ * for more details. -+ * -+ * When not %NULL, @prev is an SCX task with its slice depleted. If -+ * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in -+ * @prev->scx.flags, it is not enqueued yet and will be enqueued after -+ * ops.dispatch() returns. To keep executing @prev, return without -+ * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST. -+ */ -+ void (*dispatch)(s32 cpu, struct task_struct *prev); -+ -+ /** -+ * tick - Periodic tick -+ * @p: task running currently -+ * -+ * This operation is called every 1/HZ seconds on CPUs which are -+ * executing an SCX task. Setting @p->scx.slice to 0 will trigger an -+ * immediate dispatch cycle on the CPU. -+ */ -+ void (*tick)(struct task_struct *p); -+ -+ /** -+ * runnable - A task is becoming runnable on its associated CPU -+ * @p: task becoming runnable -+ * @enq_flags: %SCX_ENQ_* -+ * -+ * This and the following three functions can be used to track a task's -+ * execution state transitions. A task becomes ->runnable() on a CPU, -+ * and then goes through one or more ->running() and ->stopping() pairs -+ * as it runs on the CPU, and eventually becomes ->quiescent() when it's -+ * done running on the CPU. -+ * -+ * @p is becoming runnable on the CPU because it's -+ * -+ * - waking up (%SCX_ENQ_WAKEUP) -+ * - being moved from another CPU -+ * - being restored after temporarily taken off the queue for an -+ * attribute change. -+ * -+ * This and ->enqueue() are related but not coupled. This operation -+ * notifies @p's state transition and may not be followed by ->enqueue() -+ * e.g. when @p is being dispatched to a remote CPU, or when @p is -+ * being enqueued on a CPU experiencing a hotplug event. Likewise, a -+ * task may be ->enqueue()'d without being preceded by this operation -+ * e.g. after exhausting its slice. -+ */ -+ void (*runnable)(struct task_struct *p, u64 enq_flags); -+ -+ /** -+ * running - A task is starting to run on its associated CPU -+ * @p: task starting to run -+ * -+ * See ->runnable() for explanation on the task state notifiers. -+ */ -+ void (*running)(struct task_struct *p); -+ -+ /** -+ * stopping - A task is stopping execution -+ * @p: task stopping to run -+ * @runnable: is task @p still runnable? -+ * -+ * See ->runnable() for explanation on the task state notifiers. If -+ * !@runnable, ->quiescent() will be invoked after this operation -+ * returns. -+ */ -+ void (*stopping)(struct task_struct *p, bool runnable); -+ -+ /** -+ * quiescent - A task is becoming not runnable on its associated CPU -+ * @p: task becoming not runnable -+ * @deq_flags: %SCX_DEQ_* -+ * -+ * See ->runnable() for explanation on the task state notifiers. -+ * -+ * @p is becoming quiescent on the CPU because it's -+ * -+ * - sleeping (%SCX_DEQ_SLEEP) -+ * - being moved to another CPU -+ * - being temporarily taken off the queue for an attribute change -+ * (%SCX_DEQ_SAVE) -+ * -+ * This and ->dequeue() are related but not coupled. This operation -+ * notifies @p's state transition and may not be preceded by ->dequeue() -+ * e.g. when @p is being dispatched to a remote CPU. -+ */ -+ void (*quiescent)(struct task_struct *p, u64 deq_flags); -+ -+ /** -+ * yield - Yield CPU -+ * @from: yielding task -+ * @to: optional yield target task -+ * -+ * If @to is NULL, @from is yielding the CPU to other runnable tasks. -+ * The BPF scheduler should ensure that other available tasks are -+ * dispatched before the yielding task. Return value is ignored in this -+ * case. -+ * -+ * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf -+ * scheduler can implement the request, return %true; otherwise, %false. -+ */ -+ bool (*yield)(struct task_struct *from, struct task_struct *to); -+ -+ /** -+ * core_sched_before - Task ordering for core-sched -+ * @a: task A -+ * @b: task B -+ * -+ * Used by core-sched to determine the ordering between two tasks. See -+ * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on -+ * core-sched. -+ * -+ * Both @a and @b are runnable and may or may not currently be queued on -+ * the BPF scheduler. Should return %true if @a should run before @b. -+ * %false if there's no required ordering or @b should run before @a. -+ * -+ * If not specified, the default is ordering them according to when they -+ * became runnable. -+ */ -+ bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); -+ -+ /** -+ * set_weight - Set task weight -+ * @p: task to set weight for -+ * @weight: new weight [1..10000] -+ * -+ * Update @p's weight to @weight. -+ */ -+ void (*set_weight)(struct task_struct *p, u32 weight); -+ -+ /** -+ * set_cpumask - Set CPU affinity -+ * @p: task to set CPU affinity for -+ * @cpumask: cpumask of cpus that @p can run on -+ * -+ * Update @p's CPU affinity to @cpumask. -+ */ -+ void (*set_cpumask)(struct task_struct *p, -+ const struct cpumask *cpumask); -+ -+ /** -+ * update_idle - Update the idle state of a CPU -+ * @cpu: CPU to udpate the idle state for -+ * @idle: whether entering or exiting the idle state -+ * -+ * This operation is called when @rq's CPU goes or leaves the idle -+ * state. By default, implementing this operation disables the built-in -+ * idle CPU tracking and the following helpers become unavailable: -+ * -+ * - scx_bpf_select_cpu_dfl() -+ * - scx_bpf_test_and_clear_cpu_idle() -+ * - scx_bpf_pick_idle_cpu() -+ * -+ * The user also must implement ops.select_cpu() as the default -+ * implementation relies on scx_bpf_select_cpu_dfl(). -+ * -+ * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle -+ * tracking. -+ */ -+ void (*update_idle)(s32 cpu, bool idle); -+ -+ /** -+ * cpu_acquire - A CPU is becoming available to the BPF scheduler -+ * @cpu: The CPU being acquired by the BPF scheduler. -+ * @args: Acquire arguments, see the struct definition. -+ * -+ * A CPU that was previously released from the BPF scheduler is now once -+ * again under its control. -+ */ -+ void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); -+ -+ /** -+ * cpu_release - A CPU is taken away from the BPF scheduler -+ * @cpu: The CPU being released by the BPF scheduler. -+ * @args: Release arguments, see the struct definition. -+ * -+ * The specified CPU is no longer under the control of the BPF -+ * scheduler. This could be because it was preempted by a higher -+ * priority sched_class, though there may be other reasons as well. The -+ * caller should consult @args->reason to determine the cause. -+ */ -+ void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); -+ -+ /** -+ * init_task - Initialize a task to run in a BPF scheduler -+ * @p: task to initialize for BPF scheduling -+ * @args: init arguments, see the struct definition -+ * -+ * Either we're loading a BPF scheduler or a new task is being forked. -+ * Initialize @p for BPF scheduling. This operation may block and can -+ * be used for allocations, and is called exactly once for a task. -+ * -+ * Return 0 for success, -errno for failure. An error return while -+ * loading will abort loading of the BPF scheduler. During a fork, it -+ * will abort that specific fork. -+ */ -+ s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); -+ -+ /** -+ * exit_task - Exit a previously-running task from the system -+ * @p: task to exit -+ * -+ * @p is exiting or the BPF scheduler is being unloaded. Perform any -+ * necessary cleanup for @p. -+ */ -+ void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); -+ -+ /** -+ * enable - Enable BPF scheduling for a task -+ * @p: task to enable BPF scheduling for -+ * -+ * Enable @p for BPF scheduling. enable() is called on @p any time it -+ * enters SCX, and is always paired with a matching disable(). -+ */ -+ void (*enable)(struct task_struct *p); -+ -+ /** -+ * disable - Disable BPF scheduling for a task -+ * @p: task to disable BPF scheduling for -+ * -+ * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. -+ * Disable BPF scheduling for @p. A disable() call is always matched -+ * with a prior enable() call. -+ */ -+ void (*disable)(struct task_struct *p); -+ -+ /** -+ * dump - Dump BPF scheduler state on error -+ * @ctx: debug dump context -+ * -+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. -+ */ -+ void (*dump)(struct scx_dump_ctx *ctx); -+ -+ /** -+ * dump_cpu - Dump BPF scheduler state for a CPU on error -+ * @ctx: debug dump context -+ * @cpu: CPU to generate debug dump for -+ * @idle: @cpu is currently idle without any runnable tasks -+ * -+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for -+ * @cpu. If @idle is %true and this operation doesn't produce any -+ * output, @cpu is skipped for dump. -+ */ -+ void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); -+ -+ /** -+ * dump_task - Dump BPF scheduler state for a runnable task on error -+ * @ctx: debug dump context -+ * @p: runnable task to generate debug dump for -+ * -+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for -+ * @p. -+ */ -+ void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); -+ -+#ifdef CONFIG_EXT_GROUP_SCHED -+ /** -+ * cgroup_init - Initialize a cgroup -+ * @cgrp: cgroup being initialized -+ * @args: init arguments, see the struct definition -+ * -+ * Either the BPF scheduler is being loaded or @cgrp created, initialize -+ * @cgrp for sched_ext. This operation may block. -+ * -+ * Return 0 for success, -errno for failure. An error return while -+ * loading will abort loading of the BPF scheduler. During cgroup -+ * creation, it will abort the specific cgroup creation. -+ */ -+ s32 (*cgroup_init)(struct cgroup *cgrp, -+ struct scx_cgroup_init_args *args); -+ -+ /** -+ * cgroup_exit - Exit a cgroup -+ * @cgrp: cgroup being exited -+ * -+ * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit -+ * @cgrp for sched_ext. This operation my block. -+ */ -+ void (*cgroup_exit)(struct cgroup *cgrp); -+ -+ /** -+ * cgroup_prep_move - Prepare a task to be moved to a different cgroup -+ * @p: task being moved -+ * @from: cgroup @p is being moved from -+ * @to: cgroup @p is being moved to -+ * -+ * Prepare @p for move from cgroup @from to @to. This operation may -+ * block and can be used for allocations. -+ * -+ * Return 0 for success, -errno for failure. An error return aborts the -+ * migration. -+ */ -+ s32 (*cgroup_prep_move)(struct task_struct *p, -+ struct cgroup *from, struct cgroup *to); -+ -+ /** -+ * cgroup_move - Commit cgroup move -+ * @p: task being moved -+ * @from: cgroup @p is being moved from -+ * @to: cgroup @p is being moved to -+ * -+ * Commit the move. @p is dequeued during this operation. -+ */ -+ void (*cgroup_move)(struct task_struct *p, -+ struct cgroup *from, struct cgroup *to); -+ -+ /** -+ * cgroup_cancel_move - Cancel cgroup move -+ * @p: task whose cgroup move is being canceled -+ * @from: cgroup @p was being moved from -+ * @to: cgroup @p was being moved to -+ * -+ * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). -+ * Undo the preparation. -+ */ -+ void (*cgroup_cancel_move)(struct task_struct *p, -+ struct cgroup *from, struct cgroup *to); -+ -+ /** -+ * cgroup_set_weight - A cgroup's weight is being changed -+ * @cgrp: cgroup whose weight is being updated -+ * @weight: new weight [1..10000] -+ * -+ * Update @tg's weight to @weight. -+ */ -+ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); -+#endif /* CONFIG_CGROUPS */ -+ -+ /* -+ * All online ops must come before ops.cpu_online(). -+ */ -+ -+ /** -+ * cpu_online - A CPU became online -+ * @cpu: CPU which just came up -+ * -+ * @cpu just came online. @cpu will not call ops.enqueue() or -+ * ops.dispatch(), nor run tasks associated with other CPUs beforehand. -+ */ -+ void (*cpu_online)(s32 cpu); -+ -+ /** -+ * cpu_offline - A CPU is going offline -+ * @cpu: CPU which is going offline -+ * -+ * @cpu is going offline. @cpu will not call ops.enqueue() or -+ * ops.dispatch(), nor run tasks associated with other CPUs afterwards. -+ */ -+ void (*cpu_offline)(s32 cpu); -+ -+ /* -+ * All CPU hotplug ops must come before ops.init(). -+ */ -+ -+ /** -+ * init - Initialize the BPF scheduler -+ */ -+ s32 (*init)(void); -+ -+ /** -+ * exit - Clean up after the BPF scheduler -+ * @info: Exit info -+ */ -+ void (*exit)(struct scx_exit_info *info); -+ -+ /** -+ * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch -+ */ -+ u32 dispatch_max_batch; -+ -+ /** -+ * flags - %SCX_OPS_* flags -+ */ -+ u64 flags; -+ -+ /** -+ * timeout_ms - The maximum amount of time, in milliseconds, that a -+ * runnable task should be able to wait before being scheduled. The -+ * maximum timeout may not exceed the default timeout of 30 seconds. -+ * -+ * Defaults to the maximum allowed timeout value of 30 seconds. -+ */ -+ u32 timeout_ms; -+ -+ /** -+ * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default -+ * value of 32768 is used. -+ */ -+ u32 exit_dump_len; -+ -+ /** -+ * hotplug_seq - A sequence number that may be set by the scheduler to -+ * detect when a hotplug event has occurred during the loading process. -+ * If 0, no detection occurs. Otherwise, the scheduler will fail to -+ * load if the sequence number does not match @scx_hotplug_seq on the -+ * enable path. -+ */ -+ u64 hotplug_seq; -+ -+ /** -+ * name - BPF scheduler's name -+ * -+ * Must be a non-zero valid BPF object name including only isalnum(), -+ * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the -+ * BPF scheduler is enabled. -+ */ -+ char name[SCX_OPS_NAME_LEN]; -+}; -+ -+enum scx_opi { -+ SCX_OPI_BEGIN = 0, -+ SCX_OPI_NORMAL_BEGIN = 0, -+ SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), -+ SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), -+ SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), -+ SCX_OPI_END = SCX_OP_IDX(init), -+}; -+ -+enum scx_wake_flags { -+ /* expose select WF_* flags as enums */ -+ SCX_WAKE_FORK = WF_FORK, -+ SCX_WAKE_TTWU = WF_TTWU, -+ SCX_WAKE_SYNC = WF_SYNC, -+}; -+ -+enum scx_enq_flags { -+ /* expose select ENQUEUE_* flags as enums */ -+ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, -+ SCX_ENQ_HEAD = ENQUEUE_HEAD, -+ -+ /* high 32bits are SCX specific */ -+ -+ /* -+ * Set the following to trigger preemption when calling -+ * scx_bpf_dispatch() with a local dsq as the target. The slice of the -+ * current task is cleared to zero and the CPU is kicked into the -+ * scheduling path. Implies %SCX_ENQ_HEAD. -+ */ -+ SCX_ENQ_PREEMPT = 1LLU << 32, -+ -+ /* -+ * The task being enqueued was previously enqueued on the current CPU's -+ * %SCX_DSQ_LOCAL, but was removed from it in a call to the -+ * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was -+ * invoked in a ->cpu_release() callback, and the task is again -+ * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the -+ * task will not be scheduled on the CPU until at least the next invocation -+ * of the ->cpu_acquire() callback. -+ */ -+ SCX_ENQ_REENQ = 1LLU << 40, -+ -+ /* -+ * The task being enqueued is the only task available for the cpu. By -+ * default, ext core keeps executing such tasks but when -+ * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the -+ * %SCX_ENQ_LAST flag set. -+ * -+ * If the BPF scheduler wants to continue executing the task, -+ * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately. -+ * If the task gets queued on a different dsq or the BPF side, the BPF -+ * scheduler is responsible for triggering a follow-up scheduling event. -+ * Otherwise, Execution may stall. -+ */ -+ SCX_ENQ_LAST = 1LLU << 41, -+ -+ /* high 8 bits are internal */ -+ __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, -+ -+ SCX_ENQ_CLEAR_OPSS = 1LLU << 56, -+ SCX_ENQ_DSQ_PRIQ = 1LLU << 57, -+}; -+ -+enum scx_deq_flags { -+ /* expose select DEQUEUE_* flags as enums */ -+ SCX_DEQ_SLEEP = DEQUEUE_SLEEP, -+ -+ /* high 32bits are SCX specific */ -+ -+ /* -+ * The generic core-sched layer decided to execute the task even though -+ * it hasn't been dispatched yet. Dequeue from the BPF side. -+ */ -+ SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, -+}; -+ -+enum scx_pick_idle_cpu_flags { -+ SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ -+}; -+ -+enum scx_kick_flags { -+ /* -+ * Kick the target CPU if idle. Guarantees that the target CPU goes -+ * through at least one full scheduling cycle before going idle. If the -+ * target CPU can be determined to be currently not idle and going to go -+ * through a scheduling cycle before going idle, noop. -+ */ -+ SCX_KICK_IDLE = 1LLU << 0, -+ -+ /* -+ * Preempt the current task and execute the dispatch path. If the -+ * current task of the target CPU is an SCX task, its ->scx.slice is -+ * cleared to zero before the scheduling path is invoked so that the -+ * task expires and the dispatch path is invoked. -+ */ -+ SCX_KICK_PREEMPT = 1LLU << 1, -+ -+ /* -+ * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will -+ * return after the target CPU finishes picking the next task. -+ */ -+ SCX_KICK_WAIT = 1LLU << 2, -+}; -+ -+enum scx_tg_flags { -+ SCX_TG_ONLINE = 1U << 0, -+ SCX_TG_INITED = 1U << 1, -+}; -+ -+enum scx_ops_enable_state { -+ SCX_OPS_ENABLING, -+ SCX_OPS_ENABLED, -+ SCX_OPS_DISABLING, -+ SCX_OPS_DISABLED, -+}; -+ -+static const char *scx_ops_enable_state_str[] = { -+ [SCX_OPS_ENABLING] = "enabling", -+ [SCX_OPS_ENABLED] = "enabled", -+ [SCX_OPS_DISABLING] = "disabling", -+ [SCX_OPS_DISABLED] = "disabled", -+}; -+ -+/* -+ * sched_ext_entity->ops_state -+ * -+ * Used to track the task ownership between the SCX core and the BPF scheduler. -+ * State transitions look as follows: -+ * -+ * NONE -> QUEUEING -> QUEUED -> DISPATCHING -+ * ^ | | -+ * | v v -+ * \-------------------------------/ -+ * -+ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call -+ * sites for explanations on the conditions being waited upon and why they are -+ * safe. Transitions out of them into NONE or QUEUED must store_release and the -+ * waiters should load_acquire. -+ * -+ * Tracking scx_ops_state enables sched_ext core to reliably determine whether -+ * any given task can be dispatched by the BPF scheduler at all times and thus -+ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler -+ * to try to dispatch any task anytime regardless of its state as the SCX core -+ * can safely reject invalid dispatches. -+ */ -+enum scx_ops_state { -+ SCX_OPSS_NONE, /* owned by the SCX core */ -+ SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ -+ SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ -+ SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ -+ -+ /* -+ * QSEQ brands each QUEUED instance so that, when dispatch races -+ * dequeue/requeue, the dispatcher can tell whether it still has a claim -+ * on the task being dispatched. -+ * -+ * As some 32bit archs can't do 64bit store_release/load_acquire, -+ * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on -+ * 32bit machines. The dispatch race window QSEQ protects is very narrow -+ * and runs with IRQ disabled. 30 bits should be sufficient. -+ */ -+ SCX_OPSS_QSEQ_SHIFT = 2, -+}; -+ -+/* Use macros to ensure that the type is unsigned long for the masks */ -+#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) -+#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) -+ -+/* -+ * During exit, a task may schedule after losing its PIDs. When disabling the -+ * BPF scheduler, we need to be able to iterate tasks in every state to -+ * guarantee system safety. Maintain a dedicated task list which contains every -+ * task between its fork and eventual free. -+ */ -+static DEFINE_SPINLOCK(scx_tasks_lock); -+static LIST_HEAD(scx_tasks); -+ -+/* ops enable/disable */ -+static struct kthread_worker *scx_ops_helper; -+static DEFINE_MUTEX(scx_ops_enable_mutex); -+DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); -+DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); -+static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); -+static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); -+static bool scx_ops_init_task_enabled; -+static bool scx_switching_all; -+DEFINE_STATIC_KEY_FALSE(__scx_switched_all); -+ -+static struct sched_ext_ops scx_ops; -+static bool scx_warned_zero_slice; -+ -+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); -+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); -+static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); -+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); -+ -+static struct static_key_false scx_has_op[SCX_OPI_END] = -+ { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; -+ -+static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); -+static struct scx_exit_info *scx_exit_info; -+ -+static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); -+static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); -+ -+/* -+ * A monotically increasing sequence number that is incremented every time a -+ * scheduler is enabled. This can be used by to check if any custom sched_ext -+ * scheduler has ever been used in the system. -+ */ -+static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); -+ -+/* -+ * The maximum amount of time in jiffies that a task may be runnable without -+ * being scheduled on a CPU. If this timeout is exceeded, it will trigger -+ * scx_ops_error(). -+ */ -+static unsigned long scx_watchdog_timeout; -+ -+/* -+ * The last time the delayed work was run. This delayed work relies on -+ * ksoftirqd being able to run to service timer interrupts, so it's possible -+ * that this work itself could get wedged. To account for this, we check that -+ * it's not stalled in the timer tick, and trigger an error if it is. -+ */ -+static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; -+ -+static struct delayed_work scx_watchdog_work; -+ -+/* idle tracking */ -+#ifdef CONFIG_SMP -+#ifdef CONFIG_CPUMASK_OFFSTACK -+#define CL_ALIGNED_IF_ONSTACK -+#else -+#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp -+#endif -+ -+static struct { -+ cpumask_var_t cpu; -+ cpumask_var_t smt; -+} idle_masks CL_ALIGNED_IF_ONSTACK; -+ -+#endif /* CONFIG_SMP */ -+ -+/* for %SCX_KICK_WAIT */ -+static unsigned long __percpu *scx_kick_cpus_pnt_seqs; -+ -+/* -+ * Direct dispatch marker. -+ * -+ * Non-NULL values are used for direct dispatch from enqueue path. A valid -+ * pointer points to the task currently being enqueued. An ERR_PTR value is used -+ * to indicate that direct dispatch has already happened. -+ */ -+static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); -+ -+/* -+ * Dispatch queues. -+ * -+ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is -+ * to avoid live-locking in bypass mode where all tasks are dispatched to -+ * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't -+ * sufficient, it can be further split. -+ */ -+static struct scx_dispatch_q **global_dsqs; -+ -+static const struct rhashtable_params dsq_hash_params = { -+ .key_len = 8, -+ .key_offset = offsetof(struct scx_dispatch_q, id), -+ .head_offset = offsetof(struct scx_dispatch_q, hash_node), -+}; -+ -+static struct rhashtable dsq_hash; -+static LLIST_HEAD(dsqs_to_free); -+ -+/* dispatch buf */ -+struct scx_dsp_buf_ent { -+ struct task_struct *task; -+ unsigned long qseq; -+ u64 dsq_id; -+ u64 enq_flags; -+}; -+ -+static u32 scx_dsp_max_batch; -+ -+struct scx_dsp_ctx { -+ struct rq *rq; -+ u32 cursor; -+ u32 nr_tasks; -+ struct scx_dsp_buf_ent buf[]; -+}; -+ -+static struct scx_dsp_ctx __percpu *scx_dsp_ctx; -+ -+/* string formatting from BPF */ -+struct scx_bstr_buf { -+ u64 data[MAX_BPRINTF_VARARGS]; -+ char line[SCX_EXIT_MSG_LEN]; -+}; -+ -+static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); -+static struct scx_bstr_buf scx_exit_bstr_buf; -+ -+/* ops debug dump */ -+struct scx_dump_data { -+ s32 cpu; -+ bool first; -+ s32 cursor; -+ struct seq_buf *s; -+ const char *prefix; -+ struct scx_bstr_buf buf; -+}; -+ -+static struct scx_dump_data scx_dump_data = { -+ .cpu = -1, -+}; -+ -+/* /sys/kernel/sched_ext interface */ -+static struct kset *scx_kset; -+static struct kobject *scx_root_kobj; -+ -+#define CREATE_TRACE_POINTS -+#include -+ -+static void process_ddsp_deferred_locals(struct rq *rq); -+static void scx_bpf_kick_cpu(s32 cpu, u64 flags); -+static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, -+ s64 exit_code, -+ const char *fmt, ...); -+ -+#define scx_ops_error_kind(err, fmt, args...) \ -+ scx_ops_exit_kind((err), 0, fmt, ##args) -+ -+#define scx_ops_exit(code, fmt, args...) \ -+ scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args) -+ -+#define scx_ops_error(fmt, args...) \ -+ scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args) -+ -+#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)]) -+ -+static long jiffies_delta_msecs(unsigned long at, unsigned long now) -+{ -+ if (time_after(at, now)) -+ return jiffies_to_msecs(at - now); -+ else -+ return -(long)jiffies_to_msecs(now - at); -+} -+ -+/* if the highest set bit is N, return a mask with bits [N+1, 31] set */ -+static u32 higher_bits(u32 flags) -+{ -+ return ~((1 << fls(flags)) - 1); -+} -+ -+/* return the mask with only the highest bit set */ -+static u32 highest_bit(u32 flags) -+{ -+ int bit = fls(flags); -+ return ((u64)1 << bit) >> 1; -+} -+ -+static bool u32_before(u32 a, u32 b) -+{ -+ return (s32)(a - b) < 0; -+} -+ -+static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) -+{ -+ return global_dsqs[cpu_to_node(task_cpu(p))]; -+} -+ -+static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) -+{ -+ return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); -+} -+ -+/* -+ * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX -+ * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate -+ * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check -+ * whether it's running from an allowed context. -+ * -+ * @mask is constant, always inline to cull the mask calculations. -+ */ -+static __always_inline void scx_kf_allow(u32 mask) -+{ -+ /* nesting is allowed only in increasing scx_kf_mask order */ -+ WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask, -+ "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n", -+ current->scx.kf_mask, mask); -+ current->scx.kf_mask |= mask; -+ barrier(); -+} -+ -+static void scx_kf_disallow(u32 mask) -+{ -+ barrier(); -+ current->scx.kf_mask &= ~mask; -+} -+ -+#define SCX_CALL_OP(mask, op, args...) \ -+do { \ -+ if (mask) { \ -+ scx_kf_allow(mask); \ -+ scx_ops.op(args); \ -+ scx_kf_disallow(mask); \ -+ } else { \ -+ scx_ops.op(args); \ -+ } \ -+} while (0) -+ -+#define SCX_CALL_OP_RET(mask, op, args...) \ -+({ \ -+ __typeof__(scx_ops.op(args)) __ret; \ -+ if (mask) { \ -+ scx_kf_allow(mask); \ -+ __ret = scx_ops.op(args); \ -+ scx_kf_disallow(mask); \ -+ } else { \ -+ __ret = scx_ops.op(args); \ -+ } \ -+ __ret; \ -+}) -+ -+/* -+ * Some kfuncs are allowed only on the tasks that are subjects of the -+ * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such -+ * restrictions, the following SCX_CALL_OP_*() variants should be used when -+ * invoking scx_ops operations that take task arguments. These can only be used -+ * for non-nesting operations due to the way the tasks are tracked. -+ * -+ * kfuncs which can only operate on such tasks can in turn use -+ * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on -+ * the specific task. -+ */ -+#define SCX_CALL_OP_TASK(mask, op, task, args...) \ -+do { \ -+ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ -+ current->scx.kf_tasks[0] = task; \ -+ SCX_CALL_OP(mask, op, task, ##args); \ -+ current->scx.kf_tasks[0] = NULL; \ -+} while (0) -+ -+#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \ -+({ \ -+ __typeof__(scx_ops.op(task, ##args)) __ret; \ -+ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ -+ current->scx.kf_tasks[0] = task; \ -+ __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \ -+ current->scx.kf_tasks[0] = NULL; \ -+ __ret; \ -+}) -+ -+#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \ -+({ \ -+ __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \ -+ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ -+ current->scx.kf_tasks[0] = task0; \ -+ current->scx.kf_tasks[1] = task1; \ -+ __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \ -+ current->scx.kf_tasks[0] = NULL; \ -+ current->scx.kf_tasks[1] = NULL; \ -+ __ret; \ -+}) -+ -+/* @mask is constant, always inline to cull unnecessary branches */ -+static __always_inline bool scx_kf_allowed(u32 mask) -+{ -+ if (unlikely(!(current->scx.kf_mask & mask))) { -+ scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", -+ mask, current->scx.kf_mask); -+ return false; -+ } -+ -+ /* -+ * Enforce nesting boundaries. e.g. A kfunc which can be called from -+ * DISPATCH must not be called if we're running DEQUEUE which is nested -+ * inside ops.dispatch(). We don't need to check boundaries for any -+ * blocking kfuncs as the verifier ensures they're only called from -+ * sleepable progs. -+ */ -+ if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && -+ (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { -+ scx_ops_error("cpu_release kfunc called from a nested operation"); -+ return false; -+ } -+ -+ if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && -+ (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { -+ scx_ops_error("dispatch kfunc called from a nested operation"); -+ return false; -+ } -+ -+ return true; -+} -+ -+/* see SCX_CALL_OP_TASK() */ -+static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, -+ struct task_struct *p) -+{ -+ if (!scx_kf_allowed(mask)) -+ return false; -+ -+ if (unlikely((p != current->scx.kf_tasks[0] && -+ p != current->scx.kf_tasks[1]))) { -+ scx_ops_error("called on a task not being operated on"); -+ return false; -+ } -+ -+ return true; -+} -+ -+static bool scx_kf_allowed_if_unlocked(void) -+{ -+ return !current->scx.kf_mask; -+} -+ -+/** -+ * nldsq_next_task - Iterate to the next task in a non-local DSQ -+ * @dsq: user dsq being interated -+ * @cur: current position, %NULL to start iteration -+ * @rev: walk backwards -+ * -+ * Returns %NULL when iteration is finished. -+ */ -+static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, -+ struct task_struct *cur, bool rev) -+{ -+ struct list_head *list_node; -+ struct scx_dsq_list_node *dsq_lnode; -+ -+ lockdep_assert_held(&dsq->lock); -+ -+ if (cur) -+ list_node = &cur->scx.dsq_list.node; -+ else -+ list_node = &dsq->list; -+ -+ /* find the next task, need to skip BPF iteration cursors */ -+ do { -+ if (rev) -+ list_node = list_node->prev; -+ else -+ list_node = list_node->next; -+ -+ if (list_node == &dsq->list) -+ return NULL; -+ -+ dsq_lnode = container_of(list_node, struct scx_dsq_list_node, -+ node); -+ } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR); -+ -+ return container_of(dsq_lnode, struct task_struct, scx.dsq_list); -+} -+ -+#define nldsq_for_each_task(p, dsq) \ -+ for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ -+ (p) = nldsq_next_task((dsq), (p), false)) -+ -+ -+/* -+ * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] -+ * dispatch order. BPF-visible iterator is opaque and larger to allow future -+ * changes without breaking backward compatibility. Can be used with -+ * bpf_for_each(). See bpf_iter_scx_dsq_*(). -+ */ -+enum scx_dsq_iter_flags { -+ /* iterate in the reverse dispatch order */ -+ SCX_DSQ_ITER_REV = 1U << 16, -+ -+ __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, -+ __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, -+ -+ __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, -+ __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | -+ __SCX_DSQ_ITER_HAS_SLICE | -+ __SCX_DSQ_ITER_HAS_VTIME, -+}; -+ -+struct bpf_iter_scx_dsq_kern { -+ struct scx_dsq_list_node cursor; -+ struct scx_dispatch_q *dsq; -+ u64 slice; -+ u64 vtime; -+} __attribute__((aligned(8))); -+ -+struct bpf_iter_scx_dsq { -+ u64 __opaque[6]; -+} __attribute__((aligned(8))); -+ -+ -+/* -+ * SCX task iterator. -+ */ -+struct scx_task_iter { -+ struct sched_ext_entity cursor; -+ struct task_struct *locked; -+ struct rq *rq; -+ struct rq_flags rf; -+ u32 cnt; -+}; -+ -+/** -+ * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration -+ * @iter: iterator to init -+ * -+ * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter -+ * must eventually be stopped with scx_task_iter_stop(). -+ * -+ * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() -+ * between this and the first next() call or between any two next() calls. If -+ * the locks are released between two next() calls, the caller is responsible -+ * for ensuring that the task being iterated remains accessible either through -+ * RCU read lock or obtaining a reference count. -+ * -+ * All tasks which existed when the iteration started are guaranteed to be -+ * visited as long as they still exist. -+ */ -+static void scx_task_iter_start(struct scx_task_iter *iter) -+{ -+ BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & -+ ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); -+ -+ spin_lock_irq(&scx_tasks_lock); -+ -+ iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; -+ list_add(&iter->cursor.tasks_node, &scx_tasks); -+ iter->locked = NULL; -+ iter->cnt = 0; -+} -+ -+static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) -+{ -+ if (iter->locked) { -+ task_rq_unlock(iter->rq, iter->locked, &iter->rf); -+ iter->locked = NULL; -+ } -+} -+ -+/** -+ * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator -+ * @iter: iterator to unlock -+ * -+ * If @iter is in the middle of a locked iteration, it may be locking the rq of -+ * the task currently being visited in addition to scx_tasks_lock. Unlock both. -+ * This function can be safely called anytime during an iteration. -+ */ -+static void scx_task_iter_unlock(struct scx_task_iter *iter) -+{ -+ __scx_task_iter_rq_unlock(iter); -+ spin_unlock_irq(&scx_tasks_lock); -+} -+ -+/** -+ * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() -+ * @iter: iterator to re-lock -+ * -+ * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it -+ * doesn't re-lock the rq lock. Must be called before other iterator operations. -+ */ -+static void scx_task_iter_relock(struct scx_task_iter *iter) -+{ -+ spin_lock_irq(&scx_tasks_lock); -+} -+ -+/** -+ * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock -+ * @iter: iterator to exit -+ * -+ * Exit a previously initialized @iter. Must be called with scx_tasks_lock held -+ * which is released on return. If the iterator holds a task's rq lock, that rq -+ * lock is also released. See scx_task_iter_start() for details. -+ */ -+static void scx_task_iter_stop(struct scx_task_iter *iter) -+{ -+ list_del_init(&iter->cursor.tasks_node); -+ scx_task_iter_unlock(iter); -+} -+ -+/** -+ * scx_task_iter_next - Next task -+ * @iter: iterator to walk -+ * -+ * Visit the next task. See scx_task_iter_start() for details. Locks are dropped -+ * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing -+ * stalls by holding scx_tasks_lock for too long. -+ */ -+static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) -+{ -+ struct list_head *cursor = &iter->cursor.tasks_node; -+ struct sched_ext_entity *pos; -+ -+ if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) { -+ scx_task_iter_unlock(iter); -+ cpu_relax(); -+ cond_resched(); -+ scx_task_iter_relock(iter); -+ } -+ -+ list_for_each_entry(pos, cursor, tasks_node) { -+ if (&pos->tasks_node == &scx_tasks) -+ return NULL; -+ if (!(pos->flags & SCX_TASK_CURSOR)) { -+ list_move(cursor, &pos->tasks_node); -+ return container_of(pos, struct task_struct, scx); -+ } -+ } -+ -+ /* can't happen, should always terminate at scx_tasks above */ -+ BUG(); -+} -+ -+/** -+ * scx_task_iter_next_locked - Next non-idle task with its rq locked -+ * @iter: iterator to walk -+ * @include_dead: Whether we should include dead tasks in the iteration -+ * -+ * Visit the non-idle task with its rq lock held. Allows callers to specify -+ * whether they would like to filter out dead tasks. See scx_task_iter_start() -+ * for details. -+ */ -+static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) -+{ -+ struct task_struct *p; -+ -+ __scx_task_iter_rq_unlock(iter); -+ -+ while ((p = scx_task_iter_next(iter))) { -+ /* -+ * scx_task_iter is used to prepare and move tasks into SCX -+ * while loading the BPF scheduler and vice-versa while -+ * unloading. The init_tasks ("swappers") should be excluded -+ * from the iteration because: -+ * -+ * - It's unsafe to use __setschduler_prio() on an init_task to -+ * determine the sched_class to use as it won't preserve its -+ * idle_sched_class. -+ * -+ * - ops.init/exit_task() can easily be confused if called with -+ * init_tasks as they, e.g., share PID 0. -+ * -+ * As init_tasks are never scheduled through SCX, they can be -+ * skipped safely. Note that is_idle_task() which tests %PF_IDLE -+ * doesn't work here: -+ * -+ * - %PF_IDLE may not be set for an init_task whose CPU hasn't -+ * yet been onlined. -+ * -+ * - %PF_IDLE can be set on tasks that are not init_tasks. See -+ * play_idle_precise() used by CONFIG_IDLE_INJECT. -+ * -+ * Test for idle_sched_class as only init_tasks are on it. -+ */ -+ if (p->sched_class != &idle_sched_class) -+ break; -+ } -+ if (!p) -+ return NULL; -+ -+ iter->rq = task_rq_lock(p, &iter->rf); -+ iter->locked = p; -+ -+ return p; -+} -+ -+static enum scx_ops_enable_state scx_ops_enable_state(void) -+{ -+ return atomic_read(&scx_ops_enable_state_var); -+} -+ -+static enum scx_ops_enable_state -+scx_ops_set_enable_state(enum scx_ops_enable_state to) -+{ -+ return atomic_xchg(&scx_ops_enable_state_var, to); -+} -+ -+static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to, -+ enum scx_ops_enable_state from) -+{ -+ int from_v = from; -+ -+ return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to); -+} -+ -+static bool scx_rq_bypassing(struct rq *rq) -+{ -+ return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); -+} -+ -+/** -+ * wait_ops_state - Busy-wait the specified ops state to end -+ * @p: target task -+ * @opss: state to wait the end of -+ * -+ * Busy-wait for @p to transition out of @opss. This can only be used when the -+ * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also -+ * has load_acquire semantics to ensure that the caller can see the updates made -+ * in the enqueueing and dispatching paths. -+ */ -+static void wait_ops_state(struct task_struct *p, unsigned long opss) -+{ -+ do { -+ cpu_relax(); -+ } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); -+} -+ -+/** -+ * ops_cpu_valid - Verify a cpu number -+ * @cpu: cpu number which came from a BPF ops -+ * @where: extra information reported on error -+ * -+ * @cpu is a cpu number which came from the BPF scheduler and can be any value. -+ * Verify that it is in range and one of the possible cpus. If invalid, trigger -+ * an ops error. -+ */ -+static bool ops_cpu_valid(s32 cpu, const char *where) -+{ -+ if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) { -+ return true; -+ } else { -+ scx_ops_error("invalid CPU %d%s%s", cpu, -+ where ? " " : "", where ?: ""); -+ return false; -+ } -+} -+ -+/** -+ * ops_sanitize_err - Sanitize a -errno value -+ * @ops_name: operation to blame on failure -+ * @err: -errno value to sanitize -+ * -+ * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return -+ * -%EPROTO. This is necessary because returning a rogue -errno up the chain can -+ * cause misbehaviors. For an example, a large negative return from -+ * ops.init_task() triggers an oops when passed up the call chain because the -+ * value fails IS_ERR() test after being encoded with ERR_PTR() and then is -+ * handled as a pointer. -+ */ -+static int ops_sanitize_err(const char *ops_name, s32 err) -+{ -+ if (err < 0 && err >= -MAX_ERRNO) -+ return err; -+ -+ scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err); -+ return -EPROTO; -+} -+ -+static void run_deferred(struct rq *rq) -+{ -+ process_ddsp_deferred_locals(rq); -+} -+ -+#ifdef CONFIG_SMP -+static void deferred_bal_cb_workfn(struct rq *rq) -+{ -+ run_deferred(rq); -+} -+#endif -+ -+static void deferred_irq_workfn(struct irq_work *irq_work) -+{ -+ struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work); -+ -+ raw_spin_rq_lock(rq); -+ run_deferred(rq); -+ raw_spin_rq_unlock(rq); -+} -+ -+/** -+ * schedule_deferred - Schedule execution of deferred actions on an rq -+ * @rq: target rq -+ * -+ * Schedule execution of deferred actions on @rq. Must be called with @rq -+ * locked. Deferred actions are executed with @rq locked but unpinned, and thus -+ * can unlock @rq to e.g. migrate tasks to other rqs. -+ */ -+static void schedule_deferred(struct rq *rq) -+{ -+ lockdep_assert_rq_held(rq); -+ -+#ifdef CONFIG_SMP -+ /* -+ * If in the middle of waking up a task, task_woken_scx() will be called -+ * afterwards which will then run the deferred actions, no need to -+ * schedule anything. -+ */ -+ if (rq->scx.flags & SCX_RQ_IN_WAKEUP) -+ return; -+ -+ /* -+ * If in balance, the balance callbacks will be called before rq lock is -+ * released. Schedule one. -+ */ -+ if (rq->scx.flags & SCX_RQ_IN_BALANCE) { -+ queue_balance_callback(rq, &rq->scx.deferred_bal_cb, -+ deferred_bal_cb_workfn); -+ return; -+ } -+#endif -+ /* -+ * No scheduler hooks available. Queue an irq work. They are executed on -+ * IRQ re-enable which may take a bit longer than the scheduler hooks. -+ * The above WAKEUP and BALANCE paths should cover most of the cases and -+ * the time to IRQ re-enable shouldn't be long. -+ */ -+ irq_work_queue(&rq->scx.deferred_irq_work); -+} -+ -+/** -+ * touch_core_sched - Update timestamp used for core-sched task ordering -+ * @rq: rq to read clock from, must be locked -+ * @p: task to update the timestamp for -+ * -+ * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to -+ * implement global or local-DSQ FIFO ordering for core-sched. Should be called -+ * when a task becomes runnable and its turn on the CPU ends (e.g. slice -+ * exhaustion). -+ */ -+static void touch_core_sched(struct rq *rq, struct task_struct *p) -+{ -+ lockdep_assert_rq_held(rq); -+ -+#ifdef CONFIG_SCHED_CORE -+ /* -+ * It's okay to update the timestamp spuriously. Use -+ * sched_core_disabled() which is cheaper than enabled(). -+ * -+ * As this is used to determine ordering between tasks of sibling CPUs, -+ * it may be better to use per-core dispatch sequence instead. -+ */ -+ if (!sched_core_disabled()) -+ p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq)); -+#endif -+} -+ -+/** -+ * touch_core_sched_dispatch - Update core-sched timestamp on dispatch -+ * @rq: rq to read clock from, must be locked -+ * @p: task being dispatched -+ * -+ * If the BPF scheduler implements custom core-sched ordering via -+ * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO -+ * ordering within each local DSQ. This function is called from dispatch paths -+ * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. -+ */ -+static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) -+{ -+ lockdep_assert_rq_held(rq); -+ -+#ifdef CONFIG_SCHED_CORE -+ if (SCX_HAS_OP(core_sched_before)) -+ touch_core_sched(rq, p); -+#endif -+} -+ -+static void update_curr_scx(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ s64 delta_exec; -+ -+ delta_exec = update_curr_common(rq); -+ if (unlikely(delta_exec <= 0)) -+ return; -+ -+ if (curr->scx.slice != SCX_SLICE_INF) { -+ curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec); -+ if (!curr->scx.slice) -+ touch_core_sched(rq, curr); -+ } -+} -+ -+static bool scx_dsq_priq_less(struct rb_node *node_a, -+ const struct rb_node *node_b) -+{ -+ const struct task_struct *a = -+ container_of(node_a, struct task_struct, scx.dsq_priq); -+ const struct task_struct *b = -+ container_of(node_b, struct task_struct, scx.dsq_priq); -+ -+ return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); -+} -+ -+static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) -+{ -+ /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ -+ WRITE_ONCE(dsq->nr, dsq->nr + delta); -+} -+ -+static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, -+ u64 enq_flags) -+{ -+ bool is_local = dsq->id == SCX_DSQ_LOCAL; -+ -+ WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); -+ WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || -+ !RB_EMPTY_NODE(&p->scx.dsq_priq)); -+ -+ if (!is_local) { -+ raw_spin_lock(&dsq->lock); -+ if (unlikely(dsq->id == SCX_DSQ_INVALID)) { -+ scx_ops_error("attempting to dispatch to a destroyed dsq"); -+ /* fall back to the global dsq */ -+ raw_spin_unlock(&dsq->lock); -+ dsq = find_global_dsq(p); -+ raw_spin_lock(&dsq->lock); -+ } -+ } -+ -+ if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && -+ (enq_flags & SCX_ENQ_DSQ_PRIQ))) { -+ /* -+ * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from -+ * their FIFO queues. To avoid confusion and accidentally -+ * starving vtime-dispatched tasks by FIFO-dispatched tasks, we -+ * disallow any internal DSQ from doing vtime ordering of -+ * tasks. -+ */ -+ scx_ops_error("cannot use vtime ordering for built-in DSQs"); -+ enq_flags &= ~SCX_ENQ_DSQ_PRIQ; -+ } -+ -+ if (enq_flags & SCX_ENQ_DSQ_PRIQ) { -+ struct rb_node *rbp; -+ -+ /* -+ * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are -+ * linked to both the rbtree and list on PRIQs, this can only be -+ * tested easily when adding the first task. -+ */ -+ if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && -+ nldsq_next_task(dsq, NULL, false))) -+ scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks", -+ dsq->id); -+ -+ p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; -+ rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less); -+ -+ /* -+ * Find the previous task and insert after it on the list so -+ * that @dsq->list is vtime ordered. -+ */ -+ rbp = rb_prev(&p->scx.dsq_priq); -+ if (rbp) { -+ struct task_struct *prev = -+ container_of(rbp, struct task_struct, -+ scx.dsq_priq); -+ list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); -+ } else { -+ list_add(&p->scx.dsq_list.node, &dsq->list); -+ } -+ } else { -+ /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ -+ if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) -+ scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks", -+ dsq->id); -+ -+ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) -+ list_add(&p->scx.dsq_list.node, &dsq->list); -+ else -+ list_add_tail(&p->scx.dsq_list.node, &dsq->list); -+ } -+ -+ /* seq records the order tasks are queued, used by BPF DSQ iterator */ -+ dsq->seq++; -+ p->scx.dsq_seq = dsq->seq; -+ -+ dsq_mod_nr(dsq, 1); -+ p->scx.dsq = dsq; -+ -+ /* -+ * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the -+ * direct dispatch path, but we clear them here because the direct -+ * dispatch verdict may be overridden on the enqueue path during e.g. -+ * bypass. -+ */ -+ p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; -+ p->scx.ddsp_enq_flags = 0; -+ -+ /* -+ * We're transitioning out of QUEUEING or DISPATCHING. store_release to -+ * match waiters' load_acquire. -+ */ -+ if (enq_flags & SCX_ENQ_CLEAR_OPSS) -+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); -+ -+ if (is_local) { -+ struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); -+ bool preempt = false; -+ -+ if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && -+ rq->curr->sched_class == &ext_sched_class) { -+ rq->curr->scx.slice = 0; -+ preempt = true; -+ } -+ -+ if (preempt || sched_class_above(&ext_sched_class, -+ rq->curr->sched_class)) -+ resched_curr(rq); -+ } else { -+ raw_spin_unlock(&dsq->lock); -+ } -+} -+ -+static void task_unlink_from_dsq(struct task_struct *p, -+ struct scx_dispatch_q *dsq) -+{ -+ WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node)); -+ -+ if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { -+ rb_erase(&p->scx.dsq_priq, &dsq->priq); -+ RB_CLEAR_NODE(&p->scx.dsq_priq); -+ p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; -+ } -+ -+ list_del_init(&p->scx.dsq_list.node); -+ dsq_mod_nr(dsq, -1); -+} -+ -+static void dispatch_dequeue(struct rq *rq, struct task_struct *p) -+{ -+ struct scx_dispatch_q *dsq = p->scx.dsq; -+ bool is_local = dsq == &rq->scx.local_dsq; -+ -+ if (!dsq) { -+ /* -+ * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. -+ * Unlinking is all that's needed to cancel. -+ */ -+ if (unlikely(!list_empty(&p->scx.dsq_list.node))) -+ list_del_init(&p->scx.dsq_list.node); -+ -+ /* -+ * When dispatching directly from the BPF scheduler to a local -+ * DSQ, the task isn't associated with any DSQ but -+ * @p->scx.holding_cpu may be set under the protection of -+ * %SCX_OPSS_DISPATCHING. -+ */ -+ if (p->scx.holding_cpu >= 0) -+ p->scx.holding_cpu = -1; -+ -+ return; -+ } -+ -+ if (!is_local) -+ raw_spin_lock(&dsq->lock); -+ -+ /* -+ * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't -+ * change underneath us. -+ */ -+ if (p->scx.holding_cpu < 0) { -+ /* @p must still be on @dsq, dequeue */ -+ task_unlink_from_dsq(p, dsq); -+ } else { -+ /* -+ * We're racing against dispatch_to_local_dsq() which already -+ * removed @p from @dsq and set @p->scx.holding_cpu. Clear the -+ * holding_cpu which tells dispatch_to_local_dsq() that it lost -+ * the race. -+ */ -+ WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node)); -+ p->scx.holding_cpu = -1; -+ } -+ p->scx.dsq = NULL; -+ -+ if (!is_local) -+ raw_spin_unlock(&dsq->lock); -+} -+ -+static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, -+ struct task_struct *p) -+{ -+ struct scx_dispatch_q *dsq; -+ -+ if (dsq_id == SCX_DSQ_LOCAL) -+ return &rq->scx.local_dsq; -+ -+ if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { -+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; -+ -+ if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) -+ return find_global_dsq(p); -+ -+ return &cpu_rq(cpu)->scx.local_dsq; -+ } -+ -+ if (dsq_id == SCX_DSQ_GLOBAL) -+ dsq = find_global_dsq(p); -+ else -+ dsq = find_user_dsq(dsq_id); -+ -+ if (unlikely(!dsq)) { -+ scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", -+ dsq_id, p->comm, p->pid); -+ return find_global_dsq(p); -+ } -+ -+ return dsq; -+} -+ -+static void mark_direct_dispatch(struct task_struct *ddsp_task, -+ struct task_struct *p, u64 dsq_id, -+ u64 enq_flags) -+{ -+ /* -+ * Mark that dispatch already happened from ops.select_cpu() or -+ * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value -+ * which can never match a valid task pointer. -+ */ -+ __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); -+ -+ /* @p must match the task on the enqueue path */ -+ if (unlikely(p != ddsp_task)) { -+ if (IS_ERR(ddsp_task)) -+ scx_ops_error("%s[%d] already direct-dispatched", -+ p->comm, p->pid); -+ else -+ scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", -+ ddsp_task->comm, ddsp_task->pid, -+ p->comm, p->pid); -+ return; -+ } -+ -+ WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); -+ WARN_ON_ONCE(p->scx.ddsp_enq_flags); -+ -+ p->scx.ddsp_dsq_id = dsq_id; -+ p->scx.ddsp_enq_flags = enq_flags; -+} -+ -+static void direct_dispatch(struct task_struct *p, u64 enq_flags) -+{ -+ struct rq *rq = task_rq(p); -+ struct scx_dispatch_q *dsq = -+ find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p); -+ -+ touch_core_sched_dispatch(rq, p); -+ -+ p->scx.ddsp_enq_flags |= enq_flags; -+ -+ /* -+ * We are in the enqueue path with @rq locked and pinned, and thus can't -+ * double lock a remote rq and enqueue to its local DSQ. For -+ * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer -+ * the enqueue so that it's executed when @rq can be unlocked. -+ */ -+ if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) { -+ unsigned long opss; -+ -+ opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK; -+ -+ switch (opss & SCX_OPSS_STATE_MASK) { -+ case SCX_OPSS_NONE: -+ break; -+ case SCX_OPSS_QUEUEING: -+ /* -+ * As @p was never passed to the BPF side, _release is -+ * not strictly necessary. Still do it for consistency. -+ */ -+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); -+ break; -+ default: -+ WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()", -+ p->comm, p->pid, opss); -+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); -+ break; -+ } -+ -+ WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); -+ list_add_tail(&p->scx.dsq_list.node, -+ &rq->scx.ddsp_deferred_locals); -+ schedule_deferred(rq); -+ return; -+ } -+ -+ dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); -+} -+ -+static bool scx_rq_online(struct rq *rq) -+{ -+ /* -+ * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates -+ * the online state as seen from the BPF scheduler. cpu_active() test -+ * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will -+ * stay set until the current scheduling operation is complete even if -+ * we aren't locking @rq. -+ */ -+ return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq))); -+} -+ -+static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, -+ int sticky_cpu) -+{ -+ struct task_struct **ddsp_taskp; -+ unsigned long qseq; -+ -+ WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); -+ -+ /* rq migration */ -+ if (sticky_cpu == cpu_of(rq)) -+ goto local_norefill; -+ -+ /* -+ * If !scx_rq_online(), we already told the BPF scheduler that the CPU -+ * is offline and are just running the hotplug path. Don't bother the -+ * BPF scheduler. -+ */ -+ if (!scx_rq_online(rq)) -+ goto local; -+ -+ if (scx_rq_bypassing(rq)) { -+ if (enq_flags & SCX_ENQ_LAST) -+ goto local; -+ else -+ goto global; -+ } -+ -+ if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) -+ goto direct; -+ -+ /* see %SCX_OPS_ENQ_EXITING */ -+ if (!static_branch_unlikely(&scx_ops_enq_exiting) && -+ unlikely(p->flags & PF_EXITING)) -+ goto local; -+ -+ /* see %SCX_OPS_ENQ_LAST */ -+ if (!static_branch_unlikely(&scx_ops_enq_last) && -+ (enq_flags & SCX_ENQ_LAST)) -+ goto local; -+ -+ if (!SCX_HAS_OP(enqueue)) -+ goto global; -+ -+ /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ -+ qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; -+ -+ WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); -+ atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); -+ -+ ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); -+ WARN_ON_ONCE(*ddsp_taskp); -+ *ddsp_taskp = p; -+ -+ SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags); -+ -+ *ddsp_taskp = NULL; -+ if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) -+ goto direct; -+ -+ /* -+ * If not directly dispatched, QUEUEING isn't clear yet and dispatch or -+ * dequeue may be waiting. The store_release matches their load_acquire. -+ */ -+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); -+ return; -+ -+direct: -+ direct_dispatch(p, enq_flags); -+ return; -+ -+local: -+ /* -+ * For task-ordering, slice refill must be treated as implying the end -+ * of the current slice. Otherwise, the longer @p stays on the CPU, the -+ * higher priority it becomes from scx_prio_less()'s POV. -+ */ -+ touch_core_sched(rq, p); -+ p->scx.slice = SCX_SLICE_DFL; -+local_norefill: -+ dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); -+ return; -+ -+global: -+ touch_core_sched(rq, p); /* see the comment in local: */ -+ p->scx.slice = SCX_SLICE_DFL; -+ dispatch_enqueue(find_global_dsq(p), p, enq_flags); -+} -+ -+static bool task_runnable(const struct task_struct *p) -+{ -+ return !list_empty(&p->scx.runnable_node); -+} -+ -+static void set_task_runnable(struct rq *rq, struct task_struct *p) -+{ -+ lockdep_assert_rq_held(rq); -+ -+ if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { -+ p->scx.runnable_at = jiffies; -+ p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; -+ } -+ -+ /* -+ * list_add_tail() must be used. scx_ops_bypass() depends on tasks being -+ * appened to the runnable_list. -+ */ -+ list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); -+} -+ -+static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) -+{ -+ list_del_init(&p->scx.runnable_node); -+ if (reset_runnable_at) -+ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; -+} -+ -+static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) -+{ -+ int sticky_cpu = p->scx.sticky_cpu; -+ -+ if (enq_flags & ENQUEUE_WAKEUP) -+ rq->scx.flags |= SCX_RQ_IN_WAKEUP; -+ -+ enq_flags |= rq->scx.extra_enq_flags; -+ -+ if (sticky_cpu >= 0) -+ p->scx.sticky_cpu = -1; -+ -+ /* -+ * Restoring a running task will be immediately followed by -+ * set_next_task_scx() which expects the task to not be on the BPF -+ * scheduler as tasks can only start running through local DSQs. Force -+ * direct-dispatch into the local DSQ by setting the sticky_cpu. -+ */ -+ if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) -+ sticky_cpu = cpu_of(rq); -+ -+ if (p->scx.flags & SCX_TASK_QUEUED) { -+ WARN_ON_ONCE(!task_runnable(p)); -+ goto out; -+ } -+ -+ set_task_runnable(rq, p); -+ p->scx.flags |= SCX_TASK_QUEUED; -+ rq->scx.nr_running++; -+ add_nr_running(rq, 1); -+ -+ if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); -+ -+ if (enq_flags & SCX_ENQ_WAKEUP) -+ touch_core_sched(rq, p); -+ -+ do_enqueue_task(rq, p, enq_flags, sticky_cpu); -+out: -+ rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; -+} -+ -+static void ops_dequeue(struct task_struct *p, u64 deq_flags) -+{ -+ unsigned long opss; -+ -+ /* dequeue is always temporary, don't reset runnable_at */ -+ clr_task_runnable(p, false); -+ -+ /* acquire ensures that we see the preceding updates on QUEUED */ -+ opss = atomic_long_read_acquire(&p->scx.ops_state); -+ -+ switch (opss & SCX_OPSS_STATE_MASK) { -+ case SCX_OPSS_NONE: -+ break; -+ case SCX_OPSS_QUEUEING: -+ /* -+ * QUEUEING is started and finished while holding @p's rq lock. -+ * As we're holding the rq lock now, we shouldn't see QUEUEING. -+ */ -+ BUG(); -+ case SCX_OPSS_QUEUED: -+ if (SCX_HAS_OP(dequeue)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags); -+ -+ if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, -+ SCX_OPSS_NONE)) -+ break; -+ fallthrough; -+ case SCX_OPSS_DISPATCHING: -+ /* -+ * If @p is being dispatched from the BPF scheduler to a DSQ, -+ * wait for the transfer to complete so that @p doesn't get -+ * added to its DSQ after dequeueing is complete. -+ * -+ * As we're waiting on DISPATCHING with the rq locked, the -+ * dispatching side shouldn't try to lock the rq while -+ * DISPATCHING is set. See dispatch_to_local_dsq(). -+ * -+ * DISPATCHING shouldn't have qseq set and control can reach -+ * here with NONE @opss from the above QUEUED case block. -+ * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. -+ */ -+ wait_ops_state(p, SCX_OPSS_DISPATCHING); -+ BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); -+ break; -+ } -+} -+ -+static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) -+{ -+ if (!(p->scx.flags & SCX_TASK_QUEUED)) { -+ WARN_ON_ONCE(task_runnable(p)); -+ return; -+ } -+ -+ ops_dequeue(p, deq_flags); -+ -+ /* -+ * A currently running task which is going off @rq first gets dequeued -+ * and then stops running. As we want running <-> stopping transitions -+ * to be contained within runnable <-> quiescent transitions, trigger -+ * ->stopping() early here instead of in put_prev_task_scx(). -+ * -+ * @p may go through multiple stopping <-> running transitions between -+ * here and put_prev_task_scx() if task attribute changes occur while -+ * balance_scx() leaves @rq unlocked. However, they don't contain any -+ * information meaningful to the BPF scheduler and can be suppressed by -+ * skipping the callbacks if the task is !QUEUED. -+ */ -+ if (SCX_HAS_OP(stopping) && task_current(rq, p)) { -+ update_curr_scx(rq); -+ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false); -+ } -+ -+ if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags); -+ -+ if (deq_flags & SCX_DEQ_SLEEP) -+ p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; -+ else -+ p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; -+ -+ p->scx.flags &= ~SCX_TASK_QUEUED; -+ rq->scx.nr_running--; -+ sub_nr_running(rq, 1); -+ -+ dispatch_dequeue(rq, p); -+} -+ -+static void yield_task_scx(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ if (SCX_HAS_OP(yield)) -+ SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL); -+ else -+ p->scx.slice = 0; -+} -+ -+static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) -+{ -+ struct task_struct *from = rq->curr; -+ -+ if (SCX_HAS_OP(yield)) -+ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to); -+ else -+ return false; -+} -+ -+static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, -+ struct scx_dispatch_q *src_dsq, -+ struct rq *dst_rq) -+{ -+ struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq; -+ -+ /* @dsq is locked and @p is on @dst_rq */ -+ lockdep_assert_held(&src_dsq->lock); -+ lockdep_assert_rq_held(dst_rq); -+ -+ WARN_ON_ONCE(p->scx.holding_cpu >= 0); -+ -+ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) -+ list_add(&p->scx.dsq_list.node, &dst_dsq->list); -+ else -+ list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list); -+ -+ dsq_mod_nr(dst_dsq, 1); -+ p->scx.dsq = dst_dsq; -+} -+ -+#ifdef CONFIG_SMP -+/** -+ * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ -+ * @p: task to move -+ * @enq_flags: %SCX_ENQ_* -+ * @src_rq: rq to move the task from, locked on entry, released on return -+ * @dst_rq: rq to move the task into, locked on return -+ * -+ * Move @p which is currently on @src_rq to @dst_rq's local DSQ. -+ */ -+static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, -+ struct rq *src_rq, struct rq *dst_rq) -+{ -+ lockdep_assert_rq_held(src_rq); -+ -+ /* the following marks @p MIGRATING which excludes dequeue */ -+ deactivate_task(src_rq, p, 0); -+ set_task_cpu(p, cpu_of(dst_rq)); -+ p->scx.sticky_cpu = cpu_of(dst_rq); -+ -+ raw_spin_rq_unlock(src_rq); -+ raw_spin_rq_lock(dst_rq); -+ -+ /* -+ * We want to pass scx-specific enq_flags but activate_task() will -+ * truncate the upper 32 bit. As we own @rq, we can pass them through -+ * @rq->scx.extra_enq_flags instead. -+ */ -+ WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)); -+ WARN_ON_ONCE(dst_rq->scx.extra_enq_flags); -+ dst_rq->scx.extra_enq_flags = enq_flags; -+ activate_task(dst_rq, p, 0); -+ dst_rq->scx.extra_enq_flags = 0; -+} -+ -+/* -+ * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two -+ * differences: -+ * -+ * - is_cpu_allowed() asks "Can this task run on this CPU?" while -+ * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to -+ * this CPU?". -+ * -+ * While migration is disabled, is_cpu_allowed() has to say "yes" as the task -+ * must be allowed to finish on the CPU that it's currently on regardless of -+ * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the -+ * BPF scheduler shouldn't attempt to migrate a task which has migration -+ * disabled. -+ * -+ * - The BPF scheduler is bypassed while the rq is offline and we can always say -+ * no to the BPF scheduler initiated migrations while offline. -+ */ -+static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, -+ bool trigger_error) -+{ -+ int cpu = cpu_of(rq); -+ -+ /* -+ * We don't require the BPF scheduler to avoid dispatching to offline -+ * CPUs mostly for convenience but also because CPUs can go offline -+ * between scx_bpf_dispatch() calls and here. Trigger error iff the -+ * picked CPU is outside the allowed mask. -+ */ -+ if (!task_allowed_on_cpu(p, cpu)) { -+ if (trigger_error) -+ scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", -+ cpu_of(rq), p->comm, p->pid); -+ return false; -+ } -+ -+ if (unlikely(is_migration_disabled(p))) -+ return false; -+ -+ if (!scx_rq_online(rq)) -+ return false; -+ -+ return true; -+} -+ -+/** -+ * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq -+ * @p: target task -+ * @dsq: locked DSQ @p is currently on -+ * @src_rq: rq @p is currently on, stable with @dsq locked -+ * -+ * Called with @dsq locked but no rq's locked. We want to move @p to a different -+ * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is -+ * required when transferring into a local DSQ. Even when transferring into a -+ * non-local DSQ, it's better to use the same mechanism to protect against -+ * dequeues and maintain the invariant that @p->scx.dsq can only change while -+ * @src_rq is locked, which e.g. scx_dump_task() depends on. -+ * -+ * We want to grab @src_rq but that can deadlock if we try while locking @dsq, -+ * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As -+ * this may race with dequeue, which can't drop the rq lock or fail, do a little -+ * dancing from our side. -+ * -+ * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets -+ * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu -+ * would be cleared to -1. While other cpus may have updated it to different -+ * values afterwards, as this operation can't be preempted or recurse, the -+ * holding_cpu can never become this CPU again before we're done. Thus, we can -+ * tell whether we lost to dequeue by testing whether the holding_cpu still -+ * points to this CPU. See dispatch_dequeue() for the counterpart. -+ * -+ * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is -+ * still valid. %false if lost to dequeue. -+ */ -+static bool unlink_dsq_and_lock_src_rq(struct task_struct *p, -+ struct scx_dispatch_q *dsq, -+ struct rq *src_rq) -+{ -+ s32 cpu = raw_smp_processor_id(); -+ -+ lockdep_assert_held(&dsq->lock); -+ -+ WARN_ON_ONCE(p->scx.holding_cpu >= 0); -+ task_unlink_from_dsq(p, dsq); -+ p->scx.holding_cpu = cpu; -+ -+ raw_spin_unlock(&dsq->lock); -+ raw_spin_rq_lock(src_rq); -+ -+ /* task_rq couldn't have changed if we're still the holding cpu */ -+ return likely(p->scx.holding_cpu == cpu) && -+ !WARN_ON_ONCE(src_rq != task_rq(p)); -+} -+ -+static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, -+ struct scx_dispatch_q *dsq, struct rq *src_rq) -+{ -+ raw_spin_rq_unlock(this_rq); -+ -+ if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) { -+ move_remote_task_to_local_dsq(p, 0, src_rq, this_rq); -+ return true; -+ } else { -+ raw_spin_rq_unlock(src_rq); -+ raw_spin_rq_lock(this_rq); -+ return false; -+ } -+} -+#else /* CONFIG_SMP */ -+static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } -+static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } -+static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } -+#endif /* CONFIG_SMP */ -+ -+static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq) -+{ -+ struct task_struct *p; -+retry: -+ /* -+ * The caller can't expect to successfully consume a task if the task's -+ * addition to @dsq isn't guaranteed to be visible somehow. Test -+ * @dsq->list without locking and skip if it seems empty. -+ */ -+ if (list_empty(&dsq->list)) -+ return false; -+ -+ raw_spin_lock(&dsq->lock); -+ -+ nldsq_for_each_task(p, dsq) { -+ struct rq *task_rq = task_rq(p); -+ -+ if (rq == task_rq) { -+ task_unlink_from_dsq(p, dsq); -+ move_local_task_to_local_dsq(p, 0, dsq, rq); -+ raw_spin_unlock(&dsq->lock); -+ return true; -+ } -+ -+ if (task_can_run_on_remote_rq(p, rq, false)) { -+ if (likely(consume_remote_task(rq, p, dsq, task_rq))) -+ return true; -+ goto retry; -+ } -+ } -+ -+ raw_spin_unlock(&dsq->lock); -+ return false; -+} -+ -+static bool consume_global_dsq(struct rq *rq) -+{ -+ int node = cpu_to_node(cpu_of(rq)); -+ -+ return consume_dispatch_q(rq, global_dsqs[node]); -+} -+ -+/** -+ * dispatch_to_local_dsq - Dispatch a task to a local dsq -+ * @rq: current rq which is locked -+ * @dst_dsq: destination DSQ -+ * @p: task to dispatch -+ * @enq_flags: %SCX_ENQ_* -+ * -+ * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local -+ * DSQ. This function performs all the synchronization dancing needed because -+ * local DSQs are protected with rq locks. -+ * -+ * The caller must have exclusive ownership of @p (e.g. through -+ * %SCX_OPSS_DISPATCHING). -+ */ -+static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, -+ struct task_struct *p, u64 enq_flags) -+{ -+ struct rq *src_rq = task_rq(p); -+ struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); -+ -+ /* -+ * We're synchronized against dequeue through DISPATCHING. As @p can't -+ * be dequeued, its task_rq and cpus_allowed are stable too. -+ * -+ * If dispatching to @rq that @p is already on, no lock dancing needed. -+ */ -+ if (rq == src_rq && rq == dst_rq) { -+ dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); -+ return; -+ } -+ -+#ifdef CONFIG_SMP -+ if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { -+ dispatch_enqueue(find_global_dsq(p), p, -+ enq_flags | SCX_ENQ_CLEAR_OPSS); -+ return; -+ } -+ -+ /* -+ * @p is on a possibly remote @src_rq which we need to lock to move the -+ * task. If dequeue is in progress, it'd be locking @src_rq and waiting -+ * on DISPATCHING, so we can't grab @src_rq lock while holding -+ * DISPATCHING. -+ * -+ * As DISPATCHING guarantees that @p is wholly ours, we can pretend that -+ * we're moving from a DSQ and use the same mechanism - mark the task -+ * under transfer with holding_cpu, release DISPATCHING and then follow -+ * the same protocol. See unlink_dsq_and_lock_src_rq(). -+ */ -+ p->scx.holding_cpu = raw_smp_processor_id(); -+ -+ /* store_release ensures that dequeue sees the above */ -+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); -+ -+ /* switch to @src_rq lock */ -+ if (rq != src_rq) { -+ raw_spin_rq_unlock(rq); -+ raw_spin_rq_lock(src_rq); -+ } -+ -+ /* task_rq couldn't have changed if we're still the holding cpu */ -+ if (likely(p->scx.holding_cpu == raw_smp_processor_id()) && -+ !WARN_ON_ONCE(src_rq != task_rq(p))) { -+ /* -+ * If @p is staying on the same rq, there's no need to go -+ * through the full deactivate/activate cycle. Optimize by -+ * abbreviating move_remote_task_to_local_dsq(). -+ */ -+ if (src_rq == dst_rq) { -+ p->scx.holding_cpu = -1; -+ dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags); -+ } else { -+ move_remote_task_to_local_dsq(p, enq_flags, -+ src_rq, dst_rq); -+ } -+ -+ /* if the destination CPU is idle, wake it up */ -+ if (sched_class_above(p->sched_class, dst_rq->curr->sched_class)) -+ resched_curr(dst_rq); -+ } -+ -+ /* switch back to @rq lock */ -+ if (rq != dst_rq) { -+ raw_spin_rq_unlock(dst_rq); -+ raw_spin_rq_lock(rq); -+ } -+#else /* CONFIG_SMP */ -+ BUG(); /* control can not reach here on UP */ -+#endif /* CONFIG_SMP */ -+} -+ -+/** -+ * finish_dispatch - Asynchronously finish dispatching a task -+ * @rq: current rq which is locked -+ * @p: task to finish dispatching -+ * @qseq_at_dispatch: qseq when @p started getting dispatched -+ * @dsq_id: destination DSQ ID -+ * @enq_flags: %SCX_ENQ_* -+ * -+ * Dispatching to local DSQs may need to wait for queueing to complete or -+ * require rq lock dancing. As we don't wanna do either while inside -+ * ops.dispatch() to avoid locking order inversion, we split dispatching into -+ * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the -+ * task and its qseq. Once ops.dispatch() returns, this function is called to -+ * finish up. -+ * -+ * There is no guarantee that @p is still valid for dispatching or even that it -+ * was valid in the first place. Make sure that the task is still owned by the -+ * BPF scheduler and claim the ownership before dispatching. -+ */ -+static void finish_dispatch(struct rq *rq, struct task_struct *p, -+ unsigned long qseq_at_dispatch, -+ u64 dsq_id, u64 enq_flags) -+{ -+ struct scx_dispatch_q *dsq; -+ unsigned long opss; -+ -+ touch_core_sched_dispatch(rq, p); -+retry: -+ /* -+ * No need for _acquire here. @p is accessed only after a successful -+ * try_cmpxchg to DISPATCHING. -+ */ -+ opss = atomic_long_read(&p->scx.ops_state); -+ -+ switch (opss & SCX_OPSS_STATE_MASK) { -+ case SCX_OPSS_DISPATCHING: -+ case SCX_OPSS_NONE: -+ /* someone else already got to it */ -+ return; -+ case SCX_OPSS_QUEUED: -+ /* -+ * If qseq doesn't match, @p has gone through at least one -+ * dispatch/dequeue and re-enqueue cycle between -+ * scx_bpf_dispatch() and here and we have no claim on it. -+ */ -+ if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) -+ return; -+ -+ /* -+ * While we know @p is accessible, we don't yet have a claim on -+ * it - the BPF scheduler is allowed to dispatch tasks -+ * spuriously and there can be a racing dequeue attempt. Let's -+ * claim @p by atomically transitioning it from QUEUED to -+ * DISPATCHING. -+ */ -+ if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, -+ SCX_OPSS_DISPATCHING))) -+ break; -+ goto retry; -+ case SCX_OPSS_QUEUEING: -+ /* -+ * do_enqueue_task() is in the process of transferring the task -+ * to the BPF scheduler while holding @p's rq lock. As we aren't -+ * holding any kernel or BPF resource that the enqueue path may -+ * depend upon, it's safe to wait. -+ */ -+ wait_ops_state(p, opss); -+ goto retry; -+ } -+ -+ BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); -+ -+ dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p); -+ -+ if (dsq->id == SCX_DSQ_LOCAL) -+ dispatch_to_local_dsq(rq, dsq, p, enq_flags); -+ else -+ dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); -+} -+ -+static void flush_dispatch_buf(struct rq *rq) -+{ -+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); -+ u32 u; -+ -+ for (u = 0; u < dspc->cursor; u++) { -+ struct scx_dsp_buf_ent *ent = &dspc->buf[u]; -+ -+ finish_dispatch(rq, ent->task, ent->qseq, ent->dsq_id, -+ ent->enq_flags); -+ } -+ -+ dspc->nr_tasks += dspc->cursor; -+ dspc->cursor = 0; -+} -+ -+static int balance_one(struct rq *rq, struct task_struct *prev, bool local) -+{ -+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); -+ bool prev_on_scx = prev->sched_class == &ext_sched_class; -+ int nr_loops = SCX_DSP_MAX_LOOPS; -+ bool has_tasks = false; -+ -+ lockdep_assert_rq_held(rq); -+ rq->scx.flags |= SCX_RQ_IN_BALANCE; -+ -+ if (static_branch_unlikely(&scx_ops_cpu_preempt) && -+ unlikely(rq->scx.cpu_released)) { -+ /* -+ * If the previous sched_class for the current CPU was not SCX, -+ * notify the BPF scheduler that it again has control of the -+ * core. This callback complements ->cpu_release(), which is -+ * emitted in scx_next_task_picked(). -+ */ -+ if (SCX_HAS_OP(cpu_acquire)) -+ SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL); -+ rq->scx.cpu_released = false; -+ } -+ -+ if (prev_on_scx) { -+ WARN_ON_ONCE(local && (prev->scx.flags & SCX_TASK_BAL_KEEP)); -+ update_curr_scx(rq); -+ -+ /* -+ * If @prev is runnable & has slice left, it has priority and -+ * fetching more just increases latency for the fetched tasks. -+ * Tell put_prev_task_scx() to put @prev on local_dsq. If the -+ * BPF scheduler wants to handle this explicitly, it should -+ * implement ->cpu_released(). -+ * -+ * See scx_ops_disable_workfn() for the explanation on the -+ * bypassing test. -+ * -+ * When balancing a remote CPU for core-sched, there won't be a -+ * following put_prev_task_scx() call and we don't own -+ * %SCX_TASK_BAL_KEEP. Instead, pick_task_scx() will test the -+ * same conditions later and pick @rq->curr accordingly. -+ */ -+ if ((prev->scx.flags & SCX_TASK_QUEUED) && -+ prev->scx.slice && !scx_rq_bypassing(rq)) { -+ if (local) -+ prev->scx.flags |= SCX_TASK_BAL_KEEP; -+ goto has_tasks; -+ } -+ } -+ -+ /* if there already are tasks to run, nothing to do */ -+ if (rq->scx.local_dsq.nr) -+ goto has_tasks; -+ -+ if (consume_global_dsq(rq)) -+ goto has_tasks; -+ -+ if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq)) -+ goto out; -+ -+ dspc->rq = rq; -+ -+ /* -+ * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, -+ * the local DSQ might still end up empty after a successful -+ * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() -+ * produced some tasks, retry. The BPF scheduler may depend on this -+ * looping behavior to simplify its implementation. -+ */ -+ do { -+ dspc->nr_tasks = 0; -+ -+ SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq), -+ prev_on_scx ? prev : NULL); -+ -+ flush_dispatch_buf(rq); -+ -+ if (rq->scx.local_dsq.nr) -+ goto has_tasks; -+ if (consume_global_dsq(rq)) -+ goto has_tasks; -+ -+ /* -+ * ops.dispatch() can trap us in this loop by repeatedly -+ * dispatching ineligible tasks. Break out once in a while to -+ * allow the watchdog to run. As IRQ can't be enabled in -+ * balance(), we want to complete this scheduling cycle and then -+ * start a new one. IOW, we want to call resched_curr() on the -+ * next, most likely idle, task, not the current one. Use -+ * scx_bpf_kick_cpu() for deferred kicking. -+ */ -+ if (unlikely(!--nr_loops)) { -+ scx_bpf_kick_cpu(cpu_of(rq), 0); -+ break; -+ } -+ } while (dspc->nr_tasks); -+ -+ goto out; -+ -+has_tasks: -+ has_tasks = true; -+out: -+ rq->scx.flags &= ~SCX_RQ_IN_BALANCE; -+ return has_tasks; -+} -+ -+static int balance_scx(struct rq *rq, struct task_struct *prev, -+ struct rq_flags *rf) -+{ -+ int ret; -+ -+ rq_unpin_lock(rq, rf); -+ -+ ret = balance_one(rq, prev, true); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When core-sched is enabled, this ops.balance() call will be followed -+ * by put_prev_scx() and pick_task_scx() on this CPU and pick_task_scx() -+ * on the SMT siblings. Balance the siblings too. -+ */ -+ if (sched_core_enabled(rq)) { -+ const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); -+ int scpu; -+ -+ for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { -+ struct rq *srq = cpu_rq(scpu); -+ struct task_struct *sprev = srq->curr; -+ -+ WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); -+ update_rq_clock(srq); -+ balance_one(srq, sprev, false); -+ } -+ } -+#endif -+ rq_repin_lock(rq, rf); -+ -+ return ret; -+} -+ -+static void process_ddsp_deferred_locals(struct rq *rq) -+{ -+ struct task_struct *p; -+ -+ lockdep_assert_rq_held(rq); -+ -+ /* -+ * Now that @rq can be unlocked, execute the deferred enqueueing of -+ * tasks directly dispatched to the local DSQs of other CPUs. See -+ * direct_dispatch(). Keep popping from the head instead of using -+ * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq -+ * temporarily. -+ */ -+ while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, -+ struct task_struct, scx.dsq_list.node))) { -+ struct scx_dispatch_q *dsq; -+ -+ list_del_init(&p->scx.dsq_list.node); -+ -+ dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p); -+ if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) -+ dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags); -+ } -+} -+ -+static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) -+{ -+ if (p->scx.flags & SCX_TASK_QUEUED) { -+ /* -+ * Core-sched might decide to execute @p before it is -+ * dispatched. Call ops_dequeue() to notify the BPF scheduler. -+ */ -+ ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC); -+ dispatch_dequeue(rq, p); -+ } -+ -+ p->se.exec_start = rq_clock_task(rq); -+ -+ /* see dequeue_task_scx() on why we skip when !QUEUED */ -+ if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, running, p); -+ -+ clr_task_runnable(p, true); -+ -+ /* -+ * @p is getting newly scheduled or got kicked after someone updated its -+ * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). -+ */ -+ if ((p->scx.slice == SCX_SLICE_INF) != -+ (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { -+ if (p->scx.slice == SCX_SLICE_INF) -+ rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; -+ else -+ rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; -+ -+ sched_update_tick_dependency(rq); -+ -+ /* -+ * For now, let's refresh the load_avgs just when transitioning -+ * in and out of nohz. In the future, we might want to add a -+ * mechanism which calls the following periodically on -+ * tick-stopped CPUs. -+ */ -+ update_other_load_avgs(rq); -+ } -+} -+ -+static enum scx_cpu_preempt_reason -+preempt_reason_from_class(const struct sched_class *class) -+{ -+#ifdef CONFIG_SMP -+ if (class == &stop_sched_class) -+ return SCX_CPU_PREEMPT_STOP; -+#endif -+ if (class == &dl_sched_class) -+ return SCX_CPU_PREEMPT_DL; -+ if (class == &rt_sched_class) -+ return SCX_CPU_PREEMPT_RT; -+ return SCX_CPU_PREEMPT_UNKNOWN; -+} -+ -+static void switch_class_scx(struct rq *rq, struct task_struct *next) -+{ -+ const struct sched_class *next_class = next->sched_class; -+ -+ if (!scx_enabled()) -+ return; -+#ifdef CONFIG_SMP -+ /* -+ * Pairs with the smp_load_acquire() issued by a CPU in -+ * kick_cpus_irq_workfn() who is waiting for this CPU to perform a -+ * resched. -+ */ -+ smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); -+#endif -+ if (!static_branch_unlikely(&scx_ops_cpu_preempt)) -+ return; -+ -+ /* -+ * The callback is conceptually meant to convey that the CPU is no -+ * longer under the control of SCX. Therefore, don't invoke the callback -+ * if the next class is below SCX (in which case the BPF scheduler has -+ * actively decided not to schedule any tasks on the CPU). -+ */ -+ if (sched_class_above(&ext_sched_class, next_class)) -+ return; -+ -+ /* -+ * At this point we know that SCX was preempted by a higher priority -+ * sched_class, so invoke the ->cpu_release() callback if we have not -+ * done so already. We only send the callback once between SCX being -+ * preempted, and it regaining control of the CPU. -+ * -+ * ->cpu_release() complements ->cpu_acquire(), which is emitted the -+ * next time that balance_scx() is invoked. -+ */ -+ if (!rq->scx.cpu_released) { -+ if (SCX_HAS_OP(cpu_release)) { -+ struct scx_cpu_release_args args = { -+ .reason = preempt_reason_from_class(next_class), -+ .task = next, -+ }; -+ -+ SCX_CALL_OP(SCX_KF_CPU_RELEASE, -+ cpu_release, cpu_of(rq), &args); -+ } -+ rq->scx.cpu_released = true; -+ } -+} -+ -+static void put_prev_task_scx(struct rq *rq, struct task_struct *p) -+{ -+ update_curr_scx(rq); -+ -+ /* see dequeue_task_scx() on why we skip when !QUEUED */ -+ if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); -+ -+ /* -+ * If we're being called from put_prev_task_balance(), balance_scx() may -+ * have decided that @p should keep running. -+ */ -+ if (p->scx.flags & SCX_TASK_BAL_KEEP) { -+ p->scx.flags &= ~SCX_TASK_BAL_KEEP; -+ set_task_runnable(rq, p); -+ dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); -+ return; -+ } -+ -+ if (p->scx.flags & SCX_TASK_QUEUED) { -+ set_task_runnable(rq, p); -+ -+ /* -+ * If @p has slice left and balance_scx() didn't tag it for -+ * keeping, @p is getting preempted by a higher priority -+ * scheduler class or core-sched forcing a different task. Leave -+ * it at the head of the local DSQ. -+ */ -+ if (p->scx.slice && !scx_rq_bypassing(rq)) { -+ dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); -+ return; -+ } -+ -+ /* -+ * If we're in the pick_next_task path, balance_scx() should -+ * have already populated the local DSQ if there are any other -+ * available tasks. If empty, tell ops.enqueue() that @p is the -+ * only one available for this cpu. ops.enqueue() should put it -+ * on the local DSQ so that the subsequent pick_next_task_scx() -+ * can find the task unless it wants to trigger a separate -+ * follow-up scheduling event. -+ */ -+ if (list_empty(&rq->scx.local_dsq.list)) -+ do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); -+ else -+ do_enqueue_task(rq, p, 0, -1); -+ } -+} -+ -+static struct task_struct *first_local_task(struct rq *rq) -+{ -+ return list_first_entry_or_null(&rq->scx.local_dsq.list, -+ struct task_struct, scx.dsq_list.node); -+} -+ -+static struct task_struct *pick_next_task_scx(struct rq *rq) -+{ -+ struct task_struct *p; -+ -+ p = first_local_task(rq); -+ if (!p) -+ return NULL; -+ -+ set_next_task_scx(rq, p, true); -+ -+ if (unlikely(!p->scx.slice)) { -+ if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { -+ printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", -+ p->comm, p->pid); -+ scx_warned_zero_slice = true; -+ } -+ p->scx.slice = SCX_SLICE_DFL; -+ } -+ -+ return p; -+} -+ -+#ifdef CONFIG_SCHED_CORE -+/** -+ * scx_prio_less - Task ordering for core-sched -+ * @a: task A -+ * @b: task B -+ * -+ * Core-sched is implemented as an additional scheduling layer on top of the -+ * usual sched_class'es and needs to find out the expected task ordering. For -+ * SCX, core-sched calls this function to interrogate the task ordering. -+ * -+ * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used -+ * to implement the default task ordering. The older the timestamp, the higher -+ * prority the task - the global FIFO ordering matching the default scheduling -+ * behavior. -+ * -+ * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to -+ * implement FIFO ordering within each local DSQ. See pick_task_scx(). -+ */ -+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, -+ bool in_fi) -+{ -+ /* -+ * The const qualifiers are dropped from task_struct pointers when -+ * calling ops.core_sched_before(). Accesses are controlled by the -+ * verifier. -+ */ -+ if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a))) -+ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, -+ (struct task_struct *)a, -+ (struct task_struct *)b); -+ else -+ return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); -+} -+ -+/** -+ * pick_task_scx - Pick a candidate task for core-sched -+ * @rq: rq to pick the candidate task from -+ * -+ * Core-sched calls this function on each SMT sibling to determine the next -+ * tasks to run on the SMT siblings. balance_one() has been called on all -+ * siblings and put_prev_task_scx() has been called only for the current CPU. -+ * -+ * As put_prev_task_scx() hasn't been called on remote CPUs, we can't just look -+ * at the first task in the local dsq. @rq->curr has to be considered explicitly -+ * to mimic %SCX_TASK_BAL_KEEP. -+ */ -+static struct task_struct *pick_task_scx(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ struct task_struct *first = first_local_task(rq); -+ -+ if (curr->scx.flags & SCX_TASK_QUEUED) { -+ /* is curr the only runnable task? */ -+ if (!first) -+ return curr; -+ -+ /* -+ * Does curr trump first? We can always go by core_sched_at for -+ * this comparison as it represents global FIFO ordering when -+ * the default core-sched ordering is used and local-DSQ FIFO -+ * ordering otherwise. -+ * -+ * We can have a task with an earlier timestamp on the DSQ. For -+ * example, when a current task is preempted by a sibling -+ * picking a different cookie, the task would be requeued at the -+ * head of the local DSQ with an earlier timestamp than the -+ * core-sched picked next task. Besides, the BPF scheduler may -+ * dispatch any tasks to the local DSQ anytime. -+ */ -+ if (curr->scx.slice && time_before64(curr->scx.core_sched_at, -+ first->scx.core_sched_at)) -+ return curr; -+ } -+ -+ return first; /* this may be %NULL */ -+} -+#endif /* CONFIG_SCHED_CORE */ -+ -+#ifdef CONFIG_SMP -+ -+static bool test_and_clear_cpu_idle(int cpu) -+{ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * SMT mask should be cleared whether we can claim @cpu or not. The SMT -+ * cluster is not wholly idle either way. This also prevents -+ * scx_pick_idle_cpu() from getting caught in an infinite loop. -+ */ -+ if (sched_smt_active()) { -+ const struct cpumask *smt = cpu_smt_mask(cpu); -+ -+ /* -+ * If offline, @cpu is not its own sibling and -+ * scx_pick_idle_cpu() can get caught in an infinite loop as -+ * @cpu is never cleared from idle_masks.smt. Ensure that @cpu -+ * is eventually cleared. -+ */ -+ if (cpumask_intersects(smt, idle_masks.smt)) -+ cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); -+ else if (cpumask_test_cpu(cpu, idle_masks.smt)) -+ __cpumask_clear_cpu(cpu, idle_masks.smt); -+ } -+#endif -+ return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); -+} -+ -+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) -+{ -+ int cpu; -+ -+retry: -+ if (sched_smt_active()) { -+ cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); -+ if (cpu < nr_cpu_ids) -+ goto found; -+ -+ if (flags & SCX_PICK_IDLE_CORE) -+ return -EBUSY; -+ } -+ -+ cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); -+ if (cpu >= nr_cpu_ids) -+ return -EBUSY; -+ -+found: -+ if (test_and_clear_cpu_idle(cpu)) -+ return cpu; -+ else -+ goto retry; -+} -+ -+static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, -+ u64 wake_flags, bool *found) -+{ -+ s32 cpu; -+ -+ *found = false; -+ -+ /* -+ * If WAKE_SYNC, the waker's local DSQ is empty, and the system is -+ * under utilized, wake up @p to the local DSQ of the waker. Checking -+ * only for an empty local DSQ is insufficient as it could give the -+ * wakee an unfair advantage when the system is oversaturated. -+ * Checking only for the presence of idle CPUs is also insufficient as -+ * the local DSQ of the waker could have tasks piled up on it even if -+ * there is an idle core elsewhere on the system. -+ */ -+ cpu = smp_processor_id(); -+ if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 && -+ !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) && -+ cpu_rq(cpu)->scx.local_dsq.nr == 0) { -+ if (cpumask_test_cpu(cpu, p->cpus_ptr)) -+ goto cpu_found; -+ } -+ -+ if (p->nr_cpus_allowed == 1) { -+ if (test_and_clear_cpu_idle(prev_cpu)) { -+ cpu = prev_cpu; -+ goto cpu_found; -+ } else { -+ return prev_cpu; -+ } -+ } -+ -+ /* -+ * If CPU has SMT, any wholly idle CPU is likely a better pick than -+ * partially idle @prev_cpu. -+ */ -+ if (sched_smt_active()) { -+ if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && -+ test_and_clear_cpu_idle(prev_cpu)) { -+ cpu = prev_cpu; -+ goto cpu_found; -+ } -+ -+ cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); -+ if (cpu >= 0) -+ goto cpu_found; -+ } -+ -+ if (test_and_clear_cpu_idle(prev_cpu)) { -+ cpu = prev_cpu; -+ goto cpu_found; -+ } -+ -+ cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); -+ if (cpu >= 0) -+ goto cpu_found; -+ -+ return prev_cpu; -+ -+cpu_found: -+ *found = true; -+ return cpu; -+} -+ -+static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) -+{ -+ /* -+ * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it -+ * can be a good migration opportunity with low cache and memory -+ * footprint. Returning a CPU different than @prev_cpu triggers -+ * immediate rq migration. However, for SCX, as the current rq -+ * association doesn't dictate where the task is going to run, this -+ * doesn't fit well. If necessary, we can later add a dedicated method -+ * which can decide to preempt self to force it through the regular -+ * scheduling path. -+ */ -+ if (unlikely(wake_flags & WF_EXEC)) -+ return prev_cpu; -+ -+ if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) { -+ s32 cpu; -+ struct task_struct **ddsp_taskp; -+ -+ ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); -+ WARN_ON_ONCE(*ddsp_taskp); -+ *ddsp_taskp = p; -+ -+ cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, -+ select_cpu, p, prev_cpu, wake_flags); -+ *ddsp_taskp = NULL; -+ if (ops_cpu_valid(cpu, "from ops.select_cpu()")) -+ return cpu; -+ else -+ return prev_cpu; -+ } else { -+ bool found; -+ s32 cpu; -+ -+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found); -+ if (found) { -+ p->scx.slice = SCX_SLICE_DFL; -+ p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; -+ } -+ return cpu; -+ } -+} -+ -+static void task_woken_scx(struct rq *rq, struct task_struct *p) -+{ -+ run_deferred(rq); -+} -+ -+static void set_cpus_allowed_scx(struct task_struct *p, -+ struct affinity_context *ac) -+{ -+ set_cpus_allowed_common(p, ac); -+ -+ /* -+ * The effective cpumask is stored in @p->cpus_ptr which may temporarily -+ * differ from the configured one in @p->cpus_mask. Always tell the bpf -+ * scheduler the effective one. -+ * -+ * Fine-grained memory write control is enforced by BPF making the const -+ * designation pointless. Cast it away when calling the operation. -+ */ -+ if (SCX_HAS_OP(set_cpumask)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, -+ (struct cpumask *)p->cpus_ptr); -+} -+ -+static void reset_idle_masks(void) -+{ -+ /* -+ * Consider all online cpus idle. Should converge to the actual state -+ * quickly. -+ */ -+ cpumask_copy(idle_masks.cpu, cpu_online_mask); -+ cpumask_copy(idle_masks.smt, cpu_online_mask); -+} -+ -+void __scx_update_idle(struct rq *rq, bool idle) -+{ -+ int cpu = cpu_of(rq); -+ -+ if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) { -+ SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); -+ if (!static_branch_unlikely(&scx_builtin_idle_enabled)) -+ return; -+ } -+ -+ if (idle) -+ cpumask_set_cpu(cpu, idle_masks.cpu); -+ else -+ cpumask_clear_cpu(cpu, idle_masks.cpu); -+ -+#ifdef CONFIG_SCHED_SMT -+ if (sched_smt_active()) { -+ const struct cpumask *smt = cpu_smt_mask(cpu); -+ -+ if (idle) { -+ /* -+ * idle_masks.smt handling is racy but that's fine as -+ * it's only for optimization and self-correcting. -+ */ -+ for_each_cpu(cpu, smt) { -+ if (!cpumask_test_cpu(cpu, idle_masks.cpu)) -+ return; -+ } -+ cpumask_or(idle_masks.smt, idle_masks.smt, smt); -+ } else { -+ cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); -+ } -+ } -+#endif -+} -+ -+static void handle_hotplug(struct rq *rq, bool online) -+{ -+ int cpu = cpu_of(rq); -+ -+ atomic_long_inc(&scx_hotplug_seq); -+ -+ if (online && SCX_HAS_OP(cpu_online)) -+ SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); -+ else if (!online && SCX_HAS_OP(cpu_offline)) -+ SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu); -+ else -+ scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, -+ "cpu %d going %s, exiting scheduler", cpu, -+ online ? "online" : "offline"); -+} -+ -+void scx_rq_activate(struct rq *rq) -+{ -+ handle_hotplug(rq, true); -+} -+ -+void scx_rq_deactivate(struct rq *rq) -+{ -+ handle_hotplug(rq, false); -+} -+ -+static void rq_online_scx(struct rq *rq) -+{ -+ rq->scx.flags |= SCX_RQ_ONLINE; -+} -+ -+static void rq_offline_scx(struct rq *rq) -+{ -+ rq->scx.flags &= ~SCX_RQ_ONLINE; -+} -+ -+#else /* CONFIG_SMP */ -+ -+static bool test_and_clear_cpu_idle(int cpu) { return false; } -+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } -+static void reset_idle_masks(void) {} -+ -+#endif /* CONFIG_SMP */ -+ -+static bool check_rq_for_timeouts(struct rq *rq) -+{ -+ struct task_struct *p; -+ struct rq_flags rf; -+ bool timed_out = false; -+ -+ rq_lock_irqsave(rq, &rf); -+ list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { -+ unsigned long last_runnable = p->scx.runnable_at; -+ -+ if (unlikely(time_after(jiffies, -+ last_runnable + scx_watchdog_timeout))) { -+ u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); -+ -+ scx_ops_error_kind(SCX_EXIT_ERROR_STALL, -+ "%s[%d] failed to run for %u.%03us", -+ p->comm, p->pid, -+ dur_ms / 1000, dur_ms % 1000); -+ timed_out = true; -+ break; -+ } -+ } -+ rq_unlock_irqrestore(rq, &rf); -+ -+ return timed_out; -+} -+ -+static void scx_watchdog_workfn(struct work_struct *work) -+{ -+ int cpu; -+ -+ WRITE_ONCE(scx_watchdog_timestamp, jiffies); -+ -+ for_each_online_cpu(cpu) { -+ if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) -+ break; -+ -+ cond_resched(); -+ } -+ queue_delayed_work(system_unbound_wq, to_delayed_work(work), -+ scx_watchdog_timeout / 2); -+} -+ -+void scx_tick(struct rq *rq) -+{ -+ unsigned long last_check; -+ -+ if (!scx_enabled()) -+ return; -+ -+ last_check = READ_ONCE(scx_watchdog_timestamp); -+ if (unlikely(time_after(jiffies, -+ last_check + READ_ONCE(scx_watchdog_timeout)))) { -+ u32 dur_ms = jiffies_to_msecs(jiffies - last_check); -+ -+ scx_ops_error_kind(SCX_EXIT_ERROR_STALL, -+ "watchdog failed to check in for %u.%03us", -+ dur_ms / 1000, dur_ms % 1000); -+ } -+ -+ update_other_load_avgs(rq); -+} -+ -+static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) -+{ -+ update_curr_scx(rq); -+ -+ /* -+ * While disabling, always resched and refresh core-sched timestamp as -+ * we can't trust the slice management or ops.core_sched_before(). -+ */ -+ if (scx_rq_bypassing(rq)) { -+ curr->scx.slice = 0; -+ touch_core_sched(rq, curr); -+ } else if (SCX_HAS_OP(tick)) { -+ SCX_CALL_OP(SCX_KF_REST, tick, curr); -+ } -+ -+ if (!curr->scx.slice) -+ resched_curr(rq); -+} -+ -+#ifdef CONFIG_EXT_GROUP_SCHED -+static struct cgroup *tg_cgrp(struct task_group *tg) -+{ -+ /* -+ * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, -+ * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the -+ * root cgroup. -+ */ -+ if (tg && tg->css.cgroup) -+ return tg->css.cgroup; -+ else -+ return &cgrp_dfl_root.cgrp; -+} -+ -+#define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), -+ -+#else /* CONFIG_EXT_GROUP_SCHED */ -+ -+#define SCX_INIT_TASK_ARGS_CGROUP(tg) -+ -+#endif /* CONFIG_EXT_GROUP_SCHED */ -+ -+static enum scx_task_state scx_get_task_state(const struct task_struct *p) -+{ -+ return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; -+} -+ -+static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) -+{ -+ enum scx_task_state prev_state = scx_get_task_state(p); -+ bool warn = false; -+ -+ BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS)); -+ -+ switch (state) { -+ case SCX_TASK_NONE: -+ break; -+ case SCX_TASK_INIT: -+ warn = prev_state != SCX_TASK_NONE; -+ break; -+ case SCX_TASK_READY: -+ warn = prev_state == SCX_TASK_NONE; -+ break; -+ case SCX_TASK_ENABLED: -+ warn = prev_state != SCX_TASK_READY; -+ break; -+ default: -+ warn = true; -+ return; -+ } -+ -+ WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]", -+ prev_state, state, p->comm, p->pid); -+ -+ p->scx.flags &= ~SCX_TASK_STATE_MASK; -+ p->scx.flags |= state << SCX_TASK_STATE_SHIFT; -+} -+ -+static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork) -+{ -+ int ret; -+ -+ p->scx.disallow = false; -+ -+ if (SCX_HAS_OP(init_task)) { -+ struct scx_init_task_args args = { -+ SCX_INIT_TASK_ARGS_CGROUP(tg) -+ .fork = fork, -+ }; -+ -+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args); -+ if (unlikely(ret)) { -+ ret = ops_sanitize_err("init_task", ret); -+ return ret; -+ } -+ } -+ -+ scx_set_task_state(p, SCX_TASK_INIT); -+ -+ if (p->scx.disallow) { -+ if (!fork) { -+ struct rq *rq; -+ struct rq_flags rf; -+ -+ rq = task_rq_lock(p, &rf); -+ -+ /* -+ * We're in the load path and @p->policy will be applied -+ * right after. Reverting @p->policy here and rejecting -+ * %SCHED_EXT transitions from scx_check_setscheduler() -+ * guarantees that if ops.init_task() sets @p->disallow, -+ * @p can never be in SCX. -+ */ -+ if (p->policy == SCHED_EXT) { -+ p->policy = SCHED_NORMAL; -+ atomic_long_inc(&scx_nr_rejected); -+ } -+ -+ task_rq_unlock(rq, p, &rf); -+ } else if (p->policy == SCHED_EXT) { -+ scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork", -+ p->comm, p->pid); -+ } -+ } -+ -+ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; -+ return 0; -+} -+ -+static void scx_ops_enable_task(struct task_struct *p) -+{ -+ u32 weight; -+ -+ lockdep_assert_rq_held(task_rq(p)); -+ -+ /* -+ * Set the weight before calling ops.enable() so that the scheduler -+ * doesn't see a stale value if they inspect the task struct. -+ */ -+ if (task_has_idle_policy(p)) -+ weight = WEIGHT_IDLEPRIO; -+ else -+ weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; -+ -+ p->scx.weight = sched_weight_to_cgroup(weight); -+ -+ if (SCX_HAS_OP(enable)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, enable, p); -+ scx_set_task_state(p, SCX_TASK_ENABLED); -+ -+ if (SCX_HAS_OP(set_weight)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); -+} -+ -+static void scx_ops_disable_task(struct task_struct *p) -+{ -+ lockdep_assert_rq_held(task_rq(p)); -+ WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); -+ -+ if (SCX_HAS_OP(disable)) -+ SCX_CALL_OP(SCX_KF_REST, disable, p); -+ scx_set_task_state(p, SCX_TASK_READY); -+} -+ -+static void scx_ops_exit_task(struct task_struct *p) -+{ -+ struct scx_exit_task_args args = { -+ .cancelled = false, -+ }; -+ -+ lockdep_assert_rq_held(task_rq(p)); -+ -+ switch (scx_get_task_state(p)) { -+ case SCX_TASK_NONE: -+ return; -+ case SCX_TASK_INIT: -+ args.cancelled = true; -+ break; -+ case SCX_TASK_READY: -+ break; -+ case SCX_TASK_ENABLED: -+ scx_ops_disable_task(p); -+ break; -+ default: -+ WARN_ON_ONCE(true); -+ return; -+ } -+ -+ if (SCX_HAS_OP(exit_task)) -+ SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args); -+ scx_set_task_state(p, SCX_TASK_NONE); -+} -+ -+void init_scx_entity(struct sched_ext_entity *scx) -+{ -+ /* -+ * init_idle() calls this function again after fork sequence is -+ * complete. Don't touch ->tasks_node as it's already linked. -+ */ -+ memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node)); -+ -+ INIT_LIST_HEAD(&scx->dsq_list.node); -+ RB_CLEAR_NODE(&scx->dsq_priq); -+ scx->sticky_cpu = -1; -+ scx->holding_cpu = -1; -+ INIT_LIST_HEAD(&scx->runnable_node); -+ scx->runnable_at = jiffies; -+ scx->ddsp_dsq_id = SCX_DSQ_INVALID; -+ scx->slice = SCX_SLICE_DFL; -+} -+ -+void scx_pre_fork(struct task_struct *p) -+{ -+ /* -+ * BPF scheduler enable/disable paths want to be able to iterate and -+ * update all tasks which can become complex when racing forks. As -+ * enable/disable are very cold paths, let's use a percpu_rwsem to -+ * exclude forks. -+ */ -+ percpu_down_read(&scx_fork_rwsem); -+} -+ -+int scx_fork(struct task_struct *p) -+{ -+ percpu_rwsem_assert_held(&scx_fork_rwsem); -+ -+ if (scx_ops_init_task_enabled) -+ return scx_ops_init_task(p, task_group(p), true); -+ else -+ return 0; -+} -+ -+void scx_post_fork(struct task_struct *p) -+{ -+ if (scx_ops_init_task_enabled) { -+ scx_set_task_state(p, SCX_TASK_READY); -+ -+ /* -+ * Enable the task immediately if it's running on sched_ext. -+ * Otherwise, it'll be enabled in switching_to_scx() if and -+ * when it's ever configured to run with a SCHED_EXT policy. -+ */ -+ if (p->sched_class == &ext_sched_class) { -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ rq = task_rq_lock(p, &rf); -+ scx_ops_enable_task(p); -+ task_rq_unlock(rq, p, &rf); -+ } -+ } -+ -+ spin_lock_irq(&scx_tasks_lock); -+ list_add_tail(&p->scx.tasks_node, &scx_tasks); -+ spin_unlock_irq(&scx_tasks_lock); -+ -+ percpu_up_read(&scx_fork_rwsem); -+} -+ -+void scx_cancel_fork(struct task_struct *p) -+{ -+ if (scx_enabled()) { -+ struct rq *rq; -+ struct rq_flags rf; -+ -+ rq = task_rq_lock(p, &rf); -+ WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); -+ scx_ops_exit_task(p); -+ task_rq_unlock(rq, p, &rf); -+ } -+ -+ percpu_up_read(&scx_fork_rwsem); -+} -+ -+void sched_ext_free(struct task_struct *p) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&scx_tasks_lock, flags); -+ list_del_init(&p->scx.tasks_node); -+ spin_unlock_irqrestore(&scx_tasks_lock, flags); -+ -+ /* -+ * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY -> -+ * ENABLED transitions can't race us. Disable ops for @p. -+ */ -+ if (scx_get_task_state(p) != SCX_TASK_NONE) { -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ rq = task_rq_lock(p, &rf); -+ scx_ops_exit_task(p); -+ task_rq_unlock(rq, p, &rf); -+ } -+} -+ -+static void reweight_task_scx(struct rq *rq, struct task_struct *p, -+ const struct load_weight *lw) -+{ -+ lockdep_assert_rq_held(task_rq(p)); -+ -+ p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); -+ if (SCX_HAS_OP(set_weight)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); -+} -+ -+static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) -+{ -+} -+ -+static void switching_to_scx(struct rq *rq, struct task_struct *p) -+{ -+ scx_ops_enable_task(p); -+ -+ /* -+ * set_cpus_allowed_scx() is not called while @p is associated with a -+ * different scheduler class. Keep the BPF scheduler up-to-date. -+ */ -+ if (SCX_HAS_OP(set_cpumask)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, -+ (struct cpumask *)p->cpus_ptr); -+} -+ -+static void switched_from_scx(struct rq *rq, struct task_struct *p) -+{ -+ scx_ops_disable_task(p); -+} -+ -+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} -+static void switched_to_scx(struct rq *rq, struct task_struct *p) {} -+ -+int scx_check_setscheduler(struct task_struct *p, int policy) -+{ -+ lockdep_assert_rq_held(task_rq(p)); -+ -+ /* if disallow, reject transitioning into SCX */ -+ if (scx_enabled() && READ_ONCE(p->scx.disallow) && -+ p->policy != policy && policy == SCHED_EXT) -+ return -EACCES; -+ -+ return 0; -+} -+ -+#ifdef CONFIG_NO_HZ_FULL -+bool scx_can_stop_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ if (scx_rq_bypassing(rq)) -+ return false; -+ -+ if (p->sched_class != &ext_sched_class) -+ return true; -+ -+ /* -+ * @rq can dispatch from different DSQs, so we can't tell whether it -+ * needs the tick or not by looking at nr_running. Allow stopping ticks -+ * iff the BPF scheduler indicated so. See set_next_task_scx(). -+ */ -+ return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; -+} -+#endif -+ -+#ifdef CONFIG_EXT_GROUP_SCHED -+ -+DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); -+static bool scx_cgroup_enabled; -+static bool cgroup_warned_missing_weight; -+static bool cgroup_warned_missing_idle; -+ -+static void scx_cgroup_warn_missing_weight(struct task_group *tg) -+{ -+ if (scx_ops_enable_state() == SCX_OPS_DISABLED || -+ cgroup_warned_missing_weight) -+ return; -+ -+ if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent) -+ return; -+ -+ pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n", -+ scx_ops.name); -+ cgroup_warned_missing_weight = true; -+} -+ -+static void scx_cgroup_warn_missing_idle(struct task_group *tg) -+{ -+ if (!scx_cgroup_enabled || cgroup_warned_missing_idle) -+ return; -+ -+ if (!tg->idle) -+ return; -+ -+ pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n", -+ scx_ops.name); -+ cgroup_warned_missing_idle = true; -+} -+ -+int scx_tg_online(struct task_group *tg) -+{ -+ int ret = 0; -+ -+ WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); -+ -+ percpu_down_read(&scx_cgroup_rwsem); -+ -+ scx_cgroup_warn_missing_weight(tg); -+ -+ if (scx_cgroup_enabled) { -+ if (SCX_HAS_OP(cgroup_init)) { -+ struct scx_cgroup_init_args args = -+ { .weight = tg->scx_weight }; -+ -+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, -+ tg->css.cgroup, &args); -+ if (ret) -+ ret = ops_sanitize_err("cgroup_init", ret); -+ } -+ if (ret == 0) -+ tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; -+ } else { -+ tg->scx_flags |= SCX_TG_ONLINE; -+ } -+ -+ percpu_up_read(&scx_cgroup_rwsem); -+ return ret; -+} -+ -+void scx_tg_offline(struct task_group *tg) -+{ -+ WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); -+ -+ percpu_down_read(&scx_cgroup_rwsem); -+ -+ if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED)) -+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup); -+ tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); -+ -+ percpu_up_read(&scx_cgroup_rwsem); -+} -+ -+int scx_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ struct cgroup_subsys_state *css; -+ struct task_struct *p; -+ int ret; -+ -+ /* released in scx_finish/cancel_attach() */ -+ percpu_down_read(&scx_cgroup_rwsem); -+ -+ if (!scx_cgroup_enabled) -+ return 0; -+ -+ cgroup_taskset_for_each(p, css, tset) { -+ struct cgroup *from = tg_cgrp(task_group(p)); -+ struct cgroup *to = tg_cgrp(css_tg(css)); -+ -+ WARN_ON_ONCE(p->scx.cgrp_moving_from); -+ -+ /* -+ * sched_move_task() omits identity migrations. Let's match the -+ * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() -+ * always match one-to-one. -+ */ -+ if (from == to) -+ continue; -+ -+ if (SCX_HAS_OP(cgroup_prep_move)) { -+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, -+ p, from, css->cgroup); -+ if (ret) -+ goto err; -+ } -+ -+ p->scx.cgrp_moving_from = from; -+ } -+ -+ return 0; -+ -+err: -+ cgroup_taskset_for_each(p, css, tset) { -+ if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) -+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, -+ p->scx.cgrp_moving_from, css->cgroup); -+ p->scx.cgrp_moving_from = NULL; -+ } -+ -+ percpu_up_read(&scx_cgroup_rwsem); -+ return ops_sanitize_err("cgroup_prep_move", ret); -+} -+ -+void scx_move_task(struct task_struct *p) -+{ -+ if (!scx_cgroup_enabled) -+ return; -+ -+ /* -+ * We're called from sched_move_task() which handles both cgroup and -+ * autogroup moves. Ignore the latter. -+ * -+ * Also ignore exiting tasks, because in the exit path tasks transition -+ * from the autogroup to the root group, so task_group_is_autogroup() -+ * alone isn't able to catch exiting autogroup tasks. This is safe for -+ * cgroup_move(), because cgroup migrations never happen for PF_EXITING -+ * tasks. -+ */ -+ if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING)) -+ return; -+ -+ /* -+ * @p must have ops.cgroup_prep_move() called on it and thus -+ * cgrp_moving_from set. -+ */ -+ if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) -+ SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, -+ p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); -+ p->scx.cgrp_moving_from = NULL; -+} -+ -+void scx_cgroup_finish_attach(void) -+{ -+ percpu_up_read(&scx_cgroup_rwsem); -+} -+ -+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) -+{ -+ struct cgroup_subsys_state *css; -+ struct task_struct *p; -+ -+ if (!scx_cgroup_enabled) -+ goto out_unlock; -+ -+ cgroup_taskset_for_each(p, css, tset) { -+ if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) -+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, -+ p->scx.cgrp_moving_from, css->cgroup); -+ p->scx.cgrp_moving_from = NULL; -+ } -+out_unlock: -+ percpu_up_read(&scx_cgroup_rwsem); -+} -+ -+void scx_group_set_weight(struct task_group *tg, unsigned long weight) -+{ -+ percpu_down_read(&scx_cgroup_rwsem); -+ -+ if (scx_cgroup_enabled && tg->scx_weight != weight) { -+ if (SCX_HAS_OP(cgroup_set_weight)) -+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, -+ tg_cgrp(tg), weight); -+ tg->scx_weight = weight; -+ } -+ -+ percpu_up_read(&scx_cgroup_rwsem); -+} -+ -+void scx_group_set_idle(struct task_group *tg, bool idle) -+{ -+ percpu_down_read(&scx_cgroup_rwsem); -+ scx_cgroup_warn_missing_idle(tg); -+ percpu_up_read(&scx_cgroup_rwsem); -+} -+ -+static void scx_cgroup_lock(void) -+{ -+ percpu_down_write(&scx_cgroup_rwsem); -+} -+ -+static void scx_cgroup_unlock(void) -+{ -+ percpu_up_write(&scx_cgroup_rwsem); -+} -+ -+#else /* CONFIG_EXT_GROUP_SCHED */ -+ -+static inline void scx_cgroup_lock(void) {} -+static inline void scx_cgroup_unlock(void) {} -+ -+#endif /* CONFIG_EXT_GROUP_SCHED */ -+ -+/* -+ * Omitted operations: -+ * -+ * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task -+ * isn't tied to the CPU at that point. Preemption is implemented by resetting -+ * the victim task's slice to 0 and triggering reschedule on the target CPU. -+ * -+ * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. -+ * -+ * - task_fork/dead: We need fork/dead notifications for all tasks regardless of -+ * their current sched_class. Call them directly from sched core instead. -+ */ -+DEFINE_SCHED_CLASS(ext) = { -+ .enqueue_task = enqueue_task_scx, -+ .dequeue_task = dequeue_task_scx, -+ .yield_task = yield_task_scx, -+ .yield_to_task = yield_to_task_scx, -+ -+ .wakeup_preempt = wakeup_preempt_scx, -+ -+ .balance = balance_scx, -+ .pick_next_task = pick_next_task_scx, -+ -+ .put_prev_task = put_prev_task_scx, -+ .set_next_task = set_next_task_scx, -+ -+ .switch_class = switch_class_scx, -+ -+#ifdef CONFIG_SMP -+ .select_task_rq = select_task_rq_scx, -+ .task_woken = task_woken_scx, -+ .set_cpus_allowed = set_cpus_allowed_scx, -+ -+ .rq_online = rq_online_scx, -+ .rq_offline = rq_offline_scx, -+#endif -+ -+#ifdef CONFIG_SCHED_CORE -+ .pick_task = pick_task_scx, -+#endif -+ -+ .task_tick = task_tick_scx, -+ -+ .switching_to = switching_to_scx, -+ .switched_from = switched_from_scx, -+ .switched_to = switched_to_scx, -+ .reweight_task = reweight_task_scx, -+ .prio_changed = prio_changed_scx, -+ -+ .update_curr = update_curr_scx, -+ -+#ifdef CONFIG_UCLAMP_TASK -+ .uclamp_enabled = 1, -+#endif -+}; -+ -+static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) -+{ -+ memset(dsq, 0, sizeof(*dsq)); -+ -+ raw_spin_lock_init(&dsq->lock); -+ INIT_LIST_HEAD(&dsq->list); -+ dsq->id = dsq_id; -+} -+ -+static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node) -+{ -+ struct scx_dispatch_q *dsq; -+ int ret; -+ -+ if (dsq_id & SCX_DSQ_FLAG_BUILTIN) -+ return ERR_PTR(-EINVAL); -+ -+ dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); -+ if (!dsq) -+ return ERR_PTR(-ENOMEM); -+ -+ init_dsq(dsq, dsq_id); -+ -+ ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node, -+ dsq_hash_params); -+ if (ret) { -+ kfree(dsq); -+ return ERR_PTR(ret); -+ } -+ return dsq; -+} -+ -+static void free_dsq_irq_workfn(struct irq_work *irq_work) -+{ -+ struct llist_node *to_free = llist_del_all(&dsqs_to_free); -+ struct scx_dispatch_q *dsq, *tmp_dsq; -+ -+ llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) -+ kfree_rcu(dsq, rcu); -+} -+ -+static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); -+ -+static void destroy_dsq(u64 dsq_id) -+{ -+ struct scx_dispatch_q *dsq; -+ unsigned long flags; -+ -+ rcu_read_lock(); -+ -+ dsq = find_user_dsq(dsq_id); -+ if (!dsq) -+ goto out_unlock_rcu; -+ -+ raw_spin_lock_irqsave(&dsq->lock, flags); -+ -+ if (dsq->nr) { -+ scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)", -+ dsq->id, dsq->nr); -+ goto out_unlock_dsq; -+ } -+ -+ if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params)) -+ goto out_unlock_dsq; -+ -+ /* -+ * Mark dead by invalidating ->id to prevent dispatch_enqueue() from -+ * queueing more tasks. As this function can be called from anywhere, -+ * freeing is bounced through an irq work to avoid nesting RCU -+ * operations inside scheduler locks. -+ */ -+ dsq->id = SCX_DSQ_INVALID; -+ llist_add(&dsq->free_node, &dsqs_to_free); -+ irq_work_queue(&free_dsq_irq_work); -+ -+out_unlock_dsq: -+ raw_spin_unlock_irqrestore(&dsq->lock, flags); -+out_unlock_rcu: -+ rcu_read_unlock(); -+} -+ -+#ifdef CONFIG_EXT_GROUP_SCHED -+static void scx_cgroup_exit(void) -+{ -+ struct cgroup_subsys_state *css; -+ -+ percpu_rwsem_assert_held(&scx_cgroup_rwsem); -+ -+ WARN_ON_ONCE(!scx_cgroup_enabled); -+ scx_cgroup_enabled = false; -+ -+ /* -+ * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk -+ * cgroups and exit all the inited ones, all online cgroups are exited. -+ */ -+ rcu_read_lock(); -+ css_for_each_descendant_post(css, &root_task_group.css) { -+ struct task_group *tg = css_tg(css); -+ -+ if (!(tg->scx_flags & SCX_TG_INITED)) -+ continue; -+ tg->scx_flags &= ~SCX_TG_INITED; -+ -+ if (!scx_ops.cgroup_exit) -+ continue; -+ -+ if (WARN_ON_ONCE(!css_tryget(css))) -+ continue; -+ rcu_read_unlock(); -+ -+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup); -+ -+ rcu_read_lock(); -+ css_put(css); -+ } -+ rcu_read_unlock(); -+} -+ -+static int scx_cgroup_init(void) -+{ -+ struct cgroup_subsys_state *css; -+ int ret; -+ -+ percpu_rwsem_assert_held(&scx_cgroup_rwsem); -+ -+ cgroup_warned_missing_weight = false; -+ cgroup_warned_missing_idle = false; -+ -+ /* -+ * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk -+ * cgroups and init, all online cgroups are initialized. -+ */ -+ rcu_read_lock(); -+ css_for_each_descendant_pre(css, &root_task_group.css) { -+ struct task_group *tg = css_tg(css); -+ struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; -+ -+ scx_cgroup_warn_missing_weight(tg); -+ scx_cgroup_warn_missing_idle(tg); -+ -+ if ((tg->scx_flags & -+ (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) -+ continue; -+ -+ if (!scx_ops.cgroup_init) { -+ tg->scx_flags |= SCX_TG_INITED; -+ continue; -+ } -+ -+ if (WARN_ON_ONCE(!css_tryget(css))) -+ continue; -+ rcu_read_unlock(); -+ -+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, -+ css->cgroup, &args); -+ if (ret) { -+ css_put(css); -+ return ret; -+ } -+ tg->scx_flags |= SCX_TG_INITED; -+ -+ rcu_read_lock(); -+ css_put(css); -+ } -+ rcu_read_unlock(); -+ -+ WARN_ON_ONCE(scx_cgroup_enabled); -+ scx_cgroup_enabled = true; -+ -+ return 0; -+} -+ -+#else -+static void scx_cgroup_exit(void) {} -+static int scx_cgroup_init(void) { return 0; } -+#endif -+ -+ -+/******************************************************************************** -+ * Sysfs interface and ops enable/disable. -+ */ -+ -+#define SCX_ATTR(_name) \ -+ static struct kobj_attribute scx_attr_##_name = { \ -+ .attr = { .name = __stringify(_name), .mode = 0444 }, \ -+ .show = scx_attr_##_name##_show, \ -+ } -+ -+static ssize_t scx_attr_state_show(struct kobject *kobj, -+ struct kobj_attribute *ka, char *buf) -+{ -+ return sysfs_emit(buf, "%s\n", -+ scx_ops_enable_state_str[scx_ops_enable_state()]); -+} -+SCX_ATTR(state); -+ -+static ssize_t scx_attr_switch_all_show(struct kobject *kobj, -+ struct kobj_attribute *ka, char *buf) -+{ -+ return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); -+} -+SCX_ATTR(switch_all); -+ -+static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, -+ struct kobj_attribute *ka, char *buf) -+{ -+ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); -+} -+SCX_ATTR(nr_rejected); -+ -+static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, -+ struct kobj_attribute *ka, char *buf) -+{ -+ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); -+} -+SCX_ATTR(hotplug_seq); -+ -+static ssize_t scx_attr_enable_seq_show(struct kobject *kobj, -+ struct kobj_attribute *ka, char *buf) -+{ -+ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq)); -+} -+SCX_ATTR(enable_seq); -+ -+static struct attribute *scx_global_attrs[] = { -+ &scx_attr_state.attr, -+ &scx_attr_switch_all.attr, -+ &scx_attr_nr_rejected.attr, -+ &scx_attr_hotplug_seq.attr, -+ &scx_attr_enable_seq.attr, -+ NULL, -+}; -+ -+static const struct attribute_group scx_global_attr_group = { -+ .attrs = scx_global_attrs, -+}; -+ -+static void scx_kobj_release(struct kobject *kobj) -+{ -+ kfree(kobj); -+} -+ -+static ssize_t scx_attr_ops_show(struct kobject *kobj, -+ struct kobj_attribute *ka, char *buf) -+{ -+ return sysfs_emit(buf, "%s\n", scx_ops.name); -+} -+SCX_ATTR(ops); -+ -+static struct attribute *scx_sched_attrs[] = { -+ &scx_attr_ops.attr, -+ NULL, -+}; -+ATTRIBUTE_GROUPS(scx_sched); -+ -+static const struct kobj_type scx_ktype = { -+ .release = scx_kobj_release, -+ .sysfs_ops = &kobj_sysfs_ops, -+ .default_groups = scx_sched_groups, -+}; -+ -+static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) -+{ -+ return add_uevent_var(env, "SCXOPS=%s", scx_ops.name); -+} -+ -+static const struct kset_uevent_ops scx_uevent_ops = { -+ .uevent = scx_uevent, -+}; -+ -+/* -+ * Used by sched_fork() and __setscheduler_prio() to pick the matching -+ * sched_class. dl/rt are already handled. -+ */ -+bool task_should_scx(struct task_struct *p) -+{ -+ if (!scx_enabled() || -+ unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) -+ return false; -+ if (READ_ONCE(scx_switching_all)) -+ return true; -+ return p->policy == SCHED_EXT; -+} -+ -+/** -+ * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress -+ * -+ * Bypassing guarantees that all runnable tasks make forward progress without -+ * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might -+ * be held by tasks that the BPF scheduler is forgetting to run, which -+ * unfortunately also excludes toggling the static branches. -+ * -+ * Let's work around by overriding a couple ops and modifying behaviors based on -+ * the DISABLING state and then cycling the queued tasks through dequeue/enqueue -+ * to force global FIFO scheduling. -+ * -+ * - ops.select_cpu() is ignored and the default select_cpu() is used. -+ * -+ * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. -+ * -+ * - ops.dispatch() is ignored. -+ * -+ * - balance_scx() does not set %SCX_RQ_BAL_KEEP on no*n-zero slice as slice -+ * can't be trusted. Whenever a tick triggers, the running task is rotated to -+ * the tail of the queue with core_sched_at touched. -+ * -+ * - pick_next_task() suppresses zero slice warning. -+ * -+ * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM -+ * operations. -+ * -+ * - scx_prio_less() reverts to the default core_sched_at order. -+ */ -+static void scx_ops_bypass(bool bypass) -+{ -+ int depth, cpu; -+ -+ if (bypass) { -+ depth = atomic_inc_return(&scx_ops_bypass_depth); -+ WARN_ON_ONCE(depth <= 0); -+ if (depth != 1) -+ return; -+ } else { -+ depth = atomic_dec_return(&scx_ops_bypass_depth); -+ WARN_ON_ONCE(depth < 0); -+ if (depth != 0) -+ return; -+ } -+ -+ /* -+ * No task property is changing. We just need to make sure all currently -+ * queued tasks are re-queued according to the new scx_rq_bypassing() -+ * state. As an optimization, walk each rq's runnable_list instead of -+ * the scx_tasks list. -+ * -+ * This function can't trust the scheduler and thus can't use -+ * cpus_read_lock(). Walk all possible CPUs instead of online. -+ */ -+ for_each_possible_cpu(cpu) { -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ struct task_struct *p, *n; -+ -+ rq_lock_irqsave(rq, &rf); -+ -+ if (bypass) { -+ WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); -+ rq->scx.flags |= SCX_RQ_BYPASSING; -+ } else { -+ WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING)); -+ rq->scx.flags &= ~SCX_RQ_BYPASSING; -+ } -+ -+ /* -+ * We need to guarantee that no tasks are on the BPF scheduler -+ * while bypassing. Either we see enabled or the enable path -+ * sees scx_rq_bypassing() before moving tasks to SCX. -+ */ -+ if (!scx_enabled()) { -+ rq_unlock_irqrestore(rq, &rf); -+ continue; -+ } -+ -+ /* -+ * The use of list_for_each_entry_safe_reverse() is required -+ * because each task is going to be removed from and added back -+ * to the runnable_list during iteration. Because they're added -+ * to the tail of the list, safe reverse iteration can still -+ * visit all nodes. -+ */ -+ list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, -+ scx.runnable_node) { -+ struct sched_enq_and_set_ctx ctx; -+ -+ /* cycling deq/enq is enough, see the function comment */ -+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); -+ sched_enq_and_set_task(&ctx); -+ } -+ -+ rq_unlock_irqrestore(rq, &rf); -+ -+ /* resched to restore ticks and idle state */ -+ resched_cpu(cpu); -+ } -+} -+ -+static void free_exit_info(struct scx_exit_info *ei) -+{ -+ kfree(ei->dump); -+ kfree(ei->msg); -+ kfree(ei->bt); -+ kfree(ei); -+} -+ -+static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) -+{ -+ struct scx_exit_info *ei; -+ -+ ei = kzalloc(sizeof(*ei), GFP_KERNEL); -+ if (!ei) -+ return NULL; -+ -+ ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL); -+ ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); -+ ei->dump = kzalloc(exit_dump_len, GFP_KERNEL); -+ -+ if (!ei->bt || !ei->msg || !ei->dump) { -+ free_exit_info(ei); -+ return NULL; -+ } -+ -+ return ei; -+} -+ -+static const char *scx_exit_reason(enum scx_exit_kind kind) -+{ -+ switch (kind) { -+ case SCX_EXIT_UNREG: -+ return "unregistered from user space"; -+ case SCX_EXIT_UNREG_BPF: -+ return "unregistered from BPF"; -+ case SCX_EXIT_UNREG_KERN: -+ return "unregistered from the main kernel"; -+ case SCX_EXIT_SYSRQ: -+ return "disabled by sysrq-S"; -+ case SCX_EXIT_ERROR: -+ return "runtime error"; -+ case SCX_EXIT_ERROR_BPF: -+ return "scx_bpf_error"; -+ case SCX_EXIT_ERROR_STALL: -+ return "runnable task stall"; -+ default: -+ return ""; -+ } -+} -+ -+static void scx_ops_disable_workfn(struct kthread_work *work) -+{ -+ struct scx_exit_info *ei = scx_exit_info; -+ struct scx_task_iter sti; -+ struct task_struct *p; -+ struct rhashtable_iter rht_iter; -+ struct scx_dispatch_q *dsq; -+ int i, kind; -+ -+ kind = atomic_read(&scx_exit_kind); -+ while (true) { -+ /* -+ * NONE indicates that a new scx_ops has been registered since -+ * disable was scheduled - don't kill the new ops. DONE -+ * indicates that the ops has already been disabled. -+ */ -+ if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE) -+ return; -+ if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE)) -+ break; -+ } -+ ei->kind = kind; -+ ei->reason = scx_exit_reason(ei->kind); -+ -+ /* guarantee forward progress by bypassing scx_ops */ -+ scx_ops_bypass(true); -+ -+ switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) { -+ case SCX_OPS_DISABLING: -+ WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); -+ break; -+ case SCX_OPS_DISABLED: -+ pr_warn("sched_ext: ops error detected without ops (%s)\n", -+ scx_exit_info->msg); -+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != -+ SCX_OPS_DISABLING); -+ goto done; -+ default: -+ break; -+ } -+ -+ /* -+ * Here, every runnable task is guaranteed to make forward progress and -+ * we can safely use blocking synchronization constructs. Actually -+ * disable ops. -+ */ -+ mutex_lock(&scx_ops_enable_mutex); -+ -+ static_branch_disable(&__scx_switched_all); -+ WRITE_ONCE(scx_switching_all, false); -+ -+ /* -+ * Shut down cgroup support before tasks so that the cgroup attach path -+ * doesn't race against scx_ops_exit_task(). -+ */ -+ scx_cgroup_lock(); -+ scx_cgroup_exit(); -+ scx_cgroup_unlock(); -+ -+ /* -+ * The BPF scheduler is going away. All tasks including %TASK_DEAD ones -+ * must be switched out and exited synchronously. -+ */ -+ percpu_down_write(&scx_fork_rwsem); -+ -+ scx_ops_init_task_enabled = false; -+ -+ scx_task_iter_start(&sti); -+ while ((p = scx_task_iter_next_locked(&sti))) { -+ const struct sched_class *old_class = p->sched_class; -+ struct sched_enq_and_set_ctx ctx; -+ -+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); -+ -+ __setscheduler_prio(p, p->prio); -+ check_class_changing(task_rq(p), p, old_class); -+ -+ sched_enq_and_set_task(&ctx); -+ -+ check_class_changed(task_rq(p), p, old_class, p->prio); -+ scx_ops_exit_task(p); -+ } -+ scx_task_iter_stop(&sti); -+ percpu_up_write(&scx_fork_rwsem); -+ -+ /* no task is on scx, turn off all the switches and flush in-progress calls */ -+ static_branch_disable(&__scx_ops_enabled); -+ for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) -+ static_branch_disable(&scx_has_op[i]); -+ static_branch_disable(&scx_ops_enq_last); -+ static_branch_disable(&scx_ops_enq_exiting); -+ static_branch_disable(&scx_ops_cpu_preempt); -+ static_branch_disable(&scx_builtin_idle_enabled); -+ synchronize_rcu(); -+ -+ if (ei->kind >= SCX_EXIT_ERROR) { -+ pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", -+ scx_ops.name, ei->reason); -+ -+ if (ei->msg[0] != '\0') -+ pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg); -+#ifdef CONFIG_STACKTRACE -+ stack_trace_print(ei->bt, ei->bt_len, 2); -+#endif -+ } else { -+ pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", -+ scx_ops.name, ei->reason); -+ } -+ -+ if (scx_ops.exit) -+ SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); -+ -+ cancel_delayed_work_sync(&scx_watchdog_work); -+ -+ /* -+ * Delete the kobject from the hierarchy eagerly in addition to just -+ * dropping a reference. Otherwise, if the object is deleted -+ * asynchronously, sysfs could observe an object of the same name still -+ * in the hierarchy when another scheduler is loaded. -+ */ -+ kobject_del(scx_root_kobj); -+ kobject_put(scx_root_kobj); -+ scx_root_kobj = NULL; -+ -+ memset(&scx_ops, 0, sizeof(scx_ops)); -+ -+ rhashtable_walk_enter(&dsq_hash, &rht_iter); -+ do { -+ rhashtable_walk_start(&rht_iter); -+ -+ while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) -+ destroy_dsq(dsq->id); -+ -+ rhashtable_walk_stop(&rht_iter); -+ } while (dsq == ERR_PTR(-EAGAIN)); -+ rhashtable_walk_exit(&rht_iter); -+ -+ free_percpu(scx_dsp_ctx); -+ scx_dsp_ctx = NULL; -+ scx_dsp_max_batch = 0; -+ -+ free_exit_info(scx_exit_info); -+ scx_exit_info = NULL; -+ -+ mutex_unlock(&scx_ops_enable_mutex); -+ -+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != -+ SCX_OPS_DISABLING); -+done: -+ scx_ops_bypass(false); -+} -+ -+static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn); -+ -+static void schedule_scx_ops_disable_work(void) -+{ -+ struct kthread_worker *helper = READ_ONCE(scx_ops_helper); -+ -+ /* -+ * We may be called spuriously before the first bpf_sched_ext_reg(). If -+ * scx_ops_helper isn't set up yet, there's nothing to do. -+ */ -+ if (helper) -+ kthread_queue_work(helper, &scx_ops_disable_work); -+} -+ -+static void scx_ops_disable(enum scx_exit_kind kind) -+{ -+ int none = SCX_EXIT_NONE; -+ -+ if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) -+ kind = SCX_EXIT_ERROR; -+ -+ atomic_try_cmpxchg(&scx_exit_kind, &none, kind); -+ -+ schedule_scx_ops_disable_work(); -+} -+ -+static void dump_newline(struct seq_buf *s) -+{ -+ trace_sched_ext_dump(""); -+ -+ /* @s may be zero sized and seq_buf triggers WARN if so */ -+ if (s->size) -+ seq_buf_putc(s, '\n'); -+} -+ -+static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) -+{ -+ va_list args; -+ -+#ifdef CONFIG_TRACEPOINTS -+ if (trace_sched_ext_dump_enabled()) { -+ /* protected by scx_dump_state()::dump_lock */ -+ static char line_buf[SCX_EXIT_MSG_LEN]; -+ -+ va_start(args, fmt); -+ vscnprintf(line_buf, sizeof(line_buf), fmt, args); -+ va_end(args); -+ -+ trace_sched_ext_dump(line_buf); -+ } -+#endif -+ /* @s may be zero sized and seq_buf triggers WARN if so */ -+ if (s->size) { -+ va_start(args, fmt); -+ seq_buf_vprintf(s, fmt, args); -+ va_end(args); -+ -+ seq_buf_putc(s, '\n'); -+ } -+} -+ -+static void dump_stack_trace(struct seq_buf *s, const char *prefix, -+ const unsigned long *bt, unsigned int len) -+{ -+ unsigned int i; -+ -+ for (i = 0; i < len; i++) -+ dump_line(s, "%s%pS", prefix, (void *)bt[i]); -+} -+ -+static void ops_dump_init(struct seq_buf *s, const char *prefix) -+{ -+ struct scx_dump_data *dd = &scx_dump_data; -+ -+ lockdep_assert_irqs_disabled(); -+ -+ dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */ -+ dd->first = true; -+ dd->cursor = 0; -+ dd->s = s; -+ dd->prefix = prefix; -+} -+ -+static void ops_dump_flush(void) -+{ -+ struct scx_dump_data *dd = &scx_dump_data; -+ char *line = dd->buf.line; -+ -+ if (!dd->cursor) -+ return; -+ -+ /* -+ * There's something to flush and this is the first line. Insert a blank -+ * line to distinguish ops dump. -+ */ -+ if (dd->first) { -+ dump_newline(dd->s); -+ dd->first = false; -+ } -+ -+ /* -+ * There may be multiple lines in $line. Scan and emit each line -+ * separately. -+ */ -+ while (true) { -+ char *end = line; -+ char c; -+ -+ while (*end != '\n' && *end != '\0') -+ end++; -+ -+ /* -+ * If $line overflowed, it may not have newline at the end. -+ * Always emit with a newline. -+ */ -+ c = *end; -+ *end = '\0'; -+ dump_line(dd->s, "%s%s", dd->prefix, line); -+ if (c == '\0') -+ break; -+ -+ /* move to the next line */ -+ end++; -+ if (*end == '\0') -+ break; -+ line = end; -+ } -+ -+ dd->cursor = 0; -+} -+ -+static void ops_dump_exit(void) -+{ -+ ops_dump_flush(); -+ scx_dump_data.cpu = -1; -+} -+ -+static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, -+ struct task_struct *p, char marker) -+{ -+ static unsigned long bt[SCX_EXIT_BT_LEN]; -+ char dsq_id_buf[19] = "(n/a)"; -+ unsigned long ops_state = atomic_long_read(&p->scx.ops_state); -+ unsigned int bt_len = 0; -+ -+ if (p->scx.dsq) -+ scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", -+ (unsigned long long)p->scx.dsq->id); -+ -+ dump_newline(s); -+ dump_line(s, " %c%c %s[%d] %+ldms", -+ marker, task_state_to_char(p), p->comm, p->pid, -+ jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); -+ dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", -+ scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, -+ p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, -+ ops_state >> SCX_OPSS_QSEQ_SHIFT); -+ dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu", -+ p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf, -+ p->scx.dsq_vtime); -+ dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); -+ -+ if (SCX_HAS_OP(dump_task)) { -+ ops_dump_init(s, " "); -+ SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p); -+ ops_dump_exit(); -+ } -+ -+#ifdef CONFIG_STACKTRACE -+ bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); -+#endif -+ if (bt_len) { -+ dump_newline(s); -+ dump_stack_trace(s, " ", bt, bt_len); -+ } -+} -+ -+static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) -+{ -+ static DEFINE_SPINLOCK(dump_lock); -+ static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; -+ struct scx_dump_ctx dctx = { -+ .kind = ei->kind, -+ .exit_code = ei->exit_code, -+ .reason = ei->reason, -+ .at_ns = ktime_get_ns(), -+ .at_jiffies = jiffies, -+ }; -+ struct seq_buf s; -+ unsigned long flags; -+ char *buf; -+ int cpu; -+ -+ spin_lock_irqsave(&dump_lock, flags); -+ -+ seq_buf_init(&s, ei->dump, dump_len); -+ -+ if (ei->kind == SCX_EXIT_NONE) { -+ dump_line(&s, "Debug dump triggered by %s", ei->reason); -+ } else { -+ dump_line(&s, "%s[%d] triggered exit kind %d:", -+ current->comm, current->pid, ei->kind); -+ dump_line(&s, " %s (%s)", ei->reason, ei->msg); -+ dump_newline(&s); -+ dump_line(&s, "Backtrace:"); -+ dump_stack_trace(&s, " ", ei->bt, ei->bt_len); -+ } -+ -+ if (SCX_HAS_OP(dump)) { -+ ops_dump_init(&s, ""); -+ SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx); -+ ops_dump_exit(); -+ } -+ -+ dump_newline(&s); -+ dump_line(&s, "CPU states"); -+ dump_line(&s, "----------"); -+ -+ for_each_possible_cpu(cpu) { -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ struct task_struct *p; -+ struct seq_buf ns; -+ size_t avail, used; -+ bool idle; -+ -+ rq_lock(rq, &rf); -+ -+ idle = list_empty(&rq->scx.runnable_list) && -+ rq->curr->sched_class == &idle_sched_class; -+ -+ if (idle && !SCX_HAS_OP(dump_cpu)) -+ goto next; -+ -+ /* -+ * We don't yet know whether ops.dump_cpu() will produce output -+ * and we may want to skip the default CPU dump if it doesn't. -+ * Use a nested seq_buf to generate the standard dump so that we -+ * can decide whether to commit later. -+ */ -+ avail = seq_buf_get_buf(&s, &buf); -+ seq_buf_init(&ns, buf, avail); -+ -+ dump_newline(&ns); -+ dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu", -+ cpu, rq->scx.nr_running, rq->scx.flags, -+ rq->scx.cpu_released, rq->scx.ops_qseq, -+ rq->scx.pnt_seq); -+ dump_line(&ns, " curr=%s[%d] class=%ps", -+ rq->curr->comm, rq->curr->pid, -+ rq->curr->sched_class); -+ if (!cpumask_empty(rq->scx.cpus_to_kick)) -+ dump_line(&ns, " cpus_to_kick : %*pb", -+ cpumask_pr_args(rq->scx.cpus_to_kick)); -+ if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) -+ dump_line(&ns, " idle_to_kick : %*pb", -+ cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); -+ if (!cpumask_empty(rq->scx.cpus_to_preempt)) -+ dump_line(&ns, " cpus_to_preempt: %*pb", -+ cpumask_pr_args(rq->scx.cpus_to_preempt)); -+ if (!cpumask_empty(rq->scx.cpus_to_wait)) -+ dump_line(&ns, " cpus_to_wait : %*pb", -+ cpumask_pr_args(rq->scx.cpus_to_wait)); -+ -+ used = seq_buf_used(&ns); -+ if (SCX_HAS_OP(dump_cpu)) { -+ ops_dump_init(&ns, " "); -+ SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle); -+ ops_dump_exit(); -+ } -+ -+ /* -+ * If idle && nothing generated by ops.dump_cpu(), there's -+ * nothing interesting. Skip. -+ */ -+ if (idle && used == seq_buf_used(&ns)) -+ goto next; -+ -+ /* -+ * $s may already have overflowed when $ns was created. If so, -+ * calling commit on it will trigger BUG. -+ */ -+ if (avail) { -+ seq_buf_commit(&s, seq_buf_used(&ns)); -+ if (seq_buf_has_overflowed(&ns)) -+ seq_buf_set_overflow(&s); -+ } -+ -+ if (rq->curr->sched_class == &ext_sched_class) -+ scx_dump_task(&s, &dctx, rq->curr, '*'); -+ -+ list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) -+ scx_dump_task(&s, &dctx, p, ' '); -+ next: -+ rq_unlock(rq, &rf); -+ } -+ -+ if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) -+ memcpy(ei->dump + dump_len - sizeof(trunc_marker), -+ trunc_marker, sizeof(trunc_marker)); -+ -+ spin_unlock_irqrestore(&dump_lock, flags); -+} -+ -+static void scx_ops_error_irq_workfn(struct irq_work *irq_work) -+{ -+ struct scx_exit_info *ei = scx_exit_info; -+ -+ if (ei->kind >= SCX_EXIT_ERROR) -+ scx_dump_state(ei, scx_ops.exit_dump_len); -+ -+ schedule_scx_ops_disable_work(); -+} -+ -+static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn); -+ -+static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, -+ s64 exit_code, -+ const char *fmt, ...) -+{ -+ struct scx_exit_info *ei = scx_exit_info; -+ int none = SCX_EXIT_NONE; -+ va_list args; -+ -+ if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind)) -+ return; -+ -+ ei->exit_code = exit_code; -+#ifdef CONFIG_STACKTRACE -+ if (kind >= SCX_EXIT_ERROR) -+ ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); -+#endif -+ va_start(args, fmt); -+ vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); -+ va_end(args); -+ -+ /* -+ * Set ei->kind and ->reason for scx_dump_state(). They'll be set again -+ * in scx_ops_disable_workfn(). -+ */ -+ ei->kind = kind; -+ ei->reason = scx_exit_reason(ei->kind); -+ -+ irq_work_queue(&scx_ops_error_irq_work); -+} -+ -+static struct kthread_worker *scx_create_rt_helper(const char *name) -+{ -+ struct kthread_worker *helper; -+ -+ helper = kthread_create_worker(0, name); -+ if (helper) -+ sched_set_fifo(helper->task); -+ return helper; -+} -+ -+static void check_hotplug_seq(const struct sched_ext_ops *ops) -+{ -+ unsigned long long global_hotplug_seq; -+ -+ /* -+ * If a hotplug event has occurred between when a scheduler was -+ * initialized, and when we were able to attach, exit and notify user -+ * space about it. -+ */ -+ if (ops->hotplug_seq) { -+ global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); -+ if (ops->hotplug_seq != global_hotplug_seq) { -+ scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, -+ "expected hotplug seq %llu did not match actual %llu", -+ ops->hotplug_seq, global_hotplug_seq); -+ } -+ } -+} -+ -+static int validate_ops(const struct sched_ext_ops *ops) -+{ -+ /* -+ * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the -+ * ops.enqueue() callback isn't implemented. -+ */ -+ if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { -+ scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+ -+static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) -+{ -+ struct scx_task_iter sti; -+ struct task_struct *p; -+ unsigned long timeout; -+ int i, cpu, node, ret; -+ -+ if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), -+ cpu_possible_mask)) { -+ pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation"); -+ return -EINVAL; -+ } -+ -+ mutex_lock(&scx_ops_enable_mutex); -+ -+ if (!scx_ops_helper) { -+ WRITE_ONCE(scx_ops_helper, -+ scx_create_rt_helper("sched_ext_ops_helper")); -+ if (!scx_ops_helper) { -+ ret = -ENOMEM; -+ goto err_unlock; -+ } -+ } -+ -+ if (!global_dsqs) { -+ struct scx_dispatch_q **dsqs; -+ -+ dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL); -+ if (!dsqs) { -+ ret = -ENOMEM; -+ goto err_unlock; -+ } -+ -+ for_each_node_state(node, N_POSSIBLE) { -+ struct scx_dispatch_q *dsq; -+ -+ dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); -+ if (!dsq) { -+ for_each_node_state(node, N_POSSIBLE) -+ kfree(dsqs[node]); -+ kfree(dsqs); -+ ret = -ENOMEM; -+ goto err_unlock; -+ } -+ -+ init_dsq(dsq, SCX_DSQ_GLOBAL); -+ dsqs[node] = dsq; -+ } -+ -+ global_dsqs = dsqs; -+ } -+ -+ if (scx_ops_enable_state() != SCX_OPS_DISABLED) { -+ ret = -EBUSY; -+ goto err_unlock; -+ } -+ -+ scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL); -+ if (!scx_root_kobj) { -+ ret = -ENOMEM; -+ goto err_unlock; -+ } -+ -+ scx_root_kobj->kset = scx_kset; -+ ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root"); -+ if (ret < 0) -+ goto err; -+ -+ scx_exit_info = alloc_exit_info(ops->exit_dump_len); -+ if (!scx_exit_info) { -+ ret = -ENOMEM; -+ goto err_del; -+ } -+ -+ /* -+ * Set scx_ops, transition to ENABLING and clear exit info to arm the -+ * disable path. Failure triggers full disabling from here on. -+ */ -+ scx_ops = *ops; -+ -+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) != -+ SCX_OPS_DISABLED); -+ -+ atomic_set(&scx_exit_kind, SCX_EXIT_NONE); -+ scx_warned_zero_slice = false; -+ -+ atomic_long_set(&scx_nr_rejected, 0); -+ -+ for_each_possible_cpu(cpu) -+ cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE; -+ -+ /* -+ * Keep CPUs stable during enable so that the BPF scheduler can track -+ * online CPUs by watching ->on/offline_cpu() after ->init(). -+ */ -+ cpus_read_lock(); -+ -+ if (scx_ops.init) { -+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); -+ if (ret) { -+ ret = ops_sanitize_err("init", ret); -+ cpus_read_unlock(); -+ goto err_disable; -+ } -+ } -+ -+ for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) -+ if (((void (**)(void))ops)[i]) -+ static_branch_enable_cpuslocked(&scx_has_op[i]); -+ -+ check_hotplug_seq(ops); -+ cpus_read_unlock(); -+ -+ ret = validate_ops(ops); -+ if (ret) -+ goto err_disable; -+ -+ WARN_ON_ONCE(scx_dsp_ctx); -+ scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; -+ scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf, -+ scx_dsp_max_batch), -+ __alignof__(struct scx_dsp_ctx)); -+ if (!scx_dsp_ctx) { -+ ret = -ENOMEM; -+ goto err_disable; -+ } -+ -+ if (ops->timeout_ms) -+ timeout = msecs_to_jiffies(ops->timeout_ms); -+ else -+ timeout = SCX_WATCHDOG_MAX_TIMEOUT; -+ -+ WRITE_ONCE(scx_watchdog_timeout, timeout); -+ WRITE_ONCE(scx_watchdog_timestamp, jiffies); -+ queue_delayed_work(system_unbound_wq, &scx_watchdog_work, -+ scx_watchdog_timeout / 2); -+ -+ /* -+ * Once __scx_ops_enabled is set, %current can be switched to SCX -+ * anytime. This can lead to stalls as some BPF schedulers (e.g. -+ * userspace scheduling) may not function correctly before all tasks are -+ * switched. Init in bypass mode to guarantee forward progress. -+ */ -+ scx_ops_bypass(true); -+ -+ for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) -+ if (((void (**)(void))ops)[i]) -+ static_branch_enable(&scx_has_op[i]); -+ -+ if (ops->flags & SCX_OPS_ENQ_LAST) -+ static_branch_enable(&scx_ops_enq_last); -+ -+ if (ops->flags & SCX_OPS_ENQ_EXITING) -+ static_branch_enable(&scx_ops_enq_exiting); -+ if (scx_ops.cpu_acquire || scx_ops.cpu_release) -+ static_branch_enable(&scx_ops_cpu_preempt); -+ -+ if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { -+ reset_idle_masks(); -+ static_branch_enable(&scx_builtin_idle_enabled); -+ } else { -+ static_branch_disable(&scx_builtin_idle_enabled); -+ } -+ -+ /* -+ * Lock out forks, cgroup on/offlining and moves before opening the -+ * floodgate so that they don't wander into the operations prematurely. -+ */ -+ percpu_down_write(&scx_fork_rwsem); -+ -+ WARN_ON_ONCE(scx_ops_init_task_enabled); -+ scx_ops_init_task_enabled = true; -+ -+ /* -+ * Enable ops for every task. Fork is excluded by scx_fork_rwsem -+ * preventing new tasks from being added. No need to exclude tasks -+ * leaving as sched_ext_free() can handle both prepped and enabled -+ * tasks. Prep all tasks first and then enable them with preemption -+ * disabled. -+ * -+ * All cgroups should be initialized before scx_ops_init_task() so that -+ * the BPF scheduler can reliably track each task's cgroup membership -+ * from scx_ops_init_task(). Lock out cgroup on/offlining and task -+ * migrations while tasks are being initialized so that -+ * scx_cgroup_can_attach() never sees uninitialized tasks. -+ */ -+ scx_cgroup_lock(); -+ ret = scx_cgroup_init(); -+ if (ret) -+ goto err_disable_unlock_all; -+ -+ scx_task_iter_start(&sti); -+ while ((p = scx_task_iter_next_locked(&sti))) { -+ /* -+ * @p may already be dead, have lost all its usages counts and -+ * be waiting for RCU grace period before being freed. @p can't -+ * be initialized for SCX in such cases and should be ignored. -+ */ -+ if (!tryget_task_struct(p)) -+ continue; -+ -+ scx_task_iter_unlock(&sti); -+ -+ ret = scx_ops_init_task(p, task_group(p), false); -+ if (ret) { -+ put_task_struct(p); -+ scx_task_iter_relock(&sti); -+ scx_task_iter_stop(&sti); -+ pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", -+ ret, p->comm, p->pid); -+ goto err_disable_unlock_all; -+ } -+ -+ scx_set_task_state(p, SCX_TASK_READY); -+ -+ put_task_struct(p); -+ scx_task_iter_relock(&sti); -+ } -+ scx_task_iter_stop(&sti); -+ scx_cgroup_unlock(); -+ percpu_up_write(&scx_fork_rwsem); -+ -+ /* -+ * All tasks are READY. It's safe to turn on scx_enabled() and switch -+ * all eligible tasks. -+ */ -+ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); -+ static_branch_enable(&__scx_ops_enabled); -+ -+ /* -+ * We're fully committed and can't fail. The task READY -> ENABLED -+ * transitions here are synchronized against sched_ext_free() through -+ * scx_tasks_lock. -+ */ -+ percpu_down_write(&scx_fork_rwsem); -+ scx_task_iter_start(&sti); -+ while ((p = scx_task_iter_next_locked(&sti))) { -+ const struct sched_class *old_class = p->sched_class; -+ struct sched_enq_and_set_ctx ctx; -+ -+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); -+ -+ p->scx.slice = SCX_SLICE_DFL; -+ __setscheduler_prio(p, p->prio); -+ check_class_changing(task_rq(p), p, old_class); -+ -+ sched_enq_and_set_task(&ctx); -+ -+ check_class_changed(task_rq(p), p, old_class, p->prio); -+ } -+ scx_task_iter_stop(&sti); -+ percpu_up_write(&scx_fork_rwsem); -+ -+ scx_ops_bypass(false); -+ -+ /* -+ * Returning an error code here would lose the recorded error -+ * information. Exit indicating success so that the error is notified -+ * through ops.exit() with all the details. -+ */ -+ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { -+ WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); -+ ret = 0; -+ goto err_disable; -+ } -+ -+ if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) -+ static_branch_enable(&__scx_switched_all); -+ -+ pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", -+ scx_ops.name, scx_switched_all() ? "" : " (partial)"); -+ kobject_uevent(scx_root_kobj, KOBJ_ADD); -+ mutex_unlock(&scx_ops_enable_mutex); -+ -+ atomic_long_inc(&scx_enable_seq); -+ -+ return 0; -+ -+err_del: -+ kobject_del(scx_root_kobj); -+err: -+ kobject_put(scx_root_kobj); -+ scx_root_kobj = NULL; -+ if (scx_exit_info) { -+ free_exit_info(scx_exit_info); -+ scx_exit_info = NULL; -+ } -+err_unlock: -+ mutex_unlock(&scx_ops_enable_mutex); -+ return ret; -+ -+err_disable_unlock_all: -+ scx_cgroup_unlock(); -+ percpu_up_write(&scx_fork_rwsem); -+ scx_ops_bypass(false); -+err_disable: -+ mutex_unlock(&scx_ops_enable_mutex); -+ /* must be fully disabled before returning */ -+ scx_ops_disable(SCX_EXIT_ERROR); -+ kthread_flush_work(&scx_ops_disable_work); -+ return ret; -+} -+ -+ -+/******************************************************************************** -+ * bpf_struct_ops plumbing. -+ */ -+#include -+#include -+#include -+ -+extern struct btf *btf_vmlinux; -+static const struct btf_type *task_struct_type; -+static u32 task_struct_type_id; -+ -+static bool set_arg_maybe_null(const char *op, int arg_n, int off, int size, -+ enum bpf_access_type type, -+ const struct bpf_prog *prog, -+ struct bpf_insn_access_aux *info) -+{ -+ struct btf *btf = bpf_get_btf_vmlinux(); -+ const struct bpf_struct_ops_desc *st_ops_desc; -+ const struct btf_member *member; -+ const struct btf_type *t; -+ u32 btf_id, member_idx; -+ const char *mname; -+ -+ /* struct_ops op args are all sequential, 64-bit numbers */ -+ if (off != arg_n * sizeof(__u64)) -+ return false; -+ -+ /* btf_id should be the type id of struct sched_ext_ops */ -+ btf_id = prog->aux->attach_btf_id; -+ st_ops_desc = bpf_struct_ops_find(btf, btf_id); -+ if (!st_ops_desc) -+ return false; -+ -+ /* BTF type of struct sched_ext_ops */ -+ t = st_ops_desc->type; -+ -+ member_idx = prog->expected_attach_type; -+ if (member_idx >= btf_type_vlen(t)) -+ return false; -+ -+ /* -+ * Get the member name of this struct_ops program, which corresponds to -+ * a field in struct sched_ext_ops. For example, the member name of the -+ * dispatch struct_ops program (callback) is "dispatch". -+ */ -+ member = &btf_type_member(t)[member_idx]; -+ mname = btf_name_by_offset(btf_vmlinux, member->name_off); -+ -+ if (!strcmp(mname, op)) { -+ /* -+ * The value is a pointer to a type (struct task_struct) given -+ * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED), -+ * however, can be a NULL (PTR_MAYBE_NULL). The BPF program -+ * should check the pointer to make sure it is not NULL before -+ * using it, or the verifier will reject the program. -+ * -+ * Longer term, this is something that should be addressed by -+ * BTF, and be fully contained within the verifier. -+ */ -+ info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED; -+ info->btf = btf_vmlinux; -+ info->btf_id = task_struct_type_id; -+ -+ return true; -+ } -+ -+ return false; -+} -+ -+static bool bpf_scx_is_valid_access(int off, int size, -+ enum bpf_access_type type, -+ const struct bpf_prog *prog, -+ struct bpf_insn_access_aux *info) -+{ -+ if (type != BPF_READ) -+ return false; -+ if (set_arg_maybe_null("dispatch", 1, off, size, type, prog, info) || -+ set_arg_maybe_null("yield", 1, off, size, type, prog, info)) -+ return true; -+ if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) -+ return false; -+ if (off % size != 0) -+ return false; -+ -+ return btf_ctx_access(off, size, type, prog, info); -+} -+ -+static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, -+ const struct bpf_reg_state *reg, int off, -+ int size) -+{ -+ const struct btf_type *t; -+ -+ t = btf_type_by_id(reg->btf, reg->btf_id); -+ if (t == task_struct_type) { -+ if (off >= offsetof(struct task_struct, scx.slice) && -+ off + size <= offsetofend(struct task_struct, scx.slice)) -+ return SCALAR_VALUE; -+ if (off >= offsetof(struct task_struct, scx.dsq_vtime) && -+ off + size <= offsetofend(struct task_struct, scx.dsq_vtime)) -+ return SCALAR_VALUE; -+ if (off >= offsetof(struct task_struct, scx.disallow) && -+ off + size <= offsetofend(struct task_struct, scx.disallow)) -+ return SCALAR_VALUE; -+ } -+ -+ return -EACCES; -+} -+ -+static const struct bpf_func_proto * -+bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -+{ -+ switch (func_id) { -+ case BPF_FUNC_task_storage_get: -+ return &bpf_task_storage_get_proto; -+ case BPF_FUNC_task_storage_delete: -+ return &bpf_task_storage_delete_proto; -+ default: -+ return bpf_base_func_proto(func_id, prog); -+ } -+} -+ -+static const struct bpf_verifier_ops bpf_scx_verifier_ops = { -+ .get_func_proto = bpf_scx_get_func_proto, -+ .is_valid_access = bpf_scx_is_valid_access, -+ .btf_struct_access = bpf_scx_btf_struct_access, -+}; -+ -+static int bpf_scx_init_member(const struct btf_type *t, -+ const struct btf_member *member, -+ void *kdata, const void *udata) -+{ -+ const struct sched_ext_ops *uops = udata; -+ struct sched_ext_ops *ops = kdata; -+ u32 moff = __btf_member_bit_offset(t, member) / 8; -+ int ret; -+ -+ switch (moff) { -+ case offsetof(struct sched_ext_ops, dispatch_max_batch): -+ if (*(u32 *)(udata + moff) > INT_MAX) -+ return -E2BIG; -+ ops->dispatch_max_batch = *(u32 *)(udata + moff); -+ return 1; -+ case offsetof(struct sched_ext_ops, flags): -+ if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) -+ return -EINVAL; -+ ops->flags = *(u64 *)(udata + moff); -+ return 1; -+ case offsetof(struct sched_ext_ops, name): -+ ret = bpf_obj_name_cpy(ops->name, uops->name, -+ sizeof(ops->name)); -+ if (ret < 0) -+ return ret; -+ if (ret == 0) -+ return -EINVAL; -+ return 1; -+ case offsetof(struct sched_ext_ops, timeout_ms): -+ if (msecs_to_jiffies(*(u32 *)(udata + moff)) > -+ SCX_WATCHDOG_MAX_TIMEOUT) -+ return -E2BIG; -+ ops->timeout_ms = *(u32 *)(udata + moff); -+ return 1; -+ case offsetof(struct sched_ext_ops, exit_dump_len): -+ ops->exit_dump_len = -+ *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; -+ return 1; -+ case offsetof(struct sched_ext_ops, hotplug_seq): -+ ops->hotplug_seq = *(u64 *)(udata + moff); -+ return 1; -+ } -+ -+ return 0; -+} -+ -+static int bpf_scx_check_member(const struct btf_type *t, -+ const struct btf_member *member, -+ const struct bpf_prog *prog) -+{ -+ u32 moff = __btf_member_bit_offset(t, member) / 8; -+ -+ switch (moff) { -+ case offsetof(struct sched_ext_ops, init_task): -+#ifdef CONFIG_EXT_GROUP_SCHED -+ case offsetof(struct sched_ext_ops, cgroup_init): -+ case offsetof(struct sched_ext_ops, cgroup_exit): -+ case offsetof(struct sched_ext_ops, cgroup_prep_move): -+#endif -+ case offsetof(struct sched_ext_ops, cpu_online): -+ case offsetof(struct sched_ext_ops, cpu_offline): -+ case offsetof(struct sched_ext_ops, init): -+ case offsetof(struct sched_ext_ops, exit): -+ break; -+ default: -+ if (prog->sleepable) -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+ -+static int bpf_scx_reg(void *kdata, struct bpf_link *link) -+{ -+ return scx_ops_enable(kdata, link); -+} -+ -+static void bpf_scx_unreg(void *kdata, struct bpf_link *link) -+{ -+ scx_ops_disable(SCX_EXIT_UNREG); -+ kthread_flush_work(&scx_ops_disable_work); -+} -+ -+static int bpf_scx_init(struct btf *btf) -+{ -+ s32 type_id; -+ -+ type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT); -+ if (type_id < 0) -+ return -EINVAL; -+ task_struct_type = btf_type_by_id(btf, type_id); -+ task_struct_type_id = type_id; -+ -+ return 0; -+} -+ -+static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link) -+{ -+ /* -+ * sched_ext does not support updating the actively-loaded BPF -+ * scheduler, as registering a BPF scheduler can always fail if the -+ * scheduler returns an error code for e.g. ops.init(), ops.init_task(), -+ * etc. Similarly, we can always race with unregistration happening -+ * elsewhere, such as with sysrq. -+ */ -+ return -EOPNOTSUPP; -+} -+ -+static int bpf_scx_validate(void *kdata) -+{ -+ return 0; -+} -+ -+static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } -+static void enqueue_stub(struct task_struct *p, u64 enq_flags) {} -+static void dequeue_stub(struct task_struct *p, u64 enq_flags) {} -+static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {} -+static void tick_stub(struct task_struct *p) {} -+static void runnable_stub(struct task_struct *p, u64 enq_flags) {} -+static void running_stub(struct task_struct *p) {} -+static void stopping_stub(struct task_struct *p, bool runnable) {} -+static void quiescent_stub(struct task_struct *p, u64 deq_flags) {} -+static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; } -+static bool core_sched_before_stub(struct task_struct *a, struct task_struct *b) { return false; } -+static void set_weight_stub(struct task_struct *p, u32 weight) {} -+static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {} -+static void update_idle_stub(s32 cpu, bool idle) {} -+static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) {} -+static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) {} -+static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } -+static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {} -+static void enable_stub(struct task_struct *p) {} -+static void disable_stub(struct task_struct *p) {} -+#ifdef CONFIG_EXT_GROUP_SCHED -+static s32 cgroup_init_stub(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } -+static void cgroup_exit_stub(struct cgroup *cgrp) {} -+static s32 cgroup_prep_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } -+static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} -+static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} -+static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {} -+#endif -+static void cpu_online_stub(s32 cpu) {} -+static void cpu_offline_stub(s32 cpu) {} -+static s32 init_stub(void) { return -EINVAL; } -+static void exit_stub(struct scx_exit_info *info) {} -+static void dump_stub(struct scx_dump_ctx *ctx) {} -+static void dump_cpu_stub(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {} -+static void dump_task_stub(struct scx_dump_ctx *ctx, struct task_struct *p) {} -+ -+static struct sched_ext_ops __bpf_ops_sched_ext_ops = { -+ .select_cpu = select_cpu_stub, -+ .enqueue = enqueue_stub, -+ .dequeue = dequeue_stub, -+ .dispatch = dispatch_stub, -+ .tick = tick_stub, -+ .runnable = runnable_stub, -+ .running = running_stub, -+ .stopping = stopping_stub, -+ .quiescent = quiescent_stub, -+ .yield = yield_stub, -+ .core_sched_before = core_sched_before_stub, -+ .set_weight = set_weight_stub, -+ .set_cpumask = set_cpumask_stub, -+ .update_idle = update_idle_stub, -+ .cpu_acquire = cpu_acquire_stub, -+ .cpu_release = cpu_release_stub, -+ .init_task = init_task_stub, -+ .exit_task = exit_task_stub, -+ .enable = enable_stub, -+ .disable = disable_stub, -+#ifdef CONFIG_EXT_GROUP_SCHED -+ .cgroup_init = cgroup_init_stub, -+ .cgroup_exit = cgroup_exit_stub, -+ .cgroup_prep_move = cgroup_prep_move_stub, -+ .cgroup_move = cgroup_move_stub, -+ .cgroup_cancel_move = cgroup_cancel_move_stub, -+ .cgroup_set_weight = cgroup_set_weight_stub, -+#endif -+ .cpu_online = cpu_online_stub, -+ .cpu_offline = cpu_offline_stub, -+ .init = init_stub, -+ .exit = exit_stub, -+ .dump = dump_stub, -+ .dump_cpu = dump_cpu_stub, -+ .dump_task = dump_task_stub, -+}; -+ -+static struct bpf_struct_ops bpf_sched_ext_ops = { -+ .verifier_ops = &bpf_scx_verifier_ops, -+ .reg = bpf_scx_reg, -+ .unreg = bpf_scx_unreg, -+ .check_member = bpf_scx_check_member, -+ .init_member = bpf_scx_init_member, -+ .init = bpf_scx_init, -+ .update = bpf_scx_update, -+ .validate = bpf_scx_validate, -+ .name = "sched_ext_ops", -+ .owner = THIS_MODULE, -+ .cfi_stubs = &__bpf_ops_sched_ext_ops -+}; -+ -+ -+/******************************************************************************** -+ * System integration and init. -+ */ -+ -+static void sysrq_handle_sched_ext_reset(u8 key) -+{ -+ if (scx_ops_helper) -+ scx_ops_disable(SCX_EXIT_SYSRQ); -+ else -+ pr_info("sched_ext: BPF scheduler not yet used\n"); -+} -+ -+static const struct sysrq_key_op sysrq_sched_ext_reset_op = { -+ .handler = sysrq_handle_sched_ext_reset, -+ .help_msg = "reset-sched-ext(S)", -+ .action_msg = "Disable sched_ext and revert all tasks to CFS", -+ .enable_mask = SYSRQ_ENABLE_RTNICE, -+}; -+ -+static void sysrq_handle_sched_ext_dump(u8 key) -+{ -+ struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" }; -+ -+ if (scx_enabled()) -+ scx_dump_state(&ei, 0); -+} -+ -+static const struct sysrq_key_op sysrq_sched_ext_dump_op = { -+ .handler = sysrq_handle_sched_ext_dump, -+ .help_msg = "dump-sched-ext(D)", -+ .action_msg = "Trigger sched_ext debug dump", -+ .enable_mask = SYSRQ_ENABLE_RTNICE, -+}; -+ -+static bool can_skip_idle_kick(struct rq *rq) -+{ -+ lockdep_assert_rq_held(rq); -+ -+ /* -+ * We can skip idle kicking if @rq is going to go through at least one -+ * full SCX scheduling cycle before going idle. Just checking whether -+ * curr is not idle is insufficient because we could be racing -+ * balance_one() trying to pull the next task from a remote rq, which -+ * may fail, and @rq may become idle afterwards. -+ * -+ * The race window is small and we don't and can't guarantee that @rq is -+ * only kicked while idle anyway. Skip only when sure. -+ */ -+ return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE); -+} -+ -+static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ struct scx_rq *this_scx = &this_rq->scx; -+ bool should_wait = false; -+ unsigned long flags; -+ -+ raw_spin_rq_lock_irqsave(rq, flags); -+ -+ /* -+ * During CPU hotplug, a CPU may depend on kicking itself to make -+ * forward progress. Allow kicking self regardless of online state. -+ */ -+ if (cpu_online(cpu) || cpu == cpu_of(this_rq)) { -+ if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { -+ if (rq->curr->sched_class == &ext_sched_class) -+ rq->curr->scx.slice = 0; -+ cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); -+ } -+ -+ if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { -+ pseqs[cpu] = rq->scx.pnt_seq; -+ should_wait = true; -+ } -+ -+ resched_curr(rq); -+ } else { -+ cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); -+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); -+ } -+ -+ raw_spin_rq_unlock_irqrestore(rq, flags); -+ -+ return should_wait; -+} -+ -+static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_rq_lock_irqsave(rq, flags); -+ -+ if (!can_skip_idle_kick(rq) && -+ (cpu_online(cpu) || cpu == cpu_of(this_rq))) -+ resched_curr(rq); -+ -+ raw_spin_rq_unlock_irqrestore(rq, flags); -+} -+ -+static void kick_cpus_irq_workfn(struct irq_work *irq_work) -+{ -+ struct rq *this_rq = this_rq(); -+ struct scx_rq *this_scx = &this_rq->scx; -+ unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs); -+ bool should_wait = false; -+ s32 cpu; -+ -+ for_each_cpu(cpu, this_scx->cpus_to_kick) { -+ should_wait |= kick_one_cpu(cpu, this_rq, pseqs); -+ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); -+ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); -+ } -+ -+ for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) { -+ kick_one_cpu_if_idle(cpu, this_rq); -+ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); -+ } -+ -+ if (!should_wait) -+ return; -+ -+ for_each_cpu(cpu, this_scx->cpus_to_wait) { -+ unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq; -+ -+ if (cpu != cpu_of(this_rq)) { -+ /* -+ * Pairs with smp_store_release() issued by this CPU in -+ * scx_next_task_picked() on the resched path. -+ * -+ * We busy-wait here to guarantee that no other task can -+ * be scheduled on our core before the target CPU has -+ * entered the resched path. -+ */ -+ while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu]) -+ cpu_relax(); -+ } -+ -+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); -+ } -+} -+ -+/** -+ * print_scx_info - print out sched_ext scheduler state -+ * @log_lvl: the log level to use when printing -+ * @p: target task -+ * -+ * If a sched_ext scheduler is enabled, print the name and state of the -+ * scheduler. If @p is on sched_ext, print further information about the task. -+ * -+ * This function can be safely called on any task as long as the task_struct -+ * itself is accessible. While safe, this function isn't synchronized and may -+ * print out mixups or garbages of limited length. -+ */ -+void print_scx_info(const char *log_lvl, struct task_struct *p) -+{ -+ enum scx_ops_enable_state state = scx_ops_enable_state(); -+ const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; -+ char runnable_at_buf[22] = "?"; -+ struct sched_class *class; -+ unsigned long runnable_at; -+ -+ if (state == SCX_OPS_DISABLED) -+ return; -+ -+ /* -+ * Carefully check if the task was running on sched_ext, and then -+ * carefully copy the time it's been runnable, and its state. -+ */ -+ if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || -+ class != &ext_sched_class) { -+ printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name, -+ scx_ops_enable_state_str[state], all); -+ return; -+ } -+ -+ if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, -+ sizeof(runnable_at))) -+ scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms", -+ jiffies_delta_msecs(runnable_at, jiffies)); -+ -+ /* print everything onto one line to conserve console space */ -+ printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", -+ log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all, -+ runnable_at_buf); -+} -+ -+static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) -+{ -+ /* -+ * SCX schedulers often have userspace components which are sometimes -+ * involved in critial scheduling paths. PM operations involve freezing -+ * userspace which can lead to scheduling misbehaviors including stalls. -+ * Let's bypass while PM operations are in progress. -+ */ -+ switch (event) { -+ case PM_HIBERNATION_PREPARE: -+ case PM_SUSPEND_PREPARE: -+ case PM_RESTORE_PREPARE: -+ scx_ops_bypass(true); -+ break; -+ case PM_POST_HIBERNATION: -+ case PM_POST_SUSPEND: -+ case PM_POST_RESTORE: -+ scx_ops_bypass(false); -+ break; -+ } -+ -+ return NOTIFY_OK; -+} -+ -+static struct notifier_block scx_pm_notifier = { -+ .notifier_call = scx_pm_handler, -+}; -+ -+void __init init_sched_ext_class(void) -+{ -+ s32 cpu, v; -+ -+ /* -+ * The following is to prevent the compiler from optimizing out the enum -+ * definitions so that BPF scheduler implementations can use them -+ * through the generated vmlinux.h. -+ */ -+ WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | -+ SCX_TG_ONLINE); -+ -+ BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); -+#ifdef CONFIG_SMP -+ BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); -+ BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); -+#endif -+ scx_kick_cpus_pnt_seqs = -+ __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, -+ __alignof__(scx_kick_cpus_pnt_seqs[0])); -+ BUG_ON(!scx_kick_cpus_pnt_seqs); -+ -+ for_each_possible_cpu(cpu) { -+ struct rq *rq = cpu_rq(cpu); -+ -+ init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); -+ INIT_LIST_HEAD(&rq->scx.runnable_list); -+ INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); -+ -+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); -+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL)); -+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); -+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); -+ init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); -+ init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); -+ -+ if (cpu_online(cpu)) -+ cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; -+ } -+ -+ register_sysrq_key('S', &sysrq_sched_ext_reset_op); -+ register_sysrq_key('D', &sysrq_sched_ext_dump_op); -+ INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); -+} -+ -+ -+/******************************************************************************** -+ * Helpers that can be called from the BPF scheduler. -+ */ -+#include -+ -+__bpf_kfunc_start_defs(); -+ -+/** -+ * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() -+ * @p: task_struct to select a CPU for -+ * @prev_cpu: CPU @p was on previously -+ * @wake_flags: %SCX_WAKE_* flags -+ * @is_idle: out parameter indicating whether the returned CPU is idle -+ * -+ * Can only be called from ops.select_cpu() if the built-in CPU selection is -+ * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. -+ * @p, @prev_cpu and @wake_flags match ops.select_cpu(). -+ * -+ * Returns the picked CPU with *@is_idle indicating whether the picked CPU is -+ * currently idle and thus a good candidate for direct dispatching. -+ */ -+__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, -+ u64 wake_flags, bool *is_idle) -+{ -+ if (!static_branch_likely(&scx_builtin_idle_enabled)) { -+ scx_ops_error("built-in idle tracking is disabled"); -+ goto prev_cpu; -+ } -+ -+ if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) -+ goto prev_cpu; -+ -+#ifdef CONFIG_SMP -+ return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); -+#endif -+ -+prev_cpu: -+ *is_idle = false; -+ return prev_cpu; -+} -+ -+__bpf_kfunc_end_defs(); -+ -+BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) -+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) -+BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_select_cpu, -+}; -+ -+static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags) -+{ -+ if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) -+ return false; -+ -+ lockdep_assert_irqs_disabled(); -+ -+ if (unlikely(!p)) { -+ scx_ops_error("called with NULL task"); -+ return false; -+ } -+ -+ if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { -+ scx_ops_error("invalid enq_flags 0x%llx", enq_flags); -+ return false; -+ } -+ -+ return true; -+} -+ -+static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags) -+{ -+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); -+ struct task_struct *ddsp_task; -+ -+ ddsp_task = __this_cpu_read(direct_dispatch_task); -+ if (ddsp_task) { -+ mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); -+ return; -+ } -+ -+ if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { -+ scx_ops_error("dispatch buffer overflow"); -+ return; -+ } -+ -+ dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){ -+ .task = p, -+ .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, -+ .dsq_id = dsq_id, -+ .enq_flags = enq_flags, -+ }; -+} -+ -+__bpf_kfunc_start_defs(); -+ -+/** -+ * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ -+ * @p: task_struct to dispatch -+ * @dsq_id: DSQ to dispatch to -+ * @slice: duration @p can run for in nsecs, 0 to keep the current value -+ * @enq_flags: SCX_ENQ_* -+ * -+ * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe -+ * to call this function spuriously. Can be called from ops.enqueue(), -+ * ops.select_cpu(), and ops.dispatch(). -+ * -+ * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch -+ * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be -+ * used to target the local DSQ of a CPU other than the enqueueing one. Use -+ * ops.select_cpu() to be on the target CPU in the first place. -+ * -+ * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p -+ * will be directly dispatched to the corresponding dispatch queue after -+ * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be -+ * dispatched to the local DSQ of the CPU returned by ops.select_cpu(). -+ * @enq_flags are OR'd with the enqueue flags on the enqueue path before the -+ * task is dispatched. -+ * -+ * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id -+ * and this function can be called upto ops.dispatch_max_batch times to dispatch -+ * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the -+ * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. -+ * -+ * This function doesn't have any locking restrictions and may be called under -+ * BPF locks (in the future when BPF introduces more flexible locking). -+ * -+ * @p is allowed to run for @slice. The scheduling path is triggered on slice -+ * exhaustion. If zero, the current residual slice is maintained. If -+ * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with -+ * scx_bpf_kick_cpu() to trigger scheduling. -+ */ -+__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, -+ u64 enq_flags) -+{ -+ if (!scx_dispatch_preamble(p, enq_flags)) -+ return; -+ -+ if (slice) -+ p->scx.slice = slice; -+ else -+ p->scx.slice = p->scx.slice ?: 1; -+ -+ scx_dispatch_commit(p, dsq_id, enq_flags); -+} -+ -+/** -+ * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ -+ * @p: task_struct to dispatch -+ * @dsq_id: DSQ to dispatch to -+ * @slice: duration @p can run for in nsecs, 0 to keep the current value -+ * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ -+ * @enq_flags: SCX_ENQ_* -+ * -+ * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id. -+ * Tasks queued into the priority queue are ordered by @vtime and always -+ * consumed after the tasks in the FIFO queue. All other aspects are identical -+ * to scx_bpf_dispatch(). -+ * -+ * @vtime ordering is according to time_before64() which considers wrapping. A -+ * numerically larger vtime may indicate an earlier position in the ordering and -+ * vice-versa. -+ */ -+__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, -+ u64 slice, u64 vtime, u64 enq_flags) -+{ -+ if (!scx_dispatch_preamble(p, enq_flags)) -+ return; -+ -+ if (slice) -+ p->scx.slice = slice; -+ else -+ p->scx.slice = p->scx.slice ?: 1; -+ -+ p->scx.dsq_vtime = vtime; -+ -+ scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); -+} -+ -+__bpf_kfunc_end_defs(); -+ -+BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) -+BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) -+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU) -+BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_enqueue_dispatch, -+}; -+ -+static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit, -+ struct task_struct *p, u64 dsq_id, -+ u64 enq_flags) -+{ -+ struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; -+ struct rq *this_rq, *src_rq, *dst_rq, *locked_rq; -+ bool dispatched = false; -+ bool in_balance; -+ unsigned long flags; -+ -+ if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH)) -+ return false; -+ -+ /* -+ * Can be called from either ops.dispatch() locking this_rq() or any -+ * context where no rq lock is held. If latter, lock @p's task_rq which -+ * we'll likely need anyway. -+ */ -+ src_rq = task_rq(p); -+ -+ local_irq_save(flags); -+ this_rq = this_rq(); -+ in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE; -+ -+ if (in_balance) { -+ if (this_rq != src_rq) { -+ raw_spin_rq_unlock(this_rq); -+ raw_spin_rq_lock(src_rq); -+ } -+ } else { -+ raw_spin_rq_lock(src_rq); -+ } -+ -+ locked_rq = src_rq; -+ raw_spin_lock(&src_dsq->lock); -+ -+ /* -+ * Did someone else get to it? @p could have already left $src_dsq, got -+ * re-enqueud, or be in the process of being consumed by someone else. -+ */ -+ if (unlikely(p->scx.dsq != src_dsq || -+ u32_before(kit->cursor.priv, p->scx.dsq_seq) || -+ p->scx.holding_cpu >= 0) || -+ WARN_ON_ONCE(src_rq != task_rq(p))) { -+ raw_spin_unlock(&src_dsq->lock); -+ goto out; -+ } -+ -+ /* @p is still on $src_dsq and stable, determine the destination */ -+ dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p); -+ -+ if (dst_dsq->id == SCX_DSQ_LOCAL) { -+ dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); -+ if (!task_can_run_on_remote_rq(p, dst_rq, true)) { -+ dst_dsq = find_global_dsq(p); -+ dst_rq = src_rq; -+ } -+ } else { -+ /* no need to migrate if destination is a non-local DSQ */ -+ dst_rq = src_rq; -+ } -+ -+ /* -+ * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different -+ * CPU, @p will be migrated. -+ */ -+ if (dst_dsq->id == SCX_DSQ_LOCAL) { -+ /* @p is going from a non-local DSQ to a local DSQ */ -+ if (src_rq == dst_rq) { -+ task_unlink_from_dsq(p, src_dsq); -+ move_local_task_to_local_dsq(p, enq_flags, -+ src_dsq, dst_rq); -+ raw_spin_unlock(&src_dsq->lock); -+ } else { -+ raw_spin_unlock(&src_dsq->lock); -+ move_remote_task_to_local_dsq(p, enq_flags, -+ src_rq, dst_rq); -+ locked_rq = dst_rq; -+ } -+ } else { -+ /* -+ * @p is going from a non-local DSQ to a non-local DSQ. As -+ * $src_dsq is already locked, do an abbreviated dequeue. -+ */ -+ task_unlink_from_dsq(p, src_dsq); -+ p->scx.dsq = NULL; -+ raw_spin_unlock(&src_dsq->lock); -+ -+ if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) -+ p->scx.dsq_vtime = kit->vtime; -+ dispatch_enqueue(dst_dsq, p, enq_flags); -+ } -+ -+ if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE) -+ p->scx.slice = kit->slice; -+ -+ dispatched = true; -+out: -+ if (in_balance) { -+ if (this_rq != locked_rq) { -+ raw_spin_rq_unlock(locked_rq); -+ raw_spin_rq_lock(this_rq); -+ } -+ } else { -+ raw_spin_rq_unlock_irqrestore(locked_rq, flags); -+ } -+ -+ kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE | -+ __SCX_DSQ_ITER_HAS_VTIME); -+ return dispatched; -+} -+ -+__bpf_kfunc_start_defs(); -+ -+/** -+ * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots -+ * -+ * Can only be called from ops.dispatch(). -+ */ -+__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) -+{ -+ if (!scx_kf_allowed(SCX_KF_DISPATCH)) -+ return 0; -+ -+ return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); -+} -+ -+/** -+ * scx_bpf_dispatch_cancel - Cancel the latest dispatch -+ * -+ * Cancel the latest dispatch. Can be called multiple times to cancel further -+ * dispatches. Can only be called from ops.dispatch(). -+ */ -+__bpf_kfunc void scx_bpf_dispatch_cancel(void) -+{ -+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); -+ -+ if (!scx_kf_allowed(SCX_KF_DISPATCH)) -+ return; -+ -+ if (dspc->cursor > 0) -+ dspc->cursor--; -+ else -+ scx_ops_error("dispatch buffer underflow"); -+} -+ -+/** -+ * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ -+ * @dsq_id: DSQ to consume -+ * -+ * Consume a task from the non-local DSQ identified by @dsq_id and transfer it -+ * to the current CPU's local DSQ for execution. Can only be called from -+ * ops.dispatch(). -+ * -+ * This function flushes the in-flight dispatches from scx_bpf_dispatch() before -+ * trying to consume the specified DSQ. It may also grab rq locks and thus can't -+ * be called under any BPF locks. -+ * -+ * Returns %true if a task has been consumed, %false if there isn't any task to -+ * consume. -+ */ -+__bpf_kfunc bool scx_bpf_consume(u64 dsq_id) -+{ -+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); -+ struct scx_dispatch_q *dsq; -+ -+ if (!scx_kf_allowed(SCX_KF_DISPATCH)) -+ return false; -+ -+ flush_dispatch_buf(dspc->rq); -+ -+ dsq = find_user_dsq(dsq_id); -+ if (unlikely(!dsq)) { -+ scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); -+ return false; -+ } -+ -+ if (consume_dispatch_q(dspc->rq, dsq)) { -+ /* -+ * A successfully consumed task can be dequeued before it starts -+ * running while the CPU is trying to migrate other dispatched -+ * tasks. Bump nr_tasks to tell balance_scx() to retry on empty -+ * local DSQ. -+ */ -+ dspc->nr_tasks++; -+ return true; -+ } else { -+ return false; -+ } -+} -+ -+/** -+ * scx_bpf_dispatch_from_dsq_set_slice - Override slice when dispatching from DSQ -+ * @it__iter: DSQ iterator in progress -+ * @slice: duration the dispatched task can run for in nsecs -+ * -+ * Override the slice of the next task that will be dispatched from @it__iter -+ * using scx_bpf_dispatch_from_dsq[_vtime](). If this function is not called, -+ * the previous slice duration is kept. -+ */ -+__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice( -+ struct bpf_iter_scx_dsq *it__iter, u64 slice) -+{ -+ struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; -+ -+ kit->slice = slice; -+ kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; -+} -+ -+/** -+ * scx_bpf_dispatch_from_dsq_set_vtime - Override vtime when dispatching from DSQ -+ * @it__iter: DSQ iterator in progress -+ * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ -+ * -+ * Override the vtime of the next task that will be dispatched from @it__iter -+ * using scx_bpf_dispatch_from_dsq_vtime(). If this function is not called, the -+ * previous slice vtime is kept. If scx_bpf_dispatch_from_dsq() is used to -+ * dispatch the next task, the override is ignored and cleared. -+ */ -+__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( -+ struct bpf_iter_scx_dsq *it__iter, u64 vtime) -+{ -+ struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; -+ -+ kit->vtime = vtime; -+ kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; -+} -+ -+/** -+ * scx_bpf_dispatch_from_dsq - Move a task from DSQ iteration to a DSQ -+ * @it__iter: DSQ iterator in progress -+ * @p: task to transfer -+ * @dsq_id: DSQ to move @p to -+ * @enq_flags: SCX_ENQ_* -+ * -+ * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ -+ * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can -+ * be the destination. -+ * -+ * For the transfer to be successful, @p must still be on the DSQ and have been -+ * queued before the DSQ iteration started. This function doesn't care whether -+ * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have -+ * been queued before the iteration started. -+ * -+ * @p's slice is kept by default. Use scx_bpf_dispatch_from_dsq_set_slice() to -+ * update. -+ * -+ * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq -+ * lock (e.g. BPF timers or SYSCALL programs). -+ * -+ * Returns %true if @p has been consumed, %false if @p had already been consumed -+ * or dequeued. -+ */ -+__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, -+ struct task_struct *p, u64 dsq_id, -+ u64 enq_flags) -+{ -+ return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter, -+ p, dsq_id, enq_flags); -+} -+ -+/** -+ * scx_bpf_dispatch_vtime_from_dsq - Move a task from DSQ iteration to a PRIQ DSQ -+ * @it__iter: DSQ iterator in progress -+ * @p: task to transfer -+ * @dsq_id: DSQ to move @p to -+ * @enq_flags: SCX_ENQ_* -+ * -+ * Transfer @p which is on the DSQ currently iterated by @it__iter to the -+ * priority queue of the DSQ specified by @dsq_id. The destination must be a -+ * user DSQ as only user DSQs support priority queue. -+ * -+ * @p's slice and vtime are kept by default. Use -+ * scx_bpf_dispatch_from_dsq_set_slice() and -+ * scx_bpf_dispatch_from_dsq_set_vtime() to update. -+ * -+ * All other aspects are identical to scx_bpf_dispatch_from_dsq(). See -+ * scx_bpf_dispatch_vtime() for more information on @vtime. -+ */ -+__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, -+ struct task_struct *p, u64 dsq_id, -+ u64 enq_flags) -+{ -+ return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter, -+ p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); -+} -+ -+__bpf_kfunc_end_defs(); -+ -+BTF_KFUNCS_START(scx_kfunc_ids_dispatch) -+BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) -+BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) -+BTF_ID_FLAGS(func, scx_bpf_consume) -+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) -+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) -+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) -+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) -+BTF_KFUNCS_END(scx_kfunc_ids_dispatch) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_dispatch, -+}; -+ -+__bpf_kfunc_start_defs(); -+ -+/** -+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ -+ * -+ * Iterate over all of the tasks currently enqueued on the local DSQ of the -+ * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of -+ * processed tasks. Can only be called from ops.cpu_release(). -+ */ -+__bpf_kfunc u32 scx_bpf_reenqueue_local(void) -+{ -+ LIST_HEAD(tasks); -+ u32 nr_enqueued = 0; -+ struct rq *rq; -+ struct task_struct *p, *n; -+ -+ if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) -+ return 0; -+ -+ rq = cpu_rq(smp_processor_id()); -+ lockdep_assert_rq_held(rq); -+ -+ /* -+ * The BPF scheduler may choose to dispatch tasks back to -+ * @rq->scx.local_dsq. Move all candidate tasks off to a private list -+ * first to avoid processing the same tasks repeatedly. -+ */ -+ list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, -+ scx.dsq_list.node) { -+ /* -+ * If @p is being migrated, @p's current CPU may not agree with -+ * its allowed CPUs and the migration_cpu_stop is about to -+ * deactivate and re-activate @p anyway. Skip re-enqueueing. -+ * -+ * While racing sched property changes may also dequeue and -+ * re-enqueue a migrating task while its current CPU and allowed -+ * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to -+ * the current local DSQ for running tasks and thus are not -+ * visible to the BPF scheduler. -+ */ -+ if (p->migration_pending) -+ continue; -+ -+ dispatch_dequeue(rq, p); -+ list_add_tail(&p->scx.dsq_list.node, &tasks); -+ } -+ -+ list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { -+ list_del_init(&p->scx.dsq_list.node); -+ do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); -+ nr_enqueued++; -+ } -+ -+ return nr_enqueued; -+} -+ -+__bpf_kfunc_end_defs(); -+ -+BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) -+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local) -+BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_cpu_release, -+}; -+ -+__bpf_kfunc_start_defs(); -+ -+/** -+ * scx_bpf_create_dsq - Create a custom DSQ -+ * @dsq_id: DSQ to create -+ * @node: NUMA node to allocate from -+ * -+ * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable -+ * scx callback, and any BPF_PROG_TYPE_SYSCALL prog. -+ */ -+__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) -+{ -+ if (unlikely(node >= (int)nr_node_ids || -+ (node < 0 && node != NUMA_NO_NODE))) -+ return -EINVAL; -+ return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); -+} -+ -+__bpf_kfunc_end_defs(); -+ -+BTF_KFUNCS_START(scx_kfunc_ids_unlocked) -+BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) -+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) -+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) -+BTF_KFUNCS_END(scx_kfunc_ids_unlocked) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_unlocked, -+}; -+ -+__bpf_kfunc_start_defs(); -+ -+/** -+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU -+ * @cpu: cpu to kick -+ * @flags: %SCX_KICK_* flags -+ * -+ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or -+ * trigger rescheduling on a busy CPU. This can be called from any online -+ * scx_ops operation and the actual kicking is performed asynchronously through -+ * an irq work. -+ */ -+__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) -+{ -+ struct rq *this_rq; -+ unsigned long irq_flags; -+ -+ if (!ops_cpu_valid(cpu, NULL)) -+ return; -+ -+ local_irq_save(irq_flags); -+ -+ this_rq = this_rq(); -+ -+ /* -+ * While bypassing for PM ops, IRQ handling may not be online which can -+ * lead to irq_work_queue() malfunction such as infinite busy wait for -+ * IRQ status update. Suppress kicking. -+ */ -+ if (scx_rq_bypassing(this_rq)) -+ goto out; -+ -+ /* -+ * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting -+ * rq locks. We can probably be smarter and avoid bouncing if called -+ * from ops which don't hold a rq lock. -+ */ -+ if (flags & SCX_KICK_IDLE) { -+ struct rq *target_rq = cpu_rq(cpu); -+ -+ if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) -+ scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); -+ -+ if (raw_spin_rq_trylock(target_rq)) { -+ if (can_skip_idle_kick(target_rq)) { -+ raw_spin_rq_unlock(target_rq); -+ goto out; -+ } -+ raw_spin_rq_unlock(target_rq); -+ } -+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle); -+ } else { -+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick); -+ -+ if (flags & SCX_KICK_PREEMPT) -+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); -+ if (flags & SCX_KICK_WAIT) -+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait); -+ } -+ -+ irq_work_queue(&this_rq->scx.kick_cpus_irq_work); -+out: -+ local_irq_restore(irq_flags); -+} -+ -+/** -+ * scx_bpf_dsq_nr_queued - Return the number of queued tasks -+ * @dsq_id: id of the DSQ -+ * -+ * Return the number of tasks in the DSQ matching @dsq_id. If not found, -+ * -%ENOENT is returned. -+ */ -+__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) -+{ -+ struct scx_dispatch_q *dsq; -+ s32 ret; -+ -+ preempt_disable(); -+ -+ if (dsq_id == SCX_DSQ_LOCAL) { -+ ret = READ_ONCE(this_rq()->scx.local_dsq.nr); -+ goto out; -+ } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { -+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; -+ -+ if (ops_cpu_valid(cpu, NULL)) { -+ ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); -+ goto out; -+ } -+ } else { -+ dsq = find_user_dsq(dsq_id); -+ if (dsq) { -+ ret = READ_ONCE(dsq->nr); -+ goto out; -+ } -+ } -+ ret = -ENOENT; -+out: -+ preempt_enable(); -+ return ret; -+} -+ -+/** -+ * scx_bpf_destroy_dsq - Destroy a custom DSQ -+ * @dsq_id: DSQ to destroy -+ * -+ * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with -+ * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is -+ * empty and no further tasks are dispatched to it. Ignored if called on a DSQ -+ * which doesn't exist. Can be called from any online scx_ops operations. -+ */ -+__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) -+{ -+ destroy_dsq(dsq_id); -+} -+ -+/** -+ * bpf_iter_scx_dsq_new - Create a DSQ iterator -+ * @it: iterator to initialize -+ * @dsq_id: DSQ to iterate -+ * @flags: %SCX_DSQ_ITER_* -+ * -+ * Initialize BPF iterator @it which can be used with bpf_for_each() to walk -+ * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes -+ * tasks which are already queued when this function is invoked. -+ */ -+__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, -+ u64 flags) -+{ -+ struct bpf_iter_scx_dsq_kern *kit = (void *)it; -+ -+ BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) > -+ sizeof(struct bpf_iter_scx_dsq)); -+ BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != -+ __alignof__(struct bpf_iter_scx_dsq)); -+ -+ if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) -+ return -EINVAL; -+ -+ kit->dsq = find_user_dsq(dsq_id); -+ if (!kit->dsq) -+ return -ENOENT; -+ -+ INIT_LIST_HEAD(&kit->cursor.node); -+ kit->cursor.flags |= SCX_DSQ_LNODE_ITER_CURSOR | flags; -+ kit->cursor.priv = READ_ONCE(kit->dsq->seq); -+ -+ return 0; -+} -+ -+/** -+ * bpf_iter_scx_dsq_next - Progress a DSQ iterator -+ * @it: iterator to progress -+ * -+ * Return the next task. See bpf_iter_scx_dsq_new(). -+ */ -+__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) -+{ -+ struct bpf_iter_scx_dsq_kern *kit = (void *)it; -+ bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV; -+ struct task_struct *p; -+ unsigned long flags; -+ -+ if (!kit->dsq) -+ return NULL; -+ -+ raw_spin_lock_irqsave(&kit->dsq->lock, flags); -+ -+ if (list_empty(&kit->cursor.node)) -+ p = NULL; -+ else -+ p = container_of(&kit->cursor, struct task_struct, scx.dsq_list); -+ -+ /* -+ * Only tasks which were queued before the iteration started are -+ * visible. This bounds BPF iterations and guarantees that vtime never -+ * jumps in the other direction while iterating. -+ */ -+ do { -+ p = nldsq_next_task(kit->dsq, p, rev); -+ } while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq))); -+ -+ if (p) { -+ if (rev) -+ list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node); -+ else -+ list_move(&kit->cursor.node, &p->scx.dsq_list.node); -+ } else { -+ list_del_init(&kit->cursor.node); -+ } -+ -+ raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); -+ -+ return p; -+} -+ -+/** -+ * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator -+ * @it: iterator to destroy -+ * -+ * Undo scx_iter_scx_dsq_new(). -+ */ -+__bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) -+{ -+ struct bpf_iter_scx_dsq_kern *kit = (void *)it; -+ -+ if (!kit->dsq) -+ return; -+ -+ if (!list_empty(&kit->cursor.node)) { -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&kit->dsq->lock, flags); -+ list_del_init(&kit->cursor.node); -+ raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); -+ } -+ kit->dsq = NULL; -+} -+ -+__bpf_kfunc_end_defs(); -+ -+static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, -+ char *fmt, unsigned long long *data, u32 data__sz) -+{ -+ struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; -+ s32 ret; -+ -+ if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || -+ (data__sz && !data)) { -+ scx_ops_error("invalid data=%p and data__sz=%u", -+ (void *)data, data__sz); -+ return -EINVAL; -+ } -+ -+ ret = copy_from_kernel_nofault(data_buf, data, data__sz); -+ if (ret < 0) { -+ scx_ops_error("failed to read data fields (%d)", ret); -+ return ret; -+ } -+ -+ ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, -+ &bprintf_data); -+ if (ret < 0) { -+ scx_ops_error("format preparation failed (%d)", ret); -+ return ret; -+ } -+ -+ ret = bstr_printf(line_buf, line_size, fmt, -+ bprintf_data.bin_args); -+ bpf_bprintf_cleanup(&bprintf_data); -+ if (ret < 0) { -+ scx_ops_error("(\"%s\", %p, %u) failed to format", -+ fmt, data, data__sz); -+ return ret; -+ } -+ -+ return ret; -+} -+ -+static s32 bstr_format(struct scx_bstr_buf *buf, -+ char *fmt, unsigned long long *data, u32 data__sz) -+{ -+ return __bstr_format(buf->data, buf->line, sizeof(buf->line), -+ fmt, data, data__sz); -+} -+ -+__bpf_kfunc_start_defs(); -+ -+/** -+ * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. -+ * @exit_code: Exit value to pass to user space via struct scx_exit_info. -+ * @fmt: error message format string -+ * @data: format string parameters packaged using ___bpf_fill() macro -+ * @data__sz: @data len, must end in '__sz' for the verifier -+ * -+ * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops -+ * disabling. -+ */ -+__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, -+ unsigned long long *data, u32 data__sz) -+{ -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); -+ if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) -+ scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s", -+ scx_exit_bstr_buf.line); -+ raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); -+} -+ -+/** -+ * scx_bpf_error_bstr - Indicate fatal error -+ * @fmt: error message format string -+ * @data: format string parameters packaged using ___bpf_fill() macro -+ * @data__sz: @data len, must end in '__sz' for the verifier -+ * -+ * Indicate that the BPF scheduler encountered a fatal error and initiate ops -+ * disabling. -+ */ -+__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, -+ u32 data__sz) -+{ -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); -+ if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) -+ scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s", -+ scx_exit_bstr_buf.line); -+ raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); -+} -+ -+/** -+ * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler -+ * @fmt: format string -+ * @data: format string parameters packaged using ___bpf_fill() macro -+ * @data__sz: @data len, must end in '__sz' for the verifier -+ * -+ * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and -+ * dump_task() to generate extra debug dump specific to the BPF scheduler. -+ * -+ * The extra dump may be multiple lines. A single line may be split over -+ * multiple calls. The last line is automatically terminated. -+ */ -+__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, -+ u32 data__sz) -+{ -+ struct scx_dump_data *dd = &scx_dump_data; -+ struct scx_bstr_buf *buf = &dd->buf; -+ s32 ret; -+ -+ if (raw_smp_processor_id() != dd->cpu) { -+ scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends"); -+ return; -+ } -+ -+ /* append the formatted string to the line buf */ -+ ret = __bstr_format(buf->data, buf->line + dd->cursor, -+ sizeof(buf->line) - dd->cursor, fmt, data, data__sz); -+ if (ret < 0) { -+ dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", -+ dd->prefix, fmt, data, data__sz, ret); -+ return; -+ } -+ -+ dd->cursor += ret; -+ dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line)); -+ -+ if (!dd->cursor) -+ return; -+ -+ /* -+ * If the line buf overflowed or ends in a newline, flush it into the -+ * dump. This is to allow the caller to generate a single line over -+ * multiple calls. As ops_dump_flush() can also handle multiple lines in -+ * the line buf, the only case which can lead to an unexpected -+ * truncation is when the caller keeps generating newlines in the middle -+ * instead of the end consecutively. Don't do that. -+ */ -+ if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n') -+ ops_dump_flush(); -+} -+ -+/** -+ * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU -+ * @cpu: CPU of interest -+ * -+ * Return the maximum relative capacity of @cpu in relation to the most -+ * performant CPU in the system. The return value is in the range [1, -+ * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). -+ */ -+__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) -+{ -+ if (ops_cpu_valid(cpu, NULL)) -+ return arch_scale_cpu_capacity(cpu); -+ else -+ return SCX_CPUPERF_ONE; -+} -+ -+/** -+ * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU -+ * @cpu: CPU of interest -+ * -+ * Return the current relative performance of @cpu in relation to its maximum. -+ * The return value is in the range [1, %SCX_CPUPERF_ONE]. -+ * -+ * The current performance level of a CPU in relation to the maximum performance -+ * available in the system can be calculated as follows: -+ * -+ * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE -+ * -+ * The result is in the range [1, %SCX_CPUPERF_ONE]. -+ */ -+__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) -+{ -+ if (ops_cpu_valid(cpu, NULL)) -+ return arch_scale_freq_capacity(cpu); -+ else -+ return SCX_CPUPERF_ONE; -+} -+ -+/** -+ * scx_bpf_cpuperf_set - Set the relative performance target of a CPU -+ * @cpu: CPU of interest -+ * @perf: target performance level [0, %SCX_CPUPERF_ONE] -+ * @flags: %SCX_CPUPERF_* flags -+ * -+ * Set the target performance level of @cpu to @perf. @perf is in linear -+ * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the -+ * schedutil cpufreq governor chooses the target frequency. -+ * -+ * The actual performance level chosen, CPU grouping, and the overhead and -+ * latency of the operations are dependent on the hardware and cpufreq driver in -+ * use. Consult hardware and cpufreq documentation for more information. The -+ * current performance level can be monitored using scx_bpf_cpuperf_cur(). -+ */ -+__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) -+{ -+ if (unlikely(perf > SCX_CPUPERF_ONE)) { -+ scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu); -+ return; -+ } -+ -+ if (ops_cpu_valid(cpu, NULL)) { -+ struct rq *rq = cpu_rq(cpu); -+ -+ rq->scx.cpuperf_target = perf; -+ -+ rcu_read_lock_sched_notrace(); -+ cpufreq_update_util(cpu_rq(cpu), 0); -+ rcu_read_unlock_sched_notrace(); -+ } -+} -+ -+/** -+ * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs -+ * -+ * All valid CPU IDs in the system are smaller than the returned value. -+ */ -+__bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) -+{ -+ return nr_cpu_ids; -+} -+ -+/** -+ * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask -+ */ -+__bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) -+{ -+ return cpu_possible_mask; -+} -+ -+/** -+ * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask -+ */ -+__bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void) -+{ -+ return cpu_online_mask; -+} -+ -+/** -+ * scx_bpf_put_cpumask - Release a possible/online cpumask -+ * @cpumask: cpumask to release -+ */ -+__bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) -+{ -+ /* -+ * Empty function body because we aren't actually acquiring or releasing -+ * a reference to a global cpumask, which is read-only in the caller and -+ * is never released. The acquire / release semantics here are just used -+ * to make the cpumask is a trusted pointer in the caller. -+ */ -+} -+ -+/** -+ * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking -+ * per-CPU cpumask. -+ * -+ * Returns NULL if idle tracking is not enabled, or running on a UP kernel. -+ */ -+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) -+{ -+ if (!static_branch_likely(&scx_builtin_idle_enabled)) { -+ scx_ops_error("built-in idle tracking is disabled"); -+ return cpu_none_mask; -+ } -+ -+#ifdef CONFIG_SMP -+ return idle_masks.cpu; -+#else -+ return cpu_none_mask; -+#endif -+} -+ -+/** -+ * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, -+ * per-physical-core cpumask. Can be used to determine if an entire physical -+ * core is free. -+ * -+ * Returns NULL if idle tracking is not enabled, or running on a UP kernel. -+ */ -+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) -+{ -+ if (!static_branch_likely(&scx_builtin_idle_enabled)) { -+ scx_ops_error("built-in idle tracking is disabled"); -+ return cpu_none_mask; -+ } -+ -+#ifdef CONFIG_SMP -+ if (sched_smt_active()) -+ return idle_masks.smt; -+ else -+ return idle_masks.cpu; -+#else -+ return cpu_none_mask; -+#endif -+} -+ -+/** -+ * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to -+ * either the percpu, or SMT idle-tracking cpumask. -+ */ -+__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) -+{ -+ /* -+ * Empty function body because we aren't actually acquiring or releasing -+ * a reference to a global idle cpumask, which is read-only in the -+ * caller and is never released. The acquire / release semantics here -+ * are just used to make the cpumask a trusted pointer in the caller. -+ */ -+} -+ -+/** -+ * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state -+ * @cpu: cpu to test and clear idle for -+ * -+ * Returns %true if @cpu was idle and its idle state was successfully cleared. -+ * %false otherwise. -+ * -+ * Unavailable if ops.update_idle() is implemented and -+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. -+ */ -+__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) -+{ -+ if (!static_branch_likely(&scx_builtin_idle_enabled)) { -+ scx_ops_error("built-in idle tracking is disabled"); -+ return false; -+ } -+ -+ if (ops_cpu_valid(cpu, NULL)) -+ return test_and_clear_cpu_idle(cpu); -+ else -+ return false; -+} -+ -+/** -+ * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu -+ * @cpus_allowed: Allowed cpumask -+ * @flags: %SCX_PICK_IDLE_CPU_* flags -+ * -+ * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu -+ * number on success. -%EBUSY if no matching cpu was found. -+ * -+ * Idle CPU tracking may race against CPU scheduling state transitions. For -+ * example, this function may return -%EBUSY as CPUs are transitioning into the -+ * idle state. If the caller then assumes that there will be dispatch events on -+ * the CPUs as they were all busy, the scheduler may end up stalling with CPUs -+ * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and -+ * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch -+ * event in the near future. -+ * -+ * Unavailable if ops.update_idle() is implemented and -+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. -+ */ -+__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, -+ u64 flags) -+{ -+ if (!static_branch_likely(&scx_builtin_idle_enabled)) { -+ scx_ops_error("built-in idle tracking is disabled"); -+ return -EBUSY; -+ } -+ -+ return scx_pick_idle_cpu(cpus_allowed, flags); -+} -+ -+/** -+ * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU -+ * @cpus_allowed: Allowed cpumask -+ * @flags: %SCX_PICK_IDLE_CPU_* flags -+ * -+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any -+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu -+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is -+ * empty. -+ * -+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not -+ * set, this function can't tell which CPUs are idle and will always pick any -+ * CPU. -+ */ -+__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, -+ u64 flags) -+{ -+ s32 cpu; -+ -+ if (static_branch_likely(&scx_builtin_idle_enabled)) { -+ cpu = scx_pick_idle_cpu(cpus_allowed, flags); -+ if (cpu >= 0) -+ return cpu; -+ } -+ -+ cpu = cpumask_any_distribute(cpus_allowed); -+ if (cpu < nr_cpu_ids) -+ return cpu; -+ else -+ return -EBUSY; -+} -+ -+/** -+ * scx_bpf_task_running - Is task currently running? -+ * @p: task of interest -+ */ -+__bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) -+{ -+ return task_rq(p)->curr == p; -+} -+ -+/** -+ * scx_bpf_task_cpu - CPU a task is currently associated with -+ * @p: task of interest -+ */ -+__bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) -+{ -+ return task_cpu(p); -+} -+ -+/** -+ * scx_bpf_cpu_rq - Fetch the rq of a CPU -+ * @cpu: CPU of the rq -+ */ -+__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) -+{ -+ if (!ops_cpu_valid(cpu, NULL)) -+ return NULL; -+ -+ return cpu_rq(cpu); -+} -+ -+/** -+ * scx_bpf_task_cgroup - Return the sched cgroup of a task -+ * @p: task of interest -+ * -+ * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with -+ * from the scheduler's POV. SCX operations should use this function to -+ * determine @p's current cgroup as, unlike following @p->cgroups, -+ * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all -+ * rq-locked operations. Can be called on the parameter tasks of rq-locked -+ * operations. The restriction guarantees that @p's rq is locked by the caller. -+ */ -+#ifdef CONFIG_CGROUP_SCHED -+__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) -+{ -+ struct task_group *tg = p->sched_task_group; -+ struct cgroup *cgrp = &cgrp_dfl_root.cgrp; -+ -+ if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) -+ goto out; -+ -+ /* -+ * A task_group may either be a cgroup or an autogroup. In the latter -+ * case, @tg->css.cgroup is %NULL. A task_group can't become the other -+ * kind once created. -+ */ -+ if (tg && tg->css.cgroup) -+ cgrp = tg->css.cgroup; -+ else -+ cgrp = &cgrp_dfl_root.cgrp; -+out: -+ cgroup_get(cgrp); -+ return cgrp; -+} -+#endif -+ -+__bpf_kfunc_end_defs(); -+ -+BTF_KFUNCS_START(scx_kfunc_ids_any) -+BTF_ID_FLAGS(func, scx_bpf_kick_cpu) -+BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) -+BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) -+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) -+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) -+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) -+BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) -+BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) -+BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) -+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) -+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) -+BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) -+BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) -+BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) -+BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) -+BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) -+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) -+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) -+BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) -+BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) -+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) -+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) -+BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) -+BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) -+BTF_ID_FLAGS(func, scx_bpf_cpu_rq) -+#ifdef CONFIG_CGROUP_SCHED -+BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) -+#endif -+BTF_KFUNCS_END(scx_kfunc_ids_any) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_any = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_any, -+}; -+ -+static int __init scx_init(void) -+{ -+ int ret; -+ -+ /* -+ * kfunc registration can't be done from init_sched_ext_class() as -+ * register_btf_kfunc_id_set() needs most of the system to be up. -+ * -+ * Some kfuncs are context-sensitive and can only be called from -+ * specific SCX ops. They are grouped into BTF sets accordingly. -+ * Unfortunately, BPF currently doesn't have a way of enforcing such -+ * restrictions. Eventually, the verifier should be able to enforce -+ * them. For now, register them the same and make each kfunc explicitly -+ * check using scx_kf_allowed(). -+ */ -+ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_select_cpu)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_enqueue_dispatch)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_dispatch)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_cpu_release)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_unlocked)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, -+ &scx_kfunc_set_unlocked)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_any)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, -+ &scx_kfunc_set_any)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, -+ &scx_kfunc_set_any))) { -+ pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); -+ return ret; -+ } -+ -+ ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); -+ if (ret) { -+ pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); -+ return ret; -+ } -+ -+ ret = register_pm_notifier(&scx_pm_notifier); -+ if (ret) { -+ pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); -+ return ret; -+ } -+ -+ scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); -+ if (!scx_kset) { -+ pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); -+ return -ENOMEM; -+ } -+ -+ ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); -+ if (ret < 0) { -+ pr_err("sched_ext: Failed to add global attributes\n"); -+ return ret; -+ } -+ -+ return 0; -+} -+__initcall(scx_init); -diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h -new file mode 100644 -index 000000000000..246019519231 ---- /dev/null -+++ b/kernel/sched/ext.h -@@ -0,0 +1,91 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#ifdef CONFIG_SCHED_CLASS_EXT -+ -+void scx_tick(struct rq *rq); -+void init_scx_entity(struct sched_ext_entity *scx); -+void scx_pre_fork(struct task_struct *p); -+int scx_fork(struct task_struct *p); -+void scx_post_fork(struct task_struct *p); -+void scx_cancel_fork(struct task_struct *p); -+bool scx_can_stop_tick(struct rq *rq); -+void scx_rq_activate(struct rq *rq); -+void scx_rq_deactivate(struct rq *rq); -+int scx_check_setscheduler(struct task_struct *p, int policy); -+bool task_should_scx(struct task_struct *p); -+void init_sched_ext_class(void); -+ -+static inline u32 scx_cpuperf_target(s32 cpu) -+{ -+ if (scx_enabled()) -+ return cpu_rq(cpu)->scx.cpuperf_target; -+ else -+ return 0; -+} -+ -+static inline bool task_on_scx(const struct task_struct *p) -+{ -+ return scx_enabled() && p->sched_class == &ext_sched_class; -+} -+ -+#ifdef CONFIG_SCHED_CORE -+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, -+ bool in_fi); -+#endif -+ -+#else /* CONFIG_SCHED_CLASS_EXT */ -+ -+static inline void scx_tick(struct rq *rq) {} -+static inline void scx_pre_fork(struct task_struct *p) {} -+static inline int scx_fork(struct task_struct *p) { return 0; } -+static inline void scx_post_fork(struct task_struct *p) {} -+static inline void scx_cancel_fork(struct task_struct *p) {} -+static inline u32 scx_cpuperf_target(s32 cpu) { return 0; } -+static inline bool scx_can_stop_tick(struct rq *rq) { return true; } -+static inline void scx_rq_activate(struct rq *rq) {} -+static inline void scx_rq_deactivate(struct rq *rq) {} -+static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; } -+static inline bool task_on_scx(const struct task_struct *p) { return false; } -+static inline void init_sched_ext_class(void) {} -+ -+#endif /* CONFIG_SCHED_CLASS_EXT */ -+ -+#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP) -+void __scx_update_idle(struct rq *rq, bool idle); -+ -+static inline void scx_update_idle(struct rq *rq, bool idle) -+{ -+ if (scx_enabled()) -+ __scx_update_idle(rq, idle); -+} -+#else -+static inline void scx_update_idle(struct rq *rq, bool idle) {} -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+#ifdef CONFIG_EXT_GROUP_SCHED -+int scx_tg_online(struct task_group *tg); -+void scx_tg_offline(struct task_group *tg); -+int scx_cgroup_can_attach(struct cgroup_taskset *tset); -+void scx_move_task(struct task_struct *p); -+void scx_cgroup_finish_attach(void); -+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); -+void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); -+void scx_group_set_idle(struct task_group *tg, bool idle); -+#else /* CONFIG_EXT_GROUP_SCHED */ -+static inline int scx_tg_online(struct task_group *tg) { return 0; } -+static inline void scx_tg_offline(struct task_group *tg) {} -+static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -+static inline void scx_move_task(struct task_struct *p) {} -+static inline void scx_cgroup_finish_attach(void) {} -+static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} -+static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} -+static inline void scx_group_set_idle(struct task_group *tg, bool idle) {} -+#endif /* CONFIG_EXT_GROUP_SCHED */ -+#endif /* CONFIG_CGROUP_SCHED */ -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 91b242e47db7..a36e37a674e8 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -3857,7 +3857,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - } - } - --void reweight_task(struct task_struct *p, const struct load_weight *lw) -+static void reweight_task_fair(struct rq *rq, struct task_struct *p, -+ const struct load_weight *lw) - { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); -@@ -9365,29 +9366,18 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) { - - static bool __update_blocked_others(struct rq *rq, bool *done) - { -- const struct sched_class *curr_class; -- u64 now = rq_clock_pelt(rq); -- unsigned long hw_pressure; -- bool decayed; -+ bool updated; - - /* - * update_load_avg() can call cpufreq_update_util(). Make sure that RT, - * DL and IRQ signals have been updated before updating CFS. - */ -- curr_class = rq->curr->sched_class; -- -- hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); -- -- /* hw_pressure doesn't care about invariance */ -- decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | -- update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | -- update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure) | -- update_irq_load_avg(rq, 0); -+ updated = update_other_load_avgs(rq); - - if (others_have_blocked(rq)) - *done = false; - -- return decayed; -+ return updated; - } - - #ifdef CONFIG_FAIR_GROUP_SCHED -@@ -13233,6 +13223,7 @@ DEFINE_SCHED_CLASS(fair) = { - .task_tick = task_tick_fair, - .task_fork = task_fork_fair, - -+ .reweight_task = reweight_task_fair, - .prio_changed = prio_changed_fair, - .switched_from = switched_from_fair, - .switched_to = switched_to_fair, -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index 6e78d071beb5..c7a218123b7a 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -452,11 +452,13 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) - - static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) - { -+ scx_update_idle(rq, false); - } - - static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) - { - update_idle_core(rq); -+ scx_update_idle(rq, true); - schedstat_inc(rq->sched_goidle); - } - -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 432b43aa091c..48d893de632b 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -192,9 +192,18 @@ static inline int idle_policy(int policy) - return policy == SCHED_IDLE; - } - -+static inline int normal_policy(int policy) -+{ -+#ifdef CONFIG_SCHED_CLASS_EXT -+ if (policy == SCHED_EXT) -+ return true; -+#endif -+ return policy == SCHED_NORMAL; -+} -+ - static inline int fair_policy(int policy) - { -- return policy == SCHED_NORMAL || policy == SCHED_BATCH; -+ return normal_policy(policy) || policy == SCHED_BATCH; - } - - static inline int rt_policy(int policy) -@@ -244,6 +253,24 @@ static inline void update_avg(u64 *avg, u64 sample) - #define shr_bound(val, shift) \ - (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1)) - -+/* -+ * cgroup weight knobs should use the common MIN, DFL and MAX values which are -+ * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it -+ * maps pretty well onto the shares value used by scheduler and the round-trip -+ * conversions preserve the original value over the entire range. -+ */ -+static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight) -+{ -+ return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL); -+} -+ -+static inline unsigned long sched_weight_to_cgroup(unsigned long weight) -+{ -+ return clamp_t(unsigned long, -+ DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024), -+ CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); -+} -+ - /* - * !! For sched_setattr_nocheck() (kernel) only !! - * -@@ -397,16 +424,17 @@ struct cfs_bandwidth { - struct task_group { - struct cgroup_subsys_state css; - -+#ifdef CONFIG_GROUP_SCHED_WEIGHT -+ /* A positive value indicates that this is a SCHED_IDLE group. */ -+ int idle; -+#endif -+ - #ifdef CONFIG_FAIR_GROUP_SCHED - /* schedulable entities of this group on each CPU */ - struct sched_entity **se; - /* runqueue "owned" by this group on each CPU */ - struct cfs_rq **cfs_rq; - unsigned long shares; -- -- /* A positive value indicates that this is a SCHED_IDLE group. */ -- int idle; -- - #ifdef CONFIG_SMP - /* - * load_avg can be heavily contended at clock tick time, so put -@@ -424,6 +452,11 @@ struct task_group { - struct rt_bandwidth rt_bandwidth; - #endif - -+#ifdef CONFIG_EXT_GROUP_SCHED -+ u32 scx_flags; /* SCX_TG_* */ -+ u32 scx_weight; -+#endif -+ - struct rcu_head rcu; - struct list_head list; - -@@ -448,7 +481,7 @@ struct task_group { - - }; - --#ifdef CONFIG_FAIR_GROUP_SCHED -+#ifdef CONFIG_GROUP_SCHED_WEIGHT - #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD - - /* -@@ -479,6 +512,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) - return walk_tg_tree_from(&root_task_group, down, up, data); - } - -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ - extern int tg_nop(struct task_group *tg, void *data); - - #ifdef CONFIG_FAIR_GROUP_SCHED -@@ -535,6 +573,9 @@ extern void set_task_rq_fair(struct sched_entity *se, - static inline void set_task_rq_fair(struct sched_entity *se, - struct cfs_rq *prev, struct cfs_rq *next) { } - #endif /* CONFIG_SMP */ -+#else /* !CONFIG_FAIR_GROUP_SCHED */ -+static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } -+static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } - #endif /* CONFIG_FAIR_GROUP_SCHED */ - - #else /* CONFIG_CGROUP_SCHED */ -@@ -588,6 +629,11 @@ do { \ - # define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) - # define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) - -+struct balance_callback { -+ struct balance_callback *next; -+ void (*func)(struct rq *rq); -+}; -+ - /* CFS-related fields in a runqueue */ - struct cfs_rq { - struct load_weight load; -@@ -696,6 +742,43 @@ struct cfs_rq { - #endif /* CONFIG_FAIR_GROUP_SCHED */ - }; - -+#ifdef CONFIG_SCHED_CLASS_EXT -+/* scx_rq->flags, protected by the rq lock */ -+enum scx_rq_flags { -+ /* -+ * A hotplugged CPU starts scheduling before rq_online_scx(). Track -+ * ops.cpu_on/offline() state so that ops.enqueue/dispatch() are called -+ * only while the BPF scheduler considers the CPU to be online. -+ */ -+ SCX_RQ_ONLINE = 1 << 0, -+ SCX_RQ_CAN_STOP_TICK = 1 << 1, -+ SCX_RQ_BYPASSING = 1 << 3, -+ -+ SCX_RQ_IN_WAKEUP = 1 << 16, -+ SCX_RQ_IN_BALANCE = 1 << 17, -+}; -+ -+struct scx_rq { -+ struct scx_dispatch_q local_dsq; -+ struct list_head runnable_list; /* runnable tasks on this rq */ -+ struct list_head ddsp_deferred_locals; /* deferred ddsps from enq */ -+ unsigned long ops_qseq; -+ u64 extra_enq_flags; /* see move_task_to_local_dsq() */ -+ u32 nr_running; -+ u32 flags; -+ u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */ -+ bool cpu_released; -+ cpumask_var_t cpus_to_kick; -+ cpumask_var_t cpus_to_kick_if_idle; -+ cpumask_var_t cpus_to_preempt; -+ cpumask_var_t cpus_to_wait; -+ unsigned long pnt_seq; -+ struct balance_callback deferred_bal_cb; -+ struct irq_work deferred_irq_work; -+ struct irq_work kick_cpus_irq_work; -+}; -+#endif /* CONFIG_SCHED_CLASS_EXT */ -+ - static inline int rt_bandwidth_enabled(void) - { - return sysctl_sched_rt_runtime >= 0; -@@ -996,11 +1079,6 @@ struct uclamp_rq { - DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); - #endif /* CONFIG_UCLAMP_TASK */ - --struct balance_callback { -- struct balance_callback *next; -- void (*func)(struct rq *rq); --}; -- - /* - * This is the main, per-CPU runqueue data structure. - * -@@ -1043,6 +1121,9 @@ struct rq { - struct cfs_rq cfs; - struct rt_rq rt; - struct dl_rq dl; -+#ifdef CONFIG_SCHED_CLASS_EXT -+ struct scx_rq scx; -+#endif - - #ifdef CONFIG_FAIR_GROUP_SCHED - /* list of leaf cfs_rq on this CPU: */ -@@ -2291,13 +2372,15 @@ struct sched_class { - - void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); - -+ int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - struct task_struct *(*pick_next_task)(struct rq *rq); - - void (*put_prev_task)(struct rq *rq, struct task_struct *p); - void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); - -+ void (*switch_class)(struct rq *rq, struct task_struct *next); -+ - #ifdef CONFIG_SMP -- int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); - - struct task_struct * (*pick_task)(struct rq *rq); -@@ -2323,8 +2406,11 @@ struct sched_class { - * cannot assume the switched_from/switched_to pair is serialized by - * rq->lock. They are however serialized by p->pi_lock. - */ -+ void (*switching_to) (struct rq *this_rq, struct task_struct *task); - void (*switched_from)(struct rq *this_rq, struct task_struct *task); - void (*switched_to) (struct rq *this_rq, struct task_struct *task); -+ void (*reweight_task)(struct rq *this_rq, struct task_struct *task, -+ const struct load_weight *lw); - void (*prio_changed) (struct rq *this_rq, struct task_struct *task, - int oldprio); - -@@ -2373,19 +2459,54 @@ const struct sched_class name##_sched_class \ - extern struct sched_class __sched_class_highest[]; - extern struct sched_class __sched_class_lowest[]; - -+extern const struct sched_class stop_sched_class; -+extern const struct sched_class dl_sched_class; -+extern const struct sched_class rt_sched_class; -+extern const struct sched_class fair_sched_class; -+extern const struct sched_class idle_sched_class; -+ -+#ifdef CONFIG_SCHED_CLASS_EXT -+extern const struct sched_class ext_sched_class; -+ -+DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */ -+DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */ -+ -+#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled) -+#define scx_switched_all() static_branch_unlikely(&__scx_switched_all) -+#else /* !CONFIG_SCHED_CLASS_EXT */ -+#define scx_enabled() false -+#define scx_switched_all() false -+#endif /* !CONFIG_SCHED_CLASS_EXT */ -+ -+/* -+ * Iterate only active classes. SCX can take over all fair tasks or be -+ * completely disabled. If the former, skip fair. If the latter, skip SCX. -+ */ -+static inline const struct sched_class *next_active_class(const struct sched_class *class) -+{ -+ class++; -+#ifdef CONFIG_SCHED_CLASS_EXT -+ if (scx_switched_all() && class == &fair_sched_class) -+ class++; -+ if (!scx_enabled() && class == &ext_sched_class) -+ class++; -+#endif -+ return class; -+} -+ - #define for_class_range(class, _from, _to) \ - for (class = (_from); class < (_to); class++) - - #define for_each_class(class) \ - for_class_range(class, __sched_class_highest, __sched_class_lowest) - --#define sched_class_above(_a, _b) ((_a) < (_b)) -+#define for_active_class_range(class, _from, _to) \ -+ for (class = (_from); class != (_to); class = next_active_class(class)) - --extern const struct sched_class stop_sched_class; --extern const struct sched_class dl_sched_class; --extern const struct sched_class rt_sched_class; --extern const struct sched_class fair_sched_class; --extern const struct sched_class idle_sched_class; -+#define for_each_active_class(class) \ -+ for_active_class_range(class, __sched_class_highest, __sched_class_lowest) -+ -+#define sched_class_above(_a, _b) ((_a) < (_b)) - - static inline bool sched_stop_runnable(struct rq *rq) - { -@@ -2424,6 +2545,19 @@ extern void sched_balance_trigger(struct rq *rq); - extern int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx); - extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx); - -+static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) -+{ -+ /* When not in the task's cpumask, no point in looking further. */ -+ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ -+ /* Can @cpu run a user thread? */ -+ if (!(p->flags & PF_KTHREAD) && !task_cpu_possible(cpu, p)) -+ return false; -+ -+ return true; -+} -+ - static inline cpumask_t *alloc_user_cpus_ptr(int node) - { - /* -@@ -2457,6 +2591,11 @@ extern int push_cpu_stop(void *arg); - - #else /* !CONFIG_SMP: */ - -+static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) -+{ -+ return true; -+} -+ - static inline int __set_cpus_allowed_ptr(struct task_struct *p, - struct affinity_context *ctx) - { -@@ -2510,8 +2649,6 @@ extern void init_sched_dl_class(void); - extern void init_sched_rt_class(void); - extern void init_sched_fair_class(void); - --extern void reweight_task(struct task_struct *p, const struct load_weight *lw); -- - extern void resched_curr(struct rq *rq); - extern void resched_cpu(int cpu); - -@@ -3056,6 +3193,8 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { } - - #ifdef CONFIG_SMP - -+bool update_other_load_avgs(struct rq *rq); -+ - unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - unsigned long *min, - unsigned long *max); -@@ -3099,6 +3238,8 @@ static inline unsigned long cpu_util_rt(struct rq *rq) - return READ_ONCE(rq->avg_rt.util_avg); - } - -+#else /* !CONFIG_SMP */ -+static inline bool update_other_load_avgs(struct rq *rq) { return false; } - #endif /* CONFIG_SMP */ - - #ifdef CONFIG_UCLAMP_TASK -@@ -3609,6 +3750,8 @@ extern void set_load_weight(struct task_struct *p, bool update_load); - extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); - extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags); - -+extern void check_class_changing(struct rq *rq, struct task_struct *p, -+ const struct sched_class *prev_class); - extern void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio); -@@ -3629,4 +3772,24 @@ static inline void balance_callbacks(struct rq *rq, struct balance_callback *hea - - #endif - -+#ifdef CONFIG_SCHED_CLASS_EXT -+/* -+ * Used by SCX in the enable/disable paths to move tasks between sched_classes -+ * and establish invariants. -+ */ -+struct sched_enq_and_set_ctx { -+ struct task_struct *p; -+ int queue_flags; -+ bool queued; -+ bool running; -+}; -+ -+void sched_deq_and_put_task(struct task_struct *p, int queue_flags, -+ struct sched_enq_and_set_ctx *ctx); -+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); -+ -+#endif /* CONFIG_SCHED_CLASS_EXT */ -+ -+#include "ext.h" -+ - #endif /* _KERNEL_SCHED_SCHED_H */ -diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c -index ae1b42775ef9..4fa59c9f69ac 100644 ---- a/kernel/sched/syscalls.c -+++ b/kernel/sched/syscalls.c -@@ -259,6 +259,25 @@ int sched_core_idle_cpu(int cpu) - #endif - - #ifdef CONFIG_SMP -+/* -+ * Load avg and utiliztion metrics need to be updated periodically and before -+ * consumption. This function updates the metrics for all subsystems except for -+ * the fair class. @rq must be locked and have its clock updated. -+ */ -+bool update_other_load_avgs(struct rq *rq) -+{ -+ u64 now = rq_clock_pelt(rq); -+ const struct sched_class *curr_class = rq->curr->sched_class; -+ unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); -+ -+ lockdep_assert_rq_held(rq); -+ -+ return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | -+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | -+ update_hw_load_avg(now, rq, hw_pressure) | -+ update_irq_load_avg(rq, 0); -+} -+ - /* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. -@@ -695,6 +714,10 @@ int __sched_setscheduler(struct task_struct *p, - goto unlock; - } - -+ retval = scx_check_setscheduler(p, policy); -+ if (retval) -+ goto unlock; -+ - /* - * If not changing anything there's no need to proceed further, - * but store a possible modification of reset_on_fork. -@@ -797,6 +820,7 @@ int __sched_setscheduler(struct task_struct *p, - __setscheduler_prio(p, newprio); - } - __setscheduler_uclamp(p, attr); -+ check_class_changing(rq, p, prev_class); - - if (queued) { - /* -@@ -1602,6 +1626,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: -+ case SCHED_EXT: - ret = 0; - break; - } -@@ -1629,6 +1654,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: -+ case SCHED_EXT: - ret = 0; - } - return ret; -diff --git a/lib/dump_stack.c b/lib/dump_stack.c -index 1a996fbbf50a..388da1aea14a 100644 ---- a/lib/dump_stack.c -+++ b/lib/dump_stack.c -@@ -73,6 +73,7 @@ void dump_stack_print_info(const char *log_lvl) - - print_worker_info(log_lvl, current); - print_stop_info(log_lvl, current); -+ print_scx_info(log_lvl, current); - } - - /** -diff --git a/tools/Makefile b/tools/Makefile -index 276f5d0d53a4..278d24723b74 100644 ---- a/tools/Makefile -+++ b/tools/Makefile -@@ -28,6 +28,7 @@ help: - @echo ' pci - PCI tools' - @echo ' perf - Linux performance measurement and analysis tool' - @echo ' selftests - various kernel selftests' -+ @echo ' sched_ext - sched_ext example schedulers' - @echo ' bootconfig - boot config tool' - @echo ' spi - spi tools' - @echo ' tmon - thermal monitoring and tuning tool' -@@ -91,6 +92,9 @@ perf: FORCE - $(Q)mkdir -p $(PERF_O) . - $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= - -+sched_ext: FORCE -+ $(call descend,sched_ext) -+ - selftests: FORCE - $(call descend,testing/$@) - -@@ -184,6 +188,9 @@ perf_clean: - $(Q)mkdir -p $(PERF_O) . - $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean - -+sched_ext_clean: -+ $(call descend,sched_ext,clean) -+ - selftests_clean: - $(call descend,testing/$(@:_clean=),clean) - -@@ -213,6 +220,7 @@ clean: acpi_clean counter_clean cpupower_clean hv_clean firewire_clean \ - mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ - freefall_clean build_clean libbpf_clean libsubcmd_clean \ - gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \ -- intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean -+ intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean \ -+ sched_ext_clean - - .PHONY: FORCE -diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore -new file mode 100644 -index 000000000000..d6264fe1c8cd ---- /dev/null -+++ b/tools/sched_ext/.gitignore -@@ -0,0 +1,2 @@ -+tools/ -+build/ -diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile -new file mode 100644 -index 000000000000..ca3815e572d8 ---- /dev/null -+++ b/tools/sched_ext/Makefile -@@ -0,0 +1,246 @@ -+# SPDX-License-Identifier: GPL-2.0 -+# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+include ../build/Build.include -+include ../scripts/Makefile.arch -+include ../scripts/Makefile.include -+ -+all: all_targets -+ -+ifneq ($(LLVM),) -+ifneq ($(filter %/,$(LLVM)),) -+LLVM_PREFIX := $(LLVM) -+else ifneq ($(filter -%,$(LLVM)),) -+LLVM_SUFFIX := $(LLVM) -+endif -+ -+CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi -+CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu -+CLANG_TARGET_FLAGS_hexagon := hexagon-linux-musl -+CLANG_TARGET_FLAGS_m68k := m68k-linux-gnu -+CLANG_TARGET_FLAGS_mips := mipsel-linux-gnu -+CLANG_TARGET_FLAGS_powerpc := powerpc64le-linux-gnu -+CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu -+CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu -+CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu -+CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH)) -+ -+ifeq ($(CROSS_COMPILE),) -+ifeq ($(CLANG_TARGET_FLAGS),) -+$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk) -+else -+CLANG_FLAGS += --target=$(CLANG_TARGET_FLAGS) -+endif # CLANG_TARGET_FLAGS -+else -+CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%)) -+endif # CROSS_COMPILE -+ -+CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as -+else -+CC := $(CROSS_COMPILE)gcc -+endif # LLVM -+ -+CURDIR := $(abspath .) -+TOOLSDIR := $(abspath ..) -+LIBDIR := $(TOOLSDIR)/lib -+BPFDIR := $(LIBDIR)/bpf -+TOOLSINCDIR := $(TOOLSDIR)/include -+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool -+APIDIR := $(TOOLSINCDIR)/uapi -+GENDIR := $(abspath ../../include/generated) -+GENHDR := $(GENDIR)/autoconf.h -+ -+ifeq ($(O),) -+OUTPUT_DIR := $(CURDIR)/build -+else -+OUTPUT_DIR := $(O)/build -+endif # O -+OBJ_DIR := $(OUTPUT_DIR)/obj -+INCLUDE_DIR := $(OUTPUT_DIR)/include -+BPFOBJ_DIR := $(OBJ_DIR)/libbpf -+SCXOBJ_DIR := $(OBJ_DIR)/sched_ext -+BINDIR := $(OUTPUT_DIR)/bin -+BPFOBJ := $(BPFOBJ_DIR)/libbpf.a -+ifneq ($(CROSS_COMPILE),) -+HOST_BUILD_DIR := $(OBJ_DIR)/host -+HOST_OUTPUT_DIR := host-tools -+HOST_INCLUDE_DIR := $(HOST_OUTPUT_DIR)/include -+else -+HOST_BUILD_DIR := $(OBJ_DIR) -+HOST_OUTPUT_DIR := $(OUTPUT_DIR) -+HOST_INCLUDE_DIR := $(INCLUDE_DIR) -+endif -+HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a -+RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids -+DEFAULT_BPFTOOL := $(HOST_OUTPUT_DIR)/sbin/bpftool -+ -+VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ -+ $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ -+ ../../vmlinux \ -+ /sys/kernel/btf/vmlinux \ -+ /boot/vmlinux-$(shell uname -r) -+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) -+ifeq ($(VMLINUX_BTF),) -+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") -+endif -+ -+BPFTOOL ?= $(DEFAULT_BPFTOOL) -+ -+ifneq ($(wildcard $(GENHDR)),) -+ GENFLAGS := -DHAVE_GENHDR -+endif -+ -+CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ -+ -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ -+ -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include -+ -+# Silence some warnings when compiled with clang -+ifneq ($(LLVM),) -+CFLAGS += -Wno-unused-command-line-argument -+endif -+ -+LDFLAGS = -lelf -lz -lpthread -+ -+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - &1 \ -+ | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ -+$(shell $(1) -dM -E - $@ -+else -+ $(call msg,CP,,$@) -+ $(Q)cp "$(VMLINUX_H)" $@ -+endif -+ -+$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h include/scx/*.h \ -+ | $(BPFOBJ) $(SCXOBJ_DIR) -+ $(call msg,CLNG-BPF,,$(notdir $@)) -+ $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ -+ -+$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) -+ $(eval sched=$(notdir $@)) -+ $(call msg,GEN-SKEL,,$(sched)) -+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< -+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) -+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) -+ $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) -+ $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@ -+ $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) -+ -+SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) -+ -+c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg -+ -+$(addprefix $(BINDIR)/,$(c-sched-targets)): \ -+ $(BINDIR)/%: \ -+ $(filter-out %.bpf.c,%.c) \ -+ $(INCLUDE_DIR)/%.bpf.skel.h \ -+ $(SCX_COMMON_DEPS) -+ $(eval sched=$(notdir $@)) -+ $(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o -+ $(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS) -+ -+$(c-sched-targets): %: $(BINDIR)/% -+ -+install: all -+ $(Q)mkdir -p $(DESTDIR)/usr/local/bin/ -+ $(Q)cp $(BINDIR)/* $(DESTDIR)/usr/local/bin/ -+ -+clean: -+ rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR) -+ rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h -+ rm -f $(c-sched-targets) -+ -+help: -+ @echo 'Building targets' -+ @echo '================' -+ @echo '' -+ @echo ' all - Compile all schedulers' -+ @echo '' -+ @echo 'Alternatively, you may compile individual schedulers:' -+ @echo '' -+ @printf ' %s\n' $(c-sched-targets) -+ @echo '' -+ @echo 'For any scheduler build target, you may specify an alternative' -+ @echo 'build output path with the O= environment variable. For example:' -+ @echo '' -+ @echo ' O=/tmp/sched_ext make all' -+ @echo '' -+ @echo 'will compile all schedulers, and emit the build artifacts to' -+ @echo '/tmp/sched_ext/build.' -+ @echo '' -+ @echo '' -+ @echo 'Installing targets' -+ @echo '==================' -+ @echo '' -+ @echo ' install - Compile and install all schedulers to /usr/bin.' -+ @echo ' You may specify the DESTDIR= environment variable' -+ @echo ' to indicate a prefix for /usr/bin. For example:' -+ @echo '' -+ @echo ' DESTDIR=/tmp/sched_ext make install' -+ @echo '' -+ @echo ' will build the schedulers in CWD/build, and' -+ @echo ' install the schedulers to /tmp/sched_ext/usr/bin.' -+ @echo '' -+ @echo '' -+ @echo 'Cleaning targets' -+ @echo '================' -+ @echo '' -+ @echo ' clean - Remove all generated files' -+ -+all_targets: $(c-sched-targets) -+ -+.PHONY: all all_targets $(c-sched-targets) clean help -+ -+# delete failed targets -+.DELETE_ON_ERROR: -+ -+# keep intermediate (.bpf.skel.h, .bpf.o, etc) targets -+.SECONDARY: -diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md -new file mode 100644 -index 000000000000..16a42e4060f6 ---- /dev/null -+++ b/tools/sched_ext/README.md -@@ -0,0 +1,270 @@ -+SCHED_EXT EXAMPLE SCHEDULERS -+============================ -+ -+# Introduction -+ -+This directory contains a number of example sched_ext schedulers. These -+schedulers are meant to provide examples of different types of schedulers -+that can be built using sched_ext, and illustrate how various features of -+sched_ext can be used. -+ -+Some of the examples are performant, production-ready schedulers. That is, for -+the correct workload and with the correct tuning, they may be deployed in a -+production environment with acceptable or possibly even improved performance. -+Others are just examples that in practice, would not provide acceptable -+performance (though they could be improved to get there). -+ -+This README will describe these example schedulers, including describing the -+types of workloads or scenarios they're designed to accommodate, and whether or -+not they're production ready. For more details on any of these schedulers, -+please see the header comment in their .bpf.c file. -+ -+ -+# Compiling the examples -+ -+There are a few toolchain dependencies for compiling the example schedulers. -+ -+## Toolchain dependencies -+ -+1. clang >= 16.0.0 -+ -+The schedulers are BPF programs, and therefore must be compiled with clang. gcc -+is actively working on adding a BPF backend compiler as well, but are still -+missing some features such as BTF type tags which are necessary for using -+kptrs. -+ -+2. pahole >= 1.25 -+ -+You may need pahole in order to generate BTF from DWARF. -+ -+3. rust >= 1.70.0 -+ -+Rust schedulers uses features present in the rust toolchain >= 1.70.0. You -+should be able to use the stable build from rustup, but if that doesn't -+work, try using the rustup nightly build. -+ -+There are other requirements as well, such as make, but these are the main / -+non-trivial ones. -+ -+## Compiling the kernel -+ -+In order to run a sched_ext scheduler, you'll have to run a kernel compiled -+with the patches in this repository, and with a minimum set of necessary -+Kconfig options: -+ -+``` -+CONFIG_BPF=y -+CONFIG_SCHED_CLASS_EXT=y -+CONFIG_BPF_SYSCALL=y -+CONFIG_BPF_JIT=y -+CONFIG_DEBUG_INFO_BTF=y -+``` -+ -+It's also recommended that you also include the following Kconfig options: -+ -+``` -+CONFIG_BPF_JIT_ALWAYS_ON=y -+CONFIG_BPF_JIT_DEFAULT_ON=y -+CONFIG_PAHOLE_HAS_SPLIT_BTF=y -+CONFIG_PAHOLE_HAS_BTF_TAG=y -+``` -+ -+There is a `Kconfig` file in this directory whose contents you can append to -+your local `.config` file, as long as there are no conflicts with any existing -+options in the file. -+ -+## Getting a vmlinux.h file -+ -+You may notice that most of the example schedulers include a "vmlinux.h" file. -+This is a large, auto-generated header file that contains all of the types -+defined in some vmlinux binary that was compiled with -+[BTF](https://docs.kernel.org/bpf/btf.html) (i.e. with the BTF-related Kconfig -+options specified above). -+ -+The header file is created using `bpftool`, by passing it a vmlinux binary -+compiled with BTF as follows: -+ -+```bash -+$ bpftool btf dump file /path/to/vmlinux format c > vmlinux.h -+``` -+ -+`bpftool` analyzes all of the BTF encodings in the binary, and produces a -+header file that can be included by BPF programs to access those types. For -+example, using vmlinux.h allows a scheduler to access fields defined directly -+in vmlinux as follows: -+ -+```c -+#include "vmlinux.h" -+// vmlinux.h is also implicitly included by scx_common.bpf.h. -+#include "scx_common.bpf.h" -+ -+/* -+ * vmlinux.h provides definitions for struct task_struct and -+ * struct scx_enable_args. -+ */ -+void BPF_STRUCT_OPS(example_enable, struct task_struct *p, -+ struct scx_enable_args *args) -+{ -+ bpf_printk("Task %s enabled in example scheduler", p->comm); -+} -+ -+// vmlinux.h provides the definition for struct sched_ext_ops. -+SEC(".struct_ops.link") -+struct sched_ext_ops example_ops { -+ .enable = (void *)example_enable, -+ .name = "example", -+} -+``` -+ -+The scheduler build system will generate this vmlinux.h file as part of the -+scheduler build pipeline. It looks for a vmlinux file in the following -+dependency order: -+ -+1. If the O= environment variable is defined, at `$O/vmlinux` -+2. If the KBUILD_OUTPUT= environment variable is defined, at -+ `$KBUILD_OUTPUT/vmlinux` -+3. At `../../vmlinux` (i.e. at the root of the kernel tree where you're -+ compiling the schedulers) -+3. `/sys/kernel/btf/vmlinux` -+4. `/boot/vmlinux-$(uname -r)` -+ -+In other words, if you have compiled a kernel in your local repo, its vmlinux -+file will be used to generate vmlinux.h. Otherwise, it will be the vmlinux of -+the kernel you're currently running on. This means that if you're running on a -+kernel with sched_ext support, you may not need to compile a local kernel at -+all. -+ -+### Aside on CO-RE -+ -+One of the cooler features of BPF is that it supports -+[CO-RE](https://nakryiko.com/posts/bpf-core-reference-guide/) (Compile Once Run -+Everywhere). This feature allows you to reference fields inside of structs with -+types defined internal to the kernel, and not have to recompile if you load the -+BPF program on a different kernel with the field at a different offset. In our -+example above, we print out a task name with `p->comm`. CO-RE would perform -+relocations for that access when the program is loaded to ensure that it's -+referencing the correct offset for the currently running kernel. -+ -+## Compiling the schedulers -+ -+Once you have your toolchain setup, and a vmlinux that can be used to generate -+a full vmlinux.h file, you can compile the schedulers using `make`: -+ -+```bash -+$ make -j($nproc) -+``` -+ -+# Example schedulers -+ -+This directory contains the following example schedulers. These schedulers are -+for testing and demonstrating different aspects of sched_ext. While some may be -+useful in limited scenarios, they are not intended to be practical. -+ -+For more scheduler implementations, tools and documentation, visit -+https://github.com/sched-ext/scx. -+ -+## scx_simple -+ -+A simple scheduler that provides an example of a minimal sched_ext scheduler. -+scx_simple can be run in either global weighted vtime mode, or FIFO mode. -+ -+Though very simple, in limited scenarios, this scheduler can perform reasonably -+well on single-socket systems with a unified L3 cache. -+ -+## scx_qmap -+ -+Another simple, yet slightly more complex scheduler that provides an example of -+a basic weighted FIFO queuing policy. It also provides examples of some common -+useful BPF features, such as sleepable per-task storage allocation in the -+`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to -+enqueue tasks. It also illustrates how core-sched support could be implemented. -+ -+## scx_central -+ -+A "central" scheduler where scheduling decisions are made from a single CPU. -+This scheduler illustrates how scheduling decisions can be dispatched from a -+single CPU, allowing other cores to run with infinite slices, without timer -+ticks, and without having to incur the overhead of making scheduling decisions. -+ -+The approach demonstrated by this scheduler may be useful for any workload that -+benefits from minimizing scheduling overhead and timer ticks. An example of -+where this could be particularly useful is running VMs, where running with -+infinite slices and no timer ticks allows the VM to avoid unnecessary expensive -+vmexits. -+ -+## scx_flatcg -+ -+A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical -+weight-based cgroup CPU control by flattening the cgroup hierarchy into a single -+layer, by compounding the active weight share at each level. The effect of this -+is a much more performant CPU controller, which does not need to descend down -+cgroup trees in order to properly compute a cgroup's share. -+ -+Similar to scx_simple, in limited scenarios, this scheduler can perform -+reasonably well on single socket-socket systems with a unified L3 cache and show -+significantly lowered hierarchical scheduling overhead. -+ -+ -+# Troubleshooting -+ -+There are a number of common issues that you may run into when building the -+schedulers. We'll go over some of the common ones here. -+ -+## Build Failures -+ -+### Old version of clang -+ -+``` -+error: static assertion failed due to requirement 'SCX_DSQ_FLAG_BUILTIN': bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole -+ _Static_assert(SCX_DSQ_FLAG_BUILTIN, -+ ^~~~~~~~~~~~~~~~~~~~ -+1 error generated. -+``` -+ -+This means you built the kernel or the schedulers with an older version of -+clang than what's supported (i.e. older than 16.0.0). To remediate this: -+ -+1. `which clang` to make sure you're using a sufficiently new version of clang. -+ -+2. `make fullclean` in the root path of the repository, and rebuild the kernel -+ and schedulers. -+ -+3. Rebuild the kernel, and then your example schedulers. -+ -+The schedulers are also cleaned if you invoke `make mrproper` in the root -+directory of the tree. -+ -+### Stale kernel build / incomplete vmlinux.h file -+ -+As described above, you'll need a `vmlinux.h` file that was generated from a -+vmlinux built with BTF, and with sched_ext support enabled. If you don't, -+you'll see errors such as the following which indicate that a type being -+referenced in a scheduler is unknown: -+ -+``` -+/path/to/sched_ext/tools/sched_ext/user_exit_info.h:25:23: note: forward declaration of 'struct scx_exit_info' -+ -+const struct scx_exit_info *ei) -+ -+^ -+``` -+ -+In order to resolve this, please follow the steps above in -+[Getting a vmlinux.h file](#getting-a-vmlinuxh-file) in order to ensure your -+schedulers are using a vmlinux.h file that includes the requisite types. -+ -+## Misc -+ -+### llvm: [OFF] -+ -+You may see the following output when building the schedulers: -+ -+``` -+Auto-detecting system features: -+... clang-bpf-co-re: [ on ] -+... llvm: [ OFF ] -+... libcap: [ on ] -+... libbfd: [ on ] -+``` -+ -+Seeing `llvm: [ OFF ]` here is not an issue. You can safely ignore. -diff --git a/tools/sched_ext/include/bpf-compat/gnu/stubs.h b/tools/sched_ext/include/bpf-compat/gnu/stubs.h -new file mode 100644 -index 000000000000..ad7d139ce907 ---- /dev/null -+++ b/tools/sched_ext/include/bpf-compat/gnu/stubs.h -@@ -0,0 +1,11 @@ -+/* -+ * Dummy gnu/stubs.h. clang can end up including /usr/include/gnu/stubs.h when -+ * compiling BPF files although its content doesn't play any role. The file in -+ * turn includes stubs-64.h or stubs-32.h depending on whether __x86_64__ is -+ * defined. When compiling a BPF source, __x86_64__ isn't set and thus -+ * stubs-32.h is selected. However, the file is not there if the system doesn't -+ * have 32bit glibc devel package installed leading to a build failure. -+ * -+ * The problem is worked around by making this file available in the include -+ * search paths before the system one when building BPF. -+ */ -diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h -new file mode 100644 -index 000000000000..225f61f9bfca ---- /dev/null -+++ b/tools/sched_ext/include/scx/common.bpf.h -@@ -0,0 +1,427 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#ifndef __SCX_COMMON_BPF_H -+#define __SCX_COMMON_BPF_H -+ -+#ifdef LSP -+#define __bpf__ -+#include "../vmlinux/vmlinux.h" -+#else -+#include "vmlinux.h" -+#endif -+ -+#include -+#include -+#include -+#include "user_exit_info.h" -+ -+#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ -+#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ -+#define PF_EXITING 0x00000004 -+#define CLOCK_MONOTONIC 1 -+ -+/* -+ * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can -+ * lead to really confusing misbehaviors. Let's trigger a build failure. -+ */ -+static inline void ___vmlinux_h_sanity_check___(void) -+{ -+ _Static_assert(SCX_DSQ_FLAG_BUILTIN, -+ "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole"); -+} -+ -+s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; -+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; -+void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym; -+void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym; -+u32 scx_bpf_dispatch_nr_slots(void) __ksym; -+void scx_bpf_dispatch_cancel(void) __ksym; -+bool scx_bpf_consume(u64 dsq_id) __ksym; -+void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym; -+void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym; -+bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; -+bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; -+u32 scx_bpf_reenqueue_local(void) __ksym; -+void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; -+s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; -+void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; -+int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak; -+struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak; -+void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak; -+void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak; -+void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; -+void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak; -+u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak; -+u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak; -+void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak; -+u32 scx_bpf_nr_cpu_ids(void) __ksym __weak; -+const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak; -+const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak; -+void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak; -+const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; -+const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; -+void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; -+bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; -+s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; -+s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; -+bool scx_bpf_task_running(const struct task_struct *p) __ksym; -+s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; -+struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; -+struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; -+ -+/* -+ * Use the following as @it__iter when calling -+ * scx_bpf_dispatch[_vtime]_from_dsq() from within bpf_for_each() loops. -+ */ -+#define BPF_FOR_EACH_ITER (&___it) -+ -+static inline __attribute__((format(printf, 1, 2))) -+void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} -+ -+/* -+ * Helper macro for initializing the fmt and variadic argument inputs to both -+ * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to -+ * refer to the initialized list of inputs to the bstr kfunc. -+ */ -+#define scx_bpf_bstr_preamble(fmt, args...) \ -+ static char ___fmt[] = fmt; \ -+ /* \ -+ * Note that __param[] must have at least one \ -+ * element to keep the verifier happy. \ -+ */ \ -+ unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \ -+ \ -+ _Pragma("GCC diagnostic push") \ -+ _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ -+ ___bpf_fill(___param, args); \ -+ _Pragma("GCC diagnostic pop") \ -+ -+/* -+ * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments -+ * instead of an array of u64. Using this macro will cause the scheduler to -+ * exit cleanly with the specified exit code being passed to user space. -+ */ -+#define scx_bpf_exit(code, fmt, args...) \ -+({ \ -+ scx_bpf_bstr_preamble(fmt, args) \ -+ scx_bpf_exit_bstr(code, ___fmt, ___param, sizeof(___param)); \ -+ ___scx_bpf_bstr_format_checker(fmt, ##args); \ -+}) -+ -+/* -+ * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments -+ * instead of an array of u64. Invoking this macro will cause the scheduler to -+ * exit in an erroneous state, with diagnostic information being passed to the -+ * user. -+ */ -+#define scx_bpf_error(fmt, args...) \ -+({ \ -+ scx_bpf_bstr_preamble(fmt, args) \ -+ scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ -+ ___scx_bpf_bstr_format_checker(fmt, ##args); \ -+}) -+ -+/* -+ * scx_bpf_dump() wraps the scx_bpf_dump_bstr() kfunc with variadic arguments -+ * instead of an array of u64. To be used from ops.dump() and friends. -+ */ -+#define scx_bpf_dump(fmt, args...) \ -+({ \ -+ scx_bpf_bstr_preamble(fmt, args) \ -+ scx_bpf_dump_bstr(___fmt, ___param, sizeof(___param)); \ -+ ___scx_bpf_bstr_format_checker(fmt, ##args); \ -+}) -+ -+#define BPF_STRUCT_OPS(name, args...) \ -+SEC("struct_ops/"#name) \ -+BPF_PROG(name, ##args) -+ -+#define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ -+SEC("struct_ops.s/"#name) \ -+BPF_PROG(name, ##args) -+ -+/** -+ * RESIZABLE_ARRAY - Generates annotations for an array that may be resized -+ * @elfsec: the data section of the BPF program in which to place the array -+ * @arr: the name of the array -+ * -+ * libbpf has an API for setting map value sizes. Since data sections (i.e. -+ * bss, data, rodata) themselves are maps, a data section can be resized. If -+ * a data section has an array as its last element, the BTF info for that -+ * array will be adjusted so that length of the array is extended to meet the -+ * new length of the data section. This macro annotates an array to have an -+ * element count of one with the assumption that this array can be resized -+ * within the userspace program. It also annotates the section specifier so -+ * this array exists in a custom sub data section which can be resized -+ * independently. -+ * -+ * See RESIZE_ARRAY() for the userspace convenience macro for resizing an -+ * array declared with RESIZABLE_ARRAY(). -+ */ -+#define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr) -+ -+/** -+ * MEMBER_VPTR - Obtain the verified pointer to a struct or array member -+ * @base: struct or array to index -+ * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...) -+ * -+ * The verifier often gets confused by the instruction sequence the compiler -+ * generates for indexing struct fields or arrays. This macro forces the -+ * compiler to generate a code sequence which first calculates the byte offset, -+ * checks it against the struct or array size and add that byte offset to -+ * generate the pointer to the member to help the verifier. -+ * -+ * Ideally, we want to abort if the calculated offset is out-of-bounds. However, -+ * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller -+ * must check for %NULL and take appropriate action to appease the verifier. To -+ * avoid confusing the verifier, it's best to check for %NULL and dereference -+ * immediately. -+ * -+ * vptr = MEMBER_VPTR(my_array, [i][j]); -+ * if (!vptr) -+ * return error; -+ * *vptr = new_value; -+ * -+ * sizeof(@base) should encompass the memory area to be accessed and thus can't -+ * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of -+ * `MEMBER_VPTR(ptr, ->member)`. -+ */ -+#define MEMBER_VPTR(base, member) (typeof((base) member) *) \ -+({ \ -+ u64 __base = (u64)&(base); \ -+ u64 __addr = (u64)&((base) member) - __base; \ -+ _Static_assert(sizeof(base) >= sizeof((base) member), \ -+ "@base is smaller than @member, is @base a pointer?"); \ -+ asm volatile ( \ -+ "if %0 <= %[max] goto +2\n" \ -+ "%0 = 0\n" \ -+ "goto +1\n" \ -+ "%0 += %1\n" \ -+ : "+r"(__addr) \ -+ : "r"(__base), \ -+ [max]"i"(sizeof(base) - sizeof((base) member))); \ -+ __addr; \ -+}) -+ -+/** -+ * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element -+ * @arr: array to index into -+ * @i: array index -+ * @n: number of elements in array -+ * -+ * Similar to MEMBER_VPTR() but is intended for use with arrays where the -+ * element count needs to be explicit. -+ * It can be used in cases where a global array is defined with an initial -+ * size but is intended to be be resized before loading the BPF program. -+ * Without this version of the macro, MEMBER_VPTR() will use the compile time -+ * size of the array to compute the max, which will result in rejection by -+ * the verifier. -+ */ -+#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \ -+({ \ -+ u64 __base = (u64)arr; \ -+ u64 __addr = (u64)&(arr[i]) - __base; \ -+ asm volatile ( \ -+ "if %0 <= %[max] goto +2\n" \ -+ "%0 = 0\n" \ -+ "goto +1\n" \ -+ "%0 += %1\n" \ -+ : "+r"(__addr) \ -+ : "r"(__base), \ -+ [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ -+ __addr; \ -+}) -+ -+ -+/* -+ * BPF declarations and helpers -+ */ -+ -+/* list and rbtree */ -+#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node))) -+#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) -+ -+void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym; -+void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; -+ -+#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) -+#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) -+ -+void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; -+void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; -+struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym; -+struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym; -+struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, -+ struct bpf_rb_node *node) __ksym; -+int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, -+ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), -+ void *meta, __u64 off) __ksym; -+#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0) -+ -+struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; -+ -+void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym; -+#define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL) -+ -+/* task */ -+struct task_struct *bpf_task_from_pid(s32 pid) __ksym; -+struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; -+void bpf_task_release(struct task_struct *p) __ksym; -+ -+/* cgroup */ -+struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym; -+void bpf_cgroup_release(struct cgroup *cgrp) __ksym; -+struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; -+ -+/* css iteration */ -+struct bpf_iter_css; -+struct cgroup_subsys_state; -+extern int bpf_iter_css_new(struct bpf_iter_css *it, -+ struct cgroup_subsys_state *start, -+ unsigned int flags) __weak __ksym; -+extern struct cgroup_subsys_state * -+bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym; -+extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; -+ -+/* cpumask */ -+struct bpf_cpumask *bpf_cpumask_create(void) __ksym; -+struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym; -+void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym; -+u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym; -+u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym; -+void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; -+void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; -+bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym; -+bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; -+bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; -+void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym; -+void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym; -+bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1, -+ const struct cpumask *src2) __ksym; -+void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1, -+ const struct cpumask *src2) __ksym; -+void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1, -+ const struct cpumask *src2) __ksym; -+bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym; -+bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym; -+bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym; -+bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym; -+bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym; -+void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym; -+u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym; -+u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, -+ const struct cpumask *src2) __ksym; -+u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym; -+ -+/* -+ * Access a cpumask in read-only mode (typically to check bits). -+ */ -+const struct cpumask *cast_mask(struct bpf_cpumask *mask) -+{ -+ return (const struct cpumask *)mask; -+} -+ -+/* rcu */ -+void bpf_rcu_read_lock(void) __ksym; -+void bpf_rcu_read_unlock(void) __ksym; -+ -+ -+/* -+ * Other helpers -+ */ -+ -+/* useful compiler attributes */ -+#define likely(x) __builtin_expect(!!(x), 1) -+#define unlikely(x) __builtin_expect(!!(x), 0) -+#define __maybe_unused __attribute__((__unused__)) -+ -+/* -+ * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They -+ * prevent compiler from caching, redoing or reordering reads or writes. -+ */ -+typedef __u8 __attribute__((__may_alias__)) __u8_alias_t; -+typedef __u16 __attribute__((__may_alias__)) __u16_alias_t; -+typedef __u32 __attribute__((__may_alias__)) __u32_alias_t; -+typedef __u64 __attribute__((__may_alias__)) __u64_alias_t; -+ -+static __always_inline void __read_once_size(const volatile void *p, void *res, int size) -+{ -+ switch (size) { -+ case 1: *(__u8_alias_t *) res = *(volatile __u8_alias_t *) p; break; -+ case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break; -+ case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break; -+ case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break; -+ default: -+ barrier(); -+ __builtin_memcpy((void *)res, (const void *)p, size); -+ barrier(); -+ } -+} -+ -+static __always_inline void __write_once_size(volatile void *p, void *res, int size) -+{ -+ switch (size) { -+ case 1: *(volatile __u8_alias_t *) p = *(__u8_alias_t *) res; break; -+ case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break; -+ case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break; -+ case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break; -+ default: -+ barrier(); -+ __builtin_memcpy((void *)p, (const void *)res, size); -+ barrier(); -+ } -+} -+ -+#define READ_ONCE(x) \ -+({ \ -+ union { typeof(x) __val; char __c[1]; } __u = \ -+ { .__c = { 0 } }; \ -+ __read_once_size(&(x), __u.__c, sizeof(x)); \ -+ __u.__val; \ -+}) -+ -+#define WRITE_ONCE(x, val) \ -+({ \ -+ union { typeof(x) __val; char __c[1]; } __u = \ -+ { .__val = (val) }; \ -+ __write_once_size(&(x), __u.__c, sizeof(x)); \ -+ __u.__val; \ -+}) -+ -+/* -+ * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. -+ * @v: The value for which we're computing the base 2 logarithm. -+ */ -+static inline u32 log2_u32(u32 v) -+{ -+ u32 r; -+ u32 shift; -+ -+ r = (v > 0xFFFF) << 4; v >>= r; -+ shift = (v > 0xFF) << 3; v >>= shift; r |= shift; -+ shift = (v > 0xF) << 2; v >>= shift; r |= shift; -+ shift = (v > 0x3) << 1; v >>= shift; r |= shift; -+ r |= (v >> 1); -+ return r; -+} -+ -+/* -+ * log2_u64 - Compute the base 2 logarithm of a 64-bit exponential value. -+ * @v: The value for which we're computing the base 2 logarithm. -+ */ -+static inline u32 log2_u64(u64 v) -+{ -+ u32 hi = v >> 32; -+ if (hi) -+ return log2_u32(hi) + 32 + 1; -+ else -+ return log2_u32(v) + 1; -+} -+ -+#include "compat.bpf.h" -+ -+#endif /* __SCX_COMMON_BPF_H */ -diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h -new file mode 100644 -index 000000000000..5b0f90152152 ---- /dev/null -+++ b/tools/sched_ext/include/scx/common.h -@@ -0,0 +1,75 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 Tejun Heo -+ * Copyright (c) 2023 David Vernet -+ */ -+#ifndef __SCHED_EXT_COMMON_H -+#define __SCHED_EXT_COMMON_H -+ -+#ifdef __KERNEL__ -+#error "Should not be included by BPF programs" -+#endif -+ -+#include -+#include -+#include -+#include -+#include -+ -+typedef uint8_t u8; -+typedef uint16_t u16; -+typedef uint32_t u32; -+typedef uint64_t u64; -+typedef int8_t s8; -+typedef int16_t s16; -+typedef int32_t s32; -+typedef int64_t s64; -+ -+#define SCX_BUG(__fmt, ...) \ -+ do { \ -+ fprintf(stderr, "[SCX_BUG] %s:%d", __FILE__, __LINE__); \ -+ if (errno) \ -+ fprintf(stderr, " (%s)\n", strerror(errno)); \ -+ else \ -+ fprintf(stderr, "\n"); \ -+ fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__); \ -+ fprintf(stderr, "\n"); \ -+ \ -+ exit(EXIT_FAILURE); \ -+ } while (0) -+ -+#define SCX_BUG_ON(__cond, __fmt, ...) \ -+ do { \ -+ if (__cond) \ -+ SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__); \ -+ } while (0) -+ -+/** -+ * RESIZE_ARRAY - Convenience macro for resizing a BPF array -+ * @__skel: the skeleton containing the array -+ * @elfsec: the data section of the BPF program in which the array exists -+ * @arr: the name of the array -+ * @n: the desired array element count -+ * -+ * For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two -+ * operations. It resizes the map which corresponds to the custom data -+ * section that contains the target array. As a side effect, the BTF info for -+ * the array is adjusted so that the array length is sized to cover the new -+ * data section size. The second operation is reassigning the skeleton pointer -+ * for that custom data section so that it points to the newly memory mapped -+ * region. -+ */ -+#define RESIZE_ARRAY(__skel, elfsec, arr, n) \ -+ do { \ -+ size_t __sz; \ -+ bpf_map__set_value_size((__skel)->maps.elfsec##_##arr, \ -+ sizeof((__skel)->elfsec##_##arr->arr[0]) * (n)); \ -+ (__skel)->elfsec##_##arr = \ -+ bpf_map__initial_value((__skel)->maps.elfsec##_##arr, &__sz); \ -+ } while (0) -+ -+#include "user_exit_info.h" -+#include "compat.h" -+ -+#endif /* __SCHED_EXT_COMMON_H */ -diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h -new file mode 100644 -index 000000000000..e5afe9efd3f3 ---- /dev/null -+++ b/tools/sched_ext/include/scx/compat.bpf.h -@@ -0,0 +1,47 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 Tejun Heo -+ * Copyright (c) 2024 David Vernet -+ */ -+#ifndef __SCX_COMPAT_BPF_H -+#define __SCX_COMPAT_BPF_H -+ -+#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \ -+({ \ -+ __type __ret = 0; \ -+ if (bpf_core_enum_value_exists(__type, __ent)) \ -+ __ret = __ent; \ -+ __ret; \ -+}) -+ -+/* v6.12: 819513666966 ("sched_ext: Add cgroup support") */ -+#define __COMPAT_scx_bpf_task_cgroup(p) \ -+ (bpf_ksym_exists(scx_bpf_task_cgroup) ? \ -+ scx_bpf_task_cgroup((p)) : NULL) -+ -+/* v6.12: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") */ -+#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it, slice) \ -+ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice) ? \ -+ scx_bpf_dispatch_from_dsq_set_slice((it), (slice)) : (void)0) -+#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it, vtime) \ -+ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime) ? \ -+ scx_bpf_dispatch_from_dsq_set_vtime((it), (vtime)) : (void)0) -+#define __COMPAT_scx_bpf_dispatch_from_dsq(it, p, dsq_id, enq_flags) \ -+ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq) ? \ -+ scx_bpf_dispatch_from_dsq((it), (p), (dsq_id), (enq_flags)) : false) -+#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it, p, dsq_id, enq_flags) \ -+ (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq) ? \ -+ scx_bpf_dispatch_vtime_from_dsq((it), (p), (dsq_id), (enq_flags)) : false) -+ -+/* -+ * Define sched_ext_ops. This may be expanded to define multiple variants for -+ * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). -+ */ -+#define SCX_OPS_DEFINE(__name, ...) \ -+ SEC(".struct_ops.link") \ -+ struct sched_ext_ops __name = { \ -+ __VA_ARGS__, \ -+ }; -+ -+#endif /* __SCX_COMPAT_BPF_H */ -diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h -new file mode 100644 -index 000000000000..cc56ff9aa252 ---- /dev/null -+++ b/tools/sched_ext/include/scx/compat.h -@@ -0,0 +1,186 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 Tejun Heo -+ * Copyright (c) 2024 David Vernet -+ */ -+#ifndef __SCX_COMPAT_H -+#define __SCX_COMPAT_H -+ -+#include -+#include -+#include -+#include -+ -+struct btf *__COMPAT_vmlinux_btf __attribute__((weak)); -+ -+static inline void __COMPAT_load_vmlinux_btf(void) -+{ -+ if (!__COMPAT_vmlinux_btf) { -+ __COMPAT_vmlinux_btf = btf__load_vmlinux_btf(); -+ SCX_BUG_ON(!__COMPAT_vmlinux_btf, "btf__load_vmlinux_btf()"); -+ } -+} -+ -+static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v) -+{ -+ const struct btf_type *t; -+ const char *n; -+ s32 tid; -+ int i; -+ -+ __COMPAT_load_vmlinux_btf(); -+ -+ tid = btf__find_by_name(__COMPAT_vmlinux_btf, type); -+ if (tid < 0) -+ return false; -+ -+ t = btf__type_by_id(__COMPAT_vmlinux_btf, tid); -+ SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid); -+ -+ if (btf_is_enum(t)) { -+ struct btf_enum *e = btf_enum(t); -+ -+ for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { -+ n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off); -+ SCX_BUG_ON(!n, "btf__name_by_offset()"); -+ if (!strcmp(n, name)) { -+ *v = e[i].val; -+ return true; -+ } -+ } -+ } else if (btf_is_enum64(t)) { -+ struct btf_enum64 *e = btf_enum64(t); -+ -+ for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { -+ n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off); -+ SCX_BUG_ON(!n, "btf__name_by_offset()"); -+ if (!strcmp(n, name)) { -+ *v = btf_enum64_value(&e[i]); -+ return true; -+ } -+ } -+ } -+ -+ return false; -+} -+ -+#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \ -+({ \ -+ u64 __val = 0; \ -+ __COMPAT_read_enum(__type, __ent, &__val); \ -+ __val; \ -+}) -+ -+static inline bool __COMPAT_has_ksym(const char *ksym) -+{ -+ __COMPAT_load_vmlinux_btf(); -+ return btf__find_by_name(__COMPAT_vmlinux_btf, ksym) >= 0; -+} -+ -+static inline bool __COMPAT_struct_has_field(const char *type, const char *field) -+{ -+ const struct btf_type *t; -+ const struct btf_member *m; -+ const char *n; -+ s32 tid; -+ int i; -+ -+ __COMPAT_load_vmlinux_btf(); -+ tid = btf__find_by_name_kind(__COMPAT_vmlinux_btf, type, BTF_KIND_STRUCT); -+ if (tid < 0) -+ return false; -+ -+ t = btf__type_by_id(__COMPAT_vmlinux_btf, tid); -+ SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid); -+ -+ m = btf_members(t); -+ -+ for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { -+ n = btf__name_by_offset(__COMPAT_vmlinux_btf, m[i].name_off); -+ SCX_BUG_ON(!n, "btf__name_by_offset()"); -+ if (!strcmp(n, field)) -+ return true; -+ } -+ -+ return false; -+} -+ -+#define SCX_OPS_SWITCH_PARTIAL \ -+ __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL") -+ -+static inline long scx_hotplug_seq(void) -+{ -+ int fd; -+ char buf[32]; -+ ssize_t len; -+ long val; -+ -+ fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY); -+ if (fd < 0) -+ return -ENOENT; -+ -+ len = read(fd, buf, sizeof(buf) - 1); -+ SCX_BUG_ON(len <= 0, "read failed (%ld)", len); -+ buf[len] = 0; -+ close(fd); -+ -+ val = strtoul(buf, NULL, 10); -+ SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val); -+ -+ return val; -+} -+ -+/* -+ * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE() -+ * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load -+ * and attach it, backward compatibility is automatically maintained where -+ * reasonable. -+ * -+ * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is -+ * the current minimum required kernel version. -+ */ -+#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ -+ struct __scx_name *__skel; \ -+ \ -+ SCX_BUG_ON(!__COMPAT_struct_has_field("sched_ext_ops", "dump"), \ -+ "sched_ext_ops.dump() missing, kernel too old?"); \ -+ \ -+ __skel = __scx_name##__open(); \ -+ SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ -+ __skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \ -+ __skel; \ -+}) -+ -+#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \ -+ UEI_SET_SIZE(__skel, __ops_name, __uei_name); \ -+ SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \ -+}) -+ -+/* -+ * New versions of bpftool now emit additional link placeholders for BPF maps, -+ * and set up BPF skeleton in such a way that libbpf will auto-attach BPF maps -+ * automatically, assumming libbpf is recent enough (v1.5+). Old libbpf will do -+ * nothing with those links and won't attempt to auto-attach maps. -+ * -+ * To maintain compatibility with older libbpf while avoiding trying to attach -+ * twice, disable the autoattach feature on newer libbpf. -+ */ -+#if LIBBPF_MAJOR_VERSION > 1 || \ -+ (LIBBPF_MAJOR_VERSION == 1 && LIBBPF_MINOR_VERSION >= 5) -+#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) \ -+ bpf_map__set_autoattach((__skel)->maps.__ops_name, false) -+#else -+#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) do {} while (0) -+#endif -+ -+#define SCX_OPS_ATTACH(__skel, __ops_name, __scx_name) ({ \ -+ struct bpf_link *__link; \ -+ __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name); \ -+ SCX_BUG_ON(__scx_name##__attach((__skel)), "Failed to attach skel"); \ -+ __link = bpf_map__attach_struct_ops((__skel)->maps.__ops_name); \ -+ SCX_BUG_ON(!__link, "Failed to attach struct_ops"); \ -+ __link; \ -+}) -+ -+#endif /* __SCX_COMPAT_H */ -diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h -new file mode 100644 -index 000000000000..8ce2734402e1 ---- /dev/null -+++ b/tools/sched_ext/include/scx/user_exit_info.h -@@ -0,0 +1,115 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Define struct user_exit_info which is shared between BPF and userspace parts -+ * to communicate exit status and other information. -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#ifndef __USER_EXIT_INFO_H -+#define __USER_EXIT_INFO_H -+ -+enum uei_sizes { -+ UEI_REASON_LEN = 128, -+ UEI_MSG_LEN = 1024, -+ UEI_DUMP_DFL_LEN = 32768, -+}; -+ -+struct user_exit_info { -+ int kind; -+ s64 exit_code; -+ char reason[UEI_REASON_LEN]; -+ char msg[UEI_MSG_LEN]; -+}; -+ -+#ifdef __bpf__ -+ -+#ifdef LSP -+#include "../vmlinux/vmlinux.h" -+#else -+#include "vmlinux.h" -+#endif -+#include -+ -+#define UEI_DEFINE(__name) \ -+ char RESIZABLE_ARRAY(data, __name##_dump); \ -+ const volatile u32 __name##_dump_len; \ -+ struct user_exit_info __name SEC(".data") -+ -+#define UEI_RECORD(__uei_name, __ei) ({ \ -+ bpf_probe_read_kernel_str(__uei_name.reason, \ -+ sizeof(__uei_name.reason), (__ei)->reason); \ -+ bpf_probe_read_kernel_str(__uei_name.msg, \ -+ sizeof(__uei_name.msg), (__ei)->msg); \ -+ bpf_probe_read_kernel_str(__uei_name##_dump, \ -+ __uei_name##_dump_len, (__ei)->dump); \ -+ if (bpf_core_field_exists((__ei)->exit_code)) \ -+ __uei_name.exit_code = (__ei)->exit_code; \ -+ /* use __sync to force memory barrier */ \ -+ __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ -+ (__ei)->kind); \ -+}) -+ -+#else /* !__bpf__ */ -+ -+#include -+#include -+ -+/* no need to call the following explicitly if SCX_OPS_LOAD() is used */ -+#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \ -+ u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \ -+ (__skel)->rodata->__uei_name##_dump_len = __len; \ -+ RESIZE_ARRAY((__skel), data, __uei_name##_dump, __len); \ -+}) -+ -+#define UEI_EXITED(__skel, __uei_name) ({ \ -+ /* use __sync to force memory barrier */ \ -+ __sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1); \ -+}) -+ -+#define UEI_REPORT(__skel, __uei_name) ({ \ -+ struct user_exit_info *__uei = &(__skel)->data->__uei_name; \ -+ char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \ -+ if (__uei_dump[0] != '\0') { \ -+ fputs("\nDEBUG DUMP\n", stderr); \ -+ fputs("================================================================================\n\n", stderr); \ -+ fputs(__uei_dump, stderr); \ -+ fputs("\n================================================================================\n\n", stderr); \ -+ } \ -+ fprintf(stderr, "EXIT: %s", __uei->reason); \ -+ if (__uei->msg[0] != '\0') \ -+ fprintf(stderr, " (%s)", __uei->msg); \ -+ fputs("\n", stderr); \ -+ __uei->exit_code; \ -+}) -+ -+/* -+ * We can't import vmlinux.h while compiling user C code. Let's duplicate -+ * scx_exit_code definition. -+ */ -+enum scx_exit_code { -+ /* Reasons */ -+ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, -+ -+ /* Actions */ -+ SCX_ECODE_ACT_RESTART = 1LLU << 48, -+}; -+ -+enum uei_ecode_mask { -+ UEI_ECODE_USER_MASK = ((1LLU << 32) - 1), -+ UEI_ECODE_SYS_RSN_MASK = ((1LLU << 16) - 1) << 32, -+ UEI_ECODE_SYS_ACT_MASK = ((1LLU << 16) - 1) << 48, -+}; -+ -+/* -+ * These macro interpret the ecode returned from UEI_REPORT(). -+ */ -+#define UEI_ECODE_USER(__ecode) ((__ecode) & UEI_ECODE_USER_MASK) -+#define UEI_ECODE_SYS_RSN(__ecode) ((__ecode) & UEI_ECODE_SYS_RSN_MASK) -+#define UEI_ECODE_SYS_ACT(__ecode) ((__ecode) & UEI_ECODE_SYS_ACT_MASK) -+ -+#define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART) -+ -+#endif /* __bpf__ */ -+#endif /* __USER_EXIT_INFO_H */ -diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c -new file mode 100644 -index 000000000000..8dd8eb73b6b8 ---- /dev/null -+++ b/tools/sched_ext/scx_central.bpf.c -@@ -0,0 +1,361 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A central FIFO sched_ext scheduler which demonstrates the followings: -+ * -+ * a. Making all scheduling decisions from one CPU: -+ * -+ * The central CPU is the only one making scheduling decisions. All other -+ * CPUs kick the central CPU when they run out of tasks to run. -+ * -+ * There is one global BPF queue and the central CPU schedules all CPUs by -+ * dispatching from the global queue to each CPU's local dsq from dispatch(). -+ * This isn't the most straightforward. e.g. It'd be easier to bounce -+ * through per-CPU BPF queues. The current design is chosen to maximally -+ * utilize and verify various SCX mechanisms such as LOCAL_ON dispatching. -+ * -+ * b. Tickless operation -+ * -+ * All tasks are dispatched with the infinite slice which allows stopping the -+ * ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full -+ * parameter. The tickless operation can be observed through -+ * /proc/interrupts. -+ * -+ * Periodic switching is enforced by a periodic timer checking all CPUs and -+ * preempting them as necessary. Unfortunately, BPF timer currently doesn't -+ * have a way to pin to a specific CPU, so the periodic timer isn't pinned to -+ * the central CPU. -+ * -+ * c. Preemption -+ * -+ * Kthreads are unconditionally queued to the head of a matching local dsq -+ * and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always -+ * prioritized over user threads, which is required for ensuring forward -+ * progress as e.g. the periodic timer may run on a ksoftirqd and if the -+ * ksoftirqd gets starved by a user thread, there may not be anything else to -+ * vacate that user thread. -+ * -+ * SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the -+ * next tasks. -+ * -+ * This scheduler is designed to maximize usage of various SCX mechanisms. A -+ * more practical implementation would likely put the scheduling loop outside -+ * the central CPU's dispatch() path and add some form of priority mechanism. -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+enum { -+ FALLBACK_DSQ_ID = 0, -+ MS_TO_NS = 1000LLU * 1000, -+ TIMER_INTERVAL_NS = 1 * MS_TO_NS, -+}; -+ -+const volatile s32 central_cpu; -+const volatile u32 nr_cpu_ids = 1; /* !0 for veristat, set during init */ -+const volatile u64 slice_ns = SCX_SLICE_DFL; -+ -+bool timer_pinned = true; -+u64 nr_total, nr_locals, nr_queued, nr_lost_pids; -+u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries; -+u64 nr_overflows; -+ -+UEI_DEFINE(uei); -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_QUEUE); -+ __uint(max_entries, 4096); -+ __type(value, s32); -+} central_q SEC(".maps"); -+ -+/* can't use percpu map due to bad lookups */ -+bool RESIZABLE_ARRAY(data, cpu_gimme_task); -+u64 RESIZABLE_ARRAY(data, cpu_started_at); -+ -+struct central_timer { -+ struct bpf_timer timer; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_ARRAY); -+ __uint(max_entries, 1); -+ __type(key, u32); -+ __type(value, struct central_timer); -+} central_timer SEC(".maps"); -+ -+static bool vtime_before(u64 a, u64 b) -+{ -+ return (s64)(a - b) < 0; -+} -+ -+s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ /* -+ * Steer wakeups to the central CPU as much as possible to avoid -+ * disturbing other CPUs. It's safe to blindly return the central cpu as -+ * select_cpu() is a hint and if @p can't be on it, the kernel will -+ * automatically pick a fallback CPU. -+ */ -+ return central_cpu; -+} -+ -+void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ s32 pid = p->pid; -+ -+ __sync_fetch_and_add(&nr_total, 1); -+ -+ /* -+ * Push per-cpu kthreads at the head of local dsq's and preempt the -+ * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked -+ * behind other threads which is necessary for forward progress -+ * guarantee as we depend on the BPF timer which may run from ksoftirqd. -+ */ -+ if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) { -+ __sync_fetch_and_add(&nr_locals, 1); -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF, -+ enq_flags | SCX_ENQ_PREEMPT); -+ return; -+ } -+ -+ if (bpf_map_push_elem(¢ral_q, &pid, 0)) { -+ __sync_fetch_and_add(&nr_overflows, 1); -+ scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags); -+ return; -+ } -+ -+ __sync_fetch_and_add(&nr_queued, 1); -+ -+ if (!scx_bpf_task_running(p)) -+ scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); -+} -+ -+static bool dispatch_to_cpu(s32 cpu) -+{ -+ struct task_struct *p; -+ s32 pid; -+ -+ bpf_repeat(BPF_MAX_LOOPS) { -+ if (bpf_map_pop_elem(¢ral_q, &pid)) -+ break; -+ -+ __sync_fetch_and_sub(&nr_queued, 1); -+ -+ p = bpf_task_from_pid(pid); -+ if (!p) { -+ __sync_fetch_and_add(&nr_lost_pids, 1); -+ continue; -+ } -+ -+ /* -+ * If we can't run the task at the top, do the dumb thing and -+ * bounce it to the fallback dsq. -+ */ -+ if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { -+ __sync_fetch_and_add(&nr_mismatches, 1); -+ scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0); -+ bpf_task_release(p); -+ /* -+ * We might run out of dispatch buffer slots if we continue dispatching -+ * to the fallback DSQ, without dispatching to the local DSQ of the -+ * target CPU. In such a case, break the loop now as will fail the -+ * next dispatch operation. -+ */ -+ if (!scx_bpf_dispatch_nr_slots()) -+ break; -+ continue; -+ } -+ -+ /* dispatch to local and mark that @cpu doesn't need more */ -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0); -+ -+ if (cpu != central_cpu) -+ scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); -+ -+ bpf_task_release(p); -+ return true; -+ } -+ -+ return false; -+} -+ -+void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ if (cpu == central_cpu) { -+ /* dispatch for all other CPUs first */ -+ __sync_fetch_and_add(&nr_dispatches, 1); -+ -+ bpf_for(cpu, 0, nr_cpu_ids) { -+ bool *gimme; -+ -+ if (!scx_bpf_dispatch_nr_slots()) -+ break; -+ -+ /* central's gimme is never set */ -+ gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); -+ if (!gimme || !*gimme) -+ continue; -+ -+ if (dispatch_to_cpu(cpu)) -+ *gimme = false; -+ } -+ -+ /* -+ * Retry if we ran out of dispatch buffer slots as we might have -+ * skipped some CPUs and also need to dispatch for self. The ext -+ * core automatically retries if the local dsq is empty but we -+ * can't rely on that as we're dispatching for other CPUs too. -+ * Kick self explicitly to retry. -+ */ -+ if (!scx_bpf_dispatch_nr_slots()) { -+ __sync_fetch_and_add(&nr_retries, 1); -+ scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); -+ return; -+ } -+ -+ /* look for a task to run on the central CPU */ -+ if (scx_bpf_consume(FALLBACK_DSQ_ID)) -+ return; -+ dispatch_to_cpu(central_cpu); -+ } else { -+ bool *gimme; -+ -+ if (scx_bpf_consume(FALLBACK_DSQ_ID)) -+ return; -+ -+ gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); -+ if (gimme) -+ *gimme = true; -+ -+ /* -+ * Force dispatch on the scheduling CPU so that it finds a task -+ * to run for us. -+ */ -+ scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); -+ } -+} -+ -+void BPF_STRUCT_OPS(central_running, struct task_struct *p) -+{ -+ s32 cpu = scx_bpf_task_cpu(p); -+ u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); -+ if (started_at) -+ *started_at = bpf_ktime_get_ns() ?: 1; /* 0 indicates idle */ -+} -+ -+void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable) -+{ -+ s32 cpu = scx_bpf_task_cpu(p); -+ u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); -+ if (started_at) -+ *started_at = 0; -+} -+ -+static int central_timerfn(void *map, int *key, struct bpf_timer *timer) -+{ -+ u64 now = bpf_ktime_get_ns(); -+ u64 nr_to_kick = nr_queued; -+ s32 i, curr_cpu; -+ -+ curr_cpu = bpf_get_smp_processor_id(); -+ if (timer_pinned && (curr_cpu != central_cpu)) { -+ scx_bpf_error("Central timer ran on CPU %d, not central CPU %d", -+ curr_cpu, central_cpu); -+ return 0; -+ } -+ -+ bpf_for(i, 0, nr_cpu_ids) { -+ s32 cpu = (nr_timers + i) % nr_cpu_ids; -+ u64 *started_at; -+ -+ if (cpu == central_cpu) -+ continue; -+ -+ /* kick iff the current one exhausted its slice */ -+ started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); -+ if (started_at && *started_at && -+ vtime_before(now, *started_at + slice_ns)) -+ continue; -+ -+ /* and there's something pending */ -+ if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) || -+ scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu)) -+ ; -+ else if (nr_to_kick) -+ nr_to_kick--; -+ else -+ continue; -+ -+ scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT); -+ } -+ -+ bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); -+ __sync_fetch_and_add(&nr_timers, 1); -+ return 0; -+} -+ -+int BPF_STRUCT_OPS_SLEEPABLE(central_init) -+{ -+ u32 key = 0; -+ struct bpf_timer *timer; -+ int ret; -+ -+ ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1); -+ if (ret) -+ return ret; -+ -+ timer = bpf_map_lookup_elem(¢ral_timer, &key); -+ if (!timer) -+ return -ESRCH; -+ -+ if (bpf_get_smp_processor_id() != central_cpu) { -+ scx_bpf_error("init from non-central CPU"); -+ return -EINVAL; -+ } -+ -+ bpf_timer_init(timer, ¢ral_timer, CLOCK_MONOTONIC); -+ bpf_timer_set_callback(timer, central_timerfn); -+ -+ ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); -+ /* -+ * BPF_F_TIMER_CPU_PIN is pretty new (>=6.7). If we're running in a -+ * kernel which doesn't have it, bpf_timer_start() will return -EINVAL. -+ * Retry without the PIN. This would be the perfect use case for -+ * bpf_core_enum_value_exists() but the enum type doesn't have a name -+ * and can't be used with bpf_core_enum_value_exists(). Oh well... -+ */ -+ if (ret == -EINVAL) { -+ timer_pinned = false; -+ ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0); -+ } -+ if (ret) -+ scx_bpf_error("bpf_timer_start failed (%d)", ret); -+ return ret; -+} -+ -+void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SCX_OPS_DEFINE(central_ops, -+ /* -+ * We are offloading all scheduling decisions to the central CPU -+ * and thus being the last task on a given CPU doesn't mean -+ * anything special. Enqueue the last tasks like any other tasks. -+ */ -+ .flags = SCX_OPS_ENQ_LAST, -+ -+ .select_cpu = (void *)central_select_cpu, -+ .enqueue = (void *)central_enqueue, -+ .dispatch = (void *)central_dispatch, -+ .running = (void *)central_running, -+ .stopping = (void *)central_stopping, -+ .init = (void *)central_init, -+ .exit = (void *)central_exit, -+ .name = "central"); -diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c -new file mode 100644 -index 000000000000..21deea320bd7 ---- /dev/null -+++ b/tools/sched_ext/scx_central.c -@@ -0,0 +1,135 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#define _GNU_SOURCE -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "scx_central.bpf.skel.h" -+ -+const char help_fmt[] = -+"A central FIFO sched_ext scheduler.\n" -+"\n" -+"See the top-level comment in .bpf.c for more details.\n" -+"\n" -+"Usage: %s [-s SLICE_US] [-c CPU]\n" -+"\n" -+" -s SLICE_US Override slice duration\n" -+" -c CPU Override the central CPU (default: 0)\n" -+" -v Print libbpf debug messages\n" -+" -h Display this help and exit\n"; -+ -+static bool verbose; -+static volatile int exit_req; -+ -+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) -+{ -+ if (level == LIBBPF_DEBUG && !verbose) -+ return 0; -+ return vfprintf(stderr, format, args); -+} -+ -+static void sigint_handler(int dummy) -+{ -+ exit_req = 1; -+} -+ -+int main(int argc, char **argv) -+{ -+ struct scx_central *skel; -+ struct bpf_link *link; -+ __u64 seq = 0, ecode; -+ __s32 opt; -+ cpu_set_t *cpuset; -+ -+ libbpf_set_print(libbpf_print_fn); -+ signal(SIGINT, sigint_handler); -+ signal(SIGTERM, sigint_handler); -+restart: -+ skel = SCX_OPS_OPEN(central_ops, scx_central); -+ -+ skel->rodata->central_cpu = 0; -+ skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); -+ -+ while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { -+ switch (opt) { -+ case 's': -+ skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; -+ break; -+ case 'c': -+ skel->rodata->central_cpu = strtoul(optarg, NULL, 0); -+ break; -+ case 'v': -+ verbose = true; -+ break; -+ default: -+ fprintf(stderr, help_fmt, basename(argv[0])); -+ return opt != 'h'; -+ } -+ } -+ -+ /* Resize arrays so their element count is equal to cpu count. */ -+ RESIZE_ARRAY(skel, data, cpu_gimme_task, skel->rodata->nr_cpu_ids); -+ RESIZE_ARRAY(skel, data, cpu_started_at, skel->rodata->nr_cpu_ids); -+ -+ SCX_OPS_LOAD(skel, central_ops, scx_central, uei); -+ -+ /* -+ * Affinitize the loading thread to the central CPU, as: -+ * - That's where the BPF timer is first invoked in the BPF program. -+ * - We probably don't want this user space component to take up a core -+ * from a task that would benefit from avoiding preemption on one of -+ * the tickless cores. -+ * -+ * Until BPF supports pinning the timer, it's not guaranteed that it -+ * will always be invoked on the central CPU. In practice, this -+ * suffices the majority of the time. -+ */ -+ cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids); -+ SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); -+ CPU_ZERO(cpuset); -+ CPU_SET(skel->rodata->central_cpu, cpuset); -+ SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset), -+ "Failed to affinitize to central CPU %d (max %d)", -+ skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1); -+ CPU_FREE(cpuset); -+ -+ link = SCX_OPS_ATTACH(skel, central_ops, scx_central); -+ -+ if (!skel->data->timer_pinned) -+ printf("WARNING : BPF_F_TIMER_CPU_PIN not available, timer not pinned to central\n"); -+ -+ while (!exit_req && !UEI_EXITED(skel, uei)) { -+ printf("[SEQ %llu]\n", seq++); -+ printf("total :%10" PRIu64 " local:%10" PRIu64 " queued:%10" PRIu64 " lost:%10" PRIu64 "\n", -+ skel->bss->nr_total, -+ skel->bss->nr_locals, -+ skel->bss->nr_queued, -+ skel->bss->nr_lost_pids); -+ printf("timer :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n", -+ skel->bss->nr_timers, -+ skel->bss->nr_dispatches, -+ skel->bss->nr_mismatches, -+ skel->bss->nr_retries); -+ printf("overflow:%10" PRIu64 "\n", -+ skel->bss->nr_overflows); -+ fflush(stdout); -+ sleep(1); -+ } -+ -+ bpf_link__destroy(link); -+ ecode = UEI_REPORT(skel, uei); -+ scx_central__destroy(skel); -+ -+ if (UEI_ECODE_RESTART(ecode)) -+ goto restart; -+ return 0; -+} -diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c -new file mode 100644 -index 000000000000..b722baf6da4b ---- /dev/null -+++ b/tools/sched_ext/scx_flatcg.bpf.c -@@ -0,0 +1,957 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A demo sched_ext flattened cgroup hierarchy scheduler. It implements -+ * hierarchical weight-based cgroup CPU control by flattening the cgroup -+ * hierarchy into a single layer by compounding the active weight share at each -+ * level. Consider the following hierarchy with weights in parentheses: -+ * -+ * R + A (100) + B (100) -+ * | \ C (100) -+ * \ D (200) -+ * -+ * Ignoring the root and threaded cgroups, only B, C and D can contain tasks. -+ * Let's say all three have runnable tasks. The total share that each of these -+ * three cgroups is entitled to can be calculated by compounding its share at -+ * each level. -+ * -+ * For example, B is competing against C and in that competition its share is -+ * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's -+ * share in that competition is 100/(200+100) == 1/3. B's eventual share in the -+ * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's -+ * eventual shaer is the same at 1/6. D is only competing at the top level and -+ * its share is 200/(100+200) == 2/3. -+ * -+ * So, instead of hierarchically scheduling level-by-level, we can consider it -+ * as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3 -+ * and keep updating the eventual shares as the cgroups' runnable states change. -+ * -+ * This flattening of hierarchy can bring a substantial performance gain when -+ * the cgroup hierarchy is nested multiple levels. in a simple benchmark using -+ * wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it -+ * outperforms CFS by ~3% with CPU controller disabled and by ~10% with two -+ * apache instances competing with 2:1 weight ratio nested four level deep. -+ * -+ * However, the gain comes at the cost of not being able to properly handle -+ * thundering herd of cgroups. For example, if many cgroups which are nested -+ * behind a low priority parent cgroup wake up around the same time, they may be -+ * able to consume more CPU cycles than they are entitled to. In many use cases, -+ * this isn't a real concern especially given the performance gain. Also, there -+ * are ways to mitigate the problem further by e.g. introducing an extra -+ * scheduling layer on cgroup delegation boundaries. -+ * -+ * The scheduler first picks the cgroup to run and then schedule the tasks -+ * within by using nested weighted vtime scheduling by default. The -+ * cgroup-internal scheduling can be switched to FIFO with the -f option. -+ */ -+#include -+#include "scx_flatcg.h" -+ -+/* -+ * Maximum amount of retries to find a valid cgroup. -+ */ -+enum { -+ FALLBACK_DSQ = 0, -+ CGROUP_MAX_RETRIES = 1024, -+}; -+ -+char _license[] SEC("license") = "GPL"; -+ -+const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */ -+const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL; -+const volatile bool fifo_sched; -+ -+u64 cvtime_now; -+UEI_DEFINE(uei); -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); -+ __type(key, u32); -+ __type(value, u64); -+ __uint(max_entries, FCG_NR_STATS); -+} stats SEC(".maps"); -+ -+static void stat_inc(enum fcg_stat_idx idx) -+{ -+ u32 idx_v = idx; -+ -+ u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v); -+ if (cnt_p) -+ (*cnt_p)++; -+} -+ -+struct fcg_cpu_ctx { -+ u64 cur_cgid; -+ u64 cur_at; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); -+ __type(key, u32); -+ __type(value, struct fcg_cpu_ctx); -+ __uint(max_entries, 1); -+} cpu_ctx SEC(".maps"); -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_CGRP_STORAGE); -+ __uint(map_flags, BPF_F_NO_PREALLOC); -+ __type(key, int); -+ __type(value, struct fcg_cgrp_ctx); -+} cgrp_ctx SEC(".maps"); -+ -+struct cgv_node { -+ struct bpf_rb_node rb_node; -+ __u64 cvtime; -+ __u64 cgid; -+}; -+ -+private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock; -+private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node); -+ -+struct cgv_node_stash { -+ struct cgv_node __kptr *node; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_HASH); -+ __uint(max_entries, 16384); -+ __type(key, __u64); -+ __type(value, struct cgv_node_stash); -+} cgv_node_stash SEC(".maps"); -+ -+struct fcg_task_ctx { -+ u64 bypassed_at; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE); -+ __uint(map_flags, BPF_F_NO_PREALLOC); -+ __type(key, int); -+ __type(value, struct fcg_task_ctx); -+} task_ctx SEC(".maps"); -+ -+/* gets inc'd on weight tree changes to expire the cached hweights */ -+u64 hweight_gen = 1; -+ -+static u64 div_round_up(u64 dividend, u64 divisor) -+{ -+ return (dividend + divisor - 1) / divisor; -+} -+ -+static bool vtime_before(u64 a, u64 b) -+{ -+ return (s64)(a - b) < 0; -+} -+ -+static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b) -+{ -+ struct cgv_node *cgc_a, *cgc_b; -+ -+ cgc_a = container_of(a, struct cgv_node, rb_node); -+ cgc_b = container_of(b, struct cgv_node, rb_node); -+ -+ return cgc_a->cvtime < cgc_b->cvtime; -+} -+ -+static struct fcg_cpu_ctx *find_cpu_ctx(void) -+{ -+ struct fcg_cpu_ctx *cpuc; -+ u32 idx = 0; -+ -+ cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx); -+ if (!cpuc) { -+ scx_bpf_error("cpu_ctx lookup failed"); -+ return NULL; -+ } -+ return cpuc; -+} -+ -+static struct fcg_cgrp_ctx *find_cgrp_ctx(struct cgroup *cgrp) -+{ -+ struct fcg_cgrp_ctx *cgc; -+ -+ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); -+ if (!cgc) { -+ scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", cgrp->kn->id); -+ return NULL; -+ } -+ return cgc; -+} -+ -+static struct fcg_cgrp_ctx *find_ancestor_cgrp_ctx(struct cgroup *cgrp, int level) -+{ -+ struct fcg_cgrp_ctx *cgc; -+ -+ cgrp = bpf_cgroup_ancestor(cgrp, level); -+ if (!cgrp) { -+ scx_bpf_error("ancestor cgroup lookup failed"); -+ return NULL; -+ } -+ -+ cgc = find_cgrp_ctx(cgrp); -+ if (!cgc) -+ scx_bpf_error("ancestor cgrp_ctx lookup failed"); -+ bpf_cgroup_release(cgrp); -+ return cgc; -+} -+ -+static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) -+{ -+ int level; -+ -+ if (!cgc->nr_active) { -+ stat_inc(FCG_STAT_HWT_SKIP); -+ return; -+ } -+ -+ if (cgc->hweight_gen == hweight_gen) { -+ stat_inc(FCG_STAT_HWT_CACHE); -+ return; -+ } -+ -+ stat_inc(FCG_STAT_HWT_UPDATES); -+ bpf_for(level, 0, cgrp->level + 1) { -+ struct fcg_cgrp_ctx *cgc; -+ bool is_active; -+ -+ cgc = find_ancestor_cgrp_ctx(cgrp, level); -+ if (!cgc) -+ break; -+ -+ if (!level) { -+ cgc->hweight = FCG_HWEIGHT_ONE; -+ cgc->hweight_gen = hweight_gen; -+ } else { -+ struct fcg_cgrp_ctx *pcgc; -+ -+ pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); -+ if (!pcgc) -+ break; -+ -+ /* -+ * We can be opportunistic here and not grab the -+ * cgv_tree_lock and deal with the occasional races. -+ * However, hweight updates are already cached and -+ * relatively low-frequency. Let's just do the -+ * straightforward thing. -+ */ -+ bpf_spin_lock(&cgv_tree_lock); -+ is_active = cgc->nr_active; -+ if (is_active) { -+ cgc->hweight_gen = pcgc->hweight_gen; -+ cgc->hweight = -+ div_round_up(pcgc->hweight * cgc->weight, -+ pcgc->child_weight_sum); -+ } -+ bpf_spin_unlock(&cgv_tree_lock); -+ -+ if (!is_active) { -+ stat_inc(FCG_STAT_HWT_RACE); -+ break; -+ } -+ } -+ } -+} -+ -+static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc) -+{ -+ u64 delta, cvtime, max_budget; -+ -+ /* -+ * A node which is on the rbtree can't be pointed to from elsewhere yet -+ * and thus can't be updated and repositioned. Instead, we collect the -+ * vtime deltas separately and apply it asynchronously here. -+ */ -+ delta = __sync_fetch_and_sub(&cgc->cvtime_delta, cgc->cvtime_delta); -+ cvtime = cgv_node->cvtime + delta; -+ -+ /* -+ * Allow a cgroup to carry the maximum budget proportional to its -+ * hweight such that a full-hweight cgroup can immediately take up half -+ * of the CPUs at the most while staying at the front of the rbtree. -+ */ -+ max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) / -+ (2 * FCG_HWEIGHT_ONE); -+ if (vtime_before(cvtime, cvtime_now - max_budget)) -+ cvtime = cvtime_now - max_budget; -+ -+ cgv_node->cvtime = cvtime; -+} -+ -+static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) -+{ -+ struct cgv_node_stash *stash; -+ struct cgv_node *cgv_node; -+ u64 cgid = cgrp->kn->id; -+ -+ /* paired with cmpxchg in try_pick_next_cgroup() */ -+ if (__sync_val_compare_and_swap(&cgc->queued, 0, 1)) { -+ stat_inc(FCG_STAT_ENQ_SKIP); -+ return; -+ } -+ -+ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); -+ if (!stash) { -+ scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid); -+ return; -+ } -+ -+ /* NULL if the node is already on the rbtree */ -+ cgv_node = bpf_kptr_xchg(&stash->node, NULL); -+ if (!cgv_node) { -+ stat_inc(FCG_STAT_ENQ_RACE); -+ return; -+ } -+ -+ bpf_spin_lock(&cgv_tree_lock); -+ cgrp_cap_budget(cgv_node, cgc); -+ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); -+ bpf_spin_unlock(&cgv_tree_lock); -+} -+ -+static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc) -+{ -+ /* -+ * Tell fcg_stopping() that this bypassed the regular scheduling path -+ * and should be force charged to the cgroup. 0 is used to indicate that -+ * the task isn't bypassing, so if the current runtime is 0, go back by -+ * one nanosecond. -+ */ -+ taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1; -+} -+ -+s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) -+{ -+ struct fcg_task_ctx *taskc; -+ bool is_idle = false; -+ s32 cpu; -+ -+ cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); -+ -+ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); -+ if (!taskc) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return cpu; -+ } -+ -+ /* -+ * If select_cpu_dfl() is recommending local enqueue, the target CPU is -+ * idle. Follow it and charge the cgroup later in fcg_stopping() after -+ * the fact. -+ */ -+ if (is_idle) { -+ set_bypassed_at(p, taskc); -+ stat_inc(FCG_STAT_LOCAL); -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); -+ } -+ -+ return cpu; -+} -+ -+void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ struct fcg_task_ctx *taskc; -+ struct cgroup *cgrp; -+ struct fcg_cgrp_ctx *cgc; -+ -+ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); -+ if (!taskc) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return; -+ } -+ -+ /* -+ * Use the direct dispatching and force charging to deal with tasks with -+ * custom affinities so that we don't have to worry about per-cgroup -+ * dq's containing tasks that can't be executed from some CPUs. -+ */ -+ if (p->nr_cpus_allowed != nr_cpus) { -+ set_bypassed_at(p, taskc); -+ -+ /* -+ * The global dq is deprioritized as we don't want to let tasks -+ * to boost themselves by constraining its cpumask. The -+ * deprioritization is rather severe, so let's not apply that to -+ * per-cpu kernel threads. This is ham-fisted. We probably wanna -+ * implement per-cgroup fallback dq's instead so that we have -+ * more control over when tasks with custom cpumask get issued. -+ */ -+ if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) { -+ stat_inc(FCG_STAT_LOCAL); -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); -+ } else { -+ stat_inc(FCG_STAT_GLOBAL); -+ scx_bpf_dispatch(p, FALLBACK_DSQ, SCX_SLICE_DFL, enq_flags); -+ } -+ return; -+ } -+ -+ cgrp = __COMPAT_scx_bpf_task_cgroup(p); -+ cgc = find_cgrp_ctx(cgrp); -+ if (!cgc) -+ goto out_release; -+ -+ if (fifo_sched) { -+ scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags); -+ } else { -+ u64 tvtime = p->scx.dsq_vtime; -+ -+ /* -+ * Limit the amount of budget that an idling task can accumulate -+ * to one slice. -+ */ -+ if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL)) -+ tvtime = cgc->tvtime_now - SCX_SLICE_DFL; -+ -+ scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL, -+ tvtime, enq_flags); -+ } -+ -+ cgrp_enqueued(cgrp, cgc); -+out_release: -+ bpf_cgroup_release(cgrp); -+} -+ -+/* -+ * Walk the cgroup tree to update the active weight sums as tasks wake up and -+ * sleep. The weight sums are used as the base when calculating the proportion a -+ * given cgroup or task is entitled to at each level. -+ */ -+static void update_active_weight_sums(struct cgroup *cgrp, bool runnable) -+{ -+ struct fcg_cgrp_ctx *cgc; -+ bool updated = false; -+ int idx; -+ -+ cgc = find_cgrp_ctx(cgrp); -+ if (!cgc) -+ return; -+ -+ /* -+ * In most cases, a hot cgroup would have multiple threads going to -+ * sleep and waking up while the whole cgroup stays active. In leaf -+ * cgroups, ->nr_runnable which is updated with __sync operations gates -+ * ->nr_active updates, so that we don't have to grab the cgv_tree_lock -+ * repeatedly for a busy cgroup which is staying active. -+ */ -+ if (runnable) { -+ if (__sync_fetch_and_add(&cgc->nr_runnable, 1)) -+ return; -+ stat_inc(FCG_STAT_ACT); -+ } else { -+ if (__sync_sub_and_fetch(&cgc->nr_runnable, 1)) -+ return; -+ stat_inc(FCG_STAT_DEACT); -+ } -+ -+ /* -+ * If @cgrp is becoming runnable, its hweight should be refreshed after -+ * it's added to the weight tree so that enqueue has the up-to-date -+ * value. If @cgrp is becoming quiescent, the hweight should be -+ * refreshed before it's removed from the weight tree so that the usage -+ * charging which happens afterwards has access to the latest value. -+ */ -+ if (!runnable) -+ cgrp_refresh_hweight(cgrp, cgc); -+ -+ /* propagate upwards */ -+ bpf_for(idx, 0, cgrp->level) { -+ int level = cgrp->level - idx; -+ struct fcg_cgrp_ctx *cgc, *pcgc = NULL; -+ bool propagate = false; -+ -+ cgc = find_ancestor_cgrp_ctx(cgrp, level); -+ if (!cgc) -+ break; -+ if (level) { -+ pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); -+ if (!pcgc) -+ break; -+ } -+ -+ /* -+ * We need the propagation protected by a lock to synchronize -+ * against weight changes. There's no reason to drop the lock at -+ * each level but bpf_spin_lock() doesn't want any function -+ * calls while locked. -+ */ -+ bpf_spin_lock(&cgv_tree_lock); -+ -+ if (runnable) { -+ if (!cgc->nr_active++) { -+ updated = true; -+ if (pcgc) { -+ propagate = true; -+ pcgc->child_weight_sum += cgc->weight; -+ } -+ } -+ } else { -+ if (!--cgc->nr_active) { -+ updated = true; -+ if (pcgc) { -+ propagate = true; -+ pcgc->child_weight_sum -= cgc->weight; -+ } -+ } -+ } -+ -+ bpf_spin_unlock(&cgv_tree_lock); -+ -+ if (!propagate) -+ break; -+ } -+ -+ if (updated) -+ __sync_fetch_and_add(&hweight_gen, 1); -+ -+ if (runnable) -+ cgrp_refresh_hweight(cgrp, cgc); -+} -+ -+void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags) -+{ -+ struct cgroup *cgrp; -+ -+ cgrp = __COMPAT_scx_bpf_task_cgroup(p); -+ update_active_weight_sums(cgrp, true); -+ bpf_cgroup_release(cgrp); -+} -+ -+void BPF_STRUCT_OPS(fcg_running, struct task_struct *p) -+{ -+ struct cgroup *cgrp; -+ struct fcg_cgrp_ctx *cgc; -+ -+ if (fifo_sched) -+ return; -+ -+ cgrp = __COMPAT_scx_bpf_task_cgroup(p); -+ cgc = find_cgrp_ctx(cgrp); -+ if (cgc) { -+ /* -+ * @cgc->tvtime_now always progresses forward as tasks start -+ * executing. The test and update can be performed concurrently -+ * from multiple CPUs and thus racy. Any error should be -+ * contained and temporary. Let's just live with it. -+ */ -+ if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime)) -+ cgc->tvtime_now = p->scx.dsq_vtime; -+ } -+ bpf_cgroup_release(cgrp); -+} -+ -+void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable) -+{ -+ struct fcg_task_ctx *taskc; -+ struct cgroup *cgrp; -+ struct fcg_cgrp_ctx *cgc; -+ -+ /* -+ * Scale the execution time by the inverse of the weight and charge. -+ * -+ * Note that the default yield implementation yields by setting -+ * @p->scx.slice to zero and the following would treat the yielding task -+ * as if it has consumed all its slice. If this penalizes yielding tasks -+ * too much, determine the execution time by taking explicit timestamps -+ * instead of depending on @p->scx.slice. -+ */ -+ if (!fifo_sched) -+ p->scx.dsq_vtime += -+ (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; -+ -+ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); -+ if (!taskc) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return; -+ } -+ -+ if (!taskc->bypassed_at) -+ return; -+ -+ cgrp = __COMPAT_scx_bpf_task_cgroup(p); -+ cgc = find_cgrp_ctx(cgrp); -+ if (cgc) { -+ __sync_fetch_and_add(&cgc->cvtime_delta, -+ p->se.sum_exec_runtime - taskc->bypassed_at); -+ taskc->bypassed_at = 0; -+ } -+ bpf_cgroup_release(cgrp); -+} -+ -+void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags) -+{ -+ struct cgroup *cgrp; -+ -+ cgrp = __COMPAT_scx_bpf_task_cgroup(p); -+ update_active_weight_sums(cgrp, false); -+ bpf_cgroup_release(cgrp); -+} -+ -+void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight) -+{ -+ struct fcg_cgrp_ctx *cgc, *pcgc = NULL; -+ -+ cgc = find_cgrp_ctx(cgrp); -+ if (!cgc) -+ return; -+ -+ if (cgrp->level) { -+ pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - 1); -+ if (!pcgc) -+ return; -+ } -+ -+ bpf_spin_lock(&cgv_tree_lock); -+ if (pcgc && cgc->nr_active) -+ pcgc->child_weight_sum += (s64)weight - cgc->weight; -+ cgc->weight = weight; -+ bpf_spin_unlock(&cgv_tree_lock); -+} -+ -+static bool try_pick_next_cgroup(u64 *cgidp) -+{ -+ struct bpf_rb_node *rb_node; -+ struct cgv_node_stash *stash; -+ struct cgv_node *cgv_node; -+ struct fcg_cgrp_ctx *cgc; -+ struct cgroup *cgrp; -+ u64 cgid; -+ -+ /* pop the front cgroup and wind cvtime_now accordingly */ -+ bpf_spin_lock(&cgv_tree_lock); -+ -+ rb_node = bpf_rbtree_first(&cgv_tree); -+ if (!rb_node) { -+ bpf_spin_unlock(&cgv_tree_lock); -+ stat_inc(FCG_STAT_PNC_NO_CGRP); -+ *cgidp = 0; -+ return true; -+ } -+ -+ rb_node = bpf_rbtree_remove(&cgv_tree, rb_node); -+ bpf_spin_unlock(&cgv_tree_lock); -+ -+ if (!rb_node) { -+ /* -+ * This should never happen. bpf_rbtree_first() was called -+ * above while the tree lock was held, so the node should -+ * always be present. -+ */ -+ scx_bpf_error("node could not be removed"); -+ return true; -+ } -+ -+ cgv_node = container_of(rb_node, struct cgv_node, rb_node); -+ cgid = cgv_node->cgid; -+ -+ if (vtime_before(cvtime_now, cgv_node->cvtime)) -+ cvtime_now = cgv_node->cvtime; -+ -+ /* -+ * If lookup fails, the cgroup's gone. Free and move on. See -+ * fcg_cgroup_exit(). -+ */ -+ cgrp = bpf_cgroup_from_id(cgid); -+ if (!cgrp) { -+ stat_inc(FCG_STAT_PNC_GONE); -+ goto out_free; -+ } -+ -+ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); -+ if (!cgc) { -+ bpf_cgroup_release(cgrp); -+ stat_inc(FCG_STAT_PNC_GONE); -+ goto out_free; -+ } -+ -+ if (!scx_bpf_consume(cgid)) { -+ bpf_cgroup_release(cgrp); -+ stat_inc(FCG_STAT_PNC_EMPTY); -+ goto out_stash; -+ } -+ -+ /* -+ * Successfully consumed from the cgroup. This will be our current -+ * cgroup for the new slice. Refresh its hweight. -+ */ -+ cgrp_refresh_hweight(cgrp, cgc); -+ -+ bpf_cgroup_release(cgrp); -+ -+ /* -+ * As the cgroup may have more tasks, add it back to the rbtree. Note -+ * that here we charge the full slice upfront and then exact later -+ * according to the actual consumption. This prevents lowpri thundering -+ * herd from saturating the machine. -+ */ -+ bpf_spin_lock(&cgv_tree_lock); -+ cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: 1); -+ cgrp_cap_budget(cgv_node, cgc); -+ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); -+ bpf_spin_unlock(&cgv_tree_lock); -+ -+ *cgidp = cgid; -+ stat_inc(FCG_STAT_PNC_NEXT); -+ return true; -+ -+out_stash: -+ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); -+ if (!stash) { -+ stat_inc(FCG_STAT_PNC_GONE); -+ goto out_free; -+ } -+ -+ /* -+ * Paired with cmpxchg in cgrp_enqueued(). If they see the following -+ * transition, they'll enqueue the cgroup. If they are earlier, we'll -+ * see their task in the dq below and requeue the cgroup. -+ */ -+ __sync_val_compare_and_swap(&cgc->queued, 1, 0); -+ -+ if (scx_bpf_dsq_nr_queued(cgid)) { -+ bpf_spin_lock(&cgv_tree_lock); -+ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); -+ bpf_spin_unlock(&cgv_tree_lock); -+ stat_inc(FCG_STAT_PNC_RACE); -+ } else { -+ cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); -+ if (cgv_node) { -+ scx_bpf_error("unexpected !NULL cgv_node stash"); -+ goto out_free; -+ } -+ } -+ -+ return false; -+ -+out_free: -+ bpf_obj_drop(cgv_node); -+ return false; -+} -+ -+void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ struct fcg_cpu_ctx *cpuc; -+ struct fcg_cgrp_ctx *cgc; -+ struct cgroup *cgrp; -+ u64 now = bpf_ktime_get_ns(); -+ bool picked_next = false; -+ -+ cpuc = find_cpu_ctx(); -+ if (!cpuc) -+ return; -+ -+ if (!cpuc->cur_cgid) -+ goto pick_next_cgroup; -+ -+ if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) { -+ if (scx_bpf_consume(cpuc->cur_cgid)) { -+ stat_inc(FCG_STAT_CNS_KEEP); -+ return; -+ } -+ stat_inc(FCG_STAT_CNS_EMPTY); -+ } else { -+ stat_inc(FCG_STAT_CNS_EXPIRE); -+ } -+ -+ /* -+ * The current cgroup is expiring. It was already charged a full slice. -+ * Calculate the actual usage and accumulate the delta. -+ */ -+ cgrp = bpf_cgroup_from_id(cpuc->cur_cgid); -+ if (!cgrp) { -+ stat_inc(FCG_STAT_CNS_GONE); -+ goto pick_next_cgroup; -+ } -+ -+ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); -+ if (cgc) { -+ /* -+ * We want to update the vtime delta and then look for the next -+ * cgroup to execute but the latter needs to be done in a loop -+ * and we can't keep the lock held. Oh well... -+ */ -+ bpf_spin_lock(&cgv_tree_lock); -+ __sync_fetch_and_add(&cgc->cvtime_delta, -+ (cpuc->cur_at + cgrp_slice_ns - now) * -+ FCG_HWEIGHT_ONE / (cgc->hweight ?: 1)); -+ bpf_spin_unlock(&cgv_tree_lock); -+ } else { -+ stat_inc(FCG_STAT_CNS_GONE); -+ } -+ -+ bpf_cgroup_release(cgrp); -+ -+pick_next_cgroup: -+ cpuc->cur_at = now; -+ -+ if (scx_bpf_consume(FALLBACK_DSQ)) { -+ cpuc->cur_cgid = 0; -+ return; -+ } -+ -+ bpf_repeat(CGROUP_MAX_RETRIES) { -+ if (try_pick_next_cgroup(&cpuc->cur_cgid)) { -+ picked_next = true; -+ break; -+ } -+ } -+ -+ /* -+ * This only happens if try_pick_next_cgroup() races against enqueue -+ * path for more than CGROUP_MAX_RETRIES times, which is extremely -+ * unlikely and likely indicates an underlying bug. There shouldn't be -+ * any stall risk as the race is against enqueue. -+ */ -+ if (!picked_next) -+ stat_inc(FCG_STAT_PNC_FAIL); -+} -+ -+s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p, -+ struct scx_init_task_args *args) -+{ -+ struct fcg_task_ctx *taskc; -+ struct fcg_cgrp_ctx *cgc; -+ -+ /* -+ * @p is new. Let's ensure that its task_ctx is available. We can sleep -+ * in this function and the following will automatically use GFP_KERNEL. -+ */ -+ taskc = bpf_task_storage_get(&task_ctx, p, 0, -+ BPF_LOCAL_STORAGE_GET_F_CREATE); -+ if (!taskc) -+ return -ENOMEM; -+ -+ taskc->bypassed_at = 0; -+ -+ if (!(cgc = find_cgrp_ctx(args->cgroup))) -+ return -ENOENT; -+ -+ p->scx.dsq_vtime = cgc->tvtime_now; -+ -+ return 0; -+} -+ -+int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp, -+ struct scx_cgroup_init_args *args) -+{ -+ struct fcg_cgrp_ctx *cgc; -+ struct cgv_node *cgv_node; -+ struct cgv_node_stash empty_stash = {}, *stash; -+ u64 cgid = cgrp->kn->id; -+ int ret; -+ -+ /* -+ * Technically incorrect as cgroup ID is full 64bit while dsq ID is -+ * 63bit. Should not be a problem in practice and easy to spot in the -+ * unlikely case that it breaks. -+ */ -+ ret = scx_bpf_create_dsq(cgid, -1); -+ if (ret) -+ return ret; -+ -+ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, -+ BPF_LOCAL_STORAGE_GET_F_CREATE); -+ if (!cgc) { -+ ret = -ENOMEM; -+ goto err_destroy_dsq; -+ } -+ -+ cgc->weight = args->weight; -+ cgc->hweight = FCG_HWEIGHT_ONE; -+ -+ ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash, -+ BPF_NOEXIST); -+ if (ret) { -+ if (ret != -ENOMEM) -+ scx_bpf_error("unexpected stash creation error (%d)", -+ ret); -+ goto err_destroy_dsq; -+ } -+ -+ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); -+ if (!stash) { -+ scx_bpf_error("unexpected cgv_node stash lookup failure"); -+ ret = -ENOENT; -+ goto err_destroy_dsq; -+ } -+ -+ cgv_node = bpf_obj_new(struct cgv_node); -+ if (!cgv_node) { -+ ret = -ENOMEM; -+ goto err_del_cgv_node; -+ } -+ -+ cgv_node->cgid = cgid; -+ cgv_node->cvtime = cvtime_now; -+ -+ cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); -+ if (cgv_node) { -+ scx_bpf_error("unexpected !NULL cgv_node stash"); -+ ret = -EBUSY; -+ goto err_drop; -+ } -+ -+ return 0; -+ -+err_drop: -+ bpf_obj_drop(cgv_node); -+err_del_cgv_node: -+ bpf_map_delete_elem(&cgv_node_stash, &cgid); -+err_destroy_dsq: -+ scx_bpf_destroy_dsq(cgid); -+ return ret; -+} -+ -+void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp) -+{ -+ u64 cgid = cgrp->kn->id; -+ -+ /* -+ * For now, there's no way find and remove the cgv_node if it's on the -+ * cgv_tree. Let's drain them in the dispatch path as they get popped -+ * off the front of the tree. -+ */ -+ bpf_map_delete_elem(&cgv_node_stash, &cgid); -+ scx_bpf_destroy_dsq(cgid); -+} -+ -+void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p, -+ struct cgroup *from, struct cgroup *to) -+{ -+ struct fcg_cgrp_ctx *from_cgc, *to_cgc; -+ s64 vtime_delta; -+ -+ /* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */ -+ if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to))) -+ return; -+ -+ vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now; -+ p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta; -+} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init) -+{ -+ return scx_bpf_create_dsq(FALLBACK_DSQ, -1); -+} -+ -+void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SCX_OPS_DEFINE(flatcg_ops, -+ .select_cpu = (void *)fcg_select_cpu, -+ .enqueue = (void *)fcg_enqueue, -+ .dispatch = (void *)fcg_dispatch, -+ .runnable = (void *)fcg_runnable, -+ .running = (void *)fcg_running, -+ .stopping = (void *)fcg_stopping, -+ .quiescent = (void *)fcg_quiescent, -+ .init_task = (void *)fcg_init_task, -+ .cgroup_set_weight = (void *)fcg_cgroup_set_weight, -+ .cgroup_init = (void *)fcg_cgroup_init, -+ .cgroup_exit = (void *)fcg_cgroup_exit, -+ .cgroup_move = (void *)fcg_cgroup_move, -+ .init = (void *)fcg_init, -+ .exit = (void *)fcg_exit, -+ .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING, -+ .name = "flatcg"); -diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c -new file mode 100644 -index 000000000000..5d24ca9c29d9 ---- /dev/null -+++ b/tools/sched_ext/scx_flatcg.c -@@ -0,0 +1,233 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 Tejun Heo -+ * Copyright (c) 2023 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "scx_flatcg.h" -+#include "scx_flatcg.bpf.skel.h" -+ -+#ifndef FILEID_KERNFS -+#define FILEID_KERNFS 0xfe -+#endif -+ -+const char help_fmt[] = -+"A flattened cgroup hierarchy sched_ext scheduler.\n" -+"\n" -+"See the top-level comment in .bpf.c for more details.\n" -+"\n" -+"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-v]\n" -+"\n" -+" -s SLICE_US Override slice duration\n" -+" -i INTERVAL Report interval\n" -+" -f Use FIFO scheduling instead of weighted vtime scheduling\n" -+" -v Print libbpf debug messages\n" -+" -h Display this help and exit\n"; -+ -+static bool verbose; -+static volatile int exit_req; -+ -+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) -+{ -+ if (level == LIBBPF_DEBUG && !verbose) -+ return 0; -+ return vfprintf(stderr, format, args); -+} -+ -+static void sigint_handler(int dummy) -+{ -+ exit_req = 1; -+} -+ -+static float read_cpu_util(__u64 *last_sum, __u64 *last_idle) -+{ -+ FILE *fp; -+ char buf[4096]; -+ char *line, *cur = NULL, *tok; -+ __u64 sum = 0, idle = 0; -+ __u64 delta_sum, delta_idle; -+ int idx; -+ -+ fp = fopen("/proc/stat", "r"); -+ if (!fp) { -+ perror("fopen(\"/proc/stat\")"); -+ return 0.0; -+ } -+ -+ if (!fgets(buf, sizeof(buf), fp)) { -+ perror("fgets(\"/proc/stat\")"); -+ fclose(fp); -+ return 0.0; -+ } -+ fclose(fp); -+ -+ line = buf; -+ for (idx = 0; (tok = strtok_r(line, " \n", &cur)); idx++) { -+ char *endp = NULL; -+ __u64 v; -+ -+ if (idx == 0) { -+ line = NULL; -+ continue; -+ } -+ v = strtoull(tok, &endp, 0); -+ if (!endp || *endp != '\0') { -+ fprintf(stderr, "failed to parse %dth field of /proc/stat (\"%s\")\n", -+ idx, tok); -+ continue; -+ } -+ sum += v; -+ if (idx == 4) -+ idle = v; -+ } -+ -+ delta_sum = sum - *last_sum; -+ delta_idle = idle - *last_idle; -+ *last_sum = sum; -+ *last_idle = idle; -+ -+ return delta_sum ? (float)(delta_sum - delta_idle) / delta_sum : 0.0; -+} -+ -+static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats) -+{ -+ __u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus]; -+ __u32 idx; -+ -+ memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS); -+ -+ for (idx = 0; idx < FCG_NR_STATS; idx++) { -+ int ret, cpu; -+ -+ ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), -+ &idx, cnts[idx]); -+ if (ret < 0) -+ continue; -+ for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++) -+ stats[idx] += cnts[idx][cpu]; -+ } -+} -+ -+int main(int argc, char **argv) -+{ -+ struct scx_flatcg *skel; -+ struct bpf_link *link; -+ struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 }; -+ bool dump_cgrps = false; -+ __u64 last_cpu_sum = 0, last_cpu_idle = 0; -+ __u64 last_stats[FCG_NR_STATS] = {}; -+ unsigned long seq = 0; -+ __s32 opt; -+ __u64 ecode; -+ -+ libbpf_set_print(libbpf_print_fn); -+ signal(SIGINT, sigint_handler); -+ signal(SIGTERM, sigint_handler); -+restart: -+ skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg); -+ -+ skel->rodata->nr_cpus = libbpf_num_possible_cpus(); -+ -+ while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) { -+ double v; -+ -+ switch (opt) { -+ case 's': -+ v = strtod(optarg, NULL); -+ skel->rodata->cgrp_slice_ns = v * 1000; -+ break; -+ case 'i': -+ v = strtod(optarg, NULL); -+ intv_ts.tv_sec = v; -+ intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000; -+ break; -+ case 'd': -+ dump_cgrps = true; -+ break; -+ case 'f': -+ skel->rodata->fifo_sched = true; -+ break; -+ case 'v': -+ verbose = true; -+ break; -+ case 'h': -+ default: -+ fprintf(stderr, help_fmt, basename(argv[0])); -+ return opt != 'h'; -+ } -+ } -+ -+ printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d", -+ (double)skel->rodata->cgrp_slice_ns / 1000000.0, -+ (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0, -+ dump_cgrps); -+ -+ SCX_OPS_LOAD(skel, flatcg_ops, scx_flatcg, uei); -+ link = SCX_OPS_ATTACH(skel, flatcg_ops, scx_flatcg); -+ -+ while (!exit_req && !UEI_EXITED(skel, uei)) { -+ __u64 acc_stats[FCG_NR_STATS]; -+ __u64 stats[FCG_NR_STATS]; -+ float cpu_util; -+ int i; -+ -+ cpu_util = read_cpu_util(&last_cpu_sum, &last_cpu_idle); -+ -+ fcg_read_stats(skel, acc_stats); -+ for (i = 0; i < FCG_NR_STATS; i++) -+ stats[i] = acc_stats[i] - last_stats[i]; -+ -+ memcpy(last_stats, acc_stats, sizeof(acc_stats)); -+ -+ printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n", -+ seq++, cpu_util * 100.0, skel->data->hweight_gen); -+ printf(" act:%6llu deact:%6llu global:%6llu local:%6llu\n", -+ stats[FCG_STAT_ACT], -+ stats[FCG_STAT_DEACT], -+ stats[FCG_STAT_GLOBAL], -+ stats[FCG_STAT_LOCAL]); -+ printf("HWT cache:%6llu update:%6llu skip:%6llu race:%6llu\n", -+ stats[FCG_STAT_HWT_CACHE], -+ stats[FCG_STAT_HWT_UPDATES], -+ stats[FCG_STAT_HWT_SKIP], -+ stats[FCG_STAT_HWT_RACE]); -+ printf("ENQ skip:%6llu race:%6llu\n", -+ stats[FCG_STAT_ENQ_SKIP], -+ stats[FCG_STAT_ENQ_RACE]); -+ printf("CNS keep:%6llu expire:%6llu empty:%6llu gone:%6llu\n", -+ stats[FCG_STAT_CNS_KEEP], -+ stats[FCG_STAT_CNS_EXPIRE], -+ stats[FCG_STAT_CNS_EMPTY], -+ stats[FCG_STAT_CNS_GONE]); -+ printf("PNC next:%6llu empty:%6llu nocgrp:%6llu gone:%6llu race:%6llu fail:%6llu\n", -+ stats[FCG_STAT_PNC_NEXT], -+ stats[FCG_STAT_PNC_EMPTY], -+ stats[FCG_STAT_PNC_NO_CGRP], -+ stats[FCG_STAT_PNC_GONE], -+ stats[FCG_STAT_PNC_RACE], -+ stats[FCG_STAT_PNC_FAIL]); -+ printf("BAD remove:%6llu\n", -+ acc_stats[FCG_STAT_BAD_REMOVAL]); -+ fflush(stdout); -+ -+ nanosleep(&intv_ts, NULL); -+ } -+ -+ bpf_link__destroy(link); -+ ecode = UEI_REPORT(skel, uei); -+ scx_flatcg__destroy(skel); -+ -+ if (UEI_ECODE_RESTART(ecode)) -+ goto restart; -+ return 0; -+} -diff --git a/tools/sched_ext/scx_flatcg.h b/tools/sched_ext/scx_flatcg.h -new file mode 100644 -index 000000000000..6f2ea50acb1c ---- /dev/null -+++ b/tools/sched_ext/scx_flatcg.h -@@ -0,0 +1,51 @@ -+#ifndef __SCX_EXAMPLE_FLATCG_H -+#define __SCX_EXAMPLE_FLATCG_H -+ -+enum { -+ FCG_HWEIGHT_ONE = 1LLU << 16, -+}; -+ -+enum fcg_stat_idx { -+ FCG_STAT_ACT, -+ FCG_STAT_DEACT, -+ FCG_STAT_LOCAL, -+ FCG_STAT_GLOBAL, -+ -+ FCG_STAT_HWT_UPDATES, -+ FCG_STAT_HWT_CACHE, -+ FCG_STAT_HWT_SKIP, -+ FCG_STAT_HWT_RACE, -+ -+ FCG_STAT_ENQ_SKIP, -+ FCG_STAT_ENQ_RACE, -+ -+ FCG_STAT_CNS_KEEP, -+ FCG_STAT_CNS_EXPIRE, -+ FCG_STAT_CNS_EMPTY, -+ FCG_STAT_CNS_GONE, -+ -+ FCG_STAT_PNC_NO_CGRP, -+ FCG_STAT_PNC_NEXT, -+ FCG_STAT_PNC_EMPTY, -+ FCG_STAT_PNC_GONE, -+ FCG_STAT_PNC_RACE, -+ FCG_STAT_PNC_FAIL, -+ -+ FCG_STAT_BAD_REMOVAL, -+ -+ FCG_NR_STATS, -+}; -+ -+struct fcg_cgrp_ctx { -+ u32 nr_active; -+ u32 nr_runnable; -+ u32 queued; -+ u32 weight; -+ u32 hweight; -+ u64 child_weight_sum; -+ u64 hweight_gen; -+ s64 cvtime_delta; -+ u64 tvtime_now; -+}; -+ -+#endif /* __SCX_EXAMPLE_FLATCG_H */ -diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c -new file mode 100644 -index 000000000000..5b39bee9eb23 ---- /dev/null -+++ b/tools/sched_ext/scx_qmap.bpf.c -@@ -0,0 +1,813 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A simple five-level FIFO queue scheduler. -+ * -+ * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets -+ * assigned to one depending on its compound weight. Each CPU round robins -+ * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from -+ * queue0, 2 from queue1, 4 from queue2 and so on. -+ * -+ * This scheduler demonstrates: -+ * -+ * - BPF-side queueing using PIDs. -+ * - Sleepable per-task storage allocation using ops.prep_enable(). -+ * - Using ops.cpu_release() to handle a higher priority scheduling class taking -+ * the CPU away. -+ * - Core-sched support. -+ * -+ * This scheduler is primarily for demonstration and testing of sched_ext -+ * features and unlikely to be useful for actual workloads. -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include -+ -+enum consts { -+ ONE_SEC_IN_NS = 1000000000, -+ SHARED_DSQ = 0, -+ HIGHPRI_DSQ = 1, -+ HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */ -+}; -+ -+char _license[] SEC("license") = "GPL"; -+ -+const volatile u64 slice_ns = SCX_SLICE_DFL; -+const volatile u32 stall_user_nth; -+const volatile u32 stall_kernel_nth; -+const volatile u32 dsp_inf_loop_after; -+const volatile u32 dsp_batch; -+const volatile bool highpri_boosting; -+const volatile bool print_shared_dsq; -+const volatile s32 disallow_tgid; -+const volatile bool suppress_dump; -+ -+u64 nr_highpri_queued; -+u32 test_error_cnt; -+ -+UEI_DEFINE(uei); -+ -+struct qmap { -+ __uint(type, BPF_MAP_TYPE_QUEUE); -+ __uint(max_entries, 4096); -+ __type(value, u32); -+} queue0 SEC(".maps"), -+ queue1 SEC(".maps"), -+ queue2 SEC(".maps"), -+ queue3 SEC(".maps"), -+ queue4 SEC(".maps"); -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); -+ __uint(max_entries, 5); -+ __type(key, int); -+ __array(values, struct qmap); -+} queue_arr SEC(".maps") = { -+ .values = { -+ [0] = &queue0, -+ [1] = &queue1, -+ [2] = &queue2, -+ [3] = &queue3, -+ [4] = &queue4, -+ }, -+}; -+ -+/* -+ * If enabled, CPU performance target is set according to the queue index -+ * according to the following table. -+ */ -+static const u32 qidx_to_cpuperf_target[] = { -+ [0] = SCX_CPUPERF_ONE * 0 / 4, -+ [1] = SCX_CPUPERF_ONE * 1 / 4, -+ [2] = SCX_CPUPERF_ONE * 2 / 4, -+ [3] = SCX_CPUPERF_ONE * 3 / 4, -+ [4] = SCX_CPUPERF_ONE * 4 / 4, -+}; -+ -+/* -+ * Per-queue sequence numbers to implement core-sched ordering. -+ * -+ * Tail seq is assigned to each queued task and incremented. Head seq tracks the -+ * sequence number of the latest dispatched task. The distance between the a -+ * task's seq and the associated queue's head seq is called the queue distance -+ * and used when comparing two tasks for ordering. See qmap_core_sched_before(). -+ */ -+static u64 core_sched_head_seqs[5]; -+static u64 core_sched_tail_seqs[5]; -+ -+/* Per-task scheduling context */ -+struct task_ctx { -+ bool force_local; /* Dispatch directly to local_dsq */ -+ bool highpri; -+ u64 core_sched_seq; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE); -+ __uint(map_flags, BPF_F_NO_PREALLOC); -+ __type(key, int); -+ __type(value, struct task_ctx); -+} task_ctx_stor SEC(".maps"); -+ -+struct cpu_ctx { -+ u64 dsp_idx; /* dispatch index */ -+ u64 dsp_cnt; /* remaining count */ -+ u32 avg_weight; -+ u32 cpuperf_target; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); -+ __uint(max_entries, 1); -+ __type(key, u32); -+ __type(value, struct cpu_ctx); -+} cpu_ctx_stor SEC(".maps"); -+ -+/* Statistics */ -+u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq; -+u64 nr_core_sched_execed; -+u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer; -+u32 cpuperf_min, cpuperf_avg, cpuperf_max; -+u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; -+ -+static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu) -+{ -+ s32 cpu; -+ -+ if (p->nr_cpus_allowed == 1 || -+ scx_bpf_test_and_clear_cpu_idle(prev_cpu)) -+ return prev_cpu; -+ -+ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); -+ if (cpu >= 0) -+ return cpu; -+ -+ return -1; -+} -+ -+static struct task_ctx *lookup_task_ctx(struct task_struct *p) -+{ -+ struct task_ctx *tctx; -+ -+ if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return NULL; -+ } -+ return tctx; -+} -+ -+s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ struct task_ctx *tctx; -+ s32 cpu; -+ -+ if (!(tctx = lookup_task_ctx(p))) -+ return -ESRCH; -+ -+ cpu = pick_direct_dispatch_cpu(p, prev_cpu); -+ -+ if (cpu >= 0) { -+ tctx->force_local = true; -+ return cpu; -+ } else { -+ return prev_cpu; -+ } -+} -+ -+static int weight_to_idx(u32 weight) -+{ -+ /* Coarsely map the compound weight to a FIFO. */ -+ if (weight <= 25) -+ return 0; -+ else if (weight <= 50) -+ return 1; -+ else if (weight < 200) -+ return 2; -+ else if (weight < 400) -+ return 3; -+ else -+ return 4; -+} -+ -+void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ static u32 user_cnt, kernel_cnt; -+ struct task_ctx *tctx; -+ u32 pid = p->pid; -+ int idx = weight_to_idx(p->scx.weight); -+ void *ring; -+ s32 cpu; -+ -+ if (p->flags & PF_KTHREAD) { -+ if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) -+ return; -+ } else { -+ if (stall_user_nth && !(++user_cnt % stall_user_nth)) -+ return; -+ } -+ -+ if (test_error_cnt && !--test_error_cnt) -+ scx_bpf_error("test triggering error"); -+ -+ if (!(tctx = lookup_task_ctx(p))) -+ return; -+ -+ /* -+ * All enqueued tasks must have their core_sched_seq updated for correct -+ * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in -+ * qmap_ops.flags. -+ */ -+ tctx->core_sched_seq = core_sched_tail_seqs[idx]++; -+ -+ /* -+ * If qmap_select_cpu() is telling us to or this is the last runnable -+ * task on the CPU, enqueue locally. -+ */ -+ if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) { -+ tctx->force_local = false; -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); -+ return; -+ } -+ -+ /* if !WAKEUP, select_cpu() wasn't called, try direct dispatch */ -+ if (!(enq_flags & SCX_ENQ_WAKEUP) && -+ (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { -+ __sync_fetch_and_add(&nr_ddsp_from_enq, 1); -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); -+ return; -+ } -+ -+ /* -+ * If the task was re-enqueued due to the CPU being preempted by a -+ * higher priority scheduling class, just re-enqueue the task directly -+ * on the global DSQ. As we want another CPU to pick it up, find and -+ * kick an idle CPU. -+ */ -+ if (enq_flags & SCX_ENQ_REENQ) { -+ s32 cpu; -+ -+ scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags); -+ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); -+ if (cpu >= 0) -+ scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); -+ return; -+ } -+ -+ ring = bpf_map_lookup_elem(&queue_arr, &idx); -+ if (!ring) { -+ scx_bpf_error("failed to find ring %d", idx); -+ return; -+ } -+ -+ /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ -+ if (bpf_map_push_elem(ring, &pid, 0)) { -+ scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags); -+ return; -+ } -+ -+ if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) { -+ tctx->highpri = true; -+ __sync_fetch_and_add(&nr_highpri_queued, 1); -+ } -+ __sync_fetch_and_add(&nr_enqueued, 1); -+} -+ -+/* -+ * The BPF queue map doesn't support removal and sched_ext can handle spurious -+ * dispatches. qmap_dequeue() is only used to collect statistics. -+ */ -+void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) -+{ -+ __sync_fetch_and_add(&nr_dequeued, 1); -+ if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC) -+ __sync_fetch_and_add(&nr_core_sched_execed, 1); -+} -+ -+static void update_core_sched_head_seq(struct task_struct *p) -+{ -+ int idx = weight_to_idx(p->scx.weight); -+ struct task_ctx *tctx; -+ -+ if ((tctx = lookup_task_ctx(p))) -+ core_sched_head_seqs[idx] = tctx->core_sched_seq; -+} -+ -+/* -+ * To demonstrate the use of scx_bpf_dispatch_from_dsq(), implement silly -+ * selective priority boosting mechanism by scanning SHARED_DSQ looking for -+ * highpri tasks, moving them to HIGHPRI_DSQ and then consuming them first. This -+ * makes minor difference only when dsp_batch is larger than 1. -+ * -+ * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and -+ * non-rq-lock holding BPF programs. As demonstration, this function is called -+ * from qmap_dispatch() and monitor_timerfn(). -+ */ -+static bool dispatch_highpri(bool from_timer) -+{ -+ struct task_struct *p; -+ s32 this_cpu = bpf_get_smp_processor_id(); -+ -+ /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */ -+ bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) { -+ static u64 highpri_seq; -+ struct task_ctx *tctx; -+ -+ if (!(tctx = lookup_task_ctx(p))) -+ return false; -+ -+ if (tctx->highpri) { -+ /* exercise the set_*() and vtime interface too */ -+ __COMPAT_scx_bpf_dispatch_from_dsq_set_slice( -+ BPF_FOR_EACH_ITER, slice_ns * 2); -+ __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime( -+ BPF_FOR_EACH_ITER, highpri_seq++); -+ __COMPAT_scx_bpf_dispatch_vtime_from_dsq( -+ BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); -+ } -+ } -+ -+ /* -+ * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU -+ * is found. -+ */ -+ bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) { -+ bool dispatched = false; -+ s32 cpu; -+ -+ if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr)) -+ cpu = this_cpu; -+ else -+ cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); -+ -+ if (__COMPAT_scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p, -+ SCX_DSQ_LOCAL_ON | cpu, -+ SCX_ENQ_PREEMPT)) { -+ if (cpu == this_cpu) { -+ dispatched = true; -+ __sync_fetch_and_add(&nr_expedited_local, 1); -+ } else { -+ __sync_fetch_and_add(&nr_expedited_remote, 1); -+ } -+ if (from_timer) -+ __sync_fetch_and_add(&nr_expedited_from_timer, 1); -+ } else { -+ __sync_fetch_and_add(&nr_expedited_lost, 1); -+ } -+ -+ if (dispatched) -+ return true; -+ } -+ -+ return false; -+} -+ -+void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ struct task_struct *p; -+ struct cpu_ctx *cpuc; -+ u32 zero = 0, batch = dsp_batch ?: 1; -+ void *fifo; -+ s32 i, pid; -+ -+ if (dispatch_highpri(false)) -+ return; -+ -+ if (!nr_highpri_queued && scx_bpf_consume(SHARED_DSQ)) -+ return; -+ -+ if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { -+ /* -+ * PID 2 should be kthreadd which should mostly be idle and off -+ * the scheduler. Let's keep dispatching it to force the kernel -+ * to call this function over and over again. -+ */ -+ p = bpf_task_from_pid(2); -+ if (p) { -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0); -+ bpf_task_release(p); -+ return; -+ } -+ } -+ -+ if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { -+ scx_bpf_error("failed to look up cpu_ctx"); -+ return; -+ } -+ -+ for (i = 0; i < 5; i++) { -+ /* Advance the dispatch cursor and pick the fifo. */ -+ if (!cpuc->dsp_cnt) { -+ cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5; -+ cpuc->dsp_cnt = 1 << cpuc->dsp_idx; -+ } -+ -+ fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); -+ if (!fifo) { -+ scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); -+ return; -+ } -+ -+ /* Dispatch or advance. */ -+ bpf_repeat(BPF_MAX_LOOPS) { -+ struct task_ctx *tctx; -+ -+ if (bpf_map_pop_elem(fifo, &pid)) -+ break; -+ -+ p = bpf_task_from_pid(pid); -+ if (!p) -+ continue; -+ -+ if (!(tctx = lookup_task_ctx(p))) { -+ bpf_task_release(p); -+ return; -+ } -+ -+ if (tctx->highpri) -+ __sync_fetch_and_sub(&nr_highpri_queued, 1); -+ -+ update_core_sched_head_seq(p); -+ __sync_fetch_and_add(&nr_dispatched, 1); -+ -+ scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); -+ bpf_task_release(p); -+ -+ batch--; -+ cpuc->dsp_cnt--; -+ if (!batch || !scx_bpf_dispatch_nr_slots()) { -+ if (dispatch_highpri(false)) -+ return; -+ scx_bpf_consume(SHARED_DSQ); -+ return; -+ } -+ if (!cpuc->dsp_cnt) -+ break; -+ } -+ -+ cpuc->dsp_cnt = 0; -+ } -+} -+ -+void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) -+{ -+ struct cpu_ctx *cpuc; -+ u32 zero = 0; -+ int idx; -+ -+ if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { -+ scx_bpf_error("failed to look up cpu_ctx"); -+ return; -+ } -+ -+ /* -+ * Use the running avg of weights to select the target cpuperf level. -+ * This is a demonstration of the cpuperf feature rather than a -+ * practical strategy to regulate CPU frequency. -+ */ -+ cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4; -+ idx = weight_to_idx(cpuc->avg_weight); -+ cpuc->cpuperf_target = qidx_to_cpuperf_target[idx]; -+ -+ scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target); -+} -+ -+/* -+ * The distance from the head of the queue scaled by the weight of the queue. -+ * The lower the number, the older the task and the higher the priority. -+ */ -+static s64 task_qdist(struct task_struct *p) -+{ -+ int idx = weight_to_idx(p->scx.weight); -+ struct task_ctx *tctx; -+ s64 qdist; -+ -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return 0; -+ } -+ -+ qdist = tctx->core_sched_seq - core_sched_head_seqs[idx]; -+ -+ /* -+ * As queue index increments, the priority doubles. The queue w/ index 3 -+ * is dispatched twice more frequently than 2. Reflect the difference by -+ * scaling qdists accordingly. Note that the shift amount needs to be -+ * flipped depending on the sign to avoid flipping priority direction. -+ */ -+ if (qdist >= 0) -+ return qdist << (4 - idx); -+ else -+ return qdist << idx; -+} -+ -+/* -+ * This is called to determine the task ordering when core-sched is picking -+ * tasks to execute on SMT siblings and should encode about the same ordering as -+ * the regular scheduling path. Use the priority-scaled distances from the head -+ * of the queues to compare the two tasks which should be consistent with the -+ * dispatch path behavior. -+ */ -+bool BPF_STRUCT_OPS(qmap_core_sched_before, -+ struct task_struct *a, struct task_struct *b) -+{ -+ return task_qdist(a) > task_qdist(b); -+} -+ -+void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) -+{ -+ u32 cnt; -+ -+ /* -+ * Called when @cpu is taken by a higher priority scheduling class. This -+ * makes @cpu no longer available for executing sched_ext tasks. As we -+ * don't want the tasks in @cpu's local dsq to sit there until @cpu -+ * becomes available again, re-enqueue them into the global dsq. See -+ * %SCX_ENQ_REENQ handling in qmap_enqueue(). -+ */ -+ cnt = scx_bpf_reenqueue_local(); -+ if (cnt) -+ __sync_fetch_and_add(&nr_reenqueued, cnt); -+} -+ -+s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, -+ struct scx_init_task_args *args) -+{ -+ if (p->tgid == disallow_tgid) -+ p->scx.disallow = true; -+ -+ /* -+ * @p is new. Let's ensure that its task_ctx is available. We can sleep -+ * in this function and the following will automatically use GFP_KERNEL. -+ */ -+ if (bpf_task_storage_get(&task_ctx_stor, p, 0, -+ BPF_LOCAL_STORAGE_GET_F_CREATE)) -+ return 0; -+ else -+ return -ENOMEM; -+} -+ -+void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) -+{ -+ s32 i, pid; -+ -+ if (suppress_dump) -+ return; -+ -+ bpf_for(i, 0, 5) { -+ void *fifo; -+ -+ if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i))) -+ return; -+ -+ scx_bpf_dump("QMAP FIFO[%d]:", i); -+ bpf_repeat(4096) { -+ if (bpf_map_pop_elem(fifo, &pid)) -+ break; -+ scx_bpf_dump(" %d", pid); -+ } -+ scx_bpf_dump("\n"); -+ } -+} -+ -+void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle) -+{ -+ u32 zero = 0; -+ struct cpu_ctx *cpuc; -+ -+ if (suppress_dump || idle) -+ return; -+ if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu))) -+ return; -+ -+ scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u", -+ cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight, -+ cpuc->cpuperf_target); -+} -+ -+void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p) -+{ -+ struct task_ctx *taskc; -+ -+ if (suppress_dump) -+ return; -+ if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) -+ return; -+ -+ scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu", -+ taskc->force_local, taskc->core_sched_seq); -+} -+ -+/* -+ * Print out the online and possible CPU map using bpf_printk() as a -+ * demonstration of using the cpumask kfuncs and ops.cpu_on/offline(). -+ */ -+static void print_cpus(void) -+{ -+ const struct cpumask *possible, *online; -+ s32 cpu; -+ char buf[128] = "", *p; -+ int idx; -+ -+ possible = scx_bpf_get_possible_cpumask(); -+ online = scx_bpf_get_online_cpumask(); -+ -+ idx = 0; -+ bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) { -+ if (!(p = MEMBER_VPTR(buf, [idx++]))) -+ break; -+ if (bpf_cpumask_test_cpu(cpu, online)) -+ *p++ = 'O'; -+ else if (bpf_cpumask_test_cpu(cpu, possible)) -+ *p++ = 'X'; -+ else -+ *p++ = ' '; -+ -+ if ((cpu & 7) == 7) { -+ if (!(p = MEMBER_VPTR(buf, [idx++]))) -+ break; -+ *p++ = '|'; -+ } -+ } -+ buf[sizeof(buf) - 1] = '\0'; -+ -+ scx_bpf_put_cpumask(online); -+ scx_bpf_put_cpumask(possible); -+ -+ bpf_printk("CPUS: |%s", buf); -+} -+ -+void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) -+{ -+ bpf_printk("CPU %d coming online", cpu); -+ /* @cpu is already online at this point */ -+ print_cpus(); -+} -+ -+void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) -+{ -+ bpf_printk("CPU %d going offline", cpu); -+ /* @cpu is still online at this point */ -+ print_cpus(); -+} -+ -+struct monitor_timer { -+ struct bpf_timer timer; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_ARRAY); -+ __uint(max_entries, 1); -+ __type(key, u32); -+ __type(value, struct monitor_timer); -+} monitor_timer SEC(".maps"); -+ -+/* -+ * Print out the min, avg and max performance levels of CPUs every second to -+ * demonstrate the cpuperf interface. -+ */ -+static void monitor_cpuperf(void) -+{ -+ u32 zero = 0, nr_cpu_ids; -+ u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0; -+ u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0; -+ const struct cpumask *online; -+ int i, nr_online_cpus = 0; -+ -+ nr_cpu_ids = scx_bpf_nr_cpu_ids(); -+ online = scx_bpf_get_online_cpumask(); -+ -+ bpf_for(i, 0, nr_cpu_ids) { -+ struct cpu_ctx *cpuc; -+ u32 cap, cur; -+ -+ if (!bpf_cpumask_test_cpu(i, online)) -+ continue; -+ nr_online_cpus++; -+ -+ /* collect the capacity and current cpuperf */ -+ cap = scx_bpf_cpuperf_cap(i); -+ cur = scx_bpf_cpuperf_cur(i); -+ -+ cur_min = cur < cur_min ? cur : cur_min; -+ cur_max = cur > cur_max ? cur : cur_max; -+ -+ /* -+ * $cur is relative to $cap. Scale it down accordingly so that -+ * it's in the same scale as other CPUs and $cur_sum/$cap_sum -+ * makes sense. -+ */ -+ cur_sum += cur * cap / SCX_CPUPERF_ONE; -+ cap_sum += cap; -+ -+ if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) { -+ scx_bpf_error("failed to look up cpu_ctx"); -+ goto out; -+ } -+ -+ /* collect target */ -+ cur = cpuc->cpuperf_target; -+ target_sum += cur; -+ target_min = cur < target_min ? cur : target_min; -+ target_max = cur > target_max ? cur : target_max; -+ } -+ -+ cpuperf_min = cur_min; -+ cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum; -+ cpuperf_max = cur_max; -+ -+ cpuperf_target_min = target_min; -+ cpuperf_target_avg = target_sum / nr_online_cpus; -+ cpuperf_target_max = target_max; -+out: -+ scx_bpf_put_cpumask(online); -+} -+ -+/* -+ * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of -+ * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to -+ * see meaningful dumps in the trace pipe. -+ */ -+static void dump_shared_dsq(void) -+{ -+ struct task_struct *p; -+ s32 nr; -+ -+ if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ))) -+ return; -+ -+ bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr); -+ -+ bpf_rcu_read_lock(); -+ bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV) -+ bpf_printk("%s[%d]", p->comm, p->pid); -+ bpf_rcu_read_unlock(); -+} -+ -+static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) -+{ -+ bpf_rcu_read_lock(); -+ dispatch_highpri(true); -+ bpf_rcu_read_unlock(); -+ -+ monitor_cpuperf(); -+ -+ if (print_shared_dsq) -+ dump_shared_dsq(); -+ -+ bpf_timer_start(timer, ONE_SEC_IN_NS, 0); -+ return 0; -+} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) -+{ -+ u32 key = 0; -+ struct bpf_timer *timer; -+ s32 ret; -+ -+ print_cpus(); -+ -+ ret = scx_bpf_create_dsq(SHARED_DSQ, -1); -+ if (ret) -+ return ret; -+ -+ ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1); -+ if (ret) -+ return ret; -+ -+ timer = bpf_map_lookup_elem(&monitor_timer, &key); -+ if (!timer) -+ return -ESRCH; -+ -+ bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC); -+ bpf_timer_set_callback(timer, monitor_timerfn); -+ -+ return bpf_timer_start(timer, ONE_SEC_IN_NS, 0); -+} -+ -+void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SCX_OPS_DEFINE(qmap_ops, -+ .select_cpu = (void *)qmap_select_cpu, -+ .enqueue = (void *)qmap_enqueue, -+ .dequeue = (void *)qmap_dequeue, -+ .dispatch = (void *)qmap_dispatch, -+ .tick = (void *)qmap_tick, -+ .core_sched_before = (void *)qmap_core_sched_before, -+ .cpu_release = (void *)qmap_cpu_release, -+ .init_task = (void *)qmap_init_task, -+ .dump = (void *)qmap_dump, -+ .dump_cpu = (void *)qmap_dump_cpu, -+ .dump_task = (void *)qmap_dump_task, -+ .cpu_online = (void *)qmap_cpu_online, -+ .cpu_offline = (void *)qmap_cpu_offline, -+ .init = (void *)qmap_init, -+ .exit = (void *)qmap_exit, -+ .flags = SCX_OPS_ENQ_LAST, -+ .timeout_ms = 5000U, -+ .name = "qmap"); -diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c -new file mode 100644 -index 000000000000..ac45a02b4055 ---- /dev/null -+++ b/tools/sched_ext/scx_qmap.c -@@ -0,0 +1,153 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "scx_qmap.bpf.skel.h" -+ -+const char help_fmt[] = -+"A simple five-level FIFO queue sched_ext scheduler.\n" -+"\n" -+"See the top-level comment in .bpf.c for more details.\n" -+"\n" -+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n" -+" [-P] [-d PID] [-D LEN] [-p] [-v]\n" -+"\n" -+" -s SLICE_US Override slice duration\n" -+" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" -+" -t COUNT Stall every COUNT'th user thread\n" -+" -T COUNT Stall every COUNT'th kernel thread\n" -+" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" -+" -b COUNT Dispatch upto COUNT tasks together\n" -+" -P Print out DSQ content to trace_pipe every second, use with -b\n" -+" -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n" -+" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" -+" -D LEN Set scx_exit_info.dump buffer length\n" -+" -S Suppress qmap-specific debug dump\n" -+" -p Switch only tasks on SCHED_EXT policy instead of all\n" -+" -v Print libbpf debug messages\n" -+" -h Display this help and exit\n"; -+ -+static bool verbose; -+static volatile int exit_req; -+ -+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) -+{ -+ if (level == LIBBPF_DEBUG && !verbose) -+ return 0; -+ return vfprintf(stderr, format, args); -+} -+ -+static void sigint_handler(int dummy) -+{ -+ exit_req = 1; -+} -+ -+int main(int argc, char **argv) -+{ -+ struct scx_qmap *skel; -+ struct bpf_link *link; -+ int opt; -+ -+ libbpf_set_print(libbpf_print_fn); -+ signal(SIGINT, sigint_handler); -+ signal(SIGTERM, sigint_handler); -+ -+ skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); -+ -+ while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) { -+ switch (opt) { -+ case 's': -+ skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; -+ break; -+ case 'e': -+ skel->bss->test_error_cnt = strtoul(optarg, NULL, 0); -+ break; -+ case 't': -+ skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0); -+ break; -+ case 'T': -+ skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0); -+ break; -+ case 'l': -+ skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0); -+ break; -+ case 'b': -+ skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); -+ break; -+ case 'P': -+ skel->rodata->print_shared_dsq = true; -+ break; -+ case 'H': -+ skel->rodata->highpri_boosting = true; -+ break; -+ case 'd': -+ skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); -+ if (skel->rodata->disallow_tgid < 0) -+ skel->rodata->disallow_tgid = getpid(); -+ break; -+ case 'D': -+ skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0); -+ break; -+ case 'S': -+ skel->rodata->suppress_dump = true; -+ break; -+ case 'p': -+ skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL; -+ break; -+ case 'v': -+ verbose = true; -+ break; -+ default: -+ fprintf(stderr, help_fmt, basename(argv[0])); -+ return opt != 'h'; -+ } -+ } -+ -+ SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei); -+ link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap); -+ -+ while (!exit_req && !UEI_EXITED(skel, uei)) { -+ long nr_enqueued = skel->bss->nr_enqueued; -+ long nr_dispatched = skel->bss->nr_dispatched; -+ -+ printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n", -+ nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, -+ skel->bss->nr_reenqueued, skel->bss->nr_dequeued, -+ skel->bss->nr_core_sched_execed, -+ skel->bss->nr_ddsp_from_enq); -+ printf(" exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n", -+ skel->bss->nr_expedited_local, -+ skel->bss->nr_expedited_remote, -+ skel->bss->nr_expedited_from_timer, -+ skel->bss->nr_expedited_lost); -+ if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur")) -+ printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n", -+ skel->bss->cpuperf_min, -+ skel->bss->cpuperf_avg, -+ skel->bss->cpuperf_max, -+ skel->bss->cpuperf_target_min, -+ skel->bss->cpuperf_target_avg, -+ skel->bss->cpuperf_target_max); -+ fflush(stdout); -+ sleep(1); -+ } -+ -+ bpf_link__destroy(link); -+ UEI_REPORT(skel, uei); -+ scx_qmap__destroy(skel); -+ /* -+ * scx_qmap implements ops.cpu_on/offline() and doesn't need to restart -+ * on CPU hotplug events. -+ */ -+ return 0; -+} -diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py -new file mode 100644 -index 000000000000..8bc626ede1c4 ---- /dev/null -+++ b/tools/sched_ext/scx_show_state.py -@@ -0,0 +1,40 @@ -+#!/usr/bin/env drgn -+# -+# Copyright (C) 2024 Tejun Heo -+# Copyright (C) 2024 Meta Platforms, Inc. and affiliates. -+ -+desc = """ -+This is a drgn script to show the current sched_ext state. -+For more info on drgn, visit https://github.com/osandov/drgn. -+""" -+ -+import drgn -+import sys -+ -+def err(s): -+ print(s, file=sys.stderr, flush=True) -+ sys.exit(1) -+ -+def read_int(name): -+ return int(prog[name].value_()) -+ -+def read_atomic(name): -+ return prog[name].counter.value_() -+ -+def read_static_key(name): -+ return prog[name].key.enabled.counter.value_() -+ -+def ops_state_str(state): -+ return prog['scx_ops_enable_state_str'][state].string_().decode() -+ -+ops = prog['scx_ops'] -+enable_state = read_atomic("scx_ops_enable_state_var") -+ -+print(f'ops : {ops.name.string_().decode()}') -+print(f'enabled : {read_static_key("__scx_ops_enabled")}') -+print(f'switching_all : {read_int("scx_switching_all")}') -+print(f'switched_all : {read_static_key("__scx_switched_all")}') -+print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})') -+print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}') -+print(f'nr_rejected : {read_atomic("scx_nr_rejected")}') -+print(f'enable_seq : {read_atomic("scx_enable_seq")}') -diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c -new file mode 100644 -index 000000000000..ed7e8d535fc5 ---- /dev/null -+++ b/tools/sched_ext/scx_simple.bpf.c -@@ -0,0 +1,156 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A simple scheduler. -+ * -+ * By default, it operates as a simple global weighted vtime scheduler and can -+ * be switched to FIFO scheduling. It also demonstrates the following niceties. -+ * -+ * - Statistics tracking how many tasks are queued to local and global dsq's. -+ * - Termination notification for userspace. -+ * -+ * While very simple, this scheduler should work reasonably well on CPUs with a -+ * uniform L3 cache topology. While preemption is not implemented, the fact that -+ * the scheduling queue is shared across all CPUs means that whatever is at the -+ * front of the queue is likely to be executed fairly quickly given enough -+ * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads -+ * but comes with the usual problems with FIFO scheduling where saturating -+ * threads can easily drown out interactive ones. -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+const volatile bool fifo_sched; -+ -+static u64 vtime_now; -+UEI_DEFINE(uei); -+ -+/* -+ * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues -+ * (meaning, cannot be dispatched to with scx_bpf_dispatch_vtime()). We -+ * therefore create a separate DSQ with ID 0 that we dispatch to and consume -+ * from. If scx_simple only supported global FIFO scheduling, then we could -+ * just use SCX_DSQ_GLOBAL. -+ */ -+#define SHARED_DSQ 0 -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); -+ __uint(key_size, sizeof(u32)); -+ __uint(value_size, sizeof(u64)); -+ __uint(max_entries, 2); /* [local, global] */ -+} stats SEC(".maps"); -+ -+static void stat_inc(u32 idx) -+{ -+ u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); -+ if (cnt_p) -+ (*cnt_p)++; -+} -+ -+static inline bool vtime_before(u64 a, u64 b) -+{ -+ return (s64)(a - b) < 0; -+} -+ -+s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) -+{ -+ bool is_idle = false; -+ s32 cpu; -+ -+ cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); -+ if (is_idle) { -+ stat_inc(0); /* count local queueing */ -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); -+ } -+ -+ return cpu; -+} -+ -+void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ stat_inc(1); /* count global queueing */ -+ -+ if (fifo_sched) { -+ scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); -+ } else { -+ u64 vtime = p->scx.dsq_vtime; -+ -+ /* -+ * Limit the amount of budget that an idling task can accumulate -+ * to one slice. -+ */ -+ if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) -+ vtime = vtime_now - SCX_SLICE_DFL; -+ -+ scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime, -+ enq_flags); -+ } -+} -+ -+void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ scx_bpf_consume(SHARED_DSQ); -+} -+ -+void BPF_STRUCT_OPS(simple_running, struct task_struct *p) -+{ -+ if (fifo_sched) -+ return; -+ -+ /* -+ * Global vtime always progresses forward as tasks start executing. The -+ * test and update can be performed concurrently from multiple CPUs and -+ * thus racy. Any error should be contained and temporary. Let's just -+ * live with it. -+ */ -+ if (vtime_before(vtime_now, p->scx.dsq_vtime)) -+ vtime_now = p->scx.dsq_vtime; -+} -+ -+void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable) -+{ -+ if (fifo_sched) -+ return; -+ -+ /* -+ * Scale the execution time by the inverse of the weight and charge. -+ * -+ * Note that the default yield implementation yields by setting -+ * @p->scx.slice to zero and the following would treat the yielding task -+ * as if it has consumed all its slice. If this penalizes yielding tasks -+ * too much, determine the execution time by taking explicit timestamps -+ * instead of depending on @p->scx.slice. -+ */ -+ p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; -+} -+ -+void BPF_STRUCT_OPS(simple_enable, struct task_struct *p) -+{ -+ p->scx.dsq_vtime = vtime_now; -+} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) -+{ -+ return scx_bpf_create_dsq(SHARED_DSQ, -1); -+} -+ -+void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SCX_OPS_DEFINE(simple_ops, -+ .select_cpu = (void *)simple_select_cpu, -+ .enqueue = (void *)simple_enqueue, -+ .dispatch = (void *)simple_dispatch, -+ .running = (void *)simple_running, -+ .stopping = (void *)simple_stopping, -+ .enable = (void *)simple_enable, -+ .init = (void *)simple_init, -+ .exit = (void *)simple_exit, -+ .name = "simple"); -diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c -new file mode 100644 -index 000000000000..76d83199545c ---- /dev/null -+++ b/tools/sched_ext/scx_simple.c -@@ -0,0 +1,107 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include "scx_simple.bpf.skel.h" -+ -+const char help_fmt[] = -+"A simple sched_ext scheduler.\n" -+"\n" -+"See the top-level comment in .bpf.c for more details.\n" -+"\n" -+"Usage: %s [-f] [-v]\n" -+"\n" -+" -f Use FIFO scheduling instead of weighted vtime scheduling\n" -+" -v Print libbpf debug messages\n" -+" -h Display this help and exit\n"; -+ -+static bool verbose; -+static volatile int exit_req; -+ -+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) -+{ -+ if (level == LIBBPF_DEBUG && !verbose) -+ return 0; -+ return vfprintf(stderr, format, args); -+} -+ -+static void sigint_handler(int simple) -+{ -+ exit_req = 1; -+} -+ -+static void read_stats(struct scx_simple *skel, __u64 *stats) -+{ -+ int nr_cpus = libbpf_num_possible_cpus(); -+ __u64 cnts[2][nr_cpus]; -+ __u32 idx; -+ -+ memset(stats, 0, sizeof(stats[0]) * 2); -+ -+ for (idx = 0; idx < 2; idx++) { -+ int ret, cpu; -+ -+ ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), -+ &idx, cnts[idx]); -+ if (ret < 0) -+ continue; -+ for (cpu = 0; cpu < nr_cpus; cpu++) -+ stats[idx] += cnts[idx][cpu]; -+ } -+} -+ -+int main(int argc, char **argv) -+{ -+ struct scx_simple *skel; -+ struct bpf_link *link; -+ __u32 opt; -+ __u64 ecode; -+ -+ libbpf_set_print(libbpf_print_fn); -+ signal(SIGINT, sigint_handler); -+ signal(SIGTERM, sigint_handler); -+restart: -+ skel = SCX_OPS_OPEN(simple_ops, scx_simple); -+ -+ while ((opt = getopt(argc, argv, "fvh")) != -1) { -+ switch (opt) { -+ case 'f': -+ skel->rodata->fifo_sched = true; -+ break; -+ case 'v': -+ verbose = true; -+ break; -+ default: -+ fprintf(stderr, help_fmt, basename(argv[0])); -+ return opt != 'h'; -+ } -+ } -+ -+ SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei); -+ link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple); -+ -+ while (!exit_req && !UEI_EXITED(skel, uei)) { -+ __u64 stats[2]; -+ -+ read_stats(skel, stats); -+ printf("local=%llu global=%llu\n", stats[0], stats[1]); -+ fflush(stdout); -+ sleep(1); -+ } -+ -+ bpf_link__destroy(link); -+ ecode = UEI_REPORT(skel, uei); -+ scx_simple__destroy(skel); -+ -+ if (UEI_ECODE_RESTART(ecode)) -+ goto restart; -+ return 0; -+} -diff --git a/tools/testing/selftests/sched_ext/.gitignore b/tools/testing/selftests/sched_ext/.gitignore -new file mode 100644 -index 000000000000..ae5491a114c0 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/.gitignore -@@ -0,0 +1,6 @@ -+* -+!*.c -+!*.h -+!Makefile -+!.gitignore -+!config -diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile -new file mode 100644 -index 000000000000..0754a2c110a1 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/Makefile -@@ -0,0 +1,218 @@ -+# SPDX-License-Identifier: GPL-2.0 -+# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+include ../../../build/Build.include -+include ../../../scripts/Makefile.arch -+include ../../../scripts/Makefile.include -+include ../lib.mk -+ -+ifneq ($(LLVM),) -+ifneq ($(filter %/,$(LLVM)),) -+LLVM_PREFIX := $(LLVM) -+else ifneq ($(filter -%,$(LLVM)),) -+LLVM_SUFFIX := $(LLVM) -+endif -+ -+CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as -+else -+CC := gcc -+endif # LLVM -+ -+ifneq ($(CROSS_COMPILE),) -+$(error CROSS_COMPILE not supported for scx selftests) -+endif # CROSS_COMPILE -+ -+CURDIR := $(abspath .) -+REPOROOT := $(abspath ../../../..) -+TOOLSDIR := $(REPOROOT)/tools -+LIBDIR := $(TOOLSDIR)/lib -+BPFDIR := $(LIBDIR)/bpf -+TOOLSINCDIR := $(TOOLSDIR)/include -+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool -+APIDIR := $(TOOLSINCDIR)/uapi -+GENDIR := $(REPOROOT)/include/generated -+GENHDR := $(GENDIR)/autoconf.h -+SCXTOOLSDIR := $(TOOLSDIR)/sched_ext -+SCXTOOLSINCDIR := $(TOOLSDIR)/sched_ext/include -+ -+OUTPUT_DIR := $(CURDIR)/build -+OBJ_DIR := $(OUTPUT_DIR)/obj -+INCLUDE_DIR := $(OUTPUT_DIR)/include -+BPFOBJ_DIR := $(OBJ_DIR)/libbpf -+SCXOBJ_DIR := $(OBJ_DIR)/sched_ext -+BPFOBJ := $(BPFOBJ_DIR)/libbpf.a -+LIBBPF_OUTPUT := $(OBJ_DIR)/libbpf/libbpf.a -+DEFAULT_BPFTOOL := $(OUTPUT_DIR)/sbin/bpftool -+HOST_BUILD_DIR := $(OBJ_DIR) -+HOST_OUTPUT_DIR := $(OUTPUT_DIR) -+ -+VMLINUX_BTF_PATHS ?= ../../../../vmlinux \ -+ /sys/kernel/btf/vmlinux \ -+ /boot/vmlinux-$(shell uname -r) -+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) -+ifeq ($(VMLINUX_BTF),) -+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") -+endif -+ -+BPFTOOL ?= $(DEFAULT_BPFTOOL) -+ -+ifneq ($(wildcard $(GENHDR)),) -+ GENFLAGS := -DHAVE_GENHDR -+endif -+ -+CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ -+ -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ -+ -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include -I$(SCXTOOLSINCDIR) -+ -+# Silence some warnings when compiled with clang -+ifneq ($(LLVM),) -+CFLAGS += -Wno-unused-command-line-argument -+endif -+ -+LDFLAGS = -lelf -lz -lpthread -lzstd -+ -+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - &1 \ -+ | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ -+$(shell $(1) -dM -E - $@ -+else -+ $(call msg,CP,,$@) -+ $(Q)cp "$(VMLINUX_H)" $@ -+endif -+ -+$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h | $(BPFOBJ) $(SCXOBJ_DIR) -+ $(call msg,CLNG-BPF,,$(notdir $@)) -+ $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ -+ -+$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) | $(INCLUDE_DIR) -+ $(eval sched=$(notdir $@)) -+ $(call msg,GEN-SKEL,,$(sched)) -+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< -+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) -+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) -+ $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) -+ $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@ -+ $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) -+ -+################ -+# C schedulers # -+################ -+ -+override define CLEAN -+ rm -rf $(OUTPUT_DIR) -+ rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h -+ rm -f $(TEST_GEN_PROGS) -+ rm -f runner -+endef -+ -+# Every testcase takes all of the BPF progs are dependencies by default. This -+# allows testcases to load any BPF scheduler, which is useful for testcases -+# that don't need their own prog to run their test. -+all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubst %.c,%.skel.h,$(prog))) -+ -+auto-test-targets := \ -+ create_dsq \ -+ enq_last_no_enq_fails \ -+ enq_select_cpu_fails \ -+ ddsp_bogus_dsq_fail \ -+ ddsp_vtimelocal_fail \ -+ dsp_local_on \ -+ exit \ -+ hotplug \ -+ init_enable_count \ -+ maximal \ -+ maybe_null \ -+ minimal \ -+ prog_run \ -+ reload_loop \ -+ select_cpu_dfl \ -+ select_cpu_dfl_nodispatch \ -+ select_cpu_dispatch \ -+ select_cpu_dispatch_bad_dsq \ -+ select_cpu_dispatch_dbl_dsp \ -+ select_cpu_vtime \ -+ test_example \ -+ -+testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets))) -+ -+$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR) -+ $(CC) $(CFLAGS) -c $< -o $@ -+ -+# Create all of the test targets object files, whose testcase objects will be -+# registered into the runner in ELF constructors. -+# -+# Note that we must do double expansion here in order to support conditionally -+# compiling BPF object files only if one is present, as the wildcard Make -+# function doesn't support using implicit rules otherwise. -+$(testcase-targets): $(SCXOBJ_DIR)/%.o: %.c $(SCXOBJ_DIR)/runner.o $(all_test_bpfprogs) | $(SCXOBJ_DIR) -+ $(eval test=$(patsubst %.o,%.c,$(notdir $@))) -+ $(CC) $(CFLAGS) -c $< -o $@ $(SCXOBJ_DIR)/runner.o -+ -+$(SCXOBJ_DIR)/util.o: util.c | $(SCXOBJ_DIR) -+ $(CC) $(CFLAGS) -c $< -o $@ -+ -+runner: $(SCXOBJ_DIR)/runner.o $(SCXOBJ_DIR)/util.o $(BPFOBJ) $(testcase-targets) -+ @echo "$(testcase-targets)" -+ $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) -+ -+TEST_GEN_PROGS := runner -+ -+all: runner -+ -+.PHONY: all clean help -+ -+.DEFAULT_GOAL := all -+ -+.DELETE_ON_ERROR: -+ -+.SECONDARY: -diff --git a/tools/testing/selftests/sched_ext/config b/tools/testing/selftests/sched_ext/config -new file mode 100644 -index 000000000000..0de9b4ee249d ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/config -@@ -0,0 +1,9 @@ -+CONFIG_SCHED_DEBUG=y -+CONFIG_SCHED_CLASS_EXT=y -+CONFIG_CGROUPS=y -+CONFIG_CGROUP_SCHED=y -+CONFIG_EXT_GROUP_SCHED=y -+CONFIG_BPF=y -+CONFIG_BPF_SYSCALL=y -+CONFIG_DEBUG_INFO=y -+CONFIG_DEBUG_INFO_BTF=y -diff --git a/tools/testing/selftests/sched_ext/create_dsq.bpf.c b/tools/testing/selftests/sched_ext/create_dsq.bpf.c -new file mode 100644 -index 000000000000..23f79ed343f0 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/create_dsq.bpf.c -@@ -0,0 +1,58 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Create and destroy DSQs in a loop. -+ * -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+void BPF_STRUCT_OPS(create_dsq_exit_task, struct task_struct *p, -+ struct scx_exit_task_args *args) -+{ -+ scx_bpf_destroy_dsq(p->pid); -+} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init_task, struct task_struct *p, -+ struct scx_init_task_args *args) -+{ -+ s32 err; -+ -+ err = scx_bpf_create_dsq(p->pid, -1); -+ if (err) -+ scx_bpf_error("Failed to create DSQ for %s[%d]", -+ p->comm, p->pid); -+ -+ return err; -+} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init) -+{ -+ u32 i; -+ s32 err; -+ -+ bpf_for(i, 0, 1024) { -+ err = scx_bpf_create_dsq(i, -1); -+ if (err) { -+ scx_bpf_error("Failed to create DSQ %d", i); -+ return 0; -+ } -+ } -+ -+ bpf_for(i, 0, 1024) { -+ scx_bpf_destroy_dsq(i); -+ } -+ -+ return 0; -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops create_dsq_ops = { -+ .init_task = create_dsq_init_task, -+ .exit_task = create_dsq_exit_task, -+ .init = create_dsq_init, -+ .name = "create_dsq", -+}; -diff --git a/tools/testing/selftests/sched_ext/create_dsq.c b/tools/testing/selftests/sched_ext/create_dsq.c -new file mode 100644 -index 000000000000..fa946d9146d4 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/create_dsq.c -@@ -0,0 +1,57 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include "create_dsq.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct create_dsq *skel; -+ -+ skel = create_dsq__open_and_load(); -+ if (!skel) { -+ SCX_ERR("Failed to open and load skel"); -+ return SCX_TEST_FAIL; -+ } -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct create_dsq *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.create_dsq_ops); -+ if (!link) { -+ SCX_ERR("Failed to attach scheduler"); -+ return SCX_TEST_FAIL; -+ } -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct create_dsq *skel = ctx; -+ -+ create_dsq__destroy(skel); -+} -+ -+struct scx_test create_dsq = { -+ .name = "create_dsq", -+ .description = "Create and destroy a dsq in a loop", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&create_dsq) -diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c -new file mode 100644 -index 000000000000..e97ad41d354a ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c -@@ -0,0 +1,42 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ * Copyright (c) 2024 Tejun Heo -+ */ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+UEI_DEFINE(uei); -+ -+s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); -+ -+ if (cpu >= 0) { -+ /* -+ * If we dispatch to a bogus DSQ that will fall back to the -+ * builtin global DSQ, we fail gracefully. -+ */ -+ scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL, -+ p->scx.dsq_vtime, 0); -+ return cpu; -+ } -+ -+ return prev_cpu; -+} -+ -+void BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops ddsp_bogus_dsq_fail_ops = { -+ .select_cpu = ddsp_bogus_dsq_fail_select_cpu, -+ .exit = ddsp_bogus_dsq_fail_exit, -+ .name = "ddsp_bogus_dsq_fail", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c -new file mode 100644 -index 000000000000..e65d22f23f3b ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c -@@ -0,0 +1,57 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ * Copyright (c) 2024 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "ddsp_bogus_dsq_fail.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct ddsp_bogus_dsq_fail *skel; -+ -+ skel = ddsp_bogus_dsq_fail__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct ddsp_bogus_dsq_fail *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.ddsp_bogus_dsq_fail_ops); -+ SCX_FAIL_IF(!link, "Failed to attach struct_ops"); -+ -+ sleep(1); -+ -+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct ddsp_bogus_dsq_fail *skel = ctx; -+ -+ ddsp_bogus_dsq_fail__destroy(skel); -+} -+ -+struct scx_test ddsp_bogus_dsq_fail = { -+ .name = "ddsp_bogus_dsq_fail", -+ .description = "Verify we gracefully fail, and fall back to using a " -+ "built-in DSQ, if we do a direct dispatch to an invalid" -+ " DSQ in ops.select_cpu()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&ddsp_bogus_dsq_fail) -diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c -new file mode 100644 -index 000000000000..dde7e7dafbfb ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c -@@ -0,0 +1,39 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ * Copyright (c) 2024 Tejun Heo -+ */ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+UEI_DEFINE(uei); -+ -+s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); -+ -+ if (cpu >= 0) { -+ /* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */ -+ scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, -+ p->scx.dsq_vtime, 0); -+ return cpu; -+ } -+ -+ return prev_cpu; -+} -+ -+void BPF_STRUCT_OPS(ddsp_vtimelocal_fail_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops ddsp_vtimelocal_fail_ops = { -+ .select_cpu = ddsp_vtimelocal_fail_select_cpu, -+ .exit = ddsp_vtimelocal_fail_exit, -+ .name = "ddsp_vtimelocal_fail", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c -new file mode 100644 -index 000000000000..abafee587cd6 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c -@@ -0,0 +1,56 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ * Copyright (c) 2024 Tejun Heo -+ */ -+#include -+#include -+#include -+#include "ddsp_vtimelocal_fail.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct ddsp_vtimelocal_fail *skel; -+ -+ skel = ddsp_vtimelocal_fail__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct ddsp_vtimelocal_fail *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.ddsp_vtimelocal_fail_ops); -+ SCX_FAIL_IF(!link, "Failed to attach struct_ops"); -+ -+ sleep(1); -+ -+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct ddsp_vtimelocal_fail *skel = ctx; -+ -+ ddsp_vtimelocal_fail__destroy(skel); -+} -+ -+struct scx_test ddsp_vtimelocal_fail = { -+ .name = "ddsp_vtimelocal_fail", -+ .description = "Verify we gracefully fail, and fall back to using a " -+ "built-in DSQ, if we do a direct vtime dispatch to a " -+ "built-in DSQ from DSQ in ops.select_cpu()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&ddsp_vtimelocal_fail) -diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c -new file mode 100644 -index 000000000000..efb4672decb4 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c -@@ -0,0 +1,65 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+const volatile s32 nr_cpus; -+ -+UEI_DEFINE(uei); -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_QUEUE); -+ __uint(max_entries, 8192); -+ __type(value, s32); -+} queue SEC(".maps"); -+ -+s32 BPF_STRUCT_OPS(dsp_local_on_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ return prev_cpu; -+} -+ -+void BPF_STRUCT_OPS(dsp_local_on_enqueue, struct task_struct *p, -+ u64 enq_flags) -+{ -+ s32 pid = p->pid; -+ -+ if (bpf_map_push_elem(&queue, &pid, 0)) -+ scx_bpf_error("Failed to enqueue %s[%d]", p->comm, p->pid); -+} -+ -+void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ s32 pid, target; -+ struct task_struct *p; -+ -+ if (bpf_map_pop_elem(&queue, &pid)) -+ return; -+ -+ p = bpf_task_from_pid(pid); -+ if (!p) -+ return; -+ -+ target = bpf_get_prandom_u32() % nr_cpus; -+ -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); -+ bpf_task_release(p); -+} -+ -+void BPF_STRUCT_OPS(dsp_local_on_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops dsp_local_on_ops = { -+ .select_cpu = dsp_local_on_select_cpu, -+ .enqueue = dsp_local_on_enqueue, -+ .dispatch = dsp_local_on_dispatch, -+ .exit = dsp_local_on_exit, -+ .name = "dsp_local_on", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.c b/tools/testing/selftests/sched_ext/dsp_local_on.c -new file mode 100644 -index 000000000000..472851b56854 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/dsp_local_on.c -@@ -0,0 +1,58 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include -+#include "dsp_local_on.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct dsp_local_on *skel; -+ -+ skel = dsp_local_on__open(); -+ SCX_FAIL_IF(!skel, "Failed to open"); -+ -+ skel->rodata->nr_cpus = libbpf_num_possible_cpus(); -+ SCX_FAIL_IF(dsp_local_on__load(skel), "Failed to load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct dsp_local_on *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.dsp_local_on_ops); -+ SCX_FAIL_IF(!link, "Failed to attach struct_ops"); -+ -+ /* Just sleeping is fine, plenty of scheduling events happening */ -+ sleep(1); -+ -+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct dsp_local_on *skel = ctx; -+ -+ dsp_local_on__destroy(skel); -+} -+ -+struct scx_test dsp_local_on = { -+ .name = "dsp_local_on", -+ .description = "Verify we can directly dispatch tasks to a local DSQs " -+ "from osp.dispatch()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&dsp_local_on) -diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c -new file mode 100644 -index 000000000000..b0b99531d5d5 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c -@@ -0,0 +1,21 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that validates the behavior of direct dispatching with a default -+ * select_cpu implementation. -+ * -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops enq_last_no_enq_fails_ops = { -+ .name = "enq_last_no_enq_fails", -+ /* Need to define ops.enqueue() with SCX_OPS_ENQ_LAST */ -+ .flags = SCX_OPS_ENQ_LAST, -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c -new file mode 100644 -index 000000000000..2a3eda5e2c0b ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c -@@ -0,0 +1,60 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "enq_last_no_enq_fails.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct enq_last_no_enq_fails *skel; -+ -+ skel = enq_last_no_enq_fails__open_and_load(); -+ if (!skel) { -+ SCX_ERR("Failed to open and load skel"); -+ return SCX_TEST_FAIL; -+ } -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct enq_last_no_enq_fails *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.enq_last_no_enq_fails_ops); -+ if (link) { -+ SCX_ERR("Incorrectly succeeded in to attaching scheduler"); -+ return SCX_TEST_FAIL; -+ } -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct enq_last_no_enq_fails *skel = ctx; -+ -+ enq_last_no_enq_fails__destroy(skel); -+} -+ -+struct scx_test enq_last_no_enq_fails = { -+ .name = "enq_last_no_enq_fails", -+ .description = "Verify we fail to load a scheduler if we specify " -+ "the SCX_OPS_ENQ_LAST flag without defining " -+ "ops.enqueue()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&enq_last_no_enq_fails) -diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c -new file mode 100644 -index 000000000000..b3dfc1033cd6 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c -@@ -0,0 +1,43 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+/* Manually specify the signature until the kfunc is added to the scx repo. */ -+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, -+ bool *found) __ksym; -+ -+s32 BPF_STRUCT_OPS(enq_select_cpu_fails_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ return prev_cpu; -+} -+ -+void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p, -+ u64 enq_flags) -+{ -+ /* -+ * Need to initialize the variable or the verifier will fail to load. -+ * Improving these semantics is actively being worked on. -+ */ -+ bool found = false; -+ -+ /* Can only call from ops.select_cpu() */ -+ scx_bpf_select_cpu_dfl(p, 0, 0, &found); -+ -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops enq_select_cpu_fails_ops = { -+ .select_cpu = enq_select_cpu_fails_select_cpu, -+ .enqueue = enq_select_cpu_fails_enqueue, -+ .name = "enq_select_cpu_fails", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c -new file mode 100644 -index 000000000000..dd1350e5f002 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c -@@ -0,0 +1,61 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "enq_select_cpu_fails.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct enq_select_cpu_fails *skel; -+ -+ skel = enq_select_cpu_fails__open_and_load(); -+ if (!skel) { -+ SCX_ERR("Failed to open and load skel"); -+ return SCX_TEST_FAIL; -+ } -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct enq_select_cpu_fails *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.enq_select_cpu_fails_ops); -+ if (!link) { -+ SCX_ERR("Failed to attach scheduler"); -+ return SCX_TEST_FAIL; -+ } -+ -+ sleep(1); -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct enq_select_cpu_fails *skel = ctx; -+ -+ enq_select_cpu_fails__destroy(skel); -+} -+ -+struct scx_test enq_select_cpu_fails = { -+ .name = "enq_select_cpu_fails", -+ .description = "Verify we fail to call scx_bpf_select_cpu_dfl() " -+ "from ops.enqueue()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&enq_select_cpu_fails) -diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c -new file mode 100644 -index 000000000000..ae12ddaac921 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/exit.bpf.c -@@ -0,0 +1,84 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+#include "exit_test.h" -+ -+const volatile int exit_point; -+UEI_DEFINE(uei); -+ -+#define EXIT_CLEANLY() scx_bpf_exit(exit_point, "%d", exit_point) -+ -+s32 BPF_STRUCT_OPS(exit_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ bool found; -+ -+ if (exit_point == EXIT_SELECT_CPU) -+ EXIT_CLEANLY(); -+ -+ return scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &found); -+} -+ -+void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ if (exit_point == EXIT_ENQUEUE) -+ EXIT_CLEANLY(); -+ -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); -+} -+ -+void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) -+{ -+ if (exit_point == EXIT_DISPATCH) -+ EXIT_CLEANLY(); -+ -+ scx_bpf_consume(SCX_DSQ_GLOBAL); -+} -+ -+void BPF_STRUCT_OPS(exit_enable, struct task_struct *p) -+{ -+ if (exit_point == EXIT_ENABLE) -+ EXIT_CLEANLY(); -+} -+ -+s32 BPF_STRUCT_OPS(exit_init_task, struct task_struct *p, -+ struct scx_init_task_args *args) -+{ -+ if (exit_point == EXIT_INIT_TASK) -+ EXIT_CLEANLY(); -+ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(exit_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(exit_init) -+{ -+ if (exit_point == EXIT_INIT) -+ EXIT_CLEANLY(); -+ -+ return 0; -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops exit_ops = { -+ .select_cpu = exit_select_cpu, -+ .enqueue = exit_enqueue, -+ .dispatch = exit_dispatch, -+ .init_task = exit_init_task, -+ .enable = exit_enable, -+ .exit = exit_exit, -+ .init = exit_init, -+ .name = "exit", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/exit.c b/tools/testing/selftests/sched_ext/exit.c -new file mode 100644 -index 000000000000..31bcd06e21cd ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/exit.c -@@ -0,0 +1,55 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include "exit.bpf.skel.h" -+#include "scx_test.h" -+ -+#include "exit_test.h" -+ -+static enum scx_test_status run(void *ctx) -+{ -+ enum exit_test_case tc; -+ -+ for (tc = 0; tc < NUM_EXITS; tc++) { -+ struct exit *skel; -+ struct bpf_link *link; -+ char buf[16]; -+ -+ skel = exit__open(); -+ skel->rodata->exit_point = tc; -+ exit__load(skel); -+ link = bpf_map__attach_struct_ops(skel->maps.exit_ops); -+ if (!link) { -+ SCX_ERR("Failed to attach scheduler"); -+ exit__destroy(skel); -+ return SCX_TEST_FAIL; -+ } -+ -+ /* Assumes uei.kind is written last */ -+ while (skel->data->uei.kind == EXIT_KIND(SCX_EXIT_NONE)) -+ sched_yield(); -+ -+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG_BPF)); -+ SCX_EQ(skel->data->uei.exit_code, tc); -+ sprintf(buf, "%d", tc); -+ SCX_ASSERT(!strcmp(skel->data->uei.msg, buf)); -+ bpf_link__destroy(link); -+ exit__destroy(skel); -+ } -+ -+ return SCX_TEST_PASS; -+} -+ -+struct scx_test exit_test = { -+ .name = "exit", -+ .description = "Verify we can cleanly exit a scheduler in multiple places", -+ .run = run, -+}; -+REGISTER_SCX_TEST(&exit_test) -diff --git a/tools/testing/selftests/sched_ext/exit_test.h b/tools/testing/selftests/sched_ext/exit_test.h -new file mode 100644 -index 000000000000..94f0268b9cb8 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/exit_test.h -@@ -0,0 +1,20 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+ -+#ifndef __EXIT_TEST_H__ -+#define __EXIT_TEST_H__ -+ -+enum exit_test_case { -+ EXIT_SELECT_CPU, -+ EXIT_ENQUEUE, -+ EXIT_DISPATCH, -+ EXIT_ENABLE, -+ EXIT_INIT_TASK, -+ EXIT_INIT, -+ NUM_EXITS, -+}; -+ -+#endif // # __EXIT_TEST_H__ -diff --git a/tools/testing/selftests/sched_ext/hotplug.bpf.c b/tools/testing/selftests/sched_ext/hotplug.bpf.c -new file mode 100644 -index 000000000000..8f2601db39f3 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/hotplug.bpf.c -@@ -0,0 +1,61 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+#include "hotplug_test.h" -+ -+UEI_DEFINE(uei); -+ -+void BPF_STRUCT_OPS(hotplug_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+static void exit_from_hotplug(s32 cpu, bool onlining) -+{ -+ /* -+ * Ignored, just used to verify that we can invoke blocking kfuncs -+ * from the hotplug path. -+ */ -+ scx_bpf_create_dsq(0, -1); -+ -+ s64 code = SCX_ECODE_ACT_RESTART | HOTPLUG_EXIT_RSN; -+ -+ if (onlining) -+ code |= HOTPLUG_ONLINING; -+ -+ scx_bpf_exit(code, "hotplug event detected (%d going %s)", cpu, -+ onlining ? "online" : "offline"); -+} -+ -+void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_online, s32 cpu) -+{ -+ exit_from_hotplug(cpu, true); -+} -+ -+void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_offline, s32 cpu) -+{ -+ exit_from_hotplug(cpu, false); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops hotplug_cb_ops = { -+ .cpu_online = hotplug_cpu_online, -+ .cpu_offline = hotplug_cpu_offline, -+ .exit = hotplug_exit, -+ .name = "hotplug_cbs", -+ .timeout_ms = 1000U, -+}; -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops hotplug_nocb_ops = { -+ .exit = hotplug_exit, -+ .name = "hotplug_nocbs", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c -new file mode 100644 -index 000000000000..87bf220b1bce ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/hotplug.c -@@ -0,0 +1,168 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "hotplug_test.h" -+#include "hotplug.bpf.skel.h" -+#include "scx_test.h" -+#include "util.h" -+ -+const char *online_path = "/sys/devices/system/cpu/cpu1/online"; -+ -+static bool is_cpu_online(void) -+{ -+ return file_read_long(online_path) > 0; -+} -+ -+static void toggle_online_status(bool online) -+{ -+ long val = online ? 1 : 0; -+ int ret; -+ -+ ret = file_write_long(online_path, val); -+ if (ret != 0) -+ fprintf(stderr, "Failed to bring CPU %s (%s)", -+ online ? "online" : "offline", strerror(errno)); -+} -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ if (!is_cpu_online()) -+ return SCX_TEST_SKIP; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status test_hotplug(bool onlining, bool cbs_defined) -+{ -+ struct hotplug *skel; -+ struct bpf_link *link; -+ long kind, code; -+ -+ SCX_ASSERT(is_cpu_online()); -+ -+ skel = hotplug__open_and_load(); -+ SCX_ASSERT(skel); -+ -+ /* Testing the offline -> online path, so go offline before starting */ -+ if (onlining) -+ toggle_online_status(0); -+ -+ if (cbs_defined) { -+ kind = SCX_KIND_VAL(SCX_EXIT_UNREG_BPF); -+ code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | HOTPLUG_EXIT_RSN; -+ if (onlining) -+ code |= HOTPLUG_ONLINING; -+ } else { -+ kind = SCX_KIND_VAL(SCX_EXIT_UNREG_KERN); -+ code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | -+ SCX_ECODE_VAL(SCX_ECODE_RSN_HOTPLUG); -+ } -+ -+ if (cbs_defined) -+ link = bpf_map__attach_struct_ops(skel->maps.hotplug_cb_ops); -+ else -+ link = bpf_map__attach_struct_ops(skel->maps.hotplug_nocb_ops); -+ -+ if (!link) { -+ SCX_ERR("Failed to attach scheduler"); -+ hotplug__destroy(skel); -+ return SCX_TEST_FAIL; -+ } -+ -+ toggle_online_status(onlining ? 1 : 0); -+ -+ while (!UEI_EXITED(skel, uei)) -+ sched_yield(); -+ -+ SCX_EQ(skel->data->uei.kind, kind); -+ SCX_EQ(UEI_REPORT(skel, uei), code); -+ -+ if (!onlining) -+ toggle_online_status(1); -+ -+ bpf_link__destroy(link); -+ hotplug__destroy(skel); -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status test_hotplug_attach(void) -+{ -+ struct hotplug *skel; -+ struct bpf_link *link; -+ enum scx_test_status status = SCX_TEST_PASS; -+ long kind, code; -+ -+ SCX_ASSERT(is_cpu_online()); -+ SCX_ASSERT(scx_hotplug_seq() > 0); -+ -+ skel = SCX_OPS_OPEN(hotplug_nocb_ops, hotplug); -+ SCX_ASSERT(skel); -+ -+ SCX_OPS_LOAD(skel, hotplug_nocb_ops, hotplug, uei); -+ -+ /* -+ * Take the CPU offline to increment the global hotplug seq, which -+ * should cause attach to fail due to us setting the hotplug seq above -+ */ -+ toggle_online_status(0); -+ link = bpf_map__attach_struct_ops(skel->maps.hotplug_nocb_ops); -+ -+ toggle_online_status(1); -+ -+ SCX_ASSERT(link); -+ while (!UEI_EXITED(skel, uei)) -+ sched_yield(); -+ -+ kind = SCX_KIND_VAL(SCX_EXIT_UNREG_KERN); -+ code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | -+ SCX_ECODE_VAL(SCX_ECODE_RSN_HOTPLUG); -+ SCX_EQ(skel->data->uei.kind, kind); -+ SCX_EQ(UEI_REPORT(skel, uei), code); -+ -+ bpf_link__destroy(link); -+ hotplug__destroy(skel); -+ -+ return status; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ -+#define HP_TEST(__onlining, __cbs_defined) ({ \ -+ if (test_hotplug(__onlining, __cbs_defined) != SCX_TEST_PASS) \ -+ return SCX_TEST_FAIL; \ -+}) -+ -+ HP_TEST(true, true); -+ HP_TEST(false, true); -+ HP_TEST(true, false); -+ HP_TEST(false, false); -+ -+#undef HP_TEST -+ -+ return test_hotplug_attach(); -+} -+ -+static void cleanup(void *ctx) -+{ -+ toggle_online_status(1); -+} -+ -+struct scx_test hotplug_test = { -+ .name = "hotplug", -+ .description = "Verify hotplug behavior", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&hotplug_test) -diff --git a/tools/testing/selftests/sched_ext/hotplug_test.h b/tools/testing/selftests/sched_ext/hotplug_test.h -new file mode 100644 -index 000000000000..73d236f90787 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/hotplug_test.h -@@ -0,0 +1,15 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+ -+#ifndef __HOTPLUG_TEST_H__ -+#define __HOTPLUG_TEST_H__ -+ -+enum hotplug_test_flags { -+ HOTPLUG_EXIT_RSN = 1LLU << 0, -+ HOTPLUG_ONLINING = 1LLU << 1, -+}; -+ -+#endif // # __HOTPLUG_TEST_H__ -diff --git a/tools/testing/selftests/sched_ext/init_enable_count.bpf.c b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c -new file mode 100644 -index 000000000000..47ea89a626c3 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c -@@ -0,0 +1,53 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that verifies that we do proper counting of init, enable, etc -+ * callbacks. -+ * -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+u64 init_task_cnt, exit_task_cnt, enable_cnt, disable_cnt; -+u64 init_fork_cnt, init_transition_cnt; -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(cnt_init_task, struct task_struct *p, -+ struct scx_init_task_args *args) -+{ -+ __sync_fetch_and_add(&init_task_cnt, 1); -+ -+ if (args->fork) -+ __sync_fetch_and_add(&init_fork_cnt, 1); -+ else -+ __sync_fetch_and_add(&init_transition_cnt, 1); -+ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(cnt_exit_task, struct task_struct *p) -+{ -+ __sync_fetch_and_add(&exit_task_cnt, 1); -+} -+ -+void BPF_STRUCT_OPS(cnt_enable, struct task_struct *p) -+{ -+ __sync_fetch_and_add(&enable_cnt, 1); -+} -+ -+void BPF_STRUCT_OPS(cnt_disable, struct task_struct *p) -+{ -+ __sync_fetch_and_add(&disable_cnt, 1); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops init_enable_count_ops = { -+ .init_task = cnt_init_task, -+ .exit_task = cnt_exit_task, -+ .enable = cnt_enable, -+ .disable = cnt_disable, -+ .name = "init_enable_count", -+}; -diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c -new file mode 100644 -index 000000000000..97d45f1e5597 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/init_enable_count.c -@@ -0,0 +1,166 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include "scx_test.h" -+#include "init_enable_count.bpf.skel.h" -+ -+#define SCHED_EXT 7 -+ -+static struct init_enable_count * -+open_load_prog(bool global) -+{ -+ struct init_enable_count *skel; -+ -+ skel = init_enable_count__open(); -+ SCX_BUG_ON(!skel, "Failed to open skel"); -+ -+ if (!global) -+ skel->struct_ops.init_enable_count_ops->flags |= SCX_OPS_SWITCH_PARTIAL; -+ -+ SCX_BUG_ON(init_enable_count__load(skel), "Failed to load skel"); -+ -+ return skel; -+} -+ -+static enum scx_test_status run_test(bool global) -+{ -+ struct init_enable_count *skel; -+ struct bpf_link *link; -+ const u32 num_children = 5, num_pre_forks = 1024; -+ int ret, i, status; -+ struct sched_param param = {}; -+ pid_t pids[num_pre_forks]; -+ -+ skel = open_load_prog(global); -+ -+ /* -+ * Fork a bunch of children before we attach the scheduler so that we -+ * ensure (at least in practical terms) that there are more tasks that -+ * transition from SCHED_OTHER -> SCHED_EXT than there are tasks that -+ * take the fork() path either below or in other processes. -+ */ -+ for (i = 0; i < num_pre_forks; i++) { -+ pids[i] = fork(); -+ SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); -+ if (pids[i] == 0) { -+ sleep(1); -+ exit(0); -+ } -+ } -+ -+ link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); -+ SCX_FAIL_IF(!link, "Failed to attach struct_ops"); -+ -+ for (i = 0; i < num_pre_forks; i++) { -+ SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], -+ "Failed to wait for pre-forked child\n"); -+ -+ SCX_FAIL_IF(status != 0, "Pre-forked child %d exited with status %d\n", i, -+ status); -+ } -+ -+ bpf_link__destroy(link); -+ SCX_GE(skel->bss->init_task_cnt, num_pre_forks); -+ SCX_GE(skel->bss->exit_task_cnt, num_pre_forks); -+ -+ link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); -+ SCX_FAIL_IF(!link, "Failed to attach struct_ops"); -+ -+ /* SCHED_EXT children */ -+ for (i = 0; i < num_children; i++) { -+ pids[i] = fork(); -+ SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); -+ -+ if (pids[i] == 0) { -+ ret = sched_setscheduler(0, SCHED_EXT, ¶m); -+ SCX_BUG_ON(ret, "Failed to set sched to sched_ext"); -+ -+ /* -+ * Reset to SCHED_OTHER for half of them. Counts for -+ * everything should still be the same regardless, as -+ * ops.disable() is invoked even if a task is still on -+ * SCHED_EXT before it exits. -+ */ -+ if (i % 2 == 0) { -+ ret = sched_setscheduler(0, SCHED_OTHER, ¶m); -+ SCX_BUG_ON(ret, "Failed to reset sched to normal"); -+ } -+ exit(0); -+ } -+ } -+ for (i = 0; i < num_children; i++) { -+ SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], -+ "Failed to wait for SCX child\n"); -+ -+ SCX_FAIL_IF(status != 0, "SCX child %d exited with status %d\n", i, -+ status); -+ } -+ -+ /* SCHED_OTHER children */ -+ for (i = 0; i < num_children; i++) { -+ pids[i] = fork(); -+ if (pids[i] == 0) -+ exit(0); -+ } -+ -+ for (i = 0; i < num_children; i++) { -+ SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], -+ "Failed to wait for normal child\n"); -+ -+ SCX_FAIL_IF(status != 0, "Normal child %d exited with status %d\n", i, -+ status); -+ } -+ -+ bpf_link__destroy(link); -+ -+ SCX_GE(skel->bss->init_task_cnt, 2 * num_children); -+ SCX_GE(skel->bss->exit_task_cnt, 2 * num_children); -+ -+ if (global) { -+ SCX_GE(skel->bss->enable_cnt, 2 * num_children); -+ SCX_GE(skel->bss->disable_cnt, 2 * num_children); -+ } else { -+ SCX_EQ(skel->bss->enable_cnt, num_children); -+ SCX_EQ(skel->bss->disable_cnt, num_children); -+ } -+ /* -+ * We forked a ton of tasks before we attached the scheduler above, so -+ * this should be fine. Technically it could be flaky if a ton of forks -+ * are happening at the same time in other processes, but that should -+ * be exceedingly unlikely. -+ */ -+ SCX_GT(skel->bss->init_transition_cnt, skel->bss->init_fork_cnt); -+ SCX_GE(skel->bss->init_fork_cnt, 2 * num_children); -+ -+ init_enable_count__destroy(skel); -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ enum scx_test_status status; -+ -+ status = run_test(true); -+ if (status != SCX_TEST_PASS) -+ return status; -+ -+ return run_test(false); -+} -+ -+struct scx_test init_enable_count = { -+ .name = "init_enable_count", -+ .description = "Verify we do the correct amount of counting of init, " -+ "enable, etc callbacks.", -+ .run = run, -+}; -+REGISTER_SCX_TEST(&init_enable_count) -diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c -new file mode 100644 -index 000000000000..00bfa9cb95d3 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/maximal.bpf.c -@@ -0,0 +1,164 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler with every callback defined. -+ * -+ * This scheduler defines every callback. -+ * -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, -+ u64 wake_flags) -+{ -+ return prev_cpu; -+} -+ -+void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); -+} -+ -+void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) -+{} -+ -+void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ scx_bpf_consume(SCX_DSQ_GLOBAL); -+} -+ -+void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) -+{} -+ -+void BPF_STRUCT_OPS(maximal_running, struct task_struct *p) -+{} -+ -+void BPF_STRUCT_OPS(maximal_stopping, struct task_struct *p, bool runnable) -+{} -+ -+void BPF_STRUCT_OPS(maximal_quiescent, struct task_struct *p, u64 deq_flags) -+{} -+ -+bool BPF_STRUCT_OPS(maximal_yield, struct task_struct *from, -+ struct task_struct *to) -+{ -+ return false; -+} -+ -+bool BPF_STRUCT_OPS(maximal_core_sched_before, struct task_struct *a, -+ struct task_struct *b) -+{ -+ return false; -+} -+ -+void BPF_STRUCT_OPS(maximal_set_weight, struct task_struct *p, u32 weight) -+{} -+ -+void BPF_STRUCT_OPS(maximal_set_cpumask, struct task_struct *p, -+ const struct cpumask *cpumask) -+{} -+ -+void BPF_STRUCT_OPS(maximal_update_idle, s32 cpu, bool idle) -+{} -+ -+void BPF_STRUCT_OPS(maximal_cpu_acquire, s32 cpu, -+ struct scx_cpu_acquire_args *args) -+{} -+ -+void BPF_STRUCT_OPS(maximal_cpu_release, s32 cpu, -+ struct scx_cpu_release_args *args) -+{} -+ -+void BPF_STRUCT_OPS(maximal_cpu_online, s32 cpu) -+{} -+ -+void BPF_STRUCT_OPS(maximal_cpu_offline, s32 cpu) -+{} -+ -+s32 BPF_STRUCT_OPS(maximal_init_task, struct task_struct *p, -+ struct scx_init_task_args *args) -+{ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(maximal_enable, struct task_struct *p) -+{} -+ -+void BPF_STRUCT_OPS(maximal_exit_task, struct task_struct *p, -+ struct scx_exit_task_args *args) -+{} -+ -+void BPF_STRUCT_OPS(maximal_disable, struct task_struct *p) -+{} -+ -+s32 BPF_STRUCT_OPS(maximal_cgroup_init, struct cgroup *cgrp, -+ struct scx_cgroup_init_args *args) -+{ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(maximal_cgroup_exit, struct cgroup *cgrp) -+{} -+ -+s32 BPF_STRUCT_OPS(maximal_cgroup_prep_move, struct task_struct *p, -+ struct cgroup *from, struct cgroup *to) -+{ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(maximal_cgroup_move, struct task_struct *p, -+ struct cgroup *from, struct cgroup *to) -+{} -+ -+void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p, -+ struct cgroup *from, struct cgroup *to) -+{} -+ -+void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight) -+{} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) -+{ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info) -+{} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops maximal_ops = { -+ .select_cpu = maximal_select_cpu, -+ .enqueue = maximal_enqueue, -+ .dequeue = maximal_dequeue, -+ .dispatch = maximal_dispatch, -+ .runnable = maximal_runnable, -+ .running = maximal_running, -+ .stopping = maximal_stopping, -+ .quiescent = maximal_quiescent, -+ .yield = maximal_yield, -+ .core_sched_before = maximal_core_sched_before, -+ .set_weight = maximal_set_weight, -+ .set_cpumask = maximal_set_cpumask, -+ .update_idle = maximal_update_idle, -+ .cpu_acquire = maximal_cpu_acquire, -+ .cpu_release = maximal_cpu_release, -+ .cpu_online = maximal_cpu_online, -+ .cpu_offline = maximal_cpu_offline, -+ .init_task = maximal_init_task, -+ .enable = maximal_enable, -+ .exit_task = maximal_exit_task, -+ .disable = maximal_disable, -+ .cgroup_init = maximal_cgroup_init, -+ .cgroup_exit = maximal_cgroup_exit, -+ .cgroup_prep_move = maximal_cgroup_prep_move, -+ .cgroup_move = maximal_cgroup_move, -+ .cgroup_cancel_move = maximal_cgroup_cancel_move, -+ .cgroup_set_weight = maximal_cgroup_set_weight, -+ .init = maximal_init, -+ .exit = maximal_exit, -+ .name = "maximal", -+}; -diff --git a/tools/testing/selftests/sched_ext/maximal.c b/tools/testing/selftests/sched_ext/maximal.c -new file mode 100644 -index 000000000000..f38fc973c380 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/maximal.c -@@ -0,0 +1,51 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include "maximal.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct maximal *skel; -+ -+ skel = maximal__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct maximal *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.maximal_ops); -+ SCX_FAIL_IF(!link, "Failed to attach scheduler"); -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct maximal *skel = ctx; -+ -+ maximal__destroy(skel); -+} -+ -+struct scx_test maximal = { -+ .name = "maximal", -+ .description = "Verify we can load a scheduler with every callback defined", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&maximal) -diff --git a/tools/testing/selftests/sched_ext/maybe_null.bpf.c b/tools/testing/selftests/sched_ext/maybe_null.bpf.c -new file mode 100644 -index 000000000000..27d0f386acfb ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/maybe_null.bpf.c -@@ -0,0 +1,36 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+u64 vtime_test; -+ -+void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) -+{} -+ -+void BPF_STRUCT_OPS(maybe_null_success_dispatch, s32 cpu, struct task_struct *p) -+{ -+ if (p != NULL) -+ vtime_test = p->scx.dsq_vtime; -+} -+ -+bool BPF_STRUCT_OPS(maybe_null_success_yield, struct task_struct *from, -+ struct task_struct *to) -+{ -+ if (to) -+ bpf_printk("Yielding to %s[%d]", to->comm, to->pid); -+ -+ return false; -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops maybe_null_success = { -+ .dispatch = maybe_null_success_dispatch, -+ .yield = maybe_null_success_yield, -+ .enable = maybe_null_running, -+ .name = "minimal", -+}; -diff --git a/tools/testing/selftests/sched_ext/maybe_null.c b/tools/testing/selftests/sched_ext/maybe_null.c -new file mode 100644 -index 000000000000..31cfafb0cf65 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/maybe_null.c -@@ -0,0 +1,49 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ */ -+#include -+#include -+#include -+#include -+#include "maybe_null.bpf.skel.h" -+#include "maybe_null_fail_dsp.bpf.skel.h" -+#include "maybe_null_fail_yld.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct maybe_null *skel; -+ struct maybe_null_fail_dsp *fail_dsp; -+ struct maybe_null_fail_yld *fail_yld; -+ -+ skel = maybe_null__open_and_load(); -+ if (!skel) { -+ SCX_ERR("Failed to open and load maybe_null skel"); -+ return SCX_TEST_FAIL; -+ } -+ maybe_null__destroy(skel); -+ -+ fail_dsp = maybe_null_fail_dsp__open_and_load(); -+ if (fail_dsp) { -+ maybe_null_fail_dsp__destroy(fail_dsp); -+ SCX_ERR("Should failed to open and load maybe_null_fail_dsp skel"); -+ return SCX_TEST_FAIL; -+ } -+ -+ fail_yld = maybe_null_fail_yld__open_and_load(); -+ if (fail_yld) { -+ maybe_null_fail_yld__destroy(fail_yld); -+ SCX_ERR("Should failed to open and load maybe_null_fail_yld skel"); -+ return SCX_TEST_FAIL; -+ } -+ -+ return SCX_TEST_PASS; -+} -+ -+struct scx_test maybe_null = { -+ .name = "maybe_null", -+ .description = "Verify if PTR_MAYBE_NULL work for .dispatch", -+ .run = run, -+}; -+REGISTER_SCX_TEST(&maybe_null) -diff --git a/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c b/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c -new file mode 100644 -index 000000000000..c0641050271d ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c -@@ -0,0 +1,25 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+u64 vtime_test; -+ -+void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) -+{} -+ -+void BPF_STRUCT_OPS(maybe_null_fail_dispatch, s32 cpu, struct task_struct *p) -+{ -+ vtime_test = p->scx.dsq_vtime; -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops maybe_null_fail = { -+ .dispatch = maybe_null_fail_dispatch, -+ .enable = maybe_null_running, -+ .name = "maybe_null_fail_dispatch", -+}; -diff --git a/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c b/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c -new file mode 100644 -index 000000000000..3c1740028e3b ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c -@@ -0,0 +1,28 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+u64 vtime_test; -+ -+void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) -+{} -+ -+bool BPF_STRUCT_OPS(maybe_null_fail_yield, struct task_struct *from, -+ struct task_struct *to) -+{ -+ bpf_printk("Yielding to %s[%d]", to->comm, to->pid); -+ -+ return false; -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops maybe_null_fail = { -+ .yield = maybe_null_fail_yield, -+ .enable = maybe_null_running, -+ .name = "maybe_null_fail_yield", -+}; -diff --git a/tools/testing/selftests/sched_ext/minimal.bpf.c b/tools/testing/selftests/sched_ext/minimal.bpf.c -new file mode 100644 -index 000000000000..6a7eccef0104 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/minimal.bpf.c -@@ -0,0 +1,21 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A completely minimal scheduler. -+ * -+ * This scheduler defines the absolute minimal set of struct sched_ext_ops -+ * fields: its name. It should _not_ fail to be loaded, and can be used to -+ * exercise the default scheduling paths in ext.c. -+ * -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops minimal_ops = { -+ .name = "minimal", -+}; -diff --git a/tools/testing/selftests/sched_ext/minimal.c b/tools/testing/selftests/sched_ext/minimal.c -new file mode 100644 -index 000000000000..6c5db8ebbf8a ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/minimal.c -@@ -0,0 +1,58 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "minimal.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct minimal *skel; -+ -+ skel = minimal__open_and_load(); -+ if (!skel) { -+ SCX_ERR("Failed to open and load skel"); -+ return SCX_TEST_FAIL; -+ } -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct minimal *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.minimal_ops); -+ if (!link) { -+ SCX_ERR("Failed to attach scheduler"); -+ return SCX_TEST_FAIL; -+ } -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct minimal *skel = ctx; -+ -+ minimal__destroy(skel); -+} -+ -+struct scx_test minimal = { -+ .name = "minimal", -+ .description = "Verify we can load a fully minimal scheduler", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&minimal) -diff --git a/tools/testing/selftests/sched_ext/prog_run.bpf.c b/tools/testing/selftests/sched_ext/prog_run.bpf.c -new file mode 100644 -index 000000000000..6a4d7c48e3f2 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/prog_run.bpf.c -@@ -0,0 +1,33 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that validates that we can invoke sched_ext kfuncs in -+ * BPF_PROG_TYPE_SYSCALL programs. -+ * -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+ -+#include -+ -+UEI_DEFINE(uei); -+ -+char _license[] SEC("license") = "GPL"; -+ -+SEC("syscall") -+int BPF_PROG(prog_run_syscall) -+{ -+ scx_bpf_create_dsq(0, -1); -+ scx_bpf_exit(0xdeadbeef, "Exited from PROG_RUN"); -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(prog_run_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops prog_run_ops = { -+ .exit = prog_run_exit, -+ .name = "prog_run", -+}; -diff --git a/tools/testing/selftests/sched_ext/prog_run.c b/tools/testing/selftests/sched_ext/prog_run.c -new file mode 100644 -index 000000000000..3cd57ef8daaa ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/prog_run.c -@@ -0,0 +1,78 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include "prog_run.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct prog_run *skel; -+ -+ skel = prog_run__open_and_load(); -+ if (!skel) { -+ SCX_ERR("Failed to open and load skel"); -+ return SCX_TEST_FAIL; -+ } -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct prog_run *skel = ctx; -+ struct bpf_link *link; -+ int prog_fd, err = 0; -+ -+ prog_fd = bpf_program__fd(skel->progs.prog_run_syscall); -+ if (prog_fd < 0) { -+ SCX_ERR("Failed to get BPF_PROG_RUN prog"); -+ return SCX_TEST_FAIL; -+ } -+ -+ LIBBPF_OPTS(bpf_test_run_opts, topts); -+ -+ link = bpf_map__attach_struct_ops(skel->maps.prog_run_ops); -+ if (!link) { -+ SCX_ERR("Failed to attach scheduler"); -+ close(prog_fd); -+ return SCX_TEST_FAIL; -+ } -+ -+ err = bpf_prog_test_run_opts(prog_fd, &topts); -+ SCX_EQ(err, 0); -+ -+ /* Assumes uei.kind is written last */ -+ while (skel->data->uei.kind == EXIT_KIND(SCX_EXIT_NONE)) -+ sched_yield(); -+ -+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG_BPF)); -+ SCX_EQ(skel->data->uei.exit_code, 0xdeadbeef); -+ close(prog_fd); -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct prog_run *skel = ctx; -+ -+ prog_run__destroy(skel); -+} -+ -+struct scx_test prog_run = { -+ .name = "prog_run", -+ .description = "Verify we can call into a scheduler with BPF_PROG_RUN, and invoke kfuncs", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&prog_run) -diff --git a/tools/testing/selftests/sched_ext/reload_loop.c b/tools/testing/selftests/sched_ext/reload_loop.c -new file mode 100644 -index 000000000000..5cfba2d6e056 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/reload_loop.c -@@ -0,0 +1,75 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include "maximal.bpf.skel.h" -+#include "scx_test.h" -+ -+static struct maximal *skel; -+static pthread_t threads[2]; -+ -+bool force_exit = false; -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ skel = maximal__open_and_load(); -+ if (!skel) { -+ SCX_ERR("Failed to open and load skel"); -+ return SCX_TEST_FAIL; -+ } -+ -+ return SCX_TEST_PASS; -+} -+ -+static void *do_reload_loop(void *arg) -+{ -+ u32 i; -+ -+ for (i = 0; i < 1024 && !force_exit; i++) { -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.maximal_ops); -+ if (link) -+ bpf_link__destroy(link); -+ } -+ -+ return NULL; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ int err; -+ void *ret; -+ -+ err = pthread_create(&threads[0], NULL, do_reload_loop, NULL); -+ SCX_FAIL_IF(err, "Failed to create thread 0"); -+ -+ err = pthread_create(&threads[1], NULL, do_reload_loop, NULL); -+ SCX_FAIL_IF(err, "Failed to create thread 1"); -+ -+ SCX_FAIL_IF(pthread_join(threads[0], &ret), "thread 0 failed"); -+ SCX_FAIL_IF(pthread_join(threads[1], &ret), "thread 1 failed"); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ force_exit = true; -+ maximal__destroy(skel); -+} -+ -+struct scx_test reload_loop = { -+ .name = "reload_loop", -+ .description = "Stress test loading and unloading schedulers repeatedly in a tight loop", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&reload_loop) -diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c -new file mode 100644 -index 000000000000..eab48c7ff309 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/runner.c -@@ -0,0 +1,201 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ * Copyright (c) 2024 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include -+#include "scx_test.h" -+ -+const char help_fmt[] = -+"The runner for sched_ext tests.\n" -+"\n" -+"The runner is statically linked against all testcases, and runs them all serially.\n" -+"It's required for the testcases to be serial, as only a single host-wide sched_ext\n" -+"scheduler may be loaded at any given time." -+"\n" -+"Usage: %s [-t TEST] [-h]\n" -+"\n" -+" -t TEST Only run tests whose name includes this string\n" -+" -s Include print output for skipped tests\n" -+" -q Don't print the test descriptions during run\n" -+" -h Display this help and exit\n"; -+ -+static volatile int exit_req; -+static bool quiet, print_skipped; -+ -+#define MAX_SCX_TESTS 2048 -+ -+static struct scx_test __scx_tests[MAX_SCX_TESTS]; -+static unsigned __scx_num_tests = 0; -+ -+static void sigint_handler(int simple) -+{ -+ exit_req = 1; -+} -+ -+static void print_test_preamble(const struct scx_test *test, bool quiet) -+{ -+ printf("===== START =====\n"); -+ printf("TEST: %s\n", test->name); -+ if (!quiet) -+ printf("DESCRIPTION: %s\n", test->description); -+ printf("OUTPUT:\n"); -+} -+ -+static const char *status_to_result(enum scx_test_status status) -+{ -+ switch (status) { -+ case SCX_TEST_PASS: -+ case SCX_TEST_SKIP: -+ return "ok"; -+ case SCX_TEST_FAIL: -+ return "not ok"; -+ default: -+ return ""; -+ } -+} -+ -+static void print_test_result(const struct scx_test *test, -+ enum scx_test_status status, -+ unsigned int testnum) -+{ -+ const char *result = status_to_result(status); -+ const char *directive = status == SCX_TEST_SKIP ? "SKIP " : ""; -+ -+ printf("%s %u %s # %s\n", result, testnum, test->name, directive); -+ printf("===== END =====\n"); -+} -+ -+static bool should_skip_test(const struct scx_test *test, const char * filter) -+{ -+ return !strstr(test->name, filter); -+} -+ -+static enum scx_test_status run_test(const struct scx_test *test) -+{ -+ enum scx_test_status status; -+ void *context = NULL; -+ -+ if (test->setup) { -+ status = test->setup(&context); -+ if (status != SCX_TEST_PASS) -+ return status; -+ } -+ -+ status = test->run(context); -+ -+ if (test->cleanup) -+ test->cleanup(context); -+ -+ return status; -+} -+ -+static bool test_valid(const struct scx_test *test) -+{ -+ if (!test) { -+ fprintf(stderr, "NULL test detected\n"); -+ return false; -+ } -+ -+ if (!test->name) { -+ fprintf(stderr, -+ "Test with no name found. Must specify test name.\n"); -+ return false; -+ } -+ -+ if (!test->description) { -+ fprintf(stderr, "Test %s requires description.\n", test->name); -+ return false; -+ } -+ -+ if (!test->run) { -+ fprintf(stderr, "Test %s has no run() callback\n", test->name); -+ return false; -+ } -+ -+ return true; -+} -+ -+int main(int argc, char **argv) -+{ -+ const char *filter = NULL; -+ unsigned testnum = 0, i; -+ unsigned passed = 0, skipped = 0, failed = 0; -+ int opt; -+ -+ signal(SIGINT, sigint_handler); -+ signal(SIGTERM, sigint_handler); -+ -+ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); -+ -+ while ((opt = getopt(argc, argv, "qst:h")) != -1) { -+ switch (opt) { -+ case 'q': -+ quiet = true; -+ break; -+ case 's': -+ print_skipped = true; -+ break; -+ case 't': -+ filter = optarg; -+ break; -+ default: -+ fprintf(stderr, help_fmt, basename(argv[0])); -+ return opt != 'h'; -+ } -+ } -+ -+ for (i = 0; i < __scx_num_tests; i++) { -+ enum scx_test_status status; -+ struct scx_test *test = &__scx_tests[i]; -+ -+ if (filter && should_skip_test(test, filter)) { -+ /* -+ * Printing the skipped tests and their preambles can -+ * add a lot of noise to the runner output. Printing -+ * this is only really useful for CI, so let's skip it -+ * by default. -+ */ -+ if (print_skipped) { -+ print_test_preamble(test, quiet); -+ print_test_result(test, SCX_TEST_SKIP, ++testnum); -+ } -+ continue; -+ } -+ -+ print_test_preamble(test, quiet); -+ status = run_test(test); -+ print_test_result(test, status, ++testnum); -+ switch (status) { -+ case SCX_TEST_PASS: -+ passed++; -+ break; -+ case SCX_TEST_SKIP: -+ skipped++; -+ break; -+ case SCX_TEST_FAIL: -+ failed++; -+ break; -+ } -+ } -+ printf("\n\n=============================\n\n"); -+ printf("RESULTS:\n\n"); -+ printf("PASSED: %u\n", passed); -+ printf("SKIPPED: %u\n", skipped); -+ printf("FAILED: %u\n", failed); -+ -+ return 0; -+} -+ -+void scx_test_register(struct scx_test *test) -+{ -+ SCX_BUG_ON(!test_valid(test), "Invalid test found"); -+ SCX_BUG_ON(__scx_num_tests >= MAX_SCX_TESTS, "Maximum tests exceeded"); -+ -+ __scx_tests[__scx_num_tests++] = *test; -+} -diff --git a/tools/testing/selftests/sched_ext/scx_test.h b/tools/testing/selftests/sched_ext/scx_test.h -new file mode 100644 -index 000000000000..90b8d6915bb7 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/scx_test.h -@@ -0,0 +1,131 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 Tejun Heo -+ * Copyright (c) 2023 David Vernet -+ */ -+ -+#ifndef __SCX_TEST_H__ -+#define __SCX_TEST_H__ -+ -+#include -+#include -+#include -+ -+enum scx_test_status { -+ SCX_TEST_PASS = 0, -+ SCX_TEST_SKIP, -+ SCX_TEST_FAIL, -+}; -+ -+#define EXIT_KIND(__ent) __COMPAT_ENUM_OR_ZERO("scx_exit_kind", #__ent) -+ -+struct scx_test { -+ /** -+ * name - The name of the testcase. -+ */ -+ const char *name; -+ -+ /** -+ * description - A description of your testcase: what it tests and is -+ * meant to validate. -+ */ -+ const char *description; -+ -+ /* -+ * setup - Setup the test. -+ * @ctx: A pointer to a context object that will be passed to run and -+ * cleanup. -+ * -+ * An optional callback that allows a testcase to perform setup for its -+ * run. A test may return SCX_TEST_SKIP to skip the run. -+ */ -+ enum scx_test_status (*setup)(void **ctx); -+ -+ /* -+ * run - Run the test. -+ * @ctx: Context set in the setup() callback. If @ctx was not set in -+ * setup(), it is NULL. -+ * -+ * The main test. Callers should return one of: -+ * -+ * - SCX_TEST_PASS: Test passed -+ * - SCX_TEST_SKIP: Test should be skipped -+ * - SCX_TEST_FAIL: Test failed -+ * -+ * This callback must be defined. -+ */ -+ enum scx_test_status (*run)(void *ctx); -+ -+ /* -+ * cleanup - Perform cleanup following the test -+ * @ctx: Context set in the setup() callback. If @ctx was not set in -+ * setup(), it is NULL. -+ * -+ * An optional callback that allows a test to perform cleanup after -+ * being run. This callback is run even if the run() callback returns -+ * SCX_TEST_SKIP or SCX_TEST_FAIL. It is not run if setup() returns -+ * SCX_TEST_SKIP or SCX_TEST_FAIL. -+ */ -+ void (*cleanup)(void *ctx); -+}; -+ -+void scx_test_register(struct scx_test *test); -+ -+#define REGISTER_SCX_TEST(__test) \ -+ __attribute__((constructor)) \ -+ static void ___scxregister##__LINE__(void) \ -+ { \ -+ scx_test_register(__test); \ -+ } -+ -+#define SCX_ERR(__fmt, ...) \ -+ do { \ -+ fprintf(stderr, "ERR: %s:%d\n", __FILE__, __LINE__); \ -+ fprintf(stderr, __fmt"\n", ##__VA_ARGS__); \ -+ } while (0) -+ -+#define SCX_FAIL(__fmt, ...) \ -+ do { \ -+ SCX_ERR(__fmt, ##__VA_ARGS__); \ -+ return SCX_TEST_FAIL; \ -+ } while (0) -+ -+#define SCX_FAIL_IF(__cond, __fmt, ...) \ -+ do { \ -+ if (__cond) \ -+ SCX_FAIL(__fmt, ##__VA_ARGS__); \ -+ } while (0) -+ -+#define SCX_GT(_x, _y) SCX_FAIL_IF((_x) <= (_y), "Expected %s > %s (%lu > %lu)", \ -+ #_x, #_y, (u64)(_x), (u64)(_y)) -+#define SCX_GE(_x, _y) SCX_FAIL_IF((_x) < (_y), "Expected %s >= %s (%lu >= %lu)", \ -+ #_x, #_y, (u64)(_x), (u64)(_y)) -+#define SCX_LT(_x, _y) SCX_FAIL_IF((_x) >= (_y), "Expected %s < %s (%lu < %lu)", \ -+ #_x, #_y, (u64)(_x), (u64)(_y)) -+#define SCX_LE(_x, _y) SCX_FAIL_IF((_x) > (_y), "Expected %s <= %s (%lu <= %lu)", \ -+ #_x, #_y, (u64)(_x), (u64)(_y)) -+#define SCX_EQ(_x, _y) SCX_FAIL_IF((_x) != (_y), "Expected %s == %s (%lu == %lu)", \ -+ #_x, #_y, (u64)(_x), (u64)(_y)) -+#define SCX_ASSERT(_x) SCX_FAIL_IF(!(_x), "Expected %s to be true (%lu)", \ -+ #_x, (u64)(_x)) -+ -+#define SCX_ECODE_VAL(__ecode) ({ \ -+ u64 __val = 0; \ -+ bool __found = false; \ -+ \ -+ __found = __COMPAT_read_enum("scx_exit_code", #__ecode, &__val); \ -+ SCX_ASSERT(__found); \ -+ (s64)__val; \ -+}) -+ -+#define SCX_KIND_VAL(__kind) ({ \ -+ u64 __val = 0; \ -+ bool __found = false; \ -+ \ -+ __found = __COMPAT_read_enum("scx_exit_kind", #__kind, &__val); \ -+ SCX_ASSERT(__found); \ -+ __val; \ -+}) -+ -+#endif // # __SCX_TEST_H__ -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c -new file mode 100644 -index 000000000000..2ed2991afafe ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c -@@ -0,0 +1,40 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that validates the behavior of direct dispatching with a default -+ * select_cpu implementation. -+ * -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+bool saw_local = false; -+ -+static bool task_is_test(const struct task_struct *p) -+{ -+ return !bpf_strncmp(p->comm, 9, "select_cpu"); -+} -+ -+void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p, -+ u64 enq_flags) -+{ -+ const struct cpumask *idle_mask = scx_bpf_get_idle_cpumask(); -+ -+ if (task_is_test(p) && -+ bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), idle_mask)) { -+ saw_local = true; -+ } -+ scx_bpf_put_idle_cpumask(idle_mask); -+ -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops select_cpu_dfl_ops = { -+ .enqueue = select_cpu_dfl_enqueue, -+ .name = "select_cpu_dfl", -+}; -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c -new file mode 100644 -index 000000000000..a53a40c2d2f0 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c -@@ -0,0 +1,72 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "select_cpu_dfl.bpf.skel.h" -+#include "scx_test.h" -+ -+#define NUM_CHILDREN 1028 -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct select_cpu_dfl *skel; -+ -+ skel = select_cpu_dfl__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct select_cpu_dfl *skel = ctx; -+ struct bpf_link *link; -+ pid_t pids[NUM_CHILDREN]; -+ int i, status; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops); -+ SCX_FAIL_IF(!link, "Failed to attach scheduler"); -+ -+ for (i = 0; i < NUM_CHILDREN; i++) { -+ pids[i] = fork(); -+ if (pids[i] == 0) { -+ sleep(1); -+ exit(0); -+ } -+ } -+ -+ for (i = 0; i < NUM_CHILDREN; i++) { -+ SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); -+ SCX_EQ(status, 0); -+ } -+ -+ SCX_ASSERT(!skel->bss->saw_local); -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct select_cpu_dfl *skel = ctx; -+ -+ select_cpu_dfl__destroy(skel); -+} -+ -+struct scx_test select_cpu_dfl = { -+ .name = "select_cpu_dfl", -+ .description = "Verify the default ops.select_cpu() dispatches tasks " -+ "when idles cores are found, and skips ops.enqueue()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&select_cpu_dfl) -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c -new file mode 100644 -index 000000000000..4bb5abb2d369 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c -@@ -0,0 +1,89 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that validates the behavior of direct dispatching with a default -+ * select_cpu implementation, and with the SCX_OPS_ENQ_DFL_NO_DISPATCH ops flag -+ * specified. -+ * -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+bool saw_local = false; -+ -+/* Per-task scheduling context */ -+struct task_ctx { -+ bool force_local; /* CPU changed by ops.select_cpu() */ -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE); -+ __uint(map_flags, BPF_F_NO_PREALLOC); -+ __type(key, int); -+ __type(value, struct task_ctx); -+} task_ctx_stor SEC(".maps"); -+ -+/* Manually specify the signature until the kfunc is added to the scx repo. */ -+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, -+ bool *found) __ksym; -+ -+s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ struct task_ctx *tctx; -+ s32 cpu; -+ -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return -ESRCH; -+ } -+ -+ cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, -+ &tctx->force_local); -+ -+ return cpu; -+} -+ -+void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p, -+ u64 enq_flags) -+{ -+ u64 dsq_id = SCX_DSQ_GLOBAL; -+ struct task_ctx *tctx; -+ -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return; -+ } -+ -+ if (tctx->force_local) { -+ dsq_id = SCX_DSQ_LOCAL; -+ tctx->force_local = false; -+ saw_local = true; -+ } -+ -+ scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); -+} -+ -+s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task, -+ struct task_struct *p, struct scx_init_task_args *args) -+{ -+ if (bpf_task_storage_get(&task_ctx_stor, p, 0, -+ BPF_LOCAL_STORAGE_GET_F_CREATE)) -+ return 0; -+ else -+ return -ENOMEM; -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops select_cpu_dfl_nodispatch_ops = { -+ .select_cpu = select_cpu_dfl_nodispatch_select_cpu, -+ .enqueue = select_cpu_dfl_nodispatch_enqueue, -+ .init_task = select_cpu_dfl_nodispatch_init_task, -+ .name = "select_cpu_dfl_nodispatch", -+}; -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c -new file mode 100644 -index 000000000000..1d85bf4bf3a3 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c -@@ -0,0 +1,72 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "select_cpu_dfl_nodispatch.bpf.skel.h" -+#include "scx_test.h" -+ -+#define NUM_CHILDREN 1028 -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct select_cpu_dfl_nodispatch *skel; -+ -+ skel = select_cpu_dfl_nodispatch__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct select_cpu_dfl_nodispatch *skel = ctx; -+ struct bpf_link *link; -+ pid_t pids[NUM_CHILDREN]; -+ int i, status; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_nodispatch_ops); -+ SCX_FAIL_IF(!link, "Failed to attach scheduler"); -+ -+ for (i = 0; i < NUM_CHILDREN; i++) { -+ pids[i] = fork(); -+ if (pids[i] == 0) { -+ sleep(1); -+ exit(0); -+ } -+ } -+ -+ for (i = 0; i < NUM_CHILDREN; i++) { -+ SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); -+ SCX_EQ(status, 0); -+ } -+ -+ SCX_ASSERT(skel->bss->saw_local); -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct select_cpu_dfl_nodispatch *skel = ctx; -+ -+ select_cpu_dfl_nodispatch__destroy(skel); -+} -+ -+struct scx_test select_cpu_dfl_nodispatch = { -+ .name = "select_cpu_dfl_nodispatch", -+ .description = "Verify behavior of scx_bpf_select_cpu_dfl() in " -+ "ops.select_cpu()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&select_cpu_dfl_nodispatch) -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c -new file mode 100644 -index 000000000000..f0b96a4a04b2 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c -@@ -0,0 +1,41 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that validates the behavior of direct dispatching with a default -+ * select_cpu implementation. -+ * -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ u64 dsq_id = SCX_DSQ_LOCAL; -+ s32 cpu = prev_cpu; -+ -+ if (scx_bpf_test_and_clear_cpu_idle(cpu)) -+ goto dispatch; -+ -+ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); -+ if (cpu >= 0) -+ goto dispatch; -+ -+ dsq_id = SCX_DSQ_GLOBAL; -+ cpu = prev_cpu; -+ -+dispatch: -+ scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0); -+ return cpu; -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops select_cpu_dispatch_ops = { -+ .select_cpu = select_cpu_dispatch_select_cpu, -+ .name = "select_cpu_dispatch", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c -new file mode 100644 -index 000000000000..0309ca8785b3 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c -@@ -0,0 +1,70 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "select_cpu_dispatch.bpf.skel.h" -+#include "scx_test.h" -+ -+#define NUM_CHILDREN 1028 -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct select_cpu_dispatch *skel; -+ -+ skel = select_cpu_dispatch__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct select_cpu_dispatch *skel = ctx; -+ struct bpf_link *link; -+ pid_t pids[NUM_CHILDREN]; -+ int i, status; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_ops); -+ SCX_FAIL_IF(!link, "Failed to attach scheduler"); -+ -+ for (i = 0; i < NUM_CHILDREN; i++) { -+ pids[i] = fork(); -+ if (pids[i] == 0) { -+ sleep(1); -+ exit(0); -+ } -+ } -+ -+ for (i = 0; i < NUM_CHILDREN; i++) { -+ SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); -+ SCX_EQ(status, 0); -+ } -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct select_cpu_dispatch *skel = ctx; -+ -+ select_cpu_dispatch__destroy(skel); -+} -+ -+struct scx_test select_cpu_dispatch = { -+ .name = "select_cpu_dispatch", -+ .description = "Test direct dispatching to built-in DSQs from " -+ "ops.select_cpu()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&select_cpu_dispatch) -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c -new file mode 100644 -index 000000000000..7b42ddce0f56 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c -@@ -0,0 +1,37 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that validates the behavior of direct dispatching with a default -+ * select_cpu implementation. -+ * -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+UEI_DEFINE(uei); -+ -+s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ /* Dispatching to a random DSQ should fail. */ -+ scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0); -+ -+ return prev_cpu; -+} -+ -+void BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops select_cpu_dispatch_bad_dsq_ops = { -+ .select_cpu = select_cpu_dispatch_bad_dsq_select_cpu, -+ .exit = select_cpu_dispatch_bad_dsq_exit, -+ .name = "select_cpu_dispatch_bad_dsq", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c -new file mode 100644 -index 000000000000..47eb6ed7627d ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c -@@ -0,0 +1,56 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "select_cpu_dispatch_bad_dsq.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct select_cpu_dispatch_bad_dsq *skel; -+ -+ skel = select_cpu_dispatch_bad_dsq__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct select_cpu_dispatch_bad_dsq *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_bad_dsq_ops); -+ SCX_FAIL_IF(!link, "Failed to attach scheduler"); -+ -+ sleep(1); -+ -+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct select_cpu_dispatch_bad_dsq *skel = ctx; -+ -+ select_cpu_dispatch_bad_dsq__destroy(skel); -+} -+ -+struct scx_test select_cpu_dispatch_bad_dsq = { -+ .name = "select_cpu_dispatch_bad_dsq", -+ .description = "Verify graceful failure if we direct-dispatch to a " -+ "bogus DSQ in ops.select_cpu()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&select_cpu_dispatch_bad_dsq) -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c -new file mode 100644 -index 000000000000..653e3dc0b4dc ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c -@@ -0,0 +1,38 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that validates the behavior of direct dispatching with a default -+ * select_cpu implementation. -+ * -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+UEI_DEFINE(uei); -+ -+s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ /* Dispatching twice in a row is disallowed. */ -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); -+ -+ return prev_cpu; -+} -+ -+void BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops select_cpu_dispatch_dbl_dsp_ops = { -+ .select_cpu = select_cpu_dispatch_dbl_dsp_select_cpu, -+ .exit = select_cpu_dispatch_dbl_dsp_exit, -+ .name = "select_cpu_dispatch_dbl_dsp", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c -new file mode 100644 -index 000000000000..48ff028a3c46 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c -@@ -0,0 +1,56 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "select_cpu_dispatch_dbl_dsp.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct select_cpu_dispatch_dbl_dsp *skel; -+ -+ skel = select_cpu_dispatch_dbl_dsp__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct select_cpu_dispatch_dbl_dsp *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_dbl_dsp_ops); -+ SCX_FAIL_IF(!link, "Failed to attach scheduler"); -+ -+ sleep(1); -+ -+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct select_cpu_dispatch_dbl_dsp *skel = ctx; -+ -+ select_cpu_dispatch_dbl_dsp__destroy(skel); -+} -+ -+struct scx_test select_cpu_dispatch_dbl_dsp = { -+ .name = "select_cpu_dispatch_dbl_dsp", -+ .description = "Verify graceful failure if we dispatch twice to a " -+ "DSQ in ops.select_cpu()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&select_cpu_dispatch_dbl_dsp) -diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c -new file mode 100644 -index 000000000000..7f3ebf4fc2ea ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c -@@ -0,0 +1,92 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that validates that enqueue flags are properly stored and -+ * applied at dispatch time when a task is directly dispatched from -+ * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and -+ * making the test a very basic vtime scheduler. -+ * -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ * Copyright (c) 2024 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+volatile bool consumed; -+ -+static u64 vtime_now; -+ -+#define VTIME_DSQ 0 -+ -+static inline bool vtime_before(u64 a, u64 b) -+{ -+ return (s64)(a - b) < 0; -+} -+ -+static inline u64 task_vtime(const struct task_struct *p) -+{ -+ u64 vtime = p->scx.dsq_vtime; -+ -+ if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) -+ return vtime_now - SCX_SLICE_DFL; -+ else -+ return vtime; -+} -+ -+s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ s32 cpu; -+ -+ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); -+ if (cpu >= 0) -+ goto ddsp; -+ -+ cpu = prev_cpu; -+ scx_bpf_test_and_clear_cpu_idle(cpu); -+ddsp: -+ scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); -+ return cpu; -+} -+ -+void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p) -+{ -+ if (scx_bpf_consume(VTIME_DSQ)) -+ consumed = true; -+} -+ -+void BPF_STRUCT_OPS(select_cpu_vtime_running, struct task_struct *p) -+{ -+ if (vtime_before(vtime_now, p->scx.dsq_vtime)) -+ vtime_now = p->scx.dsq_vtime; -+} -+ -+void BPF_STRUCT_OPS(select_cpu_vtime_stopping, struct task_struct *p, -+ bool runnable) -+{ -+ p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; -+} -+ -+void BPF_STRUCT_OPS(select_cpu_vtime_enable, struct task_struct *p) -+{ -+ p->scx.dsq_vtime = vtime_now; -+} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init) -+{ -+ return scx_bpf_create_dsq(VTIME_DSQ, -1); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops select_cpu_vtime_ops = { -+ .select_cpu = select_cpu_vtime_select_cpu, -+ .dispatch = select_cpu_vtime_dispatch, -+ .running = select_cpu_vtime_running, -+ .stopping = select_cpu_vtime_stopping, -+ .enable = select_cpu_vtime_enable, -+ .init = select_cpu_vtime_init, -+ .name = "select_cpu_vtime", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.c -new file mode 100644 -index 000000000000..b4629c2364f5 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.c -@@ -0,0 +1,59 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ * Copyright (c) 2024 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "select_cpu_vtime.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct select_cpu_vtime *skel; -+ -+ skel = select_cpu_vtime__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct select_cpu_vtime *skel = ctx; -+ struct bpf_link *link; -+ -+ SCX_ASSERT(!skel->bss->consumed); -+ -+ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_vtime_ops); -+ SCX_FAIL_IF(!link, "Failed to attach scheduler"); -+ -+ sleep(1); -+ -+ SCX_ASSERT(skel->bss->consumed); -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct select_cpu_vtime *skel = ctx; -+ -+ select_cpu_vtime__destroy(skel); -+} -+ -+struct scx_test select_cpu_vtime = { -+ .name = "select_cpu_vtime", -+ .description = "Test doing direct vtime-dispatching from " -+ "ops.select_cpu(), to a non-built-in DSQ", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&select_cpu_vtime) -diff --git a/tools/testing/selftests/sched_ext/test_example.c b/tools/testing/selftests/sched_ext/test_example.c -new file mode 100644 -index 000000000000..ce36cdf03cdc ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/test_example.c -@@ -0,0 +1,49 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 Tejun Heo -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include "scx_test.h" -+ -+static bool setup_called = false; -+static bool run_called = false; -+static bool cleanup_called = false; -+ -+static int context = 10; -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ setup_called = true; -+ *ctx = &context; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ int *arg = ctx; -+ -+ SCX_ASSERT(setup_called); -+ SCX_ASSERT(!run_called && !cleanup_called); -+ SCX_EQ(*arg, context); -+ -+ run_called = true; -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup (void *ctx) -+{ -+ SCX_BUG_ON(!run_called || cleanup_called, "Wrong callbacks invoked"); -+} -+ -+struct scx_test example = { -+ .name = "example", -+ .description = "Validate the basic function of the test suite itself", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&example) -diff --git a/tools/testing/selftests/sched_ext/util.c b/tools/testing/selftests/sched_ext/util.c -new file mode 100644 -index 000000000000..e47769c91918 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/util.c -@@ -0,0 +1,71 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* Returns read len on success, or -errno on failure. */ -+static ssize_t read_text(const char *path, char *buf, size_t max_len) -+{ -+ ssize_t len; -+ int fd; -+ -+ fd = open(path, O_RDONLY); -+ if (fd < 0) -+ return -errno; -+ -+ len = read(fd, buf, max_len - 1); -+ -+ if (len >= 0) -+ buf[len] = 0; -+ -+ close(fd); -+ return len < 0 ? -errno : len; -+} -+ -+/* Returns written len on success, or -errno on failure. */ -+static ssize_t write_text(const char *path, char *buf, ssize_t len) -+{ -+ int fd; -+ ssize_t written; -+ -+ fd = open(path, O_WRONLY | O_APPEND); -+ if (fd < 0) -+ return -errno; -+ -+ written = write(fd, buf, len); -+ close(fd); -+ return written < 0 ? -errno : written; -+} -+ -+long file_read_long(const char *path) -+{ -+ char buf[128]; -+ -+ -+ if (read_text(path, buf, sizeof(buf)) <= 0) -+ return -1; -+ -+ return atol(buf); -+} -+ -+int file_write_long(const char *path, long val) -+{ -+ char buf[64]; -+ int ret; -+ -+ ret = sprintf(buf, "%lu", val); -+ if (ret < 0) -+ return ret; -+ -+ if (write_text(path, buf, sizeof(buf)) <= 0) -+ return -1; -+ -+ return 0; -+} -diff --git a/tools/testing/selftests/sched_ext/util.h b/tools/testing/selftests/sched_ext/util.h -new file mode 100644 -index 000000000000..bc13dfec1267 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/util.h -@@ -0,0 +1,13 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+ -+#ifndef __SCX_TEST_UTIL_H__ -+#define __SCX_TEST_UTIL_H__ -+ -+long file_read_long(const char *path); -+int file_write_long(const char *path, long val); -+ -+#endif // __SCX_TEST_H__ --- -2.47.0.rc0