From 1b97bf39299053e524957ef188122ba3a912b722 Mon Sep 17 00:00:00 2001 From: ferrreo Date: Sun, 12 Mar 2023 20:10:24 +0000 Subject: [PATCH] Remove nice patch as included in eevdf --- patches/0002-cfs-nice.patch | 990 ------------------ .../{0003-eevdf.patch => 0002-eevdf.patch} | 0 patches/{0004-bore.patch => 0003-bore.patch} | 0 patches/{0005-hdr.patch => 0004-hdr.patch} | 0 scripts/patch.sh | 8 +- 5 files changed, 3 insertions(+), 995 deletions(-) delete mode 100644 patches/0002-cfs-nice.patch rename patches/{0003-eevdf.patch => 0002-eevdf.patch} (100%) rename patches/{0004-bore.patch => 0003-bore.patch} (100%) rename patches/{0005-hdr.patch => 0004-hdr.patch} (100%) diff --git a/patches/0002-cfs-nice.patch b/patches/0002-cfs-nice.patch deleted file mode 100644 index 1010e26..0000000 --- a/patches/0002-cfs-nice.patch +++ /dev/null @@ -1,990 +0,0 @@ -From 10300b929dc0a52e458b6bcd9af801f6df967d42 Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Fri, 24 Feb 2023 11:16:15 +0100 -Subject: [PATCH] Add latency priority for CFS class - -This patchset restarts the work about adding a latency priority to describe -the latency tolerance of cfs tasks. - -Patch [1] is a new one that has been added with v6. It fixes an -unfairness for low prio tasks because of wakeup_gran() being bigger -than the maximum vruntime credit that a waking task can keep after -sleeping. - -The patches [2-4] have been done by Parth: -https://lore.kernel.org/lkml/20200228090755.22829-1-parth@linux.ibm.com/ - -I have just rebased and moved the set of latency priority outside the -priority update. I have removed the reviewed tag because the patches -are 2 years old. - -This aims to be a generic interface and the following patches is one use -of it to improve the scheduling latency of cfs tasks. - -Patch [5] uses latency nice priority to define a latency offset -and then decide if a cfs task can or should preempt the current -running task. The patch gives some tests results with cyclictests and -hackbench to highlight the benefit of latency priority for short -interactive task or long intensive tasks. - -Patch [6] adds the support of latency nice priority to task group by -adding a cpu.latency.nice field. The range is [-20:19] as for setting task -latency priority. - -Patch [7] makes sched_core taking into account the latency offset. - -Patch [8] adds a rb tree to cover some corner cases where the latency -sensitive task (priority < 0) is preempted by high priority task (RT/DL) -or fails to preempt them. This patch ensures that tasks will have at least -a slice of sched_min_granularity in priority at wakeup. - -I have also backported the patchset on a dragonboard RB3 with an android -mainline kernel based on v5.18 for a quick test. I have used the -TouchLatency app which is part of AOSP and described to be a very good -test to highlight jitter and jank frame sources of a system [1]. -In addition to the app, I have added some short running tasks waking-up -regularly (to use the 8 cpus for 4 ms every 37777us) to stress the system -without overloading it (and disabling EAS). The 1st results shows that the -patchset helps to reduce the missed deadline frames from 5% to less than -0.1% when the cpu.latency.nice of task group are set. I haven't rerun the -test with latest version. - -I have also tested the patchset with the modified version of the alsa -latency test that has been shared by Tim. The test quickly xruns with -default latency nice priority 0 but is able to run without underuns with -a latency -20 and hackbench running simultaneously. - -While preparing the version 8, I have evaluated the benefit of using an -augmented rbtree instead of adding a rbtree for latency sensitive entities, -which was a relevant suggestion done by PeterZ. Although the augmented -rbtree enables to sort additional information in the tree with a limited -overhead, it has more impact on legacy use cases (latency_nice >= 0) -because the augmented callbacks are always called to maintain this -additional information even when there is no sensitive tasks. In such -cases, the dedicated rbtree remains empty and the overhead is reduced to -loading a cached null node pointer. Nevertheless, we might want to -reconsider the augmented rbtree once the use of negative latency_nice will -be more widlely deployed. At now, the different tests that I have done, -have not shown improvements with augmented rbtree. - -Below are some hackbench results (from v10): - 2 rbtrees augmented rbtree augmented rbtree - sorted by vruntime sorted by wakeup_vruntime -sched pipe -avg 26311,000 25976,667 25839,556 -stdev 0,15 % 0,28 % 0,24 % -vs tip 0,50 % -0,78 % -1,31 % -hackbench 1 group -avg 1,315 1,344 1,359 -stdev 0,88 % 1,55 % 1,82 % -vs tip -0,47 % -2,68 % -3,87 % -hackbench 4 groups -avg 1,339 1,365 1,367 -stdev 2,39 % 2,26 % 3,58 % -vs tip -0,08 % -2,01 % -2,22 % -hackbench 8 groups -avg 1,233 1,286 1,301 -stdev 0,74 % 1,09 % 1,52 % -vs tip 0,29 % -4,05 % -5,27 % -hackbench 16 groups -avg 1,268 1,313 1,319 -stdev 0,85 % 1,60 % 0,68 % -vs tip -0,02 % -3,56 % -4,01 % - -[1] https://source.android.com/docs/core/debug/eval_perf#touchlatency - -Change since v11: -- init latency_node of task group entity - -Change since v10: -- remove sched_latency_to_weight array and use a calc_latency_offset() instead -- save latency_prio instead for task group instead of latency offset -- enqueue back an entity when changing the latency nice prio fo a task group - -Change since v9: -- Rebase -- add tags - -Change since v8: -- Rename get_sched_latency by get_sleep_latency -- move latency nice defines in sched/prio.h and fix latency_prio init value -- Fix typo and comments - -Change since v7: -- Replaced se->on_latency by using RB_CLEAR_NODE() and RB_EMPTY_NODE() -- Clarify the limit behavior fo the cgroup cpu.latenyc_nice - -Change since v6: -- Fix compilation error for !CONFIG_SCHED_DEBUG - -Change since v5: -- Add patch 1 to fix unfairness for low prio task. This has been - discovered while studying Youssef's tests results with latency nice - which were hitting the same problem. -- Fixed latency_offset computation to take into account - GENTLE_FAIR_SLEEPERS. This has diseappeared with v2and has been raised - by Youssef's tests. -- Reworked and optimized how latency_offset in used to check for - preempting current task at wakeup and tick. This cover more cases too. -- Add patch 9 to remove check_preempt_from_others() which is not needed - anymore with the rb tree. - -Change since v4: -- Removed permission checks to set latency priority. This enables user - without elevated privilege like audio application to set their latency - priority as requested by Tim. -- Removed cpu.latency and replaced it by cpu.latency.nice so we keep a - generic interface not tied to latency_offset which can be used to - implement other latency features. -- Added an entry in Documentation/admin-guide/cgroup-v2.rst to describe - cpu.latency.nice. -- Fix some typos. - -Change since v3: -- Fix 2 compilation warnings raised by kernel test robot - -Change since v2: -- Set a latency_offset field instead of saving a weight and computing it - on the fly. -- Make latency_offset available for task group: cpu.latency -- Fix some corner cases to make latency sensitive tasks schedule first and - add a rb tree for latency sensitive task. - -Change since v1: -- fix typo -- move some codes in the right patch to make bisect happy -- simplify and fixed how the weight is computed -- added support of sched core patch 7 - -Parth Shah (3): - sched: Introduce latency-nice as a per-task attribute - sched/core: Propagate parent task's latency requirements to the child - task - sched: Allow sched_{get,set}attr to change latency_nice of the task - -Vincent Guittot (5): - sched/fair: fix unfairness at wakeup - sched/fair: Take into account latency priority at wakeup - sched/fair: Add sched group latency support - sched/core: Support latency priority with sched core - sched/fair: Add latency list - -Signed-off-by: Peter Jung ---- - Documentation/admin-guide/cgroup-v2.rst | 10 ++ - include/linux/sched.h | 4 + - include/linux/sched/prio.h | 27 +++ - include/uapi/linux/sched.h | 4 +- - include/uapi/linux/sched/types.h | 19 ++ - init/init_task.c | 1 + - kernel/sched/core.c | 65 +++++++ - kernel/sched/debug.c | 1 + - kernel/sched/fair.c | 222 ++++++++++++++++++++---- - kernel/sched/sched.h | 50 +++++- - tools/include/uapi/linux/sched.h | 4 +- - 11 files changed, 371 insertions(+), 36 deletions(-) - -diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst -index 74cec76be9f2..2e511d4a4c6a 100644 ---- a/Documentation/admin-guide/cgroup-v2.rst -+++ b/Documentation/admin-guide/cgroup-v2.rst -@@ -1118,6 +1118,16 @@ All time durations are in microseconds. - values similar to the sched_setattr(2). This maximum utilization - value is used to clamp the task specific maximum utilization clamp. - -+ cpu.latency.nice -+ A read-write single value file which exists on non-root -+ cgroups. The default is "0". -+ -+ The nice value is in the range [-20, 19]. -+ -+ This interface file allows reading and setting latency using the -+ same values used by sched_setattr(2). The latency_nice of a group is -+ used to limit the impact of the latency_nice of a task outside the -+ group. - - - Memory -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 28ce1be0ba47..df219c7cd6aa 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -548,6 +548,7 @@ struct sched_entity { - /* For load-balancing: */ - struct load_weight load; - struct rb_node run_node; -+ struct rb_node latency_node; - struct list_head group_node; - unsigned int on_rq; - -@@ -571,6 +572,8 @@ struct sched_entity { - /* cached value of my_q->h_nr_running */ - unsigned long runnable_weight; - #endif -+ /* preemption offset in ns */ -+ long latency_offset; - - #ifdef CONFIG_SMP - /* -@@ -787,6 +790,7 @@ struct task_struct { - int static_prio; - int normal_prio; - unsigned int rt_priority; -+ int latency_prio; - - struct sched_entity se; - struct sched_rt_entity rt; -diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h -index ab83d85e1183..be79503d86af 100644 ---- a/include/linux/sched/prio.h -+++ b/include/linux/sched/prio.h -@@ -42,4 +42,31 @@ static inline long rlimit_to_nice(long prio) - return (MAX_NICE - prio + 1); - } - -+/* -+ * Latency nice is meant to provide scheduler hints about the relative -+ * latency requirements of a task with respect to other tasks. -+ * Thus a task with latency_nice == 19 can be hinted as the task with no -+ * latency requirements, in contrast to the task with latency_nice == -20 -+ * which should be given priority in terms of lower latency. -+ */ -+#define MAX_LATENCY_NICE 19 -+#define MIN_LATENCY_NICE -20 -+ -+#define LATENCY_NICE_WIDTH \ -+ (MAX_LATENCY_NICE - MIN_LATENCY_NICE + 1) -+ -+/* -+ * Default tasks should be treated as a task with latency_nice = 0. -+ */ -+#define DEFAULT_LATENCY_NICE 0 -+#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2) -+ -+/* -+ * Convert user-nice values [ -20 ... 0 ... 19 ] -+ * to static latency [ 0..39 ], -+ * and back. -+ */ -+#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO) -+#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO) -+ - #endif /* _LINUX_SCHED_PRIO_H */ -diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h -index 3bac0a8ceab2..b2e932c25be6 100644 ---- a/include/uapi/linux/sched.h -+++ b/include/uapi/linux/sched.h -@@ -132,6 +132,7 @@ struct clone_args { - #define SCHED_FLAG_KEEP_PARAMS 0x10 - #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 - #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 -+#define SCHED_FLAG_LATENCY_NICE 0x80 - - #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ - SCHED_FLAG_KEEP_PARAMS) -@@ -143,6 +144,7 @@ struct clone_args { - SCHED_FLAG_RECLAIM | \ - SCHED_FLAG_DL_OVERRUN | \ - SCHED_FLAG_KEEP_ALL | \ -- SCHED_FLAG_UTIL_CLAMP) -+ SCHED_FLAG_UTIL_CLAMP | \ -+ SCHED_FLAG_LATENCY_NICE) - - #endif /* _UAPI_LINUX_SCHED_H */ -diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h -index f2c4589d4dbf..db1e8199e8c8 100644 ---- a/include/uapi/linux/sched/types.h -+++ b/include/uapi/linux/sched/types.h -@@ -10,6 +10,7 @@ struct sched_param { - - #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ - #define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */ -+#define SCHED_ATTR_SIZE_VER2 60 /* add: latency_nice */ - - /* - * Extended scheduling parameters data structure. -@@ -98,6 +99,22 @@ struct sched_param { - * scheduled on a CPU with no more capacity than the specified value. - * - * A task utilization boundary can be reset by setting the attribute to -1. -+ * -+ * Latency Tolerance Attributes -+ * =========================== -+ * -+ * A subset of sched_attr attributes allows to specify the relative latency -+ * requirements of a task with respect to the other tasks running/queued in the -+ * system. -+ * -+ * @ sched_latency_nice task's latency_nice value -+ * -+ * The latency_nice of a task can have any value in a range of -+ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE]. -+ * -+ * A task with latency_nice with the value of LATENCY_NICE_MIN can be -+ * taken for a task requiring a lower latency as opposed to the task with -+ * higher latency_nice. - */ - struct sched_attr { - __u32 size; -@@ -120,6 +137,8 @@ struct sched_attr { - __u32 sched_util_min; - __u32 sched_util_max; - -+ /* latency requirement hints */ -+ __s32 sched_latency_nice; - }; - - #endif /* _UAPI_LINUX_SCHED_TYPES_H */ -diff --git a/init/init_task.c b/init/init_task.c -index ff6c4b9bfe6b..071deff8dbd1 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -78,6 +78,7 @@ struct task_struct init_task - .prio = MAX_PRIO - 20, - .static_prio = MAX_PRIO - 20, - .normal_prio = MAX_PRIO - 20, -+ .latency_prio = DEFAULT_LATENCY_PRIO, - .policy = SCHED_NORMAL, - .cpus_ptr = &init_task.cpus_mask, - .user_cpus_ptr = NULL, -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 5237639786b7..e1a9f9898b30 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -1283,6 +1283,11 @@ static void set_load_weight(struct task_struct *p, bool update_load) - } - } - -+static void set_latency_offset(struct task_struct *p) -+{ -+ p->se.latency_offset = calc_latency_offset(p->latency_prio); -+} -+ - #ifdef CONFIG_UCLAMP_TASK - /* - * Serializes updates of utilization clamp values -@@ -4432,6 +4437,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) - p->se.dur_avg = 0; - p->se.prev_sleep_sum_runtime = 0; - INIT_LIST_HEAD(&p->se.group_node); -+ RB_CLEAR_NODE(&p->se.latency_node); - - #ifdef CONFIG_FAIR_GROUP_SCHED - p->se.cfs_rq = NULL; -@@ -4684,6 +4690,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) - p->prio = p->normal_prio = p->static_prio; - set_load_weight(p, false); - -+ p->latency_prio = NICE_TO_LATENCY(0); -+ set_latency_offset(p); -+ - /* - * We don't need the reset flag anymore after the fork. It has - * fulfilled its duty: -@@ -7444,6 +7453,16 @@ static void __setscheduler_params(struct task_struct *p, - p->rt_priority = attr->sched_priority; - p->normal_prio = normal_prio(p); - set_load_weight(p, true); -+ -+} -+ -+static void __setscheduler_latency(struct task_struct *p, -+ const struct sched_attr *attr) -+{ -+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) { -+ p->latency_prio = NICE_TO_LATENCY(attr->sched_latency_nice); -+ set_latency_offset(p); -+ } - } - - /* -@@ -7586,6 +7605,13 @@ static int __sched_setscheduler(struct task_struct *p, - return retval; - } - -+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) { -+ if (attr->sched_latency_nice > MAX_LATENCY_NICE) -+ return -EINVAL; -+ if (attr->sched_latency_nice < MIN_LATENCY_NICE) -+ return -EINVAL; -+ } -+ - if (pi) - cpuset_read_lock(); - -@@ -7620,6 +7646,9 @@ static int __sched_setscheduler(struct task_struct *p, - goto change; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) - goto change; -+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE && -+ attr->sched_latency_nice != LATENCY_TO_NICE(p->latency_prio)) -+ goto change; - - p->sched_reset_on_fork = reset_on_fork; - retval = 0; -@@ -7708,6 +7737,7 @@ static int __sched_setscheduler(struct task_struct *p, - __setscheduler_params(p, attr); - __setscheduler_prio(p, newprio); - } -+ __setscheduler_latency(p, attr); - __setscheduler_uclamp(p, attr); - - if (queued) { -@@ -7918,6 +7948,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a - size < SCHED_ATTR_SIZE_VER1) - return -EINVAL; - -+ if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) && -+ size < SCHED_ATTR_SIZE_VER2) -+ return -EINVAL; - /* - * XXX: Do we want to be lenient like existing syscalls; or do we want - * to be strict and return an error on out-of-bounds values? -@@ -8155,6 +8188,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - get_params(p, &kattr); - kattr.sched_flags &= SCHED_FLAG_ALL; - -+ kattr.sched_latency_nice = LATENCY_TO_NICE(p->latency_prio); -+ - #ifdef CONFIG_UCLAMP_TASK - /* - * This could race with another potential updater, but this is fine -@@ -11027,6 +11062,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, - { - return sched_group_set_idle(css_tg(css), idle); - } -+ -+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css, -+ struct cftype *cft) -+{ -+ return LATENCY_TO_NICE(css_tg(css)->latency_prio); -+} -+ -+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css, -+ struct cftype *cft, s64 nice) -+{ -+ int prio; -+ -+ if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE) -+ return -ERANGE; -+ -+ prio = NICE_TO_LATENCY(nice); -+ -+ return sched_group_set_latency(css_tg(css), prio); -+} - #endif - - static struct cftype cpu_legacy_files[] = { -@@ -11041,6 +11095,11 @@ static struct cftype cpu_legacy_files[] = { - .read_s64 = cpu_idle_read_s64, - .write_s64 = cpu_idle_write_s64, - }, -+ { -+ .name = "latency.nice", -+ .read_s64 = cpu_latency_nice_read_s64, -+ .write_s64 = cpu_latency_nice_write_s64, -+ }, - #endif - #ifdef CONFIG_CFS_BANDWIDTH - { -@@ -11258,6 +11317,12 @@ static struct cftype cpu_files[] = { - .read_s64 = cpu_idle_read_s64, - .write_s64 = cpu_idle_write_s64, - }, -+ { -+ .name = "latency.nice", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .read_s64 = cpu_latency_nice_read_s64, -+ .write_s64 = cpu_latency_nice_write_s64, -+ }, - #endif - #ifdef CONFIG_CFS_BANDWIDTH - { -diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 8d64fba16cfe..177934290ec4 100644 ---- a/kernel/sched/debug.c -+++ b/kernel/sched/debug.c -@@ -1044,6 +1044,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, - #endif - P(policy); - P(prio); -+ P(latency_prio); - if (task_has_dl_policy(p)) { - P(dl.runtime); - P(dl.deadline); -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index b38a1ce1be49..e0a5049f6b80 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -698,7 +698,85 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) - - return __node_2_se(last); - } -+#endif -+ -+/************************************************************** -+ * Scheduling class tree data structure manipulation methods: -+ * for latency -+ */ -+ -+static inline bool latency_before(struct sched_entity *a, -+ struct sched_entity *b) -+{ -+ return (s64)(a->vruntime + a->latency_offset - b->vruntime - b->latency_offset) < 0; -+} -+ -+#define __latency_node_2_se(node) \ -+ rb_entry((node), struct sched_entity, latency_node) -+ -+static inline bool __latency_less(struct rb_node *a, const struct rb_node *b) -+{ -+ return latency_before(__latency_node_2_se(a), __latency_node_2_se(b)); -+} -+ -+/* -+ * Enqueue an entity into the latency rb-tree: -+ */ -+static void __enqueue_latency(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) -+{ -+ -+ /* Only latency sensitive entity can be added to the list */ -+ if (se->latency_offset >= 0) -+ return; -+ -+ if (!RB_EMPTY_NODE(&se->latency_node)) -+ return; -+ -+ /* -+ * The entity is always added the latency list at wakeup. -+ * Then, a not waking up entity that is put back in the list after an -+ * execution time less than sysctl_sched_min_granularity, means that -+ * the entity has been preempted by a higher sched class or an entity -+ * with higher latency constraint. In thi case, the entity is also put -+ * back in the latency list so it gets a chance to run 1st during the -+ * next slice. -+ */ -+ if (!(flags & ENQUEUE_WAKEUP)) { -+ u64 delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; -+ -+ if (delta_exec >= sysctl_sched_min_granularity) -+ return; -+ } - -+ rb_add_cached(&se->latency_node, &cfs_rq->latency_timeline, __latency_less); -+} -+ -+/* -+ * Dequeue an entity from the latency rb-tree and return true if it was really -+ * part of the rb-tree: -+ */ -+static bool __dequeue_latency(struct cfs_rq *cfs_rq, struct sched_entity *se) -+{ -+ if (!RB_EMPTY_NODE(&se->latency_node)) { -+ rb_erase_cached(&se->latency_node, &cfs_rq->latency_timeline); -+ RB_CLEAR_NODE(&se->latency_node); -+ return true; -+ } -+ -+ return false; -+} -+ -+static struct sched_entity *__pick_first_latency(struct cfs_rq *cfs_rq) -+{ -+ struct rb_node *left = rb_first_cached(&cfs_rq->latency_timeline); -+ -+ if (!left) -+ return NULL; -+ -+ return __latency_node_2_se(left); -+} -+ -+#ifdef CONFIG_SCHED_DEBUG - /************************************************************** - * Scheduling class statistics methods: - */ -@@ -4672,33 +4750,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) - u64 vruntime = cfs_rq->min_vruntime; - u64 sleep_time; - -- /* -- * The 'current' period is already promised to the current tasks, -- * however the extra weight of the new task will slow them down a -- * little, place the new task so that it fits in the slot that -- * stays open at the end. -- */ -- if (initial && sched_feat(START_DEBIT)) -- vruntime += sched_vslice(cfs_rq, se); -- -- /* sleeps up to a single latency don't count. */ -- if (!initial) { -- unsigned long thresh; -- -- if (se_is_idle(se)) -- thresh = sysctl_sched_min_granularity; -- else -- thresh = sysctl_sched_latency; -- -+ if (!initial) -+ /* sleeps up to a single latency don't count. */ -+ vruntime -= get_sleep_latency(se_is_idle(se)); -+ else if (sched_feat(START_DEBIT)) - /* -- * Halve their sleep time's effect, to allow -- * for a gentler effect of sleepers: -+ * The 'current' period is already promised to the current tasks, -+ * however the extra weight of the new task will slow them down a -+ * little, place the new task so that it fits in the slot that -+ * stays open at the end. - */ -- if (sched_feat(GENTLE_FAIR_SLEEPERS)) -- thresh >>= 1; -- -- vruntime -= thresh; -- } -+ vruntime += sched_vslice(cfs_rq, se); - - /* - * Pull vruntime of the entity being placed to the base level of -@@ -4792,8 +4854,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - check_schedstat_required(); - update_stats_enqueue_fair(cfs_rq, se, flags); - check_spread(cfs_rq, se); -- if (!curr) -+ if (!curr) { - __enqueue_entity(cfs_rq, se); -+ __enqueue_latency(cfs_rq, se, flags); -+ } - se->on_rq = 1; - - if (cfs_rq->nr_running == 1) { -@@ -4879,8 +4943,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - - clear_buddies(cfs_rq, se); - -- if (se != cfs_rq->curr) -+ if (se != cfs_rq->curr) { - __dequeue_entity(cfs_rq, se); -+ __dequeue_latency(cfs_rq, se); -+ } - se->on_rq = 0; - account_entity_dequeue(cfs_rq, se); - -@@ -4911,6 +4977,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - update_idle_cfs_rq_clock_pelt(cfs_rq); - } - -+static long wakeup_latency_gran(struct sched_entity *curr, struct sched_entity *se); -+ - /* - * Preempt the current task with a newly woken task if needed: - */ -@@ -4919,7 +4987,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) - { - unsigned long ideal_runtime, delta_exec; - struct sched_entity *se; -- s64 delta; -+ s64 delta, offset; - - /* - * When many tasks blow up the sched_period; it is possible that -@@ -4950,10 +5018,12 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) - se = __pick_first_entity(cfs_rq); - delta = curr->vruntime - se->vruntime; - -- if (delta < 0) -+ offset = wakeup_latency_gran(curr, se); -+ if (delta < offset) - return; - -- if (delta > ideal_runtime) -+ if ((delta > ideal_runtime) || -+ (delta > get_latency_max())) - resched_curr(rq_of(cfs_rq)); - } - -@@ -4971,6 +5041,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) - */ - update_stats_wait_end_fair(cfs_rq, se); - __dequeue_entity(cfs_rq, se); -+ __dequeue_latency(cfs_rq, se); - update_load_avg(cfs_rq, se, UPDATE_TG); - } - -@@ -5009,7 +5080,7 @@ static struct sched_entity * - pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) - { - struct sched_entity *left = __pick_first_entity(cfs_rq); -- struct sched_entity *se; -+ struct sched_entity *latency, *se; - - /* - * If curr is set we have to see if its left of the leftmost entity -@@ -5051,6 +5122,12 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) - se = cfs_rq->last; - } - -+ /* Check for latency sensitive entity waiting for running */ -+ latency = __pick_first_latency(cfs_rq); -+ if (latency && (latency != se) && -+ wakeup_preempt_entity(latency, se) < 1) -+ se = latency; -+ - return se; - } - -@@ -5074,6 +5151,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) - update_stats_wait_start_fair(cfs_rq, prev); - /* Put 'current' back into the tree. */ - __enqueue_entity(cfs_rq, prev); -+ __enqueue_latency(cfs_rq, prev, 0); - /* in !on_rq case, update occurred at dequeue */ - update_load_avg(cfs_rq, prev, 0); - } -@@ -7735,6 +7813,23 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) - } - #endif /* CONFIG_SMP */ - -+static long wakeup_latency_gran(struct sched_entity *curr, struct sched_entity *se) -+{ -+ long latency_offset = se->latency_offset; -+ -+ /* -+ * A negative latency offset means that the sched_entity has latency -+ * requirement that needs to be evaluated versus other entity. -+ * Otherwise, use the latency weight to evaluate how much scheduling -+ * delay is acceptable by se. -+ */ -+ if ((latency_offset < 0) || (curr->latency_offset < 0)) -+ latency_offset -= curr->latency_offset; -+ latency_offset = min_t(long, latency_offset, get_latency_max()); -+ -+ return latency_offset; -+} -+ - static unsigned long wakeup_gran(struct sched_entity *se) - { - unsigned long gran = sysctl_sched_wakeup_granularity; -@@ -7773,11 +7868,24 @@ static int - wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) - { - s64 gran, vdiff = curr->vruntime - se->vruntime; -+ s64 offset = wakeup_latency_gran(curr, se); - -- if (vdiff <= 0) -+ if (vdiff < offset) - return -1; - -- gran = wakeup_gran(se); -+ gran = offset + wakeup_gran(se); -+ -+ /* -+ * At wake up, the vruntime of a task is capped to not be older than -+ * a sched_latency period compared to min_vruntime. This prevents long -+ * sleeping task to get unlimited credit at wakeup. Such waking up task -+ * has to preempt current in order to not lose its share of CPU -+ * bandwidth but wakeup_gran() can become higher than scheduling period -+ * for low priority task. Make sure that long sleeping task will get a -+ * chance to preempt current. -+ */ -+ gran = min_t(s64, gran, get_latency_max()); -+ - if (vdiff > gran) - return 1; - -@@ -11995,6 +12103,9 @@ bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) - delta = (s64)(sea->vruntime - seb->vruntime) + - (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); - -+ /* Take into account latency offset */ -+ delta -= wakeup_latency_gran(sea, seb); -+ - return delta > 0; - } - #else -@@ -12265,6 +12376,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) - void init_cfs_rq(struct cfs_rq *cfs_rq) - { - cfs_rq->tasks_timeline = RB_ROOT_CACHED; -+ cfs_rq->latency_timeline = RB_ROOT_CACHED; - u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20))); - #ifdef CONFIG_SMP - raw_spin_lock_init(&cfs_rq->removed.lock); -@@ -12320,6 +12432,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) - goto err; - - tg->shares = NICE_0_LOAD; -+ tg->latency_prio = DEFAULT_LATENCY_PRIO; - - init_cfs_bandwidth(tg_cfs_bandwidth(tg)); - -@@ -12418,6 +12531,10 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, - } - - se->my_q = cfs_rq; -+ -+ se->latency_offset = calc_latency_offset(tg->latency_prio); -+ RB_CLEAR_NODE(&se->latency_node); -+ - /* guarantee group entities always have weight */ - update_load_set(&se->load, NICE_0_LOAD); - se->parent = parent; -@@ -12548,6 +12665,45 @@ int sched_group_set_idle(struct task_group *tg, long idle) - return 0; - } - -+int sched_group_set_latency(struct task_group *tg, int prio) -+{ -+ long latency_offset; -+ int i; -+ -+ if (tg == &root_task_group) -+ return -EINVAL; -+ -+ mutex_lock(&shares_mutex); -+ -+ if (tg->latency_prio == prio) { -+ mutex_unlock(&shares_mutex); -+ return 0; -+ } -+ -+ tg->latency_prio = prio; -+ latency_offset = calc_latency_offset(prio); -+ -+ for_each_possible_cpu(i) { -+ struct sched_entity *se = tg->se[i]; -+ struct rq *rq = cpu_rq(i); -+ struct rq_flags rf; -+ bool queued; -+ -+ rq_lock_irqsave(rq, &rf); -+ -+ queued = __dequeue_latency(se->cfs_rq, se); -+ WRITE_ONCE(se->latency_offset, latency_offset); -+ if (queued) -+ __enqueue_latency(se->cfs_rq, se, ENQUEUE_WAKEUP); -+ -+ -+ rq_unlock_irqrestore(rq, &rf); -+ } -+ -+ mutex_unlock(&shares_mutex); -+ return 0; -+} -+ - #else /* CONFIG_FAIR_GROUP_SCHED */ - - void free_fair_sched_group(struct task_group *tg) { } -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 9e8bb6278604..a9fedf20c869 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -378,6 +378,8 @@ struct task_group { - - /* A positive value indicates that this is a SCHED_IDLE group. */ - int idle; -+ /* latency priority of the group. */ -+ int latency_prio; - - #ifdef CONFIG_SMP - /* -@@ -488,6 +490,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); - - extern int sched_group_set_idle(struct task_group *tg, long idle); - -+extern int sched_group_set_latency(struct task_group *tg, int prio); -+ - #ifdef CONFIG_SMP - extern void set_task_rq_fair(struct sched_entity *se, - struct cfs_rq *prev, struct cfs_rq *next); -@@ -566,6 +570,7 @@ struct cfs_rq { - #endif - - struct rb_root_cached tasks_timeline; -+ struct rb_root_cached latency_timeline; - - /* - * 'curr' points to currently running entity on this cfs_rq. -@@ -2461,9 +2466,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); - extern const_debug unsigned int sysctl_sched_nr_migrate; - extern const_debug unsigned int sysctl_sched_migration_cost; - --#ifdef CONFIG_SCHED_DEBUG - extern unsigned int sysctl_sched_latency; - extern unsigned int sysctl_sched_min_granularity; -+#ifdef CONFIG_SCHED_DEBUG - extern unsigned int sysctl_sched_idle_min_granularity; - extern unsigned int sysctl_sched_wakeup_granularity; - extern int sysctl_resched_latency_warn_ms; -@@ -2478,6 +2483,49 @@ extern unsigned int sysctl_numa_balancing_scan_size; - extern unsigned int sysctl_numa_balancing_hot_threshold; - #endif - -+static inline unsigned long get_sleep_latency(bool idle) -+{ -+ unsigned long thresh; -+ -+ if (idle) -+ thresh = sysctl_sched_min_granularity; -+ else -+ thresh = sysctl_sched_latency; -+ -+ /* -+ * Halve their sleep time's effect, to allow -+ * for a gentler effect of sleepers: -+ */ -+ if (sched_feat(GENTLE_FAIR_SLEEPERS)) -+ thresh >>= 1; -+ -+ return thresh; -+} -+ -+/* -+ * Calculate the latency offset for a priority level. -+ * We use a linear mapping of the priority in the range: -+ * [-sysctl_sched_latency:sysctl_sched_latency] -+ */ -+static inline long calc_latency_offset(int prio) -+{ -+ return (long)get_sleep_latency(false) * LATENCY_TO_NICE(prio) / -+ (LATENCY_NICE_WIDTH/2); -+} -+ -+static inline unsigned long get_latency_max(void) -+{ -+ unsigned long thresh = get_sleep_latency(false); -+ -+ /* -+ * If the waking task failed to preempt current it could to wait up to -+ * sysctl_sched_min_granularity before preempting it during next tick. -+ */ -+ thresh -= sysctl_sched_min_granularity; -+ -+ return thresh; -+} -+ - #ifdef CONFIG_SCHED_HRTICK - - /* -diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h -index 3bac0a8ceab2..b2e932c25be6 100644 ---- a/tools/include/uapi/linux/sched.h -+++ b/tools/include/uapi/linux/sched.h -@@ -132,6 +132,7 @@ struct clone_args { - #define SCHED_FLAG_KEEP_PARAMS 0x10 - #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 - #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 -+#define SCHED_FLAG_LATENCY_NICE 0x80 - - #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ - SCHED_FLAG_KEEP_PARAMS) -@@ -143,6 +144,7 @@ struct clone_args { - SCHED_FLAG_RECLAIM | \ - SCHED_FLAG_DL_OVERRUN | \ - SCHED_FLAG_KEEP_ALL | \ -- SCHED_FLAG_UTIL_CLAMP) -+ SCHED_FLAG_UTIL_CLAMP | \ -+ SCHED_FLAG_LATENCY_NICE) - - #endif /* _UAPI_LINUX_SCHED_H */ --- -2.39.2 diff --git a/patches/0003-eevdf.patch b/patches/0002-eevdf.patch similarity index 100% rename from patches/0003-eevdf.patch rename to patches/0002-eevdf.patch diff --git a/patches/0004-bore.patch b/patches/0003-bore.patch similarity index 100% rename from patches/0004-bore.patch rename to patches/0003-bore.patch diff --git a/patches/0005-hdr.patch b/patches/0004-hdr.patch similarity index 100% rename from patches/0005-hdr.patch rename to patches/0004-hdr.patch diff --git a/scripts/patch.sh b/scripts/patch.sh index fb03347..9fdf539 100755 --- a/scripts/patch.sh +++ b/scripts/patch.sh @@ -5,14 +5,12 @@ echo "Pika Kernel - Applying patches" # Cachy patches are here: https://github.com/CachyOS/kernel-patches/ # orig patch from cachy - 0001-cachyos-base-all.patch patch -Np1 < "../patches/0001-cachy-all.patch" -# orig patch from cachy - 0001-Add-latency-priority-for-CFS-class.patch -patch -Np1 < "../patches/0002-cfs-nice.patch" # orig patch from cachy -patch -Np1 < "../patches/0003-eevdf.patch" +patch -Np1 < "../patches/0002-eevdf.patch" # orig patch from cachy - 0001-bore-eevdf.patch -patch -Np1 < "../patches/0004-bore.patch" +patch -Np1 < "../patches/0003-bore.patch" # HDR patch - from cachy (but they deleted it) -patch -Np1 < "../patches/0005-hdr.patch" +patch -Np1 < "../patches/0004-hdr.patch" # Nobara patches are here: https://github.com/sammilucia/nobara-kernel-fork # Extra Leigon laptop goodies patch -Np1 < "../patches/0001-Add-legion-laptop-v0.1.patch"