1030 lines
34 KiB
Diff
1030 lines
34 KiB
Diff
|
From 78440b24f24a021daf660c0bd212c936e50e5f0a Mon Sep 17 00:00:00 2001
|
||
|
From: Peter Jung <admin@ptr1337.dev>
|
||
|
Date: Fri, 17 Feb 2023 15:38:09 +0100
|
||
|
Subject: [PATCH] Add latency priority for CFS class
|
||
|
|
||
|
This patchset restarts the work about adding a latency priority to describe
|
||
|
the latency tolerance of cfs tasks.
|
||
|
|
||
|
Patch [1] is a new one that has been added with v6. It fixes an
|
||
|
unfairness for low prio tasks because of wakeup_gran() being bigger
|
||
|
than the maximum vruntime credit that a waking task can keep after
|
||
|
sleeping.
|
||
|
|
||
|
The patches [2-4] have been done by Parth:
|
||
|
https://lore.kernel.org/lkml/20200228090755.22829-1-parth@linux.ibm.com/
|
||
|
|
||
|
I have just rebased and moved the set of latency priority outside the
|
||
|
priority update. I have removed the reviewed tag because the patches
|
||
|
are 2 years old.
|
||
|
|
||
|
This aims to be a generic interface and the following patches is one use
|
||
|
of it to improve the scheduling latency of cfs tasks.
|
||
|
|
||
|
Patch [5] uses latency nice priority to define a latency offset
|
||
|
and then decide if a cfs task can or should preempt the current
|
||
|
running task. The patch gives some tests results with cyclictests and
|
||
|
hackbench to highlight the benefit of latency priority for short
|
||
|
interactive task or long intensive tasks.
|
||
|
|
||
|
Patch [6] adds the support of latency nice priority to task group by
|
||
|
adding a cpu.latency.nice field. The range is [-20:19] as for setting task
|
||
|
latency priority.
|
||
|
|
||
|
Patch [7] makes sched_core taking into account the latency offset.
|
||
|
|
||
|
Patch [8] adds a rb tree to cover some corner cases where the latency
|
||
|
sensitive task (priority < 0) is preempted by high priority task (RT/DL)
|
||
|
or fails to preempt them. This patch ensures that tasks will have at least
|
||
|
a slice of sched_min_granularity in priority at wakeup.
|
||
|
|
||
|
Patch [9] removes useless check after adding a latency rb tree.
|
||
|
|
||
|
I have also backported the patchset on a dragonboard RB3 with an android
|
||
|
mainline kernel based on v5.18 for a quick test. I have used the
|
||
|
TouchLatency app which is part of AOSP and described to be a very good
|
||
|
test to highlight jitter and jank frame sources of a system [1].
|
||
|
In addition to the app, I have added some short running tasks waking-up
|
||
|
regularly (to use the 8 cpus for 4 ms every 37777us) to stress the system
|
||
|
without overloading it (and disabling EAS). The 1st results shows that the
|
||
|
patchset helps to reduce the missed deadline frames from 5% to less than
|
||
|
0.1% when the cpu.latency.nice of task group are set. I haven't rerun the
|
||
|
test with latest version.
|
||
|
|
||
|
I have also tested the patchset with the modified version of the alsa
|
||
|
latency test that has been shared by Tim. The test quickly xruns with
|
||
|
default latency nice priority 0 but is able to run without underuns with
|
||
|
a latency -20 and hackbench running simultaneously.
|
||
|
|
||
|
While preparing the version 8, I have evaluated the benefit of using an
|
||
|
augmented rbtree instead of adding a rbtree for latency sensitive entities,
|
||
|
which was a relevant suggestion done by PeterZ. Although the augmented
|
||
|
rbtree enables to sort additional information in the tree with a limited
|
||
|
overhead, it has more impact on legacy use cases (latency_nice >= 0)
|
||
|
because the augmented callbacks are always called to maintain this
|
||
|
additional information even when there is no sensitive tasks. In such
|
||
|
cases, the dedicated rbtree remains empty and the overhead is reduced to
|
||
|
loading a cached null node pointer. Nevertheless, we might want to
|
||
|
reconsider the augmented rbtree once the use of negative latency_nice will
|
||
|
be more widlely deployed. At now, the different tests that I have done,
|
||
|
have not shown improvements with augmented rbtree.
|
||
|
|
||
|
Below are some hackbench results:
|
||
|
2 rbtrees augmented rbtree augmented rbtree
|
||
|
sorted by vruntime sorted by wakeup_vruntime
|
||
|
sched pipe
|
||
|
avg 26311,000 25976,667 25839,556
|
||
|
stdev 0,15 % 0,28 % 0,24 %
|
||
|
vs tip 0,50 % -0,78 % -1,31 %
|
||
|
hackbench 1 group
|
||
|
avg 1,315 1,344 1,359
|
||
|
stdev 0,88 % 1,55 % 1,82 %
|
||
|
vs tip -0,47 % -2,68 % -3,87 %
|
||
|
hackbench 4 groups
|
||
|
avg 1,339 1,365 1,367
|
||
|
stdev 2,39 % 2,26 % 3,58 %
|
||
|
vs tip -0,08 % -2,01 % -2,22 %
|
||
|
hackbench 8 groups
|
||
|
avg 1,233 1,286 1,301
|
||
|
stdev 0,74 % 1,09 % 1,52 %
|
||
|
vs tip 0,29 % -4,05 % -5,27 %
|
||
|
hackbench 16 groups
|
||
|
avg 1,268 1,313 1,319
|
||
|
stdev 0,85 % 1,60 % 0,68 %
|
||
|
vs tip -0,02 % -3,56 % -4,01 %
|
||
|
|
||
|
[1] https://source.android.com/docs/core/debug/eval_perf#touchlatency
|
||
|
|
||
|
Change since v9:
|
||
|
- Rebase
|
||
|
- add tags
|
||
|
|
||
|
Change since v8:
|
||
|
- Rename get_sched_latency by get_sleep_latency
|
||
|
- move latency nice defines in sched/prio.h and fix latency_prio init value
|
||
|
- Fix typo and comments
|
||
|
|
||
|
Change since v7:
|
||
|
- Replaced se->on_latency by using RB_CLEAR_NODE() and RB_EMPTY_NODE()
|
||
|
- Clarify the limit behavior fo the cgroup cpu.latenyc_nice
|
||
|
|
||
|
Change since v6:
|
||
|
- Fix compilation error for !CONFIG_SCHED_DEBUG
|
||
|
|
||
|
Change since v5:
|
||
|
- Add patch 1 to fix unfairness for low prio task. This has been
|
||
|
discovered while studying Youssef's tests results with latency nice
|
||
|
which were hitting the same problem.
|
||
|
- Fixed latency_offset computation to take into account
|
||
|
GENTLE_FAIR_SLEEPERS. This has diseappeared with v2and has been raised
|
||
|
by Youssef's tests.
|
||
|
- Reworked and optimized how latency_offset in used to check for
|
||
|
preempting current task at wakeup and tick. This cover more cases too.
|
||
|
- Add patch 9 to remove check_preempt_from_others() which is not needed
|
||
|
anymore with the rb tree.
|
||
|
|
||
|
Change since v4:
|
||
|
- Removed permission checks to set latency priority. This enables user
|
||
|
without elevated privilege like audio application to set their latency
|
||
|
priority as requested by Tim.
|
||
|
- Removed cpu.latency and replaced it by cpu.latency.nice so we keep a
|
||
|
generic interface not tied to latency_offset which can be used to
|
||
|
implement other latency features.
|
||
|
- Added an entry in Documentation/admin-guide/cgroup-v2.rst to describe
|
||
|
cpu.latency.nice.
|
||
|
- Fix some typos.
|
||
|
|
||
|
Change since v3:
|
||
|
- Fix 2 compilation warnings raised by kernel test robot <lkp@intel.com>
|
||
|
|
||
|
Change since v2:
|
||
|
- Set a latency_offset field instead of saving a weight and computing it
|
||
|
on the fly.
|
||
|
- Make latency_offset available for task group: cpu.latency
|
||
|
- Fix some corner cases to make latency sensitive tasks schedule first and
|
||
|
add a rb tree for latency sensitive task.
|
||
|
|
||
|
Change since v1:
|
||
|
- fix typo
|
||
|
- move some codes in the right patch to make bisect happy
|
||
|
- simplify and fixed how the weight is computed
|
||
|
- added support of sched core patch 7
|
||
|
|
||
|
Parth Shah (3):
|
||
|
sched: Introduce latency-nice as a per-task attribute
|
||
|
sched/core: Propagate parent task's latency requirements to the child
|
||
|
task
|
||
|
sched: Allow sched_{get,set}attr to change latency_nice of the task
|
||
|
|
||
|
Vincent Guittot (6):
|
||
|
sched/fair: fix unfairness at wakeup
|
||
|
sched/fair: Take into account latency priority at wakeup
|
||
|
sched/fair: Add sched group latency support
|
||
|
sched/core: Support latency priority with sched core
|
||
|
sched/fair: Add latency list
|
||
|
sched/fair: remove check_preempt_from_others
|
||
|
|
||
|
Signed-off-by: Peter Jung <admin@ptr1337.dev>
|
||
|
---
|
||
|
Documentation/admin-guide/cgroup-v2.rst | 10 ++
|
||
|
include/linux/sched.h | 4 +
|
||
|
include/linux/sched/prio.h | 27 +++
|
||
|
include/uapi/linux/sched.h | 4 +-
|
||
|
include/uapi/linux/sched/types.h | 19 +++
|
||
|
init/init_task.c | 1 +
|
||
|
kernel/sched/core.c | 106 ++++++++++++
|
||
|
kernel/sched/debug.c | 1 +
|
||
|
kernel/sched/fair.c | 209 ++++++++++++++++++++----
|
||
|
kernel/sched/sched.h | 45 ++++-
|
||
|
tools/include/uapi/linux/sched.h | 4 +-
|
||
|
11 files changed, 394 insertions(+), 36 deletions(-)
|
||
|
|
||
|
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
|
||
|
index 74cec76be9f2..2e511d4a4c6a 100644
|
||
|
--- a/Documentation/admin-guide/cgroup-v2.rst
|
||
|
+++ b/Documentation/admin-guide/cgroup-v2.rst
|
||
|
@@ -1118,6 +1118,16 @@ All time durations are in microseconds.
|
||
|
values similar to the sched_setattr(2). This maximum utilization
|
||
|
value is used to clamp the task specific maximum utilization clamp.
|
||
|
|
||
|
+ cpu.latency.nice
|
||
|
+ A read-write single value file which exists on non-root
|
||
|
+ cgroups. The default is "0".
|
||
|
+
|
||
|
+ The nice value is in the range [-20, 19].
|
||
|
+
|
||
|
+ This interface file allows reading and setting latency using the
|
||
|
+ same values used by sched_setattr(2). The latency_nice of a group is
|
||
|
+ used to limit the impact of the latency_nice of a task outside the
|
||
|
+ group.
|
||
|
|
||
|
|
||
|
Memory
|
||
|
diff --git a/include/linux/sched.h b/include/linux/sched.h
|
||
|
index 28ce1be0ba47..df219c7cd6aa 100644
|
||
|
--- a/include/linux/sched.h
|
||
|
+++ b/include/linux/sched.h
|
||
|
@@ -548,6 +548,7 @@ struct sched_entity {
|
||
|
/* For load-balancing: */
|
||
|
struct load_weight load;
|
||
|
struct rb_node run_node;
|
||
|
+ struct rb_node latency_node;
|
||
|
struct list_head group_node;
|
||
|
unsigned int on_rq;
|
||
|
|
||
|
@@ -571,6 +572,8 @@ struct sched_entity {
|
||
|
/* cached value of my_q->h_nr_running */
|
||
|
unsigned long runnable_weight;
|
||
|
#endif
|
||
|
+ /* preemption offset in ns */
|
||
|
+ long latency_offset;
|
||
|
|
||
|
#ifdef CONFIG_SMP
|
||
|
/*
|
||
|
@@ -787,6 +790,7 @@ struct task_struct {
|
||
|
int static_prio;
|
||
|
int normal_prio;
|
||
|
unsigned int rt_priority;
|
||
|
+ int latency_prio;
|
||
|
|
||
|
struct sched_entity se;
|
||
|
struct sched_rt_entity rt;
|
||
|
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
|
||
|
index ab83d85e1183..be79503d86af 100644
|
||
|
--- a/include/linux/sched/prio.h
|
||
|
+++ b/include/linux/sched/prio.h
|
||
|
@@ -42,4 +42,31 @@ static inline long rlimit_to_nice(long prio)
|
||
|
return (MAX_NICE - prio + 1);
|
||
|
}
|
||
|
|
||
|
+/*
|
||
|
+ * Latency nice is meant to provide scheduler hints about the relative
|
||
|
+ * latency requirements of a task with respect to other tasks.
|
||
|
+ * Thus a task with latency_nice == 19 can be hinted as the task with no
|
||
|
+ * latency requirements, in contrast to the task with latency_nice == -20
|
||
|
+ * which should be given priority in terms of lower latency.
|
||
|
+ */
|
||
|
+#define MAX_LATENCY_NICE 19
|
||
|
+#define MIN_LATENCY_NICE -20
|
||
|
+
|
||
|
+#define LATENCY_NICE_WIDTH \
|
||
|
+ (MAX_LATENCY_NICE - MIN_LATENCY_NICE + 1)
|
||
|
+
|
||
|
+/*
|
||
|
+ * Default tasks should be treated as a task with latency_nice = 0.
|
||
|
+ */
|
||
|
+#define DEFAULT_LATENCY_NICE 0
|
||
|
+#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2)
|
||
|
+
|
||
|
+/*
|
||
|
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
|
||
|
+ * to static latency [ 0..39 ],
|
||
|
+ * and back.
|
||
|
+ */
|
||
|
+#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO)
|
||
|
+#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO)
|
||
|
+
|
||
|
#endif /* _LINUX_SCHED_PRIO_H */
|
||
|
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
|
||
|
index 3bac0a8ceab2..b2e932c25be6 100644
|
||
|
--- a/include/uapi/linux/sched.h
|
||
|
+++ b/include/uapi/linux/sched.h
|
||
|
@@ -132,6 +132,7 @@ struct clone_args {
|
||
|
#define SCHED_FLAG_KEEP_PARAMS 0x10
|
||
|
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
|
||
|
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
|
||
|
+#define SCHED_FLAG_LATENCY_NICE 0x80
|
||
|
|
||
|
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
|
||
|
SCHED_FLAG_KEEP_PARAMS)
|
||
|
@@ -143,6 +144,7 @@ struct clone_args {
|
||
|
SCHED_FLAG_RECLAIM | \
|
||
|
SCHED_FLAG_DL_OVERRUN | \
|
||
|
SCHED_FLAG_KEEP_ALL | \
|
||
|
- SCHED_FLAG_UTIL_CLAMP)
|
||
|
+ SCHED_FLAG_UTIL_CLAMP | \
|
||
|
+ SCHED_FLAG_LATENCY_NICE)
|
||
|
|
||
|
#endif /* _UAPI_LINUX_SCHED_H */
|
||
|
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
|
||
|
index f2c4589d4dbf..db1e8199e8c8 100644
|
||
|
--- a/include/uapi/linux/sched/types.h
|
||
|
+++ b/include/uapi/linux/sched/types.h
|
||
|
@@ -10,6 +10,7 @@ struct sched_param {
|
||
|
|
||
|
#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
|
||
|
#define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */
|
||
|
+#define SCHED_ATTR_SIZE_VER2 60 /* add: latency_nice */
|
||
|
|
||
|
/*
|
||
|
* Extended scheduling parameters data structure.
|
||
|
@@ -98,6 +99,22 @@ struct sched_param {
|
||
|
* scheduled on a CPU with no more capacity than the specified value.
|
||
|
*
|
||
|
* A task utilization boundary can be reset by setting the attribute to -1.
|
||
|
+ *
|
||
|
+ * Latency Tolerance Attributes
|
||
|
+ * ===========================
|
||
|
+ *
|
||
|
+ * A subset of sched_attr attributes allows to specify the relative latency
|
||
|
+ * requirements of a task with respect to the other tasks running/queued in the
|
||
|
+ * system.
|
||
|
+ *
|
||
|
+ * @ sched_latency_nice task's latency_nice value
|
||
|
+ *
|
||
|
+ * The latency_nice of a task can have any value in a range of
|
||
|
+ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE].
|
||
|
+ *
|
||
|
+ * A task with latency_nice with the value of LATENCY_NICE_MIN can be
|
||
|
+ * taken for a task requiring a lower latency as opposed to the task with
|
||
|
+ * higher latency_nice.
|
||
|
*/
|
||
|
struct sched_attr {
|
||
|
__u32 size;
|
||
|
@@ -120,6 +137,8 @@ struct sched_attr {
|
||
|
__u32 sched_util_min;
|
||
|
__u32 sched_util_max;
|
||
|
|
||
|
+ /* latency requirement hints */
|
||
|
+ __s32 sched_latency_nice;
|
||
|
};
|
||
|
|
||
|
#endif /* _UAPI_LINUX_SCHED_TYPES_H */
|
||
|
diff --git a/init/init_task.c b/init/init_task.c
|
||
|
index ff6c4b9bfe6b..071deff8dbd1 100644
|
||
|
--- a/init/init_task.c
|
||
|
+++ b/init/init_task.c
|
||
|
@@ -78,6 +78,7 @@ struct task_struct init_task
|
||
|
.prio = MAX_PRIO - 20,
|
||
|
.static_prio = MAX_PRIO - 20,
|
||
|
.normal_prio = MAX_PRIO - 20,
|
||
|
+ .latency_prio = DEFAULT_LATENCY_PRIO,
|
||
|
.policy = SCHED_NORMAL,
|
||
|
.cpus_ptr = &init_task.cpus_mask,
|
||
|
.user_cpus_ptr = NULL,
|
||
|
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||
|
index 5237639786b7..5d6a283a4da9 100644
|
||
|
--- a/kernel/sched/core.c
|
||
|
+++ b/kernel/sched/core.c
|
||
|
@@ -1283,6 +1283,16 @@ static void set_load_weight(struct task_struct *p, bool update_load)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
+static void set_latency_offset(struct task_struct *p)
|
||
|
+{
|
||
|
+ long weight = sched_latency_to_weight[p->latency_prio];
|
||
|
+ s64 offset;
|
||
|
+
|
||
|
+ offset = weight * get_sleep_latency(false);
|
||
|
+ offset = div_s64(offset, NICE_LATENCY_WEIGHT_MAX);
|
||
|
+ p->se.latency_offset = (long)offset;
|
||
|
+}
|
||
|
+
|
||
|
#ifdef CONFIG_UCLAMP_TASK
|
||
|
/*
|
||
|
* Serializes updates of utilization clamp values
|
||
|
@@ -4432,6 +4442,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
|
||
|
p->se.dur_avg = 0;
|
||
|
p->se.prev_sleep_sum_runtime = 0;
|
||
|
INIT_LIST_HEAD(&p->se.group_node);
|
||
|
+ RB_CLEAR_NODE(&p->se.latency_node);
|
||
|
|
||
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||
|
p->se.cfs_rq = NULL;
|
||
|
@@ -4684,6 +4695,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
|
||
|
p->prio = p->normal_prio = p->static_prio;
|
||
|
set_load_weight(p, false);
|
||
|
|
||
|
+ p->latency_prio = NICE_TO_LATENCY(0);
|
||
|
+ set_latency_offset(p);
|
||
|
+
|
||
|
/*
|
||
|
* We don't need the reset flag anymore after the fork. It has
|
||
|
* fulfilled its duty:
|
||
|
@@ -7444,6 +7458,16 @@ static void __setscheduler_params(struct task_struct *p,
|
||
|
p->rt_priority = attr->sched_priority;
|
||
|
p->normal_prio = normal_prio(p);
|
||
|
set_load_weight(p, true);
|
||
|
+
|
||
|
+}
|
||
|
+
|
||
|
+static void __setscheduler_latency(struct task_struct *p,
|
||
|
+ const struct sched_attr *attr)
|
||
|
+{
|
||
|
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
|
||
|
+ p->latency_prio = NICE_TO_LATENCY(attr->sched_latency_nice);
|
||
|
+ set_latency_offset(p);
|
||
|
+ }
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
@@ -7586,6 +7610,13 @@ static int __sched_setscheduler(struct task_struct *p,
|
||
|
return retval;
|
||
|
}
|
||
|
|
||
|
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
|
||
|
+ if (attr->sched_latency_nice > MAX_LATENCY_NICE)
|
||
|
+ return -EINVAL;
|
||
|
+ if (attr->sched_latency_nice < MIN_LATENCY_NICE)
|
||
|
+ return -EINVAL;
|
||
|
+ }
|
||
|
+
|
||
|
if (pi)
|
||
|
cpuset_read_lock();
|
||
|
|
||
|
@@ -7620,6 +7651,9 @@ static int __sched_setscheduler(struct task_struct *p,
|
||
|
goto change;
|
||
|
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
|
||
|
goto change;
|
||
|
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
|
||
|
+ attr->sched_latency_nice != LATENCY_TO_NICE(p->latency_prio))
|
||
|
+ goto change;
|
||
|
|
||
|
p->sched_reset_on_fork = reset_on_fork;
|
||
|
retval = 0;
|
||
|
@@ -7708,6 +7742,7 @@ static int __sched_setscheduler(struct task_struct *p,
|
||
|
__setscheduler_params(p, attr);
|
||
|
__setscheduler_prio(p, newprio);
|
||
|
}
|
||
|
+ __setscheduler_latency(p, attr);
|
||
|
__setscheduler_uclamp(p, attr);
|
||
|
|
||
|
if (queued) {
|
||
|
@@ -7918,6 +7953,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
|
||
|
size < SCHED_ATTR_SIZE_VER1)
|
||
|
return -EINVAL;
|
||
|
|
||
|
+ if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
|
||
|
+ size < SCHED_ATTR_SIZE_VER2)
|
||
|
+ return -EINVAL;
|
||
|
/*
|
||
|
* XXX: Do we want to be lenient like existing syscalls; or do we want
|
||
|
* to be strict and return an error on out-of-bounds values?
|
||
|
@@ -8155,6 +8193,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
|
||
|
get_params(p, &kattr);
|
||
|
kattr.sched_flags &= SCHED_FLAG_ALL;
|
||
|
|
||
|
+ kattr.sched_latency_nice = LATENCY_TO_NICE(p->latency_prio);
|
||
|
+
|
||
|
#ifdef CONFIG_UCLAMP_TASK
|
||
|
/*
|
||
|
* This could race with another potential updater, but this is fine
|
||
|
@@ -11027,6 +11067,47 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
|
||
|
{
|
||
|
return sched_group_set_idle(css_tg(css), idle);
|
||
|
}
|
||
|
+
|
||
|
+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
|
||
|
+ struct cftype *cft)
|
||
|
+{
|
||
|
+ int prio, delta, last_delta = INT_MAX;
|
||
|
+ s64 weight;
|
||
|
+
|
||
|
+ weight = css_tg(css)->latency_offset * NICE_LATENCY_WEIGHT_MAX;
|
||
|
+ weight = div_s64(weight, get_sleep_latency(false));
|
||
|
+
|
||
|
+ /* Find the closest nice value to the current weight */
|
||
|
+ for (prio = 0; prio < ARRAY_SIZE(sched_latency_to_weight); prio++) {
|
||
|
+ delta = abs(sched_latency_to_weight[prio] - weight);
|
||
|
+ if (delta >= last_delta)
|
||
|
+ break;
|
||
|
+ last_delta = delta;
|
||
|
+ }
|
||
|
+
|
||
|
+ return LATENCY_TO_NICE(prio-1);
|
||
|
+}
|
||
|
+
|
||
|
+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
|
||
|
+ struct cftype *cft, s64 nice)
|
||
|
+{
|
||
|
+ s64 latency_offset;
|
||
|
+ long weight;
|
||
|
+ int idx;
|
||
|
+
|
||
|
+ if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE)
|
||
|
+ return -ERANGE;
|
||
|
+
|
||
|
+ idx = NICE_TO_LATENCY(nice);
|
||
|
+ idx = array_index_nospec(idx, LATENCY_NICE_WIDTH);
|
||
|
+ weight = sched_latency_to_weight[idx];
|
||
|
+
|
||
|
+ latency_offset = weight * get_sleep_latency(false);
|
||
|
+ latency_offset = div_s64(latency_offset, NICE_LATENCY_WEIGHT_MAX);
|
||
|
+
|
||
|
+ return sched_group_set_latency(css_tg(css), latency_offset);
|
||
|
+}
|
||
|
+
|
||
|
#endif
|
||
|
|
||
|
static struct cftype cpu_legacy_files[] = {
|
||
|
@@ -11041,6 +11122,11 @@ static struct cftype cpu_legacy_files[] = {
|
||
|
.read_s64 = cpu_idle_read_s64,
|
||
|
.write_s64 = cpu_idle_write_s64,
|
||
|
},
|
||
|
+ {
|
||
|
+ .name = "latency.nice",
|
||
|
+ .read_s64 = cpu_latency_nice_read_s64,
|
||
|
+ .write_s64 = cpu_latency_nice_write_s64,
|
||
|
+ },
|
||
|
#endif
|
||
|
#ifdef CONFIG_CFS_BANDWIDTH
|
||
|
{
|
||
|
@@ -11258,6 +11344,12 @@ static struct cftype cpu_files[] = {
|
||
|
.read_s64 = cpu_idle_read_s64,
|
||
|
.write_s64 = cpu_idle_write_s64,
|
||
|
},
|
||
|
+ {
|
||
|
+ .name = "latency.nice",
|
||
|
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||
|
+ .read_s64 = cpu_latency_nice_read_s64,
|
||
|
+ .write_s64 = cpu_latency_nice_write_s64,
|
||
|
+ },
|
||
|
#endif
|
||
|
#ifdef CONFIG_CFS_BANDWIDTH
|
||
|
{
|
||
|
@@ -11368,6 +11460,20 @@ const u32 sched_prio_to_wmult[40] = {
|
||
|
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
|
||
|
};
|
||
|
|
||
|
+/*
|
||
|
+ * latency weight for wakeup preemption
|
||
|
+ */
|
||
|
+const int sched_latency_to_weight[40] = {
|
||
|
+ /* -20 */ -1024, -973, -922, -870, -819,
|
||
|
+ /* -15 */ -768, -717, -666, -614, -563,
|
||
|
+ /* -10 */ -512, -461, -410, -358, -307,
|
||
|
+ /* -5 */ -256, -205, -154, -102, -51,
|
||
|
+ /* 0 */ 0, 51, 102, 154, 205,
|
||
|
+ /* 5 */ 256, 307, 358, 410, 461,
|
||
|
+ /* 10 */ 512, 563, 614, 666, 717,
|
||
|
+ /* 15 */ 768, 819, 870, 922, 973,
|
||
|
+};
|
||
|
+
|
||
|
void call_trace_sched_update_nr_running(struct rq *rq, int count)
|
||
|
{
|
||
|
trace_sched_update_nr_running_tp(rq, count);
|
||
|
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
|
||
|
index 8d64fba16cfe..177934290ec4 100644
|
||
|
--- a/kernel/sched/debug.c
|
||
|
+++ b/kernel/sched/debug.c
|
||
|
@@ -1044,6 +1044,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
|
||
|
#endif
|
||
|
P(policy);
|
||
|
P(prio);
|
||
|
+ P(latency_prio);
|
||
|
if (task_has_dl_policy(p)) {
|
||
|
P(dl.runtime);
|
||
|
P(dl.deadline);
|
||
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
||
|
index b38a1ce1be49..5ef893ce5734 100644
|
||
|
--- a/kernel/sched/fair.c
|
||
|
+++ b/kernel/sched/fair.c
|
||
|
@@ -698,7 +698,76 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
|
||
|
|
||
|
return __node_2_se(last);
|
||
|
}
|
||
|
+#endif
|
||
|
+
|
||
|
+/**************************************************************
|
||
|
+ * Scheduling class tree data structure manipulation methods:
|
||
|
+ * for latency
|
||
|
+ */
|
||
|
+
|
||
|
+static inline bool latency_before(struct sched_entity *a,
|
||
|
+ struct sched_entity *b)
|
||
|
+{
|
||
|
+ return (s64)(a->vruntime + a->latency_offset - b->vruntime - b->latency_offset) < 0;
|
||
|
+}
|
||
|
+
|
||
|
+#define __latency_node_2_se(node) \
|
||
|
+ rb_entry((node), struct sched_entity, latency_node)
|
||
|
+
|
||
|
+static inline bool __latency_less(struct rb_node *a, const struct rb_node *b)
|
||
|
+{
|
||
|
+ return latency_before(__latency_node_2_se(a), __latency_node_2_se(b));
|
||
|
+}
|
||
|
+
|
||
|
+/*
|
||
|
+ * Enqueue an entity into the latency rb-tree:
|
||
|
+ */
|
||
|
+static void __enqueue_latency(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||
|
+{
|
||
|
+
|
||
|
+ /* Only latency sensitive entity can be added to the list */
|
||
|
+ if (se->latency_offset >= 0)
|
||
|
+ return;
|
||
|
+
|
||
|
+ if (!RB_EMPTY_NODE(&se->latency_node))
|
||
|
+ return;
|
||
|
+
|
||
|
+ /*
|
||
|
+ * An execution time less than sysctl_sched_min_granularity means that
|
||
|
+ * the entity has been preempted by a higher sched class or an entity
|
||
|
+ * with higher latency constraint.
|
||
|
+ * Put it back in the list so it gets a chance to run 1st during the
|
||
|
+ * next slice.
|
||
|
+ */
|
||
|
+ if (!(flags & ENQUEUE_WAKEUP)) {
|
||
|
+ u64 delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
|
||
|
+
|
||
|
+ if (delta_exec >= sysctl_sched_min_granularity)
|
||
|
+ return;
|
||
|
+ }
|
||
|
+
|
||
|
+ rb_add_cached(&se->latency_node, &cfs_rq->latency_timeline, __latency_less);
|
||
|
+}
|
||
|
+
|
||
|
+static void __dequeue_latency(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||
|
+{
|
||
|
+ if (!RB_EMPTY_NODE(&se->latency_node)) {
|
||
|
+ rb_erase_cached(&se->latency_node, &cfs_rq->latency_timeline);
|
||
|
+ RB_CLEAR_NODE(&se->latency_node);
|
||
|
+ }
|
||
|
+}
|
||
|
+
|
||
|
+static struct sched_entity *__pick_first_latency(struct cfs_rq *cfs_rq)
|
||
|
+{
|
||
|
+ struct rb_node *left = rb_first_cached(&cfs_rq->latency_timeline);
|
||
|
+
|
||
|
+ if (!left)
|
||
|
+ return NULL;
|
||
|
+
|
||
|
+ return __latency_node_2_se(left);
|
||
|
+}
|
||
|
|
||
|
+#ifdef CONFIG_SCHED_DEBUG
|
||
|
/**************************************************************
|
||
|
* Scheduling class statistics methods:
|
||
|
*/
|
||
|
@@ -4672,33 +4741,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
||
|
u64 vruntime = cfs_rq->min_vruntime;
|
||
|
u64 sleep_time;
|
||
|
|
||
|
- /*
|
||
|
- * The 'current' period is already promised to the current tasks,
|
||
|
- * however the extra weight of the new task will slow them down a
|
||
|
- * little, place the new task so that it fits in the slot that
|
||
|
- * stays open at the end.
|
||
|
- */
|
||
|
- if (initial && sched_feat(START_DEBIT))
|
||
|
- vruntime += sched_vslice(cfs_rq, se);
|
||
|
-
|
||
|
- /* sleeps up to a single latency don't count. */
|
||
|
- if (!initial) {
|
||
|
- unsigned long thresh;
|
||
|
-
|
||
|
- if (se_is_idle(se))
|
||
|
- thresh = sysctl_sched_min_granularity;
|
||
|
- else
|
||
|
- thresh = sysctl_sched_latency;
|
||
|
-
|
||
|
+ if (!initial)
|
||
|
+ /* sleeps up to a single latency don't count. */
|
||
|
+ vruntime -= get_sleep_latency(se_is_idle(se));
|
||
|
+ else if (sched_feat(START_DEBIT))
|
||
|
/*
|
||
|
- * Halve their sleep time's effect, to allow
|
||
|
- * for a gentler effect of sleepers:
|
||
|
+ * The 'current' period is already promised to the current tasks,
|
||
|
+ * however the extra weight of the new task will slow them down a
|
||
|
+ * little, place the new task so that it fits in the slot that
|
||
|
+ * stays open at the end.
|
||
|
*/
|
||
|
- if (sched_feat(GENTLE_FAIR_SLEEPERS))
|
||
|
- thresh >>= 1;
|
||
|
-
|
||
|
- vruntime -= thresh;
|
||
|
- }
|
||
|
+ vruntime += sched_vslice(cfs_rq, se);
|
||
|
|
||
|
/*
|
||
|
* Pull vruntime of the entity being placed to the base level of
|
||
|
@@ -4792,8 +4845,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||
|
check_schedstat_required();
|
||
|
update_stats_enqueue_fair(cfs_rq, se, flags);
|
||
|
check_spread(cfs_rq, se);
|
||
|
- if (!curr)
|
||
|
+ if (!curr) {
|
||
|
__enqueue_entity(cfs_rq, se);
|
||
|
+ __enqueue_latency(cfs_rq, se, flags);
|
||
|
+ }
|
||
|
se->on_rq = 1;
|
||
|
|
||
|
if (cfs_rq->nr_running == 1) {
|
||
|
@@ -4879,8 +4934,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||
|
|
||
|
clear_buddies(cfs_rq, se);
|
||
|
|
||
|
- if (se != cfs_rq->curr)
|
||
|
+ if (se != cfs_rq->curr) {
|
||
|
__dequeue_entity(cfs_rq, se);
|
||
|
+ __dequeue_latency(cfs_rq, se);
|
||
|
+ }
|
||
|
se->on_rq = 0;
|
||
|
account_entity_dequeue(cfs_rq, se);
|
||
|
|
||
|
@@ -4911,6 +4968,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||
|
update_idle_cfs_rq_clock_pelt(cfs_rq);
|
||
|
}
|
||
|
|
||
|
+static long wakeup_latency_gran(struct sched_entity *curr, struct sched_entity *se);
|
||
|
+
|
||
|
/*
|
||
|
* Preempt the current task with a newly woken task if needed:
|
||
|
*/
|
||
|
@@ -4919,7 +4978,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
||
|
{
|
||
|
unsigned long ideal_runtime, delta_exec;
|
||
|
struct sched_entity *se;
|
||
|
- s64 delta;
|
||
|
+ s64 delta, offset;
|
||
|
|
||
|
/*
|
||
|
* When many tasks blow up the sched_period; it is possible that
|
||
|
@@ -4950,10 +5009,12 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
||
|
se = __pick_first_entity(cfs_rq);
|
||
|
delta = curr->vruntime - se->vruntime;
|
||
|
|
||
|
- if (delta < 0)
|
||
|
+ offset = wakeup_latency_gran(curr, se);
|
||
|
+ if (delta < offset)
|
||
|
return;
|
||
|
|
||
|
- if (delta > ideal_runtime)
|
||
|
+ if ((delta > ideal_runtime) ||
|
||
|
+ (delta > get_latency_max()))
|
||
|
resched_curr(rq_of(cfs_rq));
|
||
|
}
|
||
|
|
||
|
@@ -4971,6 +5032,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||
|
*/
|
||
|
update_stats_wait_end_fair(cfs_rq, se);
|
||
|
__dequeue_entity(cfs_rq, se);
|
||
|
+ __dequeue_latency(cfs_rq, se);
|
||
|
update_load_avg(cfs_rq, se, UPDATE_TG);
|
||
|
}
|
||
|
|
||
|
@@ -5009,7 +5071,7 @@ static struct sched_entity *
|
||
|
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
||
|
{
|
||
|
struct sched_entity *left = __pick_first_entity(cfs_rq);
|
||
|
- struct sched_entity *se;
|
||
|
+ struct sched_entity *latency, *se;
|
||
|
|
||
|
/*
|
||
|
* If curr is set we have to see if its left of the leftmost entity
|
||
|
@@ -5051,6 +5113,12 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
||
|
se = cfs_rq->last;
|
||
|
}
|
||
|
|
||
|
+ /* Check for latency sensitive entity waiting for running */
|
||
|
+ latency = __pick_first_latency(cfs_rq);
|
||
|
+ if (latency && (latency != se) &&
|
||
|
+ wakeup_preempt_entity(latency, se) < 1)
|
||
|
+ se = latency;
|
||
|
+
|
||
|
return se;
|
||
|
}
|
||
|
|
||
|
@@ -5074,6 +5142,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
|
||
|
update_stats_wait_start_fair(cfs_rq, prev);
|
||
|
/* Put 'current' back into the tree. */
|
||
|
__enqueue_entity(cfs_rq, prev);
|
||
|
+ __enqueue_latency(cfs_rq, prev, 0);
|
||
|
/* in !on_rq case, update occurred at dequeue */
|
||
|
update_load_avg(cfs_rq, prev, 0);
|
||
|
}
|
||
|
@@ -7735,6 +7804,23 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||
|
}
|
||
|
#endif /* CONFIG_SMP */
|
||
|
|
||
|
+static long wakeup_latency_gran(struct sched_entity *curr, struct sched_entity *se)
|
||
|
+{
|
||
|
+ long latency_offset = se->latency_offset;
|
||
|
+
|
||
|
+ /*
|
||
|
+ * A negative latency offset means that the sched_entity has latency
|
||
|
+ * requirement that needs to be evaluated versus other entity.
|
||
|
+ * Otherwise, use the latency weight to evaluate how much scheduling
|
||
|
+ * delay is acceptable by se.
|
||
|
+ */
|
||
|
+ if ((latency_offset < 0) || (curr->latency_offset < 0))
|
||
|
+ latency_offset -= curr->latency_offset;
|
||
|
+ latency_offset = min_t(long, latency_offset, get_latency_max());
|
||
|
+
|
||
|
+ return latency_offset;
|
||
|
+}
|
||
|
+
|
||
|
static unsigned long wakeup_gran(struct sched_entity *se)
|
||
|
{
|
||
|
unsigned long gran = sysctl_sched_wakeup_granularity;
|
||
|
@@ -7773,11 +7859,24 @@ static int
|
||
|
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
|
||
|
{
|
||
|
s64 gran, vdiff = curr->vruntime - se->vruntime;
|
||
|
+ s64 offset = wakeup_latency_gran(curr, se);
|
||
|
|
||
|
- if (vdiff <= 0)
|
||
|
+ if (vdiff < offset)
|
||
|
return -1;
|
||
|
|
||
|
- gran = wakeup_gran(se);
|
||
|
+ gran = offset + wakeup_gran(se);
|
||
|
+
|
||
|
+ /*
|
||
|
+ * At wake up, the vruntime of a task is capped to not be older than
|
||
|
+ * a sched_latency period compared to min_vruntime. This prevents long
|
||
|
+ * sleeping task to get unlimited credit at wakeup. Such waking up task
|
||
|
+ * has to preempt current in order to not lose its share of CPU
|
||
|
+ * bandwidth but wakeup_gran() can become higher than scheduling period
|
||
|
+ * for low priority task. Make sure that long sleeping task will get a
|
||
|
+ * chance to preempt current.
|
||
|
+ */
|
||
|
+ gran = min_t(s64, gran, get_latency_max());
|
||
|
+
|
||
|
if (vdiff > gran)
|
||
|
return 1;
|
||
|
|
||
|
@@ -11995,6 +12094,9 @@ bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
|
||
|
delta = (s64)(sea->vruntime - seb->vruntime) +
|
||
|
(s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
|
||
|
|
||
|
+ /* Take into account latency prio */
|
||
|
+ delta -= wakeup_latency_gran(sea, seb);
|
||
|
+
|
||
|
return delta > 0;
|
||
|
}
|
||
|
#else
|
||
|
@@ -12265,6 +12367,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
|
||
|
void init_cfs_rq(struct cfs_rq *cfs_rq)
|
||
|
{
|
||
|
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
|
||
|
+ cfs_rq->latency_timeline = RB_ROOT_CACHED;
|
||
|
u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
|
||
|
#ifdef CONFIG_SMP
|
||
|
raw_spin_lock_init(&cfs_rq->removed.lock);
|
||
|
@@ -12320,6 +12423,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
|
||
|
goto err;
|
||
|
|
||
|
tg->shares = NICE_0_LOAD;
|
||
|
+ tg->latency_offset = 0;
|
||
|
|
||
|
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
|
||
|
|
||
|
@@ -12418,6 +12522,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
|
||
|
}
|
||
|
|
||
|
se->my_q = cfs_rq;
|
||
|
+
|
||
|
+ se->latency_offset = tg->latency_offset;
|
||
|
+
|
||
|
/* guarantee group entities always have weight */
|
||
|
update_load_set(&se->load, NICE_0_LOAD);
|
||
|
se->parent = parent;
|
||
|
@@ -12548,6 +12655,42 @@ int sched_group_set_idle(struct task_group *tg, long idle)
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
+int sched_group_set_latency(struct task_group *tg, s64 latency)
|
||
|
+{
|
||
|
+ int i;
|
||
|
+
|
||
|
+ if (tg == &root_task_group)
|
||
|
+ return -EINVAL;
|
||
|
+
|
||
|
+ if (abs(latency) > sysctl_sched_latency)
|
||
|
+ return -EINVAL;
|
||
|
+
|
||
|
+ mutex_lock(&shares_mutex);
|
||
|
+
|
||
|
+ if (tg->latency_offset == latency) {
|
||
|
+ mutex_unlock(&shares_mutex);
|
||
|
+ return 0;
|
||
|
+ }
|
||
|
+
|
||
|
+ tg->latency_offset = latency;
|
||
|
+
|
||
|
+ for_each_possible_cpu(i) {
|
||
|
+ struct sched_entity *se = tg->se[i];
|
||
|
+ struct rq *rq = cpu_rq(i);
|
||
|
+ struct rq_flags rf;
|
||
|
+
|
||
|
+ rq_lock_irqsave(rq, &rf);
|
||
|
+
|
||
|
+ __dequeue_latency(se->cfs_rq, se);
|
||
|
+ WRITE_ONCE(se->latency_offset, latency);
|
||
|
+
|
||
|
+ rq_unlock_irqrestore(rq, &rf);
|
||
|
+ }
|
||
|
+
|
||
|
+ mutex_unlock(&shares_mutex);
|
||
|
+ return 0;
|
||
|
+}
|
||
|
+
|
||
|
#else /* CONFIG_FAIR_GROUP_SCHED */
|
||
|
|
||
|
void free_fair_sched_group(struct task_group *tg) { }
|
||
|
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
||
|
index 9e8bb6278604..c47198dbf740 100644
|
||
|
--- a/kernel/sched/sched.h
|
||
|
+++ b/kernel/sched/sched.h
|
||
|
@@ -125,6 +125,11 @@ extern int sched_rr_timeslice;
|
||
|
*/
|
||
|
#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
|
||
|
|
||
|
+/* Maximum nice latency weight used to scale the latency_offset */
|
||
|
+
|
||
|
+#define NICE_LATENCY_SHIFT (SCHED_FIXEDPOINT_SHIFT)
|
||
|
+#define NICE_LATENCY_WEIGHT_MAX (1L << NICE_LATENCY_SHIFT)
|
||
|
+
|
||
|
/*
|
||
|
* Increase resolution of nice-level calculations for 64-bit architectures.
|
||
|
* The extra resolution improves shares distribution and load balancing of
|
||
|
@@ -378,6 +383,8 @@ struct task_group {
|
||
|
|
||
|
/* A positive value indicates that this is a SCHED_IDLE group. */
|
||
|
int idle;
|
||
|
+ /* latency constraint of the group. */
|
||
|
+ int latency_offset;
|
||
|
|
||
|
#ifdef CONFIG_SMP
|
||
|
/*
|
||
|
@@ -488,6 +495,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
|
||
|
|
||
|
extern int sched_group_set_idle(struct task_group *tg, long idle);
|
||
|
|
||
|
+extern int sched_group_set_latency(struct task_group *tg, s64 latency);
|
||
|
+
|
||
|
#ifdef CONFIG_SMP
|
||
|
extern void set_task_rq_fair(struct sched_entity *se,
|
||
|
struct cfs_rq *prev, struct cfs_rq *next);
|
||
|
@@ -566,6 +575,7 @@ struct cfs_rq {
|
||
|
#endif
|
||
|
|
||
|
struct rb_root_cached tasks_timeline;
|
||
|
+ struct rb_root_cached latency_timeline;
|
||
|
|
||
|
/*
|
||
|
* 'curr' points to currently running entity on this cfs_rq.
|
||
|
@@ -2123,6 +2133,7 @@ static_assert(WF_TTWU == SD_BALANCE_WAKE);
|
||
|
|
||
|
extern const int sched_prio_to_weight[40];
|
||
|
extern const u32 sched_prio_to_wmult[40];
|
||
|
+extern const int sched_latency_to_weight[40];
|
||
|
|
||
|
/*
|
||
|
* {de,en}queue flags:
|
||
|
@@ -2461,9 +2472,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
|
||
|
extern const_debug unsigned int sysctl_sched_nr_migrate;
|
||
|
extern const_debug unsigned int sysctl_sched_migration_cost;
|
||
|
|
||
|
-#ifdef CONFIG_SCHED_DEBUG
|
||
|
extern unsigned int sysctl_sched_latency;
|
||
|
extern unsigned int sysctl_sched_min_granularity;
|
||
|
+#ifdef CONFIG_SCHED_DEBUG
|
||
|
extern unsigned int sysctl_sched_idle_min_granularity;
|
||
|
extern unsigned int sysctl_sched_wakeup_granularity;
|
||
|
extern int sysctl_resched_latency_warn_ms;
|
||
|
@@ -2478,6 +2489,38 @@ extern unsigned int sysctl_numa_balancing_scan_size;
|
||
|
extern unsigned int sysctl_numa_balancing_hot_threshold;
|
||
|
#endif
|
||
|
|
||
|
+static inline unsigned long get_sleep_latency(bool idle)
|
||
|
+{
|
||
|
+ unsigned long thresh;
|
||
|
+
|
||
|
+ if (idle)
|
||
|
+ thresh = sysctl_sched_min_granularity;
|
||
|
+ else
|
||
|
+ thresh = sysctl_sched_latency;
|
||
|
+
|
||
|
+ /*
|
||
|
+ * Halve their sleep time's effect, to allow
|
||
|
+ * for a gentler effect of sleepers:
|
||
|
+ */
|
||
|
+ if (sched_feat(GENTLE_FAIR_SLEEPERS))
|
||
|
+ thresh >>= 1;
|
||
|
+
|
||
|
+ return thresh;
|
||
|
+}
|
||
|
+
|
||
|
+static inline unsigned long get_latency_max(void)
|
||
|
+{
|
||
|
+ unsigned long thresh = get_sleep_latency(false);
|
||
|
+
|
||
|
+ /*
|
||
|
+ * If the waking task failed to preempt current it could to wait up to
|
||
|
+ * sysctl_sched_min_granularity before preempting it during next tick.
|
||
|
+ */
|
||
|
+ thresh -= sysctl_sched_min_granularity;
|
||
|
+
|
||
|
+ return thresh;
|
||
|
+}
|
||
|
+
|
||
|
#ifdef CONFIG_SCHED_HRTICK
|
||
|
|
||
|
/*
|
||
|
diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h
|
||
|
index 3bac0a8ceab2..b2e932c25be6 100644
|
||
|
--- a/tools/include/uapi/linux/sched.h
|
||
|
+++ b/tools/include/uapi/linux/sched.h
|
||
|
@@ -132,6 +132,7 @@ struct clone_args {
|
||
|
#define SCHED_FLAG_KEEP_PARAMS 0x10
|
||
|
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
|
||
|
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
|
||
|
+#define SCHED_FLAG_LATENCY_NICE 0x80
|
||
|
|
||
|
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
|
||
|
SCHED_FLAG_KEEP_PARAMS)
|
||
|
@@ -143,6 +144,7 @@ struct clone_args {
|
||
|
SCHED_FLAG_RECLAIM | \
|
||
|
SCHED_FLAG_DL_OVERRUN | \
|
||
|
SCHED_FLAG_KEEP_ALL | \
|
||
|
- SCHED_FLAG_UTIL_CLAMP)
|
||
|
+ SCHED_FLAG_UTIL_CLAMP | \
|
||
|
+ SCHED_FLAG_LATENCY_NICE)
|
||
|
|
||
|
#endif /* _UAPI_LINUX_SCHED_H */
|
||
|
--
|
||
|
2.39.2
|