2302 lines
68 KiB
Diff
2302 lines
68 KiB
Diff
From 40a2f9f3e7e56936385c5a97957cd43fbb85fd32 Mon Sep 17 00:00:00 2001
|
|
From: Peter Jung <admin@ptr1337.dev>
|
|
Date: Sun, 9 Apr 2023 21:35:07 +0200
|
|
Subject: [PATCH] EEVDF
|
|
|
|
Signed-off-by: Peter Jung <admin@ptr1337.dev>
|
|
---
|
|
Documentation/admin-guide/cgroup-v2.rst | 10 +
|
|
include/linux/rbtree_augmented.h | 26 +
|
|
include/linux/sched.h | 9 +-
|
|
include/uapi/linux/sched.h | 4 +-
|
|
include/uapi/linux/sched/types.h | 19 +
|
|
init/init_task.c | 3 +-
|
|
kernel/sched/core.c | 67 +-
|
|
kernel/sched/debug.c | 50 +-
|
|
kernel/sched/fair.c | 1171 ++++++++++-------------
|
|
kernel/sched/features.h | 28 +-
|
|
kernel/sched/sched.h | 23 +-
|
|
tools/include/uapi/linux/sched.h | 4 +-
|
|
12 files changed, 697 insertions(+), 717 deletions(-)
|
|
|
|
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
|
|
index f67c0829350b..a39dfda3d032 100644
|
|
--- a/Documentation/admin-guide/cgroup-v2.rst
|
|
+++ b/Documentation/admin-guide/cgroup-v2.rst
|
|
@@ -1121,6 +1121,16 @@ All time durations are in microseconds.
|
|
values similar to the sched_setattr(2). This maximum utilization
|
|
value is used to clamp the task specific maximum utilization clamp.
|
|
|
|
+ cpu.latency.nice
|
|
+ A read-write single value file which exists on non-root
|
|
+ cgroups. The default is "0".
|
|
+
|
|
+ The nice value is in the range [-20, 19].
|
|
+
|
|
+ This interface file allows reading and setting latency using the
|
|
+ same values used by sched_setattr(2). The latency_nice of a group is
|
|
+ used to limit the impact of the latency_nice of a task outside the
|
|
+ group.
|
|
|
|
|
|
Memory
|
|
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
|
|
index d1c53e9d8c75..a78e692a9ff5 100644
|
|
--- a/include/linux/rbtree_augmented.h
|
|
+++ b/include/linux/rbtree_augmented.h
|
|
@@ -60,6 +60,32 @@ rb_insert_augmented_cached(struct rb_node *node,
|
|
rb_insert_augmented(node, &root->rb_root, augment);
|
|
}
|
|
|
|
+static __always_inline struct rb_node *
|
|
+rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree,
|
|
+ bool (*less)(struct rb_node *, const struct rb_node *),
|
|
+ const struct rb_augment_callbacks *augment)
|
|
+{
|
|
+ struct rb_node **link = &tree->rb_root.rb_node;
|
|
+ struct rb_node *parent = NULL;
|
|
+ bool leftmost = true;
|
|
+
|
|
+ while (*link) {
|
|
+ parent = *link;
|
|
+ if (less(node, parent)) {
|
|
+ link = &parent->rb_left;
|
|
+ } else {
|
|
+ link = &parent->rb_right;
|
|
+ leftmost = false;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ rb_link_node(node, parent, link);
|
|
+ augment->propagate(parent, NULL); /* suboptimal */
|
|
+ rb_insert_augmented_cached(node, tree, leftmost, augment);
|
|
+
|
|
+ return leftmost ? node : NULL;
|
|
+}
|
|
+
|
|
/*
|
|
* Template for declaring augmented rbtree callbacks (generic case)
|
|
*
|
|
diff --git a/include/linux/sched.h b/include/linux/sched.h
|
|
index 6d398b337b0d..6a719374f688 100644
|
|
--- a/include/linux/sched.h
|
|
+++ b/include/linux/sched.h
|
|
@@ -548,6 +548,9 @@ struct sched_entity {
|
|
/* For load-balancing: */
|
|
struct load_weight load;
|
|
struct rb_node run_node;
|
|
+ u64 deadline;
|
|
+ u64 min_deadline;
|
|
+
|
|
struct list_head group_node;
|
|
unsigned int on_rq;
|
|
|
|
@@ -555,11 +558,10 @@ struct sched_entity {
|
|
u64 sum_exec_runtime;
|
|
u64 vruntime;
|
|
u64 prev_sum_exec_runtime;
|
|
+ s64 vlag;
|
|
+ u64 slice;
|
|
|
|
u64 nr_migrations;
|
|
- u64 prev_sleep_sum_runtime;
|
|
- /* average duration of a task */
|
|
- u64 dur_avg;
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
int depth;
|
|
@@ -787,6 +789,7 @@ struct task_struct {
|
|
int static_prio;
|
|
int normal_prio;
|
|
unsigned int rt_priority;
|
|
+ int latency_prio;
|
|
|
|
struct sched_entity se;
|
|
struct sched_rt_entity rt;
|
|
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
|
|
index 3bac0a8ceab2..b2e932c25be6 100644
|
|
--- a/include/uapi/linux/sched.h
|
|
+++ b/include/uapi/linux/sched.h
|
|
@@ -132,6 +132,7 @@ struct clone_args {
|
|
#define SCHED_FLAG_KEEP_PARAMS 0x10
|
|
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
|
|
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
|
|
+#define SCHED_FLAG_LATENCY_NICE 0x80
|
|
|
|
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
|
|
SCHED_FLAG_KEEP_PARAMS)
|
|
@@ -143,6 +144,7 @@ struct clone_args {
|
|
SCHED_FLAG_RECLAIM | \
|
|
SCHED_FLAG_DL_OVERRUN | \
|
|
SCHED_FLAG_KEEP_ALL | \
|
|
- SCHED_FLAG_UTIL_CLAMP)
|
|
+ SCHED_FLAG_UTIL_CLAMP | \
|
|
+ SCHED_FLAG_LATENCY_NICE)
|
|
|
|
#endif /* _UAPI_LINUX_SCHED_H */
|
|
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
|
|
index f2c4589d4dbf..db1e8199e8c8 100644
|
|
--- a/include/uapi/linux/sched/types.h
|
|
+++ b/include/uapi/linux/sched/types.h
|
|
@@ -10,6 +10,7 @@ struct sched_param {
|
|
|
|
#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
|
|
#define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */
|
|
+#define SCHED_ATTR_SIZE_VER2 60 /* add: latency_nice */
|
|
|
|
/*
|
|
* Extended scheduling parameters data structure.
|
|
@@ -98,6 +99,22 @@ struct sched_param {
|
|
* scheduled on a CPU with no more capacity than the specified value.
|
|
*
|
|
* A task utilization boundary can be reset by setting the attribute to -1.
|
|
+ *
|
|
+ * Latency Tolerance Attributes
|
|
+ * ===========================
|
|
+ *
|
|
+ * A subset of sched_attr attributes allows to specify the relative latency
|
|
+ * requirements of a task with respect to the other tasks running/queued in the
|
|
+ * system.
|
|
+ *
|
|
+ * @ sched_latency_nice task's latency_nice value
|
|
+ *
|
|
+ * The latency_nice of a task can have any value in a range of
|
|
+ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE].
|
|
+ *
|
|
+ * A task with latency_nice with the value of LATENCY_NICE_MIN can be
|
|
+ * taken for a task requiring a lower latency as opposed to the task with
|
|
+ * higher latency_nice.
|
|
*/
|
|
struct sched_attr {
|
|
__u32 size;
|
|
@@ -120,6 +137,8 @@ struct sched_attr {
|
|
__u32 sched_util_min;
|
|
__u32 sched_util_max;
|
|
|
|
+ /* latency requirement hints */
|
|
+ __s32 sched_latency_nice;
|
|
};
|
|
|
|
#endif /* _UAPI_LINUX_SCHED_TYPES_H */
|
|
diff --git a/init/init_task.c b/init/init_task.c
|
|
index ff6c4b9bfe6b..511cbcf3510d 100644
|
|
--- a/init/init_task.c
|
|
+++ b/init/init_task.c
|
|
@@ -78,6 +78,7 @@ struct task_struct init_task
|
|
.prio = MAX_PRIO - 20,
|
|
.static_prio = MAX_PRIO - 20,
|
|
.normal_prio = MAX_PRIO - 20,
|
|
+ .latency_prio = DEFAULT_PRIO,
|
|
.policy = SCHED_NORMAL,
|
|
.cpus_ptr = &init_task.cpus_mask,
|
|
.user_cpus_ptr = NULL,
|
|
@@ -89,7 +90,7 @@ struct task_struct init_task
|
|
.fn = do_no_restart_syscall,
|
|
},
|
|
.se = {
|
|
- .group_node = LIST_HEAD_INIT(init_task.se.group_node),
|
|
+ .group_node = LIST_HEAD_INIT(init_task.se.group_node),
|
|
},
|
|
.rt = {
|
|
.run_list = LIST_HEAD_INIT(init_task.rt.run_list),
|
|
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
|
index 17bb9637f314..fbc08605b068 100644
|
|
--- a/kernel/sched/core.c
|
|
+++ b/kernel/sched/core.c
|
|
@@ -1285,6 +1285,12 @@ static void set_load_weight(struct task_struct *p, bool update_load)
|
|
}
|
|
}
|
|
|
|
+static inline void set_latency_prio(struct task_struct *p, int prio)
|
|
+{
|
|
+ p->latency_prio = prio;
|
|
+ set_latency_fair(&p->se, prio - MAX_RT_PRIO);
|
|
+}
|
|
+
|
|
#ifdef CONFIG_UCLAMP_TASK
|
|
/*
|
|
* Serializes updates of utilization clamp values
|
|
@@ -4434,10 +4440,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
|
|
p->se.prev_sum_exec_runtime = 0;
|
|
p->se.nr_migrations = 0;
|
|
p->se.vruntime = 0;
|
|
- p->se.dur_avg = 0;
|
|
- p->se.prev_sleep_sum_runtime = 0;
|
|
+ p->se.vlag = 0;
|
|
INIT_LIST_HEAD(&p->se.group_node);
|
|
|
|
+ set_latency_prio(p, p->latency_prio);
|
|
+
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
p->se.cfs_rq = NULL;
|
|
#endif
|
|
@@ -4688,6 +4695,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
|
|
|
|
p->prio = p->normal_prio = p->static_prio;
|
|
set_load_weight(p, false);
|
|
+ set_latency_prio(p, NICE_TO_PRIO(0));
|
|
|
|
/*
|
|
* We don't need the reset flag anymore after the fork. It has
|
|
@@ -7433,7 +7441,7 @@ static struct task_struct *find_process_by_pid(pid_t pid)
|
|
#define SETPARAM_POLICY -1
|
|
|
|
static void __setscheduler_params(struct task_struct *p,
|
|
- const struct sched_attr *attr)
|
|
+ const struct sched_attr *attr)
|
|
{
|
|
int policy = attr->sched_policy;
|
|
|
|
@@ -7457,6 +7465,13 @@ static void __setscheduler_params(struct task_struct *p,
|
|
set_load_weight(p, true);
|
|
}
|
|
|
|
+static void __setscheduler_latency(struct task_struct *p,
|
|
+ const struct sched_attr *attr)
|
|
+{
|
|
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
|
|
+ set_latency_prio(p, NICE_TO_PRIO(attr->sched_latency_nice));
|
|
+}
|
|
+
|
|
/*
|
|
* Check the target process has a UID that matches the current process's:
|
|
*/
|
|
@@ -7597,6 +7612,13 @@ static int __sched_setscheduler(struct task_struct *p,
|
|
return retval;
|
|
}
|
|
|
|
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
|
|
+ if (attr->sched_latency_nice > MAX_NICE)
|
|
+ return -EINVAL;
|
|
+ if (attr->sched_latency_nice < MIN_NICE)
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
if (pi)
|
|
cpuset_read_lock();
|
|
|
|
@@ -7631,6 +7653,9 @@ static int __sched_setscheduler(struct task_struct *p,
|
|
goto change;
|
|
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
|
|
goto change;
|
|
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
|
|
+ attr->sched_latency_nice != PRIO_TO_NICE(p->latency_prio))
|
|
+ goto change;
|
|
|
|
p->sched_reset_on_fork = reset_on_fork;
|
|
retval = 0;
|
|
@@ -7719,6 +7744,7 @@ static int __sched_setscheduler(struct task_struct *p,
|
|
__setscheduler_params(p, attr);
|
|
__setscheduler_prio(p, newprio);
|
|
}
|
|
+ __setscheduler_latency(p, attr);
|
|
__setscheduler_uclamp(p, attr);
|
|
|
|
if (queued) {
|
|
@@ -7929,6 +7955,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
|
|
size < SCHED_ATTR_SIZE_VER1)
|
|
return -EINVAL;
|
|
|
|
+ if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
|
|
+ size < SCHED_ATTR_SIZE_VER2)
|
|
+ return -EINVAL;
|
|
/*
|
|
* XXX: Do we want to be lenient like existing syscalls; or do we want
|
|
* to be strict and return an error on out-of-bounds values?
|
|
@@ -8166,6 +8195,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
|
|
get_params(p, &kattr);
|
|
kattr.sched_flags &= SCHED_FLAG_ALL;
|
|
|
|
+ kattr.sched_latency_nice = PRIO_TO_NICE(p->latency_prio);
|
|
+
|
|
#ifdef CONFIG_UCLAMP_TASK
|
|
/*
|
|
* This could race with another potential updater, but this is fine
|
|
@@ -11038,6 +11069,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
|
|
{
|
|
return sched_group_set_idle(css_tg(css), idle);
|
|
}
|
|
+
|
|
+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
|
|
+ struct cftype *cft)
|
|
+{
|
|
+ return PRIO_TO_NICE(css_tg(css)->latency_prio);
|
|
+}
|
|
+
|
|
+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
|
|
+ struct cftype *cft, s64 nice)
|
|
+{
|
|
+ int prio;
|
|
+
|
|
+ if (nice < MIN_NICE || nice > MAX_NICE)
|
|
+ return -ERANGE;
|
|
+
|
|
+ prio = NICE_TO_PRIO(nice);
|
|
+
|
|
+ return sched_group_set_latency(css_tg(css), prio);
|
|
+}
|
|
#endif
|
|
|
|
static struct cftype cpu_legacy_files[] = {
|
|
@@ -11052,6 +11102,11 @@ static struct cftype cpu_legacy_files[] = {
|
|
.read_s64 = cpu_idle_read_s64,
|
|
.write_s64 = cpu_idle_write_s64,
|
|
},
|
|
+ {
|
|
+ .name = "latency.nice",
|
|
+ .read_s64 = cpu_latency_nice_read_s64,
|
|
+ .write_s64 = cpu_latency_nice_write_s64,
|
|
+ },
|
|
#endif
|
|
#ifdef CONFIG_CFS_BANDWIDTH
|
|
{
|
|
@@ -11269,6 +11324,12 @@ static struct cftype cpu_files[] = {
|
|
.read_s64 = cpu_idle_read_s64,
|
|
.write_s64 = cpu_idle_write_s64,
|
|
},
|
|
+ {
|
|
+ .name = "latency.nice",
|
|
+ .flags = CFTYPE_NOT_ON_ROOT,
|
|
+ .read_s64 = cpu_latency_nice_read_s64,
|
|
+ .write_s64 = cpu_latency_nice_write_s64,
|
|
+ },
|
|
#endif
|
|
#ifdef CONFIG_CFS_BANDWIDTH
|
|
{
|
|
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
|
|
index 8d64fba16cfe..e0d10ac21016 100644
|
|
--- a/kernel/sched/debug.c
|
|
+++ b/kernel/sched/debug.c
|
|
@@ -308,10 +308,7 @@ static __init int sched_init_debug(void)
|
|
debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
|
|
#endif
|
|
|
|
- debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
|
|
- debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
|
|
- debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
|
|
- debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
|
|
+ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
|
|
|
|
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
|
|
debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
|
|
@@ -535,9 +532,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
|
|
else
|
|
SEQ_printf(m, " %c", task_state_to_char(p));
|
|
|
|
- SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
|
|
+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
|
|
p->comm, task_pid_nr(p),
|
|
SPLIT_NS(p->se.vruntime),
|
|
+ entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
|
|
+ SPLIT_NS(p->se.deadline),
|
|
+ SPLIT_NS(p->se.slice),
|
|
+ SPLIT_NS(p->se.sum_exec_runtime),
|
|
(long long)(p->nvcsw + p->nivcsw),
|
|
p->prio);
|
|
|
|
@@ -580,10 +581,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
|
|
|
|
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
|
{
|
|
- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
|
|
- spread, rq0_min_vruntime, spread0;
|
|
+ s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread;
|
|
+ struct sched_entity *last, *first;
|
|
struct rq *rq = cpu_rq(cpu);
|
|
- struct sched_entity *last;
|
|
unsigned long flags;
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
@@ -597,26 +597,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
|
SPLIT_NS(cfs_rq->exec_clock));
|
|
|
|
raw_spin_rq_lock_irqsave(rq, flags);
|
|
- if (rb_first_cached(&cfs_rq->tasks_timeline))
|
|
- MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
|
|
+ first = __pick_first_entity(cfs_rq);
|
|
+ if (first)
|
|
+ left_vruntime = first->vruntime;
|
|
last = __pick_last_entity(cfs_rq);
|
|
if (last)
|
|
- max_vruntime = last->vruntime;
|
|
+ right_vruntime = last->vruntime;
|
|
min_vruntime = cfs_rq->min_vruntime;
|
|
- rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
|
|
raw_spin_rq_unlock_irqrestore(rq, flags);
|
|
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
|
|
- SPLIT_NS(MIN_vruntime));
|
|
+
|
|
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
|
|
+ SPLIT_NS(left_vruntime));
|
|
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
|
|
SPLIT_NS(min_vruntime));
|
|
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
|
|
- SPLIT_NS(max_vruntime));
|
|
- spread = max_vruntime - MIN_vruntime;
|
|
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
|
|
- SPLIT_NS(spread));
|
|
- spread0 = min_vruntime - rq0_min_vruntime;
|
|
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
|
|
- SPLIT_NS(spread0));
|
|
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
|
|
+ SPLIT_NS(avg_vruntime(cfs_rq)));
|
|
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
|
|
+ SPLIT_NS(right_vruntime));
|
|
+ spread = right_vruntime - left_vruntime;
|
|
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
|
|
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
|
|
cfs_rq->nr_spread_over);
|
|
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
|
|
@@ -817,10 +816,7 @@ static void sched_debug_header(struct seq_file *m)
|
|
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
|
|
#define PN(x) \
|
|
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
|
|
- PN(sysctl_sched_latency);
|
|
- PN(sysctl_sched_min_granularity);
|
|
- PN(sysctl_sched_idle_min_granularity);
|
|
- PN(sysctl_sched_wakeup_granularity);
|
|
+ PN(sysctl_sched_base_slice);
|
|
P(sysctl_sched_child_runs_first);
|
|
P(sysctl_sched_features);
|
|
#undef PN
|
|
@@ -1024,7 +1020,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
|
|
__PS("nr_involuntary_switches", p->nivcsw);
|
|
|
|
P(se.load.weight);
|
|
- P(se.dur_avg);
|
|
#ifdef CONFIG_SMP
|
|
P(se.avg.load_sum);
|
|
P(se.avg.runnable_sum);
|
|
@@ -1044,6 +1039,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
|
|
#endif
|
|
P(policy);
|
|
P(prio);
|
|
+ P(latency_prio);
|
|
if (task_has_dl_policy(p)) {
|
|
P(dl.runtime);
|
|
P(dl.deadline);
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index 115be8a965f2..76bd212ee5bd 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -47,6 +47,7 @@
|
|
#include <linux/psi.h>
|
|
#include <linux/ratelimit.h>
|
|
#include <linux/task_work.h>
|
|
+#include <linux/rbtree_augmented.h>
|
|
|
|
#include <asm/switch_to.h>
|
|
|
|
@@ -56,26 +57,6 @@
|
|
#include "stats.h"
|
|
#include "autogroup.h"
|
|
|
|
-/*
|
|
- * Targeted preemption latency for CPU-bound tasks:
|
|
- *
|
|
- * NOTE: this latency value is not the same as the concept of
|
|
- * 'timeslice length' - timeslices in CFS are of variable length
|
|
- * and have no persistent notion like in traditional, time-slice
|
|
- * based scheduling concepts.
|
|
- *
|
|
- * (to see the precise effective timeslice length of your workload,
|
|
- * run vmstat and monitor the context-switches (cs) field)
|
|
- *
|
|
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
|
|
- */
|
|
-#ifdef CONFIG_CACHY
|
|
-unsigned int sysctl_sched_latency = 3000000ULL;
|
|
-static unsigned int normalized_sysctl_sched_latency = 3000000ULL;
|
|
-#else
|
|
-unsigned int sysctl_sched_latency = 6000000ULL;
|
|
-static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
|
|
-#endif
|
|
/*
|
|
* The initial- and re-scaling of tunables is configurable
|
|
*
|
|
@@ -94,26 +75,8 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
|
|
*
|
|
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
|
|
*/
|
|
-#ifdef CONFIG_CACHY
|
|
-unsigned int sysctl_sched_min_granularity = 400000ULL;
|
|
-static unsigned int normalized_sysctl_sched_min_granularity = 400000ULL;
|
|
-#else
|
|
-unsigned int sysctl_sched_min_granularity = 750000ULL;
|
|
-static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
|
|
-#endif
|
|
-
|
|
-/*
|
|
- * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
|
|
- * Applies only when SCHED_IDLE tasks compete with normal tasks.
|
|
- *
|
|
- * (default: 0.75 msec)
|
|
- */
|
|
-unsigned int sysctl_sched_idle_min_granularity = 750000ULL;
|
|
-
|
|
-/*
|
|
- * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
|
|
- */
|
|
-static unsigned int sched_nr_latency = 8;
|
|
+unsigned int sysctl_sched_base_slice = 750000ULL;
|
|
+static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
|
|
|
|
/*
|
|
* After fork, child runs first. If set to 0 (default) then
|
|
@@ -121,23 +84,6 @@ static unsigned int sched_nr_latency = 8;
|
|
*/
|
|
unsigned int sysctl_sched_child_runs_first __read_mostly;
|
|
|
|
-/*
|
|
- * SCHED_OTHER wake-up granularity.
|
|
- *
|
|
- * This option delays the preemption effects of decoupled workloads
|
|
- * and reduces their over-scheduling. Synchronous workloads will still
|
|
- * have immediate wakeup/sleep latencies.
|
|
- *
|
|
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
|
|
- */
|
|
-#ifdef CONFIG_CACHY
|
|
-unsigned int sysctl_sched_wakeup_granularity = 500000UL;
|
|
-static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL;
|
|
-#else
|
|
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
|
|
-static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
|
|
-#endif
|
|
-
|
|
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
|
|
|
|
int sched_thermal_decay_shift;
|
|
@@ -189,12 +135,8 @@ int __weak arch_asym_cpu_priority(int cpu)
|
|
*
|
|
* (default: 5 msec, units: microseconds)
|
|
*/
|
|
-#ifdef CONFIG_CACHY
|
|
-static unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL;
|
|
-#else
|
|
static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
|
|
#endif
|
|
-#endif
|
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
|
|
@@ -295,9 +237,7 @@ static void update_sysctl(void)
|
|
|
|
#define SET_SYSCTL(name) \
|
|
(sysctl_##name = (factor) * normalized_sysctl_##name)
|
|
- SET_SYSCTL(sched_min_granularity);
|
|
- SET_SYSCTL(sched_latency);
|
|
- SET_SYSCTL(sched_wakeup_granularity);
|
|
+ SET_SYSCTL(sched_base_slice);
|
|
#undef SET_SYSCTL
|
|
}
|
|
|
|
@@ -365,6 +305,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
|
|
return mul_u64_u32_shr(delta_exec, fact, shift);
|
|
}
|
|
|
|
+/*
|
|
+ * delta /= w
|
|
+ */
|
|
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
|
|
+{
|
|
+ if (unlikely(se->load.weight != NICE_0_LOAD))
|
|
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
|
|
+
|
|
+ return delta;
|
|
+}
|
|
|
|
const struct sched_class fair_sched_class;
|
|
|
|
@@ -619,35 +569,203 @@ static inline bool entity_before(const struct sched_entity *a,
|
|
return (s64)(a->vruntime - b->vruntime) < 0;
|
|
}
|
|
|
|
+static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
+{
|
|
+ return (s64)(se->vruntime - cfs_rq->min_vruntime);
|
|
+}
|
|
+
|
|
#define __node_2_se(node) \
|
|
rb_entry((node), struct sched_entity, run_node)
|
|
|
|
-static void update_min_vruntime(struct cfs_rq *cfs_rq)
|
|
+/*
|
|
+ * Compute virtual time from the per-task service numbers:
|
|
+ *
|
|
+ * Fair schedulers conserve lag: \Sum lag_i = 0
|
|
+ *
|
|
+ * lag_i = S - s_i = w_i * (V - v_i)
|
|
+ *
|
|
+ * \Sum lag_i = 0 -> \Sum w_i * (V - v_i) = V * \Sum w_i - \Sum w_i * v_i = 0
|
|
+ *
|
|
+ * From which we solve V:
|
|
+ *
|
|
+ * \Sum v_i * w_i
|
|
+ * V = --------------
|
|
+ * \Sum w_i
|
|
+ *
|
|
+ * However, since v_i is u64, and the multiplcation could easily overflow
|
|
+ * transform it into a relative form that uses smaller quantities:
|
|
+ *
|
|
+ * Substitute: v_i == (v_i - v) + v
|
|
+ *
|
|
+ * \Sum ((v_i - v) + v) * w_i \Sum (v_i - v) * w_i
|
|
+ * V = -------------------------- = -------------------- + v
|
|
+ * \Sum w_i \Sum w_i
|
|
+ *
|
|
+ * min_vruntime = v
|
|
+ * avg_vruntime = \Sum (v_i - v) * w_i
|
|
+ * cfs_rq->load = \Sum w_i
|
|
+ *
|
|
+ * Since min_vruntime is a monotonic increasing variable that closely tracks
|
|
+ * the per-task service, these deltas: (v_i - v), will be in the order of the
|
|
+ * maximal (virtual) lag induced in the system due to quantisation.
|
|
+ */
|
|
+static void
|
|
+avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
+{
|
|
+ unsigned long weight = scale_load_down(se->load.weight);
|
|
+ s64 key = entity_key(cfs_rq, se);
|
|
+
|
|
+ cfs_rq->avg_vruntime += key * weight;
|
|
+ cfs_rq->avg_slice += se->slice * weight;
|
|
+ cfs_rq->avg_load += weight;
|
|
+}
|
|
+
|
|
+static void
|
|
+avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
+{
|
|
+ unsigned long weight = scale_load_down(se->load.weight);
|
|
+ s64 key = entity_key(cfs_rq, se);
|
|
+
|
|
+ cfs_rq->avg_vruntime -= key * weight;
|
|
+ cfs_rq->avg_slice -= se->slice * weight;
|
|
+ cfs_rq->avg_load -= weight;
|
|
+}
|
|
+
|
|
+static inline
|
|
+void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
|
|
+{
|
|
+ /*
|
|
+ * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
|
|
+ */
|
|
+ cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
|
|
+}
|
|
+
|
|
+u64 avg_vruntime(struct cfs_rq *cfs_rq)
|
|
{
|
|
struct sched_entity *curr = cfs_rq->curr;
|
|
- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
|
|
+ s64 avg = cfs_rq->avg_vruntime;
|
|
+ long load = cfs_rq->avg_load;
|
|
|
|
- u64 vruntime = cfs_rq->min_vruntime;
|
|
+ if (curr && curr->on_rq) {
|
|
+ unsigned long weight = scale_load_down(curr->load.weight);
|
|
|
|
- if (curr) {
|
|
- if (curr->on_rq)
|
|
- vruntime = curr->vruntime;
|
|
- else
|
|
- curr = NULL;
|
|
+ avg += entity_key(cfs_rq, curr) * weight;
|
|
+ load += weight;
|
|
}
|
|
|
|
- if (leftmost) { /* non-empty tree */
|
|
- struct sched_entity *se = __node_2_se(leftmost);
|
|
+ if (load)
|
|
+ avg = div_s64(avg, load);
|
|
|
|
- if (!curr)
|
|
- vruntime = se->vruntime;
|
|
- else
|
|
- vruntime = min_vruntime(vruntime, se->vruntime);
|
|
+ return cfs_rq->min_vruntime + avg;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * lag_i = S - s_i = w_i * (V - v_i)
|
|
+ *
|
|
+ * However, since V is approximated by the weighted average of all entities it
|
|
+ * is possible -- by addition/removal/reweight to the tree -- to move V around
|
|
+ * and end up with a larger lag than we started with.
|
|
+ *
|
|
+ * Limit this to either double the slice length with a minimum of TICK_NSEC
|
|
+ * since that is the timing granularity.
|
|
+ *
|
|
+ * EEVDF gives the following limit for a steady state system:
|
|
+ *
|
|
+ * -r_max < lag < max(r_max, q)
|
|
+ *
|
|
+ * XXX could add max_slice to the augmented data to track this.
|
|
+ */
|
|
+void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
+{
|
|
+ s64 lag, limit;
|
|
+
|
|
+ SCHED_WARN_ON(!se->on_rq);
|
|
+ lag = avg_vruntime(cfs_rq) - se->vruntime;
|
|
+
|
|
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
|
|
+ se->vlag = clamp(lag, -limit, limit);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Entity is eligible once it received less service than it ought to have,
|
|
+ * eg. lag >= 0.
|
|
+ *
|
|
+ * lag_i = S - s_i = w_i*(V - v_i)
|
|
+ *
|
|
+ * lag_i >= 0 -> V >= v_i
|
|
+ *
|
|
+ * \Sum (v_i - v)*w_i
|
|
+ * V = ------------------ + v
|
|
+ * \Sum w_i
|
|
+ *
|
|
+ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
|
|
+ *
|
|
+ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
|
|
+ * to the loss in precision caused by the division.
|
|
+ */
|
|
+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
+{
|
|
+ struct sched_entity *curr = cfs_rq->curr;
|
|
+ s64 avg = cfs_rq->avg_vruntime;
|
|
+ long load = cfs_rq->avg_load;
|
|
+
|
|
+ if (curr && curr->on_rq) {
|
|
+ unsigned long weight = scale_load_down(curr->load.weight);
|
|
+
|
|
+ avg += entity_key(cfs_rq, curr) * weight;
|
|
+ load += weight;
|
|
}
|
|
|
|
- /* ensure we never gain time by being placed backwards. */
|
|
- u64_u32_store(cfs_rq->min_vruntime,
|
|
- max_vruntime(cfs_rq->min_vruntime, vruntime));
|
|
+ return avg >= entity_key(cfs_rq, se) * load;
|
|
+}
|
|
+
|
|
+static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
|
|
+{
|
|
+ u64 min_vruntime = cfs_rq->min_vruntime;
|
|
+ /*
|
|
+ * open coded max_vruntime() to allow updating avg_vruntime
|
|
+ */
|
|
+ s64 delta = (s64)(vruntime - min_vruntime);
|
|
+ if (delta > 0) {
|
|
+ avg_vruntime_update(cfs_rq, delta);
|
|
+ min_vruntime = vruntime;
|
|
+ }
|
|
+ return min_vruntime;
|
|
+}
|
|
+
|
|
+static void update_min_vruntime(struct cfs_rq *cfs_rq)
|
|
+{
|
|
+ if (sched_feat(MINIMAL_VA)) {
|
|
+ u64 vruntime = avg_vruntime(cfs_rq);
|
|
+ s64 delta = (s64)(vruntime - cfs_rq->min_vruntime);
|
|
+
|
|
+ avg_vruntime_update(cfs_rq, delta);
|
|
+
|
|
+ u64_u32_store(cfs_rq->min_vruntime, vruntime);
|
|
+ } else {
|
|
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
|
|
+ struct sched_entity *curr = cfs_rq->curr;
|
|
+
|
|
+ u64 vruntime = cfs_rq->min_vruntime;
|
|
+
|
|
+ if (curr) {
|
|
+ if (curr->on_rq)
|
|
+ vruntime = curr->vruntime;
|
|
+ else
|
|
+ curr = NULL;
|
|
+ }
|
|
+
|
|
+ if (se) {
|
|
+ if (!curr)
|
|
+ vruntime = se->vruntime;
|
|
+ else
|
|
+ vruntime = min_vruntime(vruntime, se->vruntime);
|
|
+ }
|
|
+
|
|
+ /* ensure we never gain time by being placed backwards. */
|
|
+ u64_u32_store(cfs_rq->min_vruntime,
|
|
+ __update_min_vruntime(cfs_rq, vruntime));
|
|
+ }
|
|
}
|
|
|
|
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
|
|
@@ -655,17 +773,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
|
|
return entity_before(__node_2_se(a), __node_2_se(b));
|
|
}
|
|
|
|
+#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
|
|
+
|
|
+static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node)
|
|
+{
|
|
+ if (node) {
|
|
+ struct sched_entity *rse = __node_2_se(node);
|
|
+ if (deadline_gt(min_deadline, se, rse))
|
|
+ se->min_deadline = rse->min_deadline;
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
|
|
+ */
|
|
+static inline bool min_deadline_update(struct sched_entity *se, bool exit)
|
|
+{
|
|
+ u64 old_min_deadline = se->min_deadline;
|
|
+ struct rb_node *node = &se->run_node;
|
|
+
|
|
+ se->min_deadline = se->deadline;
|
|
+ __update_min_deadline(se, node->rb_right);
|
|
+ __update_min_deadline(se, node->rb_left);
|
|
+
|
|
+ return se->min_deadline == old_min_deadline;
|
|
+}
|
|
+
|
|
+RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
|
|
+ run_node, min_deadline, min_deadline_update);
|
|
+
|
|
/*
|
|
* Enqueue an entity into the rb-tree:
|
|
*/
|
|
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
- rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
|
|
+ avg_vruntime_add(cfs_rq, se);
|
|
+ se->min_deadline = se->deadline;
|
|
+ rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
|
|
+ __entity_less, &min_deadline_cb);
|
|
}
|
|
|
|
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
|
|
+ rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
|
|
+ &min_deadline_cb);
|
|
+ avg_vruntime_sub(cfs_rq, se);
|
|
}
|
|
|
|
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
|
|
@@ -678,14 +830,81 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
|
|
return __node_2_se(left);
|
|
}
|
|
|
|
-static struct sched_entity *__pick_next_entity(struct sched_entity *se)
|
|
+/*
|
|
+ * Earliest Eligible Virtual Deadline First
|
|
+ *
|
|
+ * In order to provide latency guarantees for different request sizes
|
|
+ * EEVDF selects the best runnable task from two criteria:
|
|
+ *
|
|
+ * 1) the task must be eligible (must be owed service)
|
|
+ *
|
|
+ * 2) from those tasks that meet 1), we select the one
|
|
+ * with the earliest virtual deadline.
|
|
+ *
|
|
+ * We can do this in O(log n) time due to an augmented RB-tree. The
|
|
+ * tree keeps the entries sorted on service, but also functions as a
|
|
+ * heap based on the deadline by keeping:
|
|
+ *
|
|
+ * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
|
|
+ *
|
|
+ * Which allows an EDF like search on (sub)trees.
|
|
+ */
|
|
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
|
|
{
|
|
- struct rb_node *next = rb_next(&se->run_node);
|
|
+ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
|
|
+ struct sched_entity *curr = cfs_rq->curr;
|
|
+ struct sched_entity *best = NULL;
|
|
|
|
- if (!next)
|
|
- return NULL;
|
|
+ if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
|
|
+ curr = NULL;
|
|
+
|
|
+ while (node) {
|
|
+ struct sched_entity *se = __node_2_se(node);
|
|
+
|
|
+ /*
|
|
+ * If this entity is not eligible, try the left subtree.
|
|
+ */
|
|
+ if (!entity_eligible(cfs_rq, se)) {
|
|
+ node = node->rb_left;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If this entity has an earlier deadline than the previous
|
|
+ * best, take this one. If it also has the earliest deadline
|
|
+ * of its subtree, we're done.
|
|
+ */
|
|
+ if (!best || deadline_gt(deadline, best, se)) {
|
|
+ best = se;
|
|
+ if (best->deadline == best->min_deadline)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If the earlest deadline in this subtree is in the fully
|
|
+ * eligible left half of our space, go there.
|
|
+ */
|
|
+ if (node->rb_left &&
|
|
+ __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
|
|
+ node = node->rb_left;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ node = node->rb_right;
|
|
+ }
|
|
+
|
|
+ if (!best || (curr && deadline_gt(deadline, best, curr)))
|
|
+ best = curr;
|
|
+
|
|
+ if (unlikely(!best)) {
|
|
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
|
|
+ if (left) {
|
|
+ pr_err("EEVDF scheduling fail, picking leftmost\n");
|
|
+ return left;
|
|
+ }
|
|
+ }
|
|
|
|
- return __node_2_se(next);
|
|
+ return best;
|
|
}
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
@@ -707,104 +926,43 @@ int sched_update_scaling(void)
|
|
{
|
|
unsigned int factor = get_update_sysctl_factor();
|
|
|
|
- sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
|
|
- sysctl_sched_min_granularity);
|
|
-
|
|
#define WRT_SYSCTL(name) \
|
|
(normalized_sysctl_##name = sysctl_##name / (factor))
|
|
- WRT_SYSCTL(sched_min_granularity);
|
|
- WRT_SYSCTL(sched_latency);
|
|
- WRT_SYSCTL(sched_wakeup_granularity);
|
|
+ WRT_SYSCTL(sched_base_slice);
|
|
#undef WRT_SYSCTL
|
|
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
-/*
|
|
- * delta /= w
|
|
- */
|
|
-static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
|
|
+void set_latency_fair(struct sched_entity *se, int prio)
|
|
{
|
|
- if (unlikely(se->load.weight != NICE_0_LOAD))
|
|
- delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
|
|
+ u32 weight = sched_prio_to_weight[prio];
|
|
+ u64 base = sysctl_sched_base_slice;
|
|
|
|
- return delta;
|
|
-}
|
|
-
|
|
-/*
|
|
- * The idea is to set a period in which each task runs once.
|
|
- *
|
|
- * When there are too many tasks (sched_nr_latency) we have to stretch
|
|
- * this period because otherwise the slices get too small.
|
|
- *
|
|
- * p = (nr <= nl) ? l : l*nr/nl
|
|
- */
|
|
-static u64 __sched_period(unsigned long nr_running)
|
|
-{
|
|
- if (unlikely(nr_running > sched_nr_latency))
|
|
- return nr_running * sysctl_sched_min_granularity;
|
|
- else
|
|
- return sysctl_sched_latency;
|
|
+ /*
|
|
+ * For EEVDF the virtual time slope is determined by w_i (iow.
|
|
+ * nice) while the request time r_i is determined by
|
|
+ * latency-nice.
|
|
+ *
|
|
+ * Smaller request gets better latency.
|
|
+ */
|
|
+ se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight);
|
|
}
|
|
|
|
-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
|
|
-
|
|
/*
|
|
- * We calculate the wall-time slice from the period by taking a part
|
|
- * proportional to the weight.
|
|
- *
|
|
- * s = p*P[w/rw]
|
|
+ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
|
|
+ * this is probably good enough.
|
|
*/
|
|
-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
- unsigned int nr_running = cfs_rq->nr_running;
|
|
- struct sched_entity *init_se = se;
|
|
- unsigned int min_gran;
|
|
- u64 slice;
|
|
-
|
|
- if (sched_feat(ALT_PERIOD))
|
|
- nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
|
|
-
|
|
- slice = __sched_period(nr_running + !se->on_rq);
|
|
-
|
|
- for_each_sched_entity(se) {
|
|
- struct load_weight *load;
|
|
- struct load_weight lw;
|
|
- struct cfs_rq *qcfs_rq;
|
|
-
|
|
- qcfs_rq = cfs_rq_of(se);
|
|
- load = &qcfs_rq->load;
|
|
-
|
|
- if (unlikely(!se->on_rq)) {
|
|
- lw = qcfs_rq->load;
|
|
-
|
|
- update_load_add(&lw, se->load.weight);
|
|
- load = &lw;
|
|
- }
|
|
- slice = __calc_delta(slice, se->load.weight, load);
|
|
- }
|
|
-
|
|
- if (sched_feat(BASE_SLICE)) {
|
|
- if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
|
|
- min_gran = sysctl_sched_idle_min_granularity;
|
|
- else
|
|
- min_gran = sysctl_sched_min_granularity;
|
|
-
|
|
- slice = max_t(u64, slice, min_gran);
|
|
- }
|
|
-
|
|
- return slice;
|
|
-}
|
|
+ if ((s64)(se->vruntime - se->deadline) < 0)
|
|
+ return;
|
|
|
|
-/*
|
|
- * We calculate the vruntime slice of a to-be-inserted task.
|
|
- *
|
|
- * vs = s/w
|
|
- */
|
|
-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
-{
|
|
- return calc_delta_fair(sched_slice(cfs_rq, se), se);
|
|
+ /*
|
|
+ * EEVDF: vd_i = ve_i + r_i / w_i
|
|
+ */
|
|
+ se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
|
|
}
|
|
|
|
#include "pelt.h"
|
|
@@ -939,6 +1097,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
|
|
schedstat_add(cfs_rq->exec_clock, delta_exec);
|
|
|
|
curr->vruntime += calc_delta_fair(delta_exec, curr);
|
|
+ update_deadline(cfs_rq, curr);
|
|
update_min_vruntime(cfs_rq);
|
|
|
|
if (entity_is_task(curr)) {
|
|
@@ -3336,16 +3495,28 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
|
|
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
|
unsigned long weight)
|
|
{
|
|
+ unsigned long old_weight = se->load.weight;
|
|
+
|
|
if (se->on_rq) {
|
|
/* commit outstanding execution time */
|
|
if (cfs_rq->curr == se)
|
|
update_curr(cfs_rq);
|
|
+ else
|
|
+ avg_vruntime_sub(cfs_rq, se);
|
|
update_load_sub(&cfs_rq->load, se->load.weight);
|
|
}
|
|
dequeue_load_avg(cfs_rq, se);
|
|
|
|
update_load_set(&se->load, weight);
|
|
|
|
+ if (!se->on_rq) {
|
|
+ /*
|
|
+ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v),
|
|
+ * we need to scale se->vlag when w_i changes.
|
|
+ */
|
|
+ se->vlag = div_s64(se->vlag * old_weight, weight);
|
|
+ }
|
|
+
|
|
#ifdef CONFIG_SMP
|
|
do {
|
|
u32 divider = get_pelt_divider(&se->avg);
|
|
@@ -3355,9 +3526,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
|
#endif
|
|
|
|
enqueue_load_avg(cfs_rq, se);
|
|
- if (se->on_rq)
|
|
+ if (se->on_rq) {
|
|
update_load_add(&cfs_rq->load, se->load.weight);
|
|
-
|
|
+ if (cfs_rq->curr != se)
|
|
+ avg_vruntime_add(cfs_rq, se);
|
|
+ }
|
|
}
|
|
|
|
void reweight_task(struct task_struct *p, int prio)
|
|
@@ -4653,158 +4826,135 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
|
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
-static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
-{
|
|
-#ifdef CONFIG_SCHED_DEBUG
|
|
- s64 d = se->vruntime - cfs_rq->min_vruntime;
|
|
-
|
|
- if (d < 0)
|
|
- d = -d;
|
|
-
|
|
- if (d > 3*sysctl_sched_latency)
|
|
- schedstat_inc(cfs_rq->nr_spread_over);
|
|
-#endif
|
|
-}
|
|
-
|
|
-static inline bool entity_is_long_sleeper(struct sched_entity *se)
|
|
+static void
|
|
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
{
|
|
- struct cfs_rq *cfs_rq;
|
|
- u64 sleep_time;
|
|
+ u64 vslice = calc_delta_fair(se->slice, se);
|
|
+ u64 vruntime = avg_vruntime(cfs_rq);
|
|
+ s64 lag = 0;
|
|
|
|
- if (se->exec_start == 0)
|
|
- return false;
|
|
+ /*
|
|
+ * Due to how V is constructed as the weighted average of entities,
|
|
+ * adding tasks with positive lag, or removing tasks with negative lag
|
|
+ * will move 'time' backwards, this can screw around with the lag of
|
|
+ * other tasks.
|
|
+ *
|
|
+ * EEVDF: placement strategy #1 / #2
|
|
+ */
|
|
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
|
|
+ struct sched_entity *curr = cfs_rq->curr;
|
|
+ unsigned long load;
|
|
|
|
- cfs_rq = cfs_rq_of(se);
|
|
+ lag = se->vlag;
|
|
|
|
- sleep_time = rq_clock_task(rq_of(cfs_rq));
|
|
+ /*
|
|
+ * For latency sensitive tasks; those that have a shorter than
|
|
+ * average slice and do not fully consume the slice, transition
|
|
+ * to EEVDF placement strategy #2.
|
|
+ */
|
|
+ if (sched_feat(PLACE_FUDGE) &&
|
|
+ cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) {
|
|
+ lag += vslice;
|
|
+ if (lag > 0)
|
|
+ lag = 0;
|
|
+ }
|
|
|
|
- /* Happen while migrating because of clock task divergence */
|
|
- if (sleep_time <= se->exec_start)
|
|
- return false;
|
|
+ /*
|
|
+ * If we want to place a task and preserve lag, we have to
|
|
+ * consider the effect of the new entity on the weighted
|
|
+ * average and compensate for this, otherwise lag can quickly
|
|
+ * evaporate:
|
|
+ *
|
|
+ * l_i = V - v_i <=> v_i = V - l_i
|
|
+ *
|
|
+ * V = v_avg = W*v_avg / W
|
|
+ *
|
|
+ * V' = (W*v_avg + w_i*v_i) / (W + w_i)
|
|
+ * = (W*v_avg + w_i(v_avg - l_i)) / (W + w_i)
|
|
+ * = v_avg + w_i*l_i/(W + w_i)
|
|
+ *
|
|
+ * l_i' = V' - v_i = v_avg + w_i*l_i/(W + w_i) - (v_avg - l)
|
|
+ * = l_i - w_i*l_i/(W + w_i)
|
|
+ *
|
|
+ * l_i = (W + w_i) * l_i' / W
|
|
+ */
|
|
+ load = cfs_rq->avg_load;
|
|
+ if (curr && curr->on_rq)
|
|
+ load += scale_load_down(curr->load.weight);
|
|
|
|
- sleep_time -= se->exec_start;
|
|
- if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
|
|
- return true;
|
|
+ lag *= load + scale_load_down(se->load.weight);
|
|
+ if (WARN_ON_ONCE(!load))
|
|
+ load = 1;
|
|
+ lag = div_s64(lag, load);
|
|
|
|
- return false;
|
|
-}
|
|
+ vruntime -= lag;
|
|
+ }
|
|
|
|
-static void
|
|
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
|
-{
|
|
- u64 vruntime = cfs_rq->min_vruntime;
|
|
+ /*
|
|
+ * Base the deadline on the 'normal' EEVDF placement policy in an
|
|
+ * attempt to not let the bonus crud below wreck things completely.
|
|
+ */
|
|
+ se->deadline = vruntime;
|
|
|
|
/*
|
|
- * The 'current' period is already promised to the current tasks,
|
|
- * however the extra weight of the new task will slow them down a
|
|
- * little, place the new task so that it fits in the slot that
|
|
- * stays open at the end.
|
|
+ * The whole 'sleeper' bonus hack... :-/ This is strictly unfair.
|
|
+ *
|
|
+ * By giving a sleeping task a little boost, it becomes possible for a
|
|
+ * 50% task to compete equally with a 100% task. That is, strictly fair
|
|
+ * that setup would result in a 67% / 33% split. Sleeper bonus will
|
|
+ * change that to 50% / 50%.
|
|
+ *
|
|
+ * This thing hurts my brain, because tasks leaving with negative lag
|
|
+ * will move 'time' backward, so comparing against a historical
|
|
+ * se->vruntime is dodgy as heck.
|
|
*/
|
|
- if (initial && sched_feat(START_DEBIT))
|
|
- vruntime += sched_vslice(cfs_rq, se);
|
|
+ if (sched_feat(PLACE_BONUS) &&
|
|
+ (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)) {
|
|
+ /*
|
|
+ * If se->vruntime is ahead of vruntime, something dodgy
|
|
+ * happened and we cannot give bonus due to not having valid
|
|
+ * history.
|
|
+ */
|
|
+ if ((s64)(se->vruntime - vruntime) < 0) {
|
|
+ vruntime -= se->slice/2;
|
|
+ vruntime = max_vruntime(se->vruntime, vruntime);
|
|
+ }
|
|
+ }
|
|
|
|
- /* sleeps up to a single latency don't count. */
|
|
- if (!initial) {
|
|
- unsigned long thresh;
|
|
+ se->vruntime = vruntime;
|
|
|
|
- if (se_is_idle(se))
|
|
- thresh = sysctl_sched_min_granularity;
|
|
- else
|
|
- thresh = sysctl_sched_latency;
|
|
+ /*
|
|
+ * When joining the competition; the exisiting tasks will be,
|
|
+ * on average, halfway through their slice, as such start tasks
|
|
+ * off with half a slice to ease into the competition.
|
|
+ */
|
|
+ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
|
|
+ vslice /= 2;
|
|
|
|
- /*
|
|
- * Halve their sleep time's effect, to allow
|
|
- * for a gentler effect of sleepers:
|
|
- */
|
|
- if (sched_feat(GENTLE_FAIR_SLEEPERS))
|
|
- thresh >>= 1;
|
|
-
|
|
- vruntime -= thresh;
|
|
- }
|
|
-
|
|
- /*
|
|
- * Pull vruntime of the entity being placed to the base level of
|
|
- * cfs_rq, to prevent boosting it if placed backwards.
|
|
- * However, min_vruntime can advance much faster than real time, with
|
|
- * the extreme being when an entity with the minimal weight always runs
|
|
- * on the cfs_rq. If the waking entity slept for a long time, its
|
|
- * vruntime difference from min_vruntime may overflow s64 and their
|
|
- * comparison may get inversed, so ignore the entity's original
|
|
- * vruntime in that case.
|
|
- * The maximal vruntime speedup is given by the ratio of normal to
|
|
- * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
|
|
- * When placing a migrated waking entity, its exec_start has been set
|
|
- * from a different rq. In order to take into account a possible
|
|
- * divergence between new and prev rq's clocks task because of irq and
|
|
- * stolen time, we take an additional margin.
|
|
- * So, cutting off on the sleep time of
|
|
- * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
|
|
- * should be safe.
|
|
- */
|
|
- if (entity_is_long_sleeper(se))
|
|
- se->vruntime = vruntime;
|
|
- else
|
|
- se->vruntime = max_vruntime(se->vruntime, vruntime);
|
|
+ /*
|
|
+ * EEVDF: vd_i = ve_i + r_i/w_i
|
|
+ */
|
|
+ se->deadline += vslice;
|
|
}
|
|
|
|
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
|
|
|
|
static inline bool cfs_bandwidth_used(void);
|
|
|
|
-/*
|
|
- * MIGRATION
|
|
- *
|
|
- * dequeue
|
|
- * update_curr()
|
|
- * update_min_vruntime()
|
|
- * vruntime -= min_vruntime
|
|
- *
|
|
- * enqueue
|
|
- * update_curr()
|
|
- * update_min_vruntime()
|
|
- * vruntime += min_vruntime
|
|
- *
|
|
- * this way the vruntime transition between RQs is done when both
|
|
- * min_vruntime are up-to-date.
|
|
- *
|
|
- * WAKEUP (remote)
|
|
- *
|
|
- * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
|
|
- * vruntime -= min_vruntime
|
|
- *
|
|
- * enqueue
|
|
- * update_curr()
|
|
- * update_min_vruntime()
|
|
- * vruntime += min_vruntime
|
|
- *
|
|
- * this way we don't have the most up-to-date min_vruntime on the originating
|
|
- * CPU and an up-to-date min_vruntime on the destination CPU.
|
|
- */
|
|
-
|
|
static void
|
|
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
{
|
|
- bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
|
|
bool curr = cfs_rq->curr == se;
|
|
|
|
/*
|
|
* If we're the current task, we must renormalise before calling
|
|
* update_curr().
|
|
*/
|
|
- if (renorm && curr)
|
|
- se->vruntime += cfs_rq->min_vruntime;
|
|
+ if (curr)
|
|
+ place_entity(cfs_rq, se, flags);
|
|
|
|
update_curr(cfs_rq);
|
|
|
|
- /*
|
|
- * Otherwise, renormalise after, such that we're placed at the current
|
|
- * moment in time, instead of some random moment in the past. Being
|
|
- * placed in the past could significantly boost this task to the
|
|
- * fairness detriment of existing tasks.
|
|
- */
|
|
- if (renorm && !curr)
|
|
- se->vruntime += cfs_rq->min_vruntime;
|
|
-
|
|
/*
|
|
* When enqueuing a sched_entity, we must:
|
|
* - Update loads to have both entity and cfs_rq synced with now.
|
|
@@ -4816,18 +4966,28 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
*/
|
|
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
|
|
se_update_runnable(se);
|
|
+ /*
|
|
+ * XXX update_load_avg() above will have attached us to the pelt sum;
|
|
+ * but update_cfs_group() here will re-adjust the weight and have to
|
|
+ * undo/redo all that. Seems wasteful.
|
|
+ */
|
|
update_cfs_group(se);
|
|
+
|
|
+ /*
|
|
+ * XXX now that the entity has been re-weighted, and it's lag adjusted,
|
|
+ * we can place the entity.
|
|
+ */
|
|
+ if (!curr)
|
|
+ place_entity(cfs_rq, se, flags);
|
|
+
|
|
account_entity_enqueue(cfs_rq, se);
|
|
|
|
- if (flags & ENQUEUE_WAKEUP)
|
|
- place_entity(cfs_rq, se, 0);
|
|
/* Entity has migrated, no longer consider this task hot */
|
|
if (flags & ENQUEUE_MIGRATED)
|
|
se->exec_start = 0;
|
|
|
|
check_schedstat_required();
|
|
update_stats_enqueue_fair(cfs_rq, se, flags);
|
|
- check_spread(cfs_rq, se);
|
|
if (!curr)
|
|
__enqueue_entity(cfs_rq, se);
|
|
se->on_rq = 1;
|
|
@@ -4839,17 +4999,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
}
|
|
}
|
|
|
|
-static void __clear_buddies_last(struct sched_entity *se)
|
|
-{
|
|
- for_each_sched_entity(se) {
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
- if (cfs_rq->last != se)
|
|
- break;
|
|
-
|
|
- cfs_rq->last = NULL;
|
|
- }
|
|
-}
|
|
-
|
|
static void __clear_buddies_next(struct sched_entity *se)
|
|
{
|
|
for_each_sched_entity(se) {
|
|
@@ -4861,27 +5010,10 @@ static void __clear_buddies_next(struct sched_entity *se)
|
|
}
|
|
}
|
|
|
|
-static void __clear_buddies_skip(struct sched_entity *se)
|
|
-{
|
|
- for_each_sched_entity(se) {
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
- if (cfs_rq->skip != se)
|
|
- break;
|
|
-
|
|
- cfs_rq->skip = NULL;
|
|
- }
|
|
-}
|
|
-
|
|
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
- if (cfs_rq->last == se)
|
|
- __clear_buddies_last(se);
|
|
-
|
|
if (cfs_rq->next == se)
|
|
__clear_buddies_next(se);
|
|
-
|
|
- if (cfs_rq->skip == se)
|
|
- __clear_buddies_skip(se);
|
|
}
|
|
|
|
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
|
|
@@ -4915,20 +5047,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
|
clear_buddies(cfs_rq, se);
|
|
|
|
+ update_entity_lag(cfs_rq, se);
|
|
if (se != cfs_rq->curr)
|
|
__dequeue_entity(cfs_rq, se);
|
|
se->on_rq = 0;
|
|
account_entity_dequeue(cfs_rq, se);
|
|
|
|
- /*
|
|
- * Normalize after update_curr(); which will also have moved
|
|
- * min_vruntime if @se is the one holding it back. But before doing
|
|
- * update_min_vruntime() again, which will discount @se's position and
|
|
- * can move min_vruntime forward still more.
|
|
- */
|
|
- if (!(flags & DEQUEUE_SLEEP))
|
|
- se->vruntime -= cfs_rq->min_vruntime;
|
|
-
|
|
/* return excess runtime on last dequeue */
|
|
return_cfs_rq_runtime(cfs_rq);
|
|
|
|
@@ -4953,44 +5077,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
static void
|
|
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
|
{
|
|
- unsigned long ideal_runtime, delta_exec;
|
|
- struct sched_entity *se;
|
|
- s64 delta;
|
|
-
|
|
- /*
|
|
- * When many tasks blow up the sched_period; it is possible that
|
|
- * sched_slice() reports unusually large results (when many tasks are
|
|
- * very light for example). Therefore impose a maximum.
|
|
- */
|
|
- ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
|
|
-
|
|
- delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
|
|
- if (delta_exec > ideal_runtime) {
|
|
+ if (pick_eevdf(cfs_rq) != curr) {
|
|
resched_curr(rq_of(cfs_rq));
|
|
/*
|
|
* The current task ran long enough, ensure it doesn't get
|
|
* re-elected due to buddy favours.
|
|
*/
|
|
clear_buddies(cfs_rq, curr);
|
|
- return;
|
|
}
|
|
-
|
|
- /*
|
|
- * Ensure that a task that missed wakeup preemption by a
|
|
- * narrow margin doesn't have to wait for a full slice.
|
|
- * This also mitigates buddy induced latencies under load.
|
|
- */
|
|
- if (delta_exec < sysctl_sched_min_granularity)
|
|
- return;
|
|
-
|
|
- se = __pick_first_entity(cfs_rq);
|
|
- delta = curr->vruntime - se->vruntime;
|
|
-
|
|
- if (delta < 0)
|
|
- return;
|
|
-
|
|
- if (delta > ideal_runtime)
|
|
- resched_curr(rq_of(cfs_rq));
|
|
}
|
|
|
|
static void
|
|
@@ -5031,9 +5125,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
se->prev_sum_exec_runtime = se->sum_exec_runtime;
|
|
}
|
|
|
|
-static int
|
|
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
|
|
-
|
|
/*
|
|
* Pick the next process, keeping these things in mind, in this order:
|
|
* 1) keep things fair between processes/task groups
|
|
@@ -5044,50 +5135,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
|
|
static struct sched_entity *
|
|
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
|
{
|
|
- struct sched_entity *left = __pick_first_entity(cfs_rq);
|
|
- struct sched_entity *se;
|
|
-
|
|
/*
|
|
- * If curr is set we have to see if its left of the leftmost entity
|
|
- * still in the tree, provided there was anything in the tree at all.
|
|
+ * Enabling NEXT_BUDDY will affect latency but not fairness.
|
|
*/
|
|
- if (!left || (curr && entity_before(curr, left)))
|
|
- left = curr;
|
|
-
|
|
- se = left; /* ideally we run the leftmost entity */
|
|
+ if (sched_feat(NEXT_BUDDY) &&
|
|
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
|
|
+ return cfs_rq->next;
|
|
|
|
- /*
|
|
- * Avoid running the skip buddy, if running something else can
|
|
- * be done without getting too unfair.
|
|
- */
|
|
- if (cfs_rq->skip && cfs_rq->skip == se) {
|
|
- struct sched_entity *second;
|
|
-
|
|
- if (se == curr) {
|
|
- second = __pick_first_entity(cfs_rq);
|
|
- } else {
|
|
- second = __pick_next_entity(se);
|
|
- if (!second || (curr && entity_before(curr, second)))
|
|
- second = curr;
|
|
- }
|
|
-
|
|
- if (second && wakeup_preempt_entity(second, left) < 1)
|
|
- se = second;
|
|
- }
|
|
-
|
|
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
|
|
- /*
|
|
- * Someone really wants this to run. If it's not unfair, run it.
|
|
- */
|
|
- se = cfs_rq->next;
|
|
- } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
|
|
- /*
|
|
- * Prefer last buddy, try to return the CPU to a preempted task.
|
|
- */
|
|
- se = cfs_rq->last;
|
|
- }
|
|
-
|
|
- return se;
|
|
+ return pick_eevdf(cfs_rq);
|
|
}
|
|
|
|
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
|
|
@@ -5104,8 +5159,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
|
|
/* throttle cfs_rqs exceeding runtime */
|
|
check_cfs_rq_runtime(cfs_rq);
|
|
|
|
- check_spread(cfs_rq, prev);
|
|
-
|
|
if (prev->on_rq) {
|
|
update_stats_wait_start_fair(cfs_rq, prev);
|
|
/* Put 'current' back into the tree. */
|
|
@@ -6149,13 +6202,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
|
|
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
|
|
{
|
|
struct sched_entity *se = &p->se;
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
|
SCHED_WARN_ON(task_rq(p) != rq);
|
|
|
|
if (rq->cfs.h_nr_running > 1) {
|
|
- u64 slice = sched_slice(cfs_rq, se);
|
|
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
|
|
+ u64 slice = se->slice;
|
|
s64 delta = slice - ran;
|
|
|
|
if (delta < 0) {
|
|
@@ -6179,8 +6231,7 @@ static void hrtick_update(struct rq *rq)
|
|
if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
|
|
return;
|
|
|
|
- if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
|
|
- hrtick_start_fair(rq, curr);
|
|
+ hrtick_start_fair(rq, curr);
|
|
}
|
|
#else /* !CONFIG_SCHED_HRTICK */
|
|
static inline void
|
|
@@ -6221,17 +6272,6 @@ static int sched_idle_rq(struct rq *rq)
|
|
rq->nr_running);
|
|
}
|
|
|
|
-/*
|
|
- * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use
|
|
- * of idle_nr_running, which does not consider idle descendants of normal
|
|
- * entities.
|
|
- */
|
|
-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq)
|
|
-{
|
|
- return cfs_rq->nr_running &&
|
|
- cfs_rq->nr_running == cfs_rq->idle_nr_running;
|
|
-}
|
|
-
|
|
#ifdef CONFIG_SMP
|
|
static int sched_idle_cpu(int cpu)
|
|
{
|
|
@@ -6333,18 +6373,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
|
|
|
static void set_next_buddy(struct sched_entity *se);
|
|
|
|
-static inline void dur_avg_update(struct task_struct *p, bool task_sleep)
|
|
-{
|
|
- u64 dur;
|
|
-
|
|
- if (!task_sleep)
|
|
- return;
|
|
-
|
|
- dur = p->se.sum_exec_runtime - p->se.prev_sleep_sum_runtime;
|
|
- p->se.prev_sleep_sum_runtime = p->se.sum_exec_runtime;
|
|
- update_avg(&p->se.dur_avg, dur);
|
|
-}
|
|
-
|
|
/*
|
|
* The dequeue_task method is called before nr_running is
|
|
* decreased. We remove the task from the rbtree and
|
|
@@ -6417,7 +6445,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
|
|
|
dequeue_throttle:
|
|
util_est_update(&rq->cfs, p, task_sleep);
|
|
- dur_avg_update(p, task_sleep);
|
|
hrtick_update(rq);
|
|
}
|
|
|
|
@@ -6551,23 +6578,6 @@ static int wake_wide(struct task_struct *p)
|
|
return 1;
|
|
}
|
|
|
|
-/*
|
|
- * If a task switches in and then voluntarily relinquishes the
|
|
- * CPU quickly, it is regarded as a short duration task.
|
|
- *
|
|
- * SIS_SHORT tries to wake up the short wakee on current CPU. This
|
|
- * aims to avoid race condition among CPUs due to frequent context
|
|
- * switch. Besides, the candidate short task should not be the one
|
|
- * that wakes up more than one tasks, otherwise SIS_SHORT might
|
|
- * stack too many tasks on current CPU.
|
|
- */
|
|
-static inline int is_short_task(struct task_struct *p)
|
|
-{
|
|
- return sched_feat(SIS_SHORT) && !p->wakee_flips &&
|
|
- p->se.dur_avg &&
|
|
- ((p->se.dur_avg * 8) < sysctl_sched_min_granularity);
|
|
-}
|
|
-
|
|
/*
|
|
* The purpose of wake_affine() is to quickly determine on which CPU we can run
|
|
* soonest. For the purpose of speed we only consider the waking and previous
|
|
@@ -6604,11 +6614,6 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
|
|
if (available_idle_cpu(prev_cpu))
|
|
return prev_cpu;
|
|
|
|
- /* The only running task is a short duration one. */
|
|
- if (cpu_rq(this_cpu)->nr_running == 1 &&
|
|
- is_short_task(rcu_dereference(cpu_curr(this_cpu))))
|
|
- return this_cpu;
|
|
-
|
|
return nr_cpumask_bits;
|
|
}
|
|
|
|
@@ -6983,20 +6988,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
|
|
/* overloaded LLC is unlikely to have idle cpu/core */
|
|
if (nr == 1)
|
|
return -1;
|
|
-
|
|
- /*
|
|
- * If the scan number suggested by SIS_UTIL is smaller
|
|
- * than 60% of llc_weight, it indicates a util_avg% higher
|
|
- * than 50%. System busier than this could lower its bar to
|
|
- * choose a compromised "idle" CPU. This co-exists with
|
|
- * !has_idle_core to not stack too many tasks on one CPU.
|
|
- */
|
|
- if (!has_idle_core && this == target &&
|
|
- (5 * nr < 3 * sd->span_weight) &&
|
|
- cpu_rq(target)->nr_running <= 1 &&
|
|
- is_short_task(p) &&
|
|
- is_short_task(rcu_dereference(cpu_curr(target))))
|
|
- return target;
|
|
}
|
|
}
|
|
|
|
@@ -7729,18 +7720,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
|
|
{
|
|
struct sched_entity *se = &p->se;
|
|
|
|
- /*
|
|
- * As blocked tasks retain absolute vruntime the migration needs to
|
|
- * deal with this by subtracting the old and adding the new
|
|
- * min_vruntime -- the latter is done by enqueue_entity() when placing
|
|
- * the task on the new runqueue.
|
|
- */
|
|
- if (READ_ONCE(p->__state) == TASK_WAKING) {
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
-
|
|
- se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
|
|
- }
|
|
-
|
|
if (!task_on_rq_migrating(p)) {
|
|
remove_entity_load_avg(se);
|
|
|
|
@@ -7778,66 +7757,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
|
}
|
|
#endif /* CONFIG_SMP */
|
|
|
|
-static unsigned long wakeup_gran(struct sched_entity *se)
|
|
-{
|
|
- unsigned long gran = sysctl_sched_wakeup_granularity;
|
|
-
|
|
- /*
|
|
- * Since its curr running now, convert the gran from real-time
|
|
- * to virtual-time in his units.
|
|
- *
|
|
- * By using 'se' instead of 'curr' we penalize light tasks, so
|
|
- * they get preempted easier. That is, if 'se' < 'curr' then
|
|
- * the resulting gran will be larger, therefore penalizing the
|
|
- * lighter, if otoh 'se' > 'curr' then the resulting gran will
|
|
- * be smaller, again penalizing the lighter task.
|
|
- *
|
|
- * This is especially important for buddies when the leftmost
|
|
- * task is higher priority than the buddy.
|
|
- */
|
|
- return calc_delta_fair(gran, se);
|
|
-}
|
|
-
|
|
-/*
|
|
- * Should 'se' preempt 'curr'.
|
|
- *
|
|
- * |s1
|
|
- * |s2
|
|
- * |s3
|
|
- * g
|
|
- * |<--->|c
|
|
- *
|
|
- * w(c, s1) = -1
|
|
- * w(c, s2) = 0
|
|
- * w(c, s3) = 1
|
|
- *
|
|
- */
|
|
-static int
|
|
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
|
|
-{
|
|
- s64 gran, vdiff = curr->vruntime - se->vruntime;
|
|
-
|
|
- if (vdiff <= 0)
|
|
- return -1;
|
|
-
|
|
- gran = wakeup_gran(se);
|
|
- if (vdiff > gran)
|
|
- return 1;
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
-static void set_last_buddy(struct sched_entity *se)
|
|
-{
|
|
- for_each_sched_entity(se) {
|
|
- if (SCHED_WARN_ON(!se->on_rq))
|
|
- return;
|
|
- if (se_is_idle(se))
|
|
- return;
|
|
- cfs_rq_of(se)->last = se;
|
|
- }
|
|
-}
|
|
-
|
|
static void set_next_buddy(struct sched_entity *se)
|
|
{
|
|
for_each_sched_entity(se) {
|
|
@@ -7849,12 +7768,6 @@ static void set_next_buddy(struct sched_entity *se)
|
|
}
|
|
}
|
|
|
|
-static void set_skip_buddy(struct sched_entity *se)
|
|
-{
|
|
- for_each_sched_entity(se)
|
|
- cfs_rq_of(se)->skip = se;
|
|
-}
|
|
-
|
|
/*
|
|
* Preempt the current task with a newly woken task if needed:
|
|
*/
|
|
@@ -7863,7 +7776,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
|
|
struct task_struct *curr = rq->curr;
|
|
struct sched_entity *se = &curr->se, *pse = &p->se;
|
|
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
|
|
- int scale = cfs_rq->nr_running >= sched_nr_latency;
|
|
int next_buddy_marked = 0;
|
|
int cse_is_idle, pse_is_idle;
|
|
|
|
@@ -7879,7 +7791,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
|
|
if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
|
|
return;
|
|
|
|
- if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
|
|
+ if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) {
|
|
set_next_buddy(pse);
|
|
next_buddy_marked = 1;
|
|
}
|
|
@@ -7924,35 +7836,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
|
|
if (cse_is_idle != pse_is_idle)
|
|
return;
|
|
|
|
- update_curr(cfs_rq_of(se));
|
|
- if (wakeup_preempt_entity(se, pse) == 1) {
|
|
- /*
|
|
- * Bias pick_next to pick the sched entity that is
|
|
- * triggering this preemption.
|
|
- */
|
|
- if (!next_buddy_marked)
|
|
- set_next_buddy(pse);
|
|
+ cfs_rq = cfs_rq_of(se);
|
|
+ update_curr(cfs_rq);
|
|
+
|
|
+ /*
|
|
+ * XXX pick_eevdf(cfs_rq) != se ?
|
|
+ */
|
|
+ if (pick_eevdf(cfs_rq) == pse)
|
|
goto preempt;
|
|
- }
|
|
|
|
return;
|
|
|
|
preempt:
|
|
resched_curr(rq);
|
|
- /*
|
|
- * Only set the backward buddy when the current task is still
|
|
- * on the rq. This can happen when a wakeup gets interleaved
|
|
- * with schedule on the ->pre_schedule() or idle_balance()
|
|
- * point, either of which can * drop the rq lock.
|
|
- *
|
|
- * Also, during early boot the idle thread is in the fair class,
|
|
- * for obvious reasons its a bad idea to schedule back to it.
|
|
- */
|
|
- if (unlikely(!se->on_rq || curr == rq->idle))
|
|
- return;
|
|
-
|
|
- if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
|
|
- set_last_buddy(se);
|
|
}
|
|
|
|
#ifdef CONFIG_SMP
|
|
@@ -8153,8 +8049,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
|
|
|
|
/*
|
|
* sched_yield() is very simple
|
|
- *
|
|
- * The magic of dealing with the ->skip buddy is in pick_next_entity.
|
|
*/
|
|
static void yield_task_fair(struct rq *rq)
|
|
{
|
|
@@ -8170,21 +8064,19 @@ static void yield_task_fair(struct rq *rq)
|
|
|
|
clear_buddies(cfs_rq, se);
|
|
|
|
- if (curr->policy != SCHED_BATCH) {
|
|
- update_rq_clock(rq);
|
|
- /*
|
|
- * Update run-time statistics of the 'current'.
|
|
- */
|
|
- update_curr(cfs_rq);
|
|
- /*
|
|
- * Tell update_rq_clock() that we've just updated,
|
|
- * so we don't do microscopic update in schedule()
|
|
- * and double the fastpath cost.
|
|
- */
|
|
- rq_clock_skip_update(rq);
|
|
- }
|
|
+ update_rq_clock(rq);
|
|
+ /*
|
|
+ * Update run-time statistics of the 'current'.
|
|
+ */
|
|
+ update_curr(cfs_rq);
|
|
+ /*
|
|
+ * Tell update_rq_clock() that we've just updated,
|
|
+ * so we don't do microscopic update in schedule()
|
|
+ * and double the fastpath cost.
|
|
+ */
|
|
+ rq_clock_skip_update(rq);
|
|
|
|
- set_skip_buddy(se);
|
|
+ se->deadline += calc_delta_fair(se->slice, se);
|
|
}
|
|
|
|
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
|
|
@@ -8427,8 +8319,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
|
|
* Buddy candidates are cache hot:
|
|
*/
|
|
if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
|
|
- (&p->se == cfs_rq_of(&p->se)->next ||
|
|
- &p->se == cfs_rq_of(&p->se)->last))
|
|
+ (&p->se == cfs_rq_of(&p->se)->next))
|
|
return 1;
|
|
|
|
if (sysctl_sched_migration_cost == -1)
|
|
@@ -11932,8 +11823,8 @@ static void rq_offline_fair(struct rq *rq)
|
|
static inline bool
|
|
__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
|
|
{
|
|
- u64 slice = sched_slice(cfs_rq_of(se), se);
|
|
u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
|
|
+ u64 slice = se->slice;
|
|
|
|
return (rtime * min_nr_tasks > slice);
|
|
}
|
|
@@ -12077,8 +11968,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
|
|
*/
|
|
static void task_fork_fair(struct task_struct *p)
|
|
{
|
|
- struct cfs_rq *cfs_rq;
|
|
struct sched_entity *se = &p->se, *curr;
|
|
+ struct cfs_rq *cfs_rq;
|
|
struct rq *rq = this_rq();
|
|
struct rq_flags rf;
|
|
|
|
@@ -12087,22 +11978,9 @@ static void task_fork_fair(struct task_struct *p)
|
|
|
|
cfs_rq = task_cfs_rq(current);
|
|
curr = cfs_rq->curr;
|
|
- if (curr) {
|
|
+ if (curr)
|
|
update_curr(cfs_rq);
|
|
- se->vruntime = curr->vruntime;
|
|
- }
|
|
- place_entity(cfs_rq, se, 1);
|
|
-
|
|
- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
|
|
- /*
|
|
- * Upon rescheduling, sched_class::put_prev_task() will place
|
|
- * 'current' within the tree based on its new key value.
|
|
- */
|
|
- swap(curr->vruntime, se->vruntime);
|
|
- resched_curr(rq);
|
|
- }
|
|
-
|
|
- se->vruntime -= cfs_rq->min_vruntime;
|
|
+ place_entity(cfs_rq, se, ENQUEUE_INITIAL);
|
|
rq_unlock(rq, &rf);
|
|
}
|
|
|
|
@@ -12131,34 +12009,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
|
|
check_preempt_curr(rq, p, 0);
|
|
}
|
|
|
|
-static inline bool vruntime_normalized(struct task_struct *p)
|
|
-{
|
|
- struct sched_entity *se = &p->se;
|
|
-
|
|
- /*
|
|
- * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
|
|
- * the dequeue_entity(.flags=0) will already have normalized the
|
|
- * vruntime.
|
|
- */
|
|
- if (p->on_rq)
|
|
- return true;
|
|
-
|
|
- /*
|
|
- * When !on_rq, vruntime of the task has usually NOT been normalized.
|
|
- * But there are some cases where it has already been normalized:
|
|
- *
|
|
- * - A forked child which is waiting for being woken up by
|
|
- * wake_up_new_task().
|
|
- * - A task which has been woken up by try_to_wake_up() and
|
|
- * waiting for actually being woken up by sched_ttwu_pending().
|
|
- */
|
|
- if (!se->sum_exec_runtime ||
|
|
- (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
|
|
- return true;
|
|
-
|
|
- return false;
|
|
-}
|
|
-
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
/*
|
|
* Propagate the changes of the sched_entity across the tg tree to make it
|
|
@@ -12229,16 +12079,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
|
|
static void detach_task_cfs_rq(struct task_struct *p)
|
|
{
|
|
struct sched_entity *se = &p->se;
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
-
|
|
- if (!vruntime_normalized(p)) {
|
|
- /*
|
|
- * Fix up our vruntime so that the current sleep doesn't
|
|
- * cause 'unlimited' sleep bonus.
|
|
- */
|
|
- place_entity(cfs_rq, se, 0);
|
|
- se->vruntime -= cfs_rq->min_vruntime;
|
|
- }
|
|
|
|
detach_entity_cfs_rq(se);
|
|
}
|
|
@@ -12246,12 +12086,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
|
|
static void attach_task_cfs_rq(struct task_struct *p)
|
|
{
|
|
struct sched_entity *se = &p->se;
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
|
attach_entity_cfs_rq(se);
|
|
-
|
|
- if (!vruntime_normalized(p))
|
|
- se->vruntime += cfs_rq->min_vruntime;
|
|
}
|
|
|
|
static void switched_from_fair(struct rq *rq, struct task_struct *p)
|
|
@@ -12362,6 +12198,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
|
|
goto err;
|
|
|
|
tg->shares = NICE_0_LOAD;
|
|
+ tg->latency_prio = DEFAULT_PRIO;
|
|
|
|
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
|
|
|
|
@@ -12460,6 +12297,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
|
|
}
|
|
|
|
se->my_q = cfs_rq;
|
|
+
|
|
+ set_latency_fair(se, tg->latency_prio - MAX_RT_PRIO);
|
|
+
|
|
/* guarantee group entities always have weight */
|
|
update_load_set(&se->load, NICE_0_LOAD);
|
|
se->parent = parent;
|
|
@@ -12590,6 +12430,29 @@ int sched_group_set_idle(struct task_group *tg, long idle)
|
|
return 0;
|
|
}
|
|
|
|
+int sched_group_set_latency(struct task_group *tg, int prio)
|
|
+{
|
|
+ int i;
|
|
+
|
|
+ if (tg == &root_task_group)
|
|
+ return -EINVAL;
|
|
+
|
|
+ mutex_lock(&shares_mutex);
|
|
+
|
|
+ if (tg->latency_prio == prio) {
|
|
+ mutex_unlock(&shares_mutex);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ tg->latency_prio = prio;
|
|
+
|
|
+ for_each_possible_cpu(i)
|
|
+ set_latency_fair(tg->se[i], prio - MAX_RT_PRIO);
|
|
+
|
|
+ mutex_unlock(&shares_mutex);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
#else /* CONFIG_FAIR_GROUP_SCHED */
|
|
|
|
void free_fair_sched_group(struct task_group *tg) { }
|
|
@@ -12616,7 +12479,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
|
|
* idle runqueue:
|
|
*/
|
|
if (rq->cfs.load.weight)
|
|
- rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
|
|
+ rr_interval = NS_TO_JIFFIES(se->slice);
|
|
|
|
return rr_interval;
|
|
}
|
|
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
|
|
index efdc29c42161..d4b7d3f7c044 100644
|
|
--- a/kernel/sched/features.h
|
|
+++ b/kernel/sched/features.h
|
|
@@ -1,16 +1,15 @@
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
-/*
|
|
- * Only give sleepers 50% of their service deficit. This allows
|
|
- * them to run sooner, but does not allow tons of sleepers to
|
|
- * rip the spread apart.
|
|
- */
|
|
-SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
|
|
|
|
/*
|
|
- * Place new tasks ahead so that they do not starve already running
|
|
- * tasks
|
|
+ * Using the avg_vruntime, do the right thing and preserve lag across
|
|
+ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
|
|
*/
|
|
-SCHED_FEAT(START_DEBIT, true)
|
|
+SCHED_FEAT(PLACE_LAG, true)
|
|
+SCHED_FEAT(PLACE_FUDGE, true)
|
|
+SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
|
|
+SCHED_FEAT(PLACE_BONUS, false)
|
|
+
|
|
+SCHED_FEAT(MINIMAL_VA, false)
|
|
|
|
/*
|
|
* Prefer to schedule the task we woke last (assuming it failed
|
|
@@ -19,13 +18,6 @@ SCHED_FEAT(START_DEBIT, true)
|
|
*/
|
|
SCHED_FEAT(NEXT_BUDDY, false)
|
|
|
|
-/*
|
|
- * Prefer to schedule the task that ran last (when we did
|
|
- * wake-preempt) as that likely will touch the same data, increases
|
|
- * cache locality.
|
|
- */
|
|
-SCHED_FEAT(LAST_BUDDY, true)
|
|
-
|
|
/*
|
|
* Consider buddies to be cache hot, decreases the likeliness of a
|
|
* cache buddy being migrated away, increases cache locality.
|
|
@@ -62,7 +54,6 @@ SCHED_FEAT(TTWU_QUEUE, true)
|
|
*/
|
|
SCHED_FEAT(SIS_PROP, false)
|
|
SCHED_FEAT(SIS_UTIL, true)
|
|
-SCHED_FEAT(SIS_SHORT, true)
|
|
|
|
/*
|
|
* Issue a WARN when we do multiple update_rq_clock() calls
|
|
@@ -99,6 +90,3 @@ SCHED_FEAT(UTIL_EST, true)
|
|
SCHED_FEAT(UTIL_EST_FASTUP, true)
|
|
|
|
SCHED_FEAT(LATENCY_WARN, false)
|
|
-
|
|
-SCHED_FEAT(ALT_PERIOD, true)
|
|
-SCHED_FEAT(BASE_SLICE, true)
|
|
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
|
index 7331d436ebc4..bfce45b21441 100644
|
|
--- a/kernel/sched/sched.h
|
|
+++ b/kernel/sched/sched.h
|
|
@@ -378,6 +378,8 @@ struct task_group {
|
|
|
|
/* A positive value indicates that this is a SCHED_IDLE group. */
|
|
int idle;
|
|
+ /* latency priority of the group. */
|
|
+ int latency_prio;
|
|
|
|
#ifdef CONFIG_SMP
|
|
/*
|
|
@@ -488,6 +490,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
|
|
|
|
extern int sched_group_set_idle(struct task_group *tg, long idle);
|
|
|
|
+extern int sched_group_set_latency(struct task_group *tg, int prio);
|
|
+
|
|
#ifdef CONFIG_SMP
|
|
extern void set_task_rq_fair(struct sched_entity *se,
|
|
struct cfs_rq *prev, struct cfs_rq *next);
|
|
@@ -554,6 +558,10 @@ struct cfs_rq {
|
|
unsigned int idle_nr_running; /* SCHED_IDLE */
|
|
unsigned int idle_h_nr_running; /* SCHED_IDLE */
|
|
|
|
+ s64 avg_vruntime;
|
|
+ u64 avg_slice;
|
|
+ u64 avg_load;
|
|
+
|
|
u64 exec_clock;
|
|
u64 min_vruntime;
|
|
#ifdef CONFIG_SCHED_CORE
|
|
@@ -573,8 +581,6 @@ struct cfs_rq {
|
|
*/
|
|
struct sched_entity *curr;
|
|
struct sched_entity *next;
|
|
- struct sched_entity *last;
|
|
- struct sched_entity *skip;
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
unsigned int nr_spread_over;
|
|
@@ -2154,7 +2160,7 @@ extern const u32 sched_prio_to_wmult[40];
|
|
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
|
|
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
|
|
* ENQUEUE_MIGRATED - the task was migrated during wakeup
|
|
- *
|
|
+ * ENQUEUE_INITIAL - place a new task (fork/clone)
|
|
*/
|
|
|
|
#define DEQUEUE_SLEEP 0x01
|
|
@@ -2174,6 +2180,7 @@ extern const u32 sched_prio_to_wmult[40];
|
|
#else
|
|
#define ENQUEUE_MIGRATED 0x00
|
|
#endif
|
|
+#define ENQUEUE_INITIAL 0x80
|
|
|
|
#define RETRY_TASK ((void *)-1UL)
|
|
|
|
@@ -2476,10 +2483,7 @@ extern const_debug unsigned int sysctl_sched_nr_migrate;
|
|
extern const_debug unsigned int sysctl_sched_migration_cost;
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
-extern unsigned int sysctl_sched_latency;
|
|
-extern unsigned int sysctl_sched_min_granularity;
|
|
-extern unsigned int sysctl_sched_idle_min_granularity;
|
|
-extern unsigned int sysctl_sched_wakeup_granularity;
|
|
+extern unsigned int sysctl_sched_base_slice;
|
|
extern int sysctl_resched_latency_warn_ms;
|
|
extern int sysctl_resched_latency_warn_once;
|
|
|
|
@@ -2492,6 +2496,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
|
|
extern unsigned int sysctl_numa_balancing_hot_threshold;
|
|
#endif
|
|
|
|
+extern void set_latency_fair(struct sched_entity *se, int prio);
|
|
+
|
|
#ifdef CONFIG_SCHED_HRTICK
|
|
|
|
/*
|
|
@@ -3323,4 +3329,7 @@ static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *n
|
|
static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
|
|
#endif
|
|
|
|
+extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
|
|
+extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
|
|
+
|
|
#endif /* _KERNEL_SCHED_SCHED_H */
|
|
diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h
|
|
index 3bac0a8ceab2..b2e932c25be6 100644
|
|
--- a/tools/include/uapi/linux/sched.h
|
|
+++ b/tools/include/uapi/linux/sched.h
|
|
@@ -132,6 +132,7 @@ struct clone_args {
|
|
#define SCHED_FLAG_KEEP_PARAMS 0x10
|
|
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
|
|
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
|
|
+#define SCHED_FLAG_LATENCY_NICE 0x80
|
|
|
|
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
|
|
SCHED_FLAG_KEEP_PARAMS)
|
|
@@ -143,6 +144,7 @@ struct clone_args {
|
|
SCHED_FLAG_RECLAIM | \
|
|
SCHED_FLAG_DL_OVERRUN | \
|
|
SCHED_FLAG_KEEP_ALL | \
|
|
- SCHED_FLAG_UTIL_CLAMP)
|
|
+ SCHED_FLAG_UTIL_CLAMP | \
|
|
+ SCHED_FLAG_LATENCY_NICE)
|
|
|
|
#endif /* _UAPI_LINUX_SCHED_H */
|
|
--
|
|
2.40.0
|