This commit is contained in:
ferrreo 2023-08-10 18:30:38 +01:00
parent 9ec4cbb0e4
commit 60dc452fb5
7 changed files with 2915 additions and 5588 deletions

11
config
View File

@ -594,7 +594,9 @@ CONFIG_CALL_DEPTH_TRACKING=y
# CONFIG_CALL_THUNKS_DEBUG is not set
CONFIG_CPU_IBPB_ENTRY=y
CONFIG_CPU_IBRS_ENTRY=y
CONFIG_CPU_SRSO=y
CONFIG_SLS=y
# CONFIG_GDS_FORCE_MITIGATION is not set
CONFIG_ARCH_HAS_ADD_PAGES=y
CONFIG_ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE=y
@ -1333,11 +1335,10 @@ CONFIG_TCP_CONG_YEAH=m
CONFIG_TCP_CONG_ILLINOIS=m
CONFIG_TCP_CONG_DCTCP=m
CONFIG_TCP_CONG_CDG=m
CONFIG_TCP_CONG_BBR=m
CONFIG_TCP_CONG_BBR2=y
CONFIG_DEFAULT_BBR2=y
CONFIG_TCP_CONG_BBR=y
CONFIG_DEFAULT_BBR=y
# CONFIG_DEFAULT_RENO is not set
CONFIG_DEFAULT_TCP_CONG="bbr2"
CONFIG_DEFAULT_TCP_CONG="bbr"
CONFIG_TCP_MD5SIG=y
CONFIG_IPV6=y
CONFIG_IPV6_ROUTER_PREF=y
@ -2613,7 +2614,7 @@ CONFIG_ZRAM_DEF_COMP_ZSTD=y
# CONFIG_ZRAM_DEF_COMP_842 is not set
CONFIG_ZRAM_DEF_COMP="zstd"
CONFIG_ZRAM_WRITEBACK=y
# CONFIG_ZRAM_MEMORY_TRACKING is not set
CONFIG_ZRAM_MEMORY_TRACKING=y
CONFIG_ZRAM_MULTI_COMP=y
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_LOOP_MIN_COUNT=0

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,7 @@
From 0af97bb369de3bfe15d724e9bb0e3c971c6f9f20 Mon Sep 17 00:00:00 2001
From 218c51e49185b75b4e36c8f11b5c77686f955a0a Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Mon, 10 Jul 2023 17:12:45 +0200
Subject: [PATCH] EEVDF-cachy
Date: Sun, 30 Jul 2023 09:38:51 +0200
Subject: [PATCH] EEVDF
Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
@ -13,11 +13,11 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
init/init_task.c | 3 +-
kernel/sched/core.c | 65 +-
kernel/sched/debug.c | 49 +-
kernel/sched/fair.c | 1157 +++++++++++------------
kernel/sched/features.h | 24 +-
kernel/sched/sched.h | 22 +-
kernel/sched/fair.c | 1138 +++++++++++------------
kernel/sched/features.h | 23 +-
kernel/sched/sched.h | 21 +-
tools/include/uapi/linux/sched.h | 4 +-
12 files changed, 733 insertions(+), 658 deletions(-)
12 files changed, 702 insertions(+), 668 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 4ef890191196..3a8d3e1e5591 100644
@ -78,7 +78,7 @@ index 7ee7ed5de722..6dbc5a1bf6a8 100644
* Template for declaring augmented rbtree callbacks (generic case)
*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index efc9f4bdc4ca..e99a9aa6a972 100644
index 609bde814cb0..c940c4dc8304 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -549,13 +549,18 @@ struct sched_entity {
@ -196,7 +196,7 @@ index ff6c4b9bfe6b..511cbcf3510d 100644
.rt = {
.run_list = LIST_HEAD_INIT(init_task.rt.run_list),
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 83e36547af17..8a541fe2d462 100644
index c52c2eba7c73..aff81e12460e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_struct *p, bool update_load)
@ -232,7 +232,7 @@ index 83e36547af17..8a541fe2d462 100644
/*
* We don't need the reset flag anymore after the fork. It has
@@ -7529,7 +7539,7 @@ static struct task_struct *find_process_by_pid(pid_t pid)
@@ -7516,7 +7526,7 @@ static struct task_struct *find_process_by_pid(pid_t pid)
#define SETPARAM_POLICY -1
static void __setscheduler_params(struct task_struct *p,
@ -241,7 +241,7 @@ index 83e36547af17..8a541fe2d462 100644
{
int policy = attr->sched_policy;
@@ -7553,6 +7563,13 @@ static void __setscheduler_params(struct task_struct *p,
@@ -7540,6 +7550,13 @@ static void __setscheduler_params(struct task_struct *p,
set_load_weight(p, true);
}
@ -255,7 +255,7 @@ index 83e36547af17..8a541fe2d462 100644
/*
* Check the target process has a UID that matches the current process's:
*/
@@ -7687,6 +7704,13 @@ static int __sched_setscheduler(struct task_struct *p,
@@ -7674,6 +7691,13 @@ static int __sched_setscheduler(struct task_struct *p,
return retval;
}
@ -269,7 +269,7 @@ index 83e36547af17..8a541fe2d462 100644
/* Update task specific "requested" clamps */
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
retval = uclamp_validate(p, attr);
@@ -7734,6 +7758,9 @@ static int __sched_setscheduler(struct task_struct *p,
@@ -7721,6 +7745,9 @@ static int __sched_setscheduler(struct task_struct *p,
goto change;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
goto change;
@ -279,7 +279,7 @@ index 83e36547af17..8a541fe2d462 100644
p->sched_reset_on_fork = reset_on_fork;
retval = 0;
@@ -7822,6 +7849,7 @@ static int __sched_setscheduler(struct task_struct *p,
@@ -7809,6 +7836,7 @@ static int __sched_setscheduler(struct task_struct *p,
__setscheduler_params(p, attr);
__setscheduler_prio(p, newprio);
}
@ -287,7 +287,7 @@ index 83e36547af17..8a541fe2d462 100644
__setscheduler_uclamp(p, attr);
if (queued) {
@@ -8033,6 +8061,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
@@ -8020,6 +8048,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
size < SCHED_ATTR_SIZE_VER1)
return -EINVAL;
@ -297,7 +297,7 @@ index 83e36547af17..8a541fe2d462 100644
/*
* XXX: Do we want to be lenient like existing syscalls; or do we want
* to be strict and return an error on out-of-bounds values?
@@ -8270,6 +8301,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
@@ -8257,6 +8288,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
get_params(p, &kattr);
kattr.sched_flags &= SCHED_FLAG_ALL;
@ -306,7 +306,7 @@ index 83e36547af17..8a541fe2d462 100644
#ifdef CONFIG_UCLAMP_TASK
/*
* This could race with another potential updater, but this is fine
@@ -11214,6 +11247,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
@@ -11180,6 +11213,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
{
return sched_group_set_idle(css_tg(css), idle);
}
@ -332,7 +332,7 @@ index 83e36547af17..8a541fe2d462 100644
#endif
static struct cftype cpu_legacy_files[] = {
@@ -11228,6 +11280,11 @@ static struct cftype cpu_legacy_files[] = {
@@ -11194,6 +11246,11 @@ static struct cftype cpu_legacy_files[] = {
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
@ -344,7 +344,7 @@ index 83e36547af17..8a541fe2d462 100644
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -11467,6 +11524,12 @@ static struct cftype cpu_files[] = {
@@ -11411,6 +11468,12 @@ static struct cftype cpu_files[] = {
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
@ -358,7 +358,7 @@ index 83e36547af17..8a541fe2d462 100644
#ifdef CONFIG_CFS_BANDWIDTH
{
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index aeeba46a096b..5c743bcb340d 100644
index 066ff1c8ae4e..e7e83181fbb6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -347,10 +347,7 @@ static __init int sched_init_debug(void)
@ -373,7 +373,7 @@ index aeeba46a096b..5c743bcb340d 100644
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
@@ -582,9 +579,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
@@ -581,9 +578,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
else
SEQ_printf(m, " %c", task_state_to_char(p));
@ -388,7 +388,7 @@ index aeeba46a096b..5c743bcb340d 100644
(long long)(p->nvcsw + p->nivcsw),
p->prio);
@@ -627,10 +628,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
@@ -626,10 +627,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
@ -401,7 +401,7 @@ index aeeba46a096b..5c743bcb340d 100644
unsigned long flags;
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -644,26 +644,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
@@ -643,26 +643,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(cfs_rq->exec_clock));
raw_spin_rq_lock_irqsave(rq, flags);
@ -441,7 +441,7 @@ index aeeba46a096b..5c743bcb340d 100644
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
@@ -864,10 +863,7 @@ static void sched_debug_header(struct seq_file *m)
@@ -863,10 +862,7 @@ static void sched_debug_header(struct seq_file *m)
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
#define PN(x) \
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
@ -453,7 +453,7 @@ index aeeba46a096b..5c743bcb340d 100644
P(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
#undef PN
@@ -1090,6 +1086,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
@@ -1089,6 +1085,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
#endif
P(policy);
P(prio);
@ -462,7 +462,7 @@ index aeeba46a096b..5c743bcb340d 100644
P(dl.runtime);
P(dl.deadline);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4039ff46fcb3..0fbb8fb24a50 100644
index 2c335df30171..461409c0eac7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -47,6 +47,7 @@
@ -594,7 +594,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
const struct sched_class fair_sched_class;
@@ -619,13 +569,200 @@ static inline bool entity_before(const struct sched_entity *a,
@@ -619,13 +569,198 @@ static inline bool entity_before(const struct sched_entity *a,
return (s64)(a->vruntime - b->vruntime) < 0;
}
@ -671,7 +671,6 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
+ s64 key = entity_key(cfs_rq, se);
+
+ cfs_rq->avg_vruntime += key * weight;
+ cfs_rq->avg_slice += se->slice * weight;
+ cfs_rq->avg_load += weight;
+}
+
@ -682,7 +681,6 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
+ s64 key = entity_key(cfs_rq, se);
+
+ cfs_rq->avg_vruntime -= key * weight;
+ cfs_rq->avg_slice -= se->slice * weight;
+ cfs_rq->avg_load -= weight;
+}
+
@ -796,7 +794,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
u64 vruntime = cfs_rq->min_vruntime;
@@ -636,9 +773,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
@@ -636,9 +771,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
curr = NULL;
}
@ -807,7 +805,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
if (!curr)
vruntime = se->vruntime;
else
@@ -647,7 +782,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
@@ -647,7 +780,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
/* ensure we never gain time by being placed backwards. */
u64_u32_store(cfs_rq->min_vruntime,
@ -816,7 +814,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
}
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
@@ -655,17 +790,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
@@ -655,17 +788,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
return entity_before(__node_2_se(a), __node_2_se(b));
}
@ -870,7 +868,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
@@ -678,14 +847,81 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
@@ -678,14 +845,81 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
return __node_2_se(left);
}
@ -927,8 +925,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
+ if (best->deadline == best->min_deadline)
+ break;
+ }
- return __node_2_se(next);
+
+ /*
+ * If the earlest deadline in this subtree is in the fully
+ * eligible left half of our space, go there.
@ -941,7 +938,8 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
+
+ node = node->rb_right;
+ }
+
- return __node_2_se(next);
+ if (!best || (curr && deadline_gt(deadline, best, curr)))
+ best = curr;
+
@ -957,7 +955,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
}
#ifdef CONFIG_SCHED_DEBUG
@@ -707,104 +943,53 @@ int sched_update_scaling(void)
@@ -707,104 +941,53 @@ int sched_update_scaling(void)
{
unsigned int factor = get_update_sysctl_factor();
@ -1090,7 +1088,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
}
#include "pelt.h"
@@ -939,6 +1124,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
@@ -939,6 +1122,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
schedstat_add(cfs_rq->exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr);
@ -1098,7 +1096,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
@@ -3393,16 +3579,36 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
@@ -3393,16 +3577,36 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
@ -1135,7 +1133,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
#ifdef CONFIG_SMP
do {
u32 divider = get_pelt_divider(&se->avg);
@@ -3412,9 +3618,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
@@ -3412,9 +3616,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
#endif
enqueue_load_avg(cfs_rq, se);
@ -1149,7 +1147,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
}
void reweight_task(struct task_struct *p, int prio)
@@ -4710,98 +4918,140 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
@@ -4710,158 +4916,123 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
@ -1167,94 +1165,42 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
-}
-
-static inline bool entity_is_long_sleeper(struct sched_entity *se)
+static inline bool
+entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 vslice, int flags)
+static void
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- struct cfs_rq *cfs_rq;
- u64 sleep_time;
+ u64 now, vdelta;
+ s64 delta;
- if (se->exec_start == 0)
+ if (!(flags & ENQUEUE_WAKEUP))
return false;
- cfs_rq = cfs_rq_of(se);
-
- sleep_time = rq_clock_task(rq_of(cfs_rq));
+ if (flags & ENQUEUE_MIGRATED)
+ return true;
- /* Happen while migrating because of clock task divergence */
- if (sleep_time <= se->exec_start)
+ now = rq_clock_task(rq_of(cfs_rq));
+ delta = now - se->exec_start;
+ if (delta < 0)
return false;
- sleep_time -= se->exec_start;
- if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
- return true;
+ vdelta = __calc_delta(delta, NICE_0_LOAD, &cfs_rq->load);
+ if (vdelta < vslice)
+ return false;
- return false;
+ return true;
}
static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- u64 vruntime = cfs_rq->min_vruntime;
- if (se->exec_start == 0)
- return false;
-
- cfs_rq = cfs_rq_of(se);
+ u64 vslice = calc_delta_fair(se->slice, se);
+ u64 vruntime = avg_vruntime(cfs_rq);
+ s64 lag = 0;
/*
- * The 'current' period is already promised to the current tasks,
- * however the extra weight of the new task will slow them down a
- * little, place the new task so that it fits in the slot that
- * stays open at the end.
- sleep_time = rq_clock_task(rq_of(cfs_rq));
+ /*
+ * Due to how V is constructed as the weighted average of entities,
+ * adding tasks with positive lag, or removing tasks with negative lag
+ * will move 'time' backwards, this can screw around with the lag of
+ * other tasks.
+ *
+ * EEVDF: placement strategy #1 / #2
*/
- if (initial && sched_feat(START_DEBIT))
- vruntime += sched_vslice(cfs_rq, se);
+ */
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
+ struct sched_entity *curr = cfs_rq->curr;
+ unsigned long load;
- /* sleeps up to a single latency don't count. */
- if (!initial) {
- unsigned long thresh;
- /* Happen while migrating because of clock task divergence */
- if (sleep_time <= se->exec_start)
- return false;
+ lag = se->vlag;
- if (se_is_idle(se))
- thresh = sysctl_sched_min_granularity;
- else
- thresh = sysctl_sched_latency;
- sleep_time -= se->exec_start;
- if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
- return true;
+ /*
+ * For latency sensitive tasks; those that have a shorter than
+ * average slice and do not fully consume the slice, transition
+ * to EEVDF placement strategy #2.
+ */
+ if (sched_feat(PLACE_FUDGE) &&
+ (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) &&
+ entity_has_slept(cfs_rq, se, vslice, flags)) {
+ lag += vslice;
+ if (lag > 0)
+ lag = 0;
+ }
/*
- * Halve their sleep time's effect, to allow
- * for a gentler effect of sleepers:
+ * If we want to place a task and preserve lag, we have to
+ * consider the effect of the new entity on the weighted
+ * average and compensate for this, otherwise lag can quickly
@ -1305,7 +1251,52 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
+ * = W*vl_i
+ *
+ * vl_i = (W + w_i)*vl'_i / W
*/
+ */
+ load = cfs_rq->avg_load;
+ if (curr && curr->on_rq)
+ load += scale_load_down(curr->load.weight);
- return false;
-}
+ lag *= load + scale_load_down(se->load.weight);
+ if (WARN_ON_ONCE(!load))
+ load = 1;
+ lag = div_s64(lag, load);
+ }
-static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
-{
- u64 vruntime = cfs_rq->min_vruntime;
+ se->vruntime = vruntime - lag;
/*
- * The 'current' period is already promised to the current tasks,
- * however the extra weight of the new task will slow them down a
- * little, place the new task so that it fits in the slot that
- * stays open at the end.
+ * When joining the competition; the exisiting tasks will be,
+ * on average, halfway through their slice, as such start tasks
+ * off with half a slice to ease into the competition.
*/
- if (initial && sched_feat(START_DEBIT))
- vruntime += sched_vslice(cfs_rq, se);
-
- /* sleeps up to a single latency don't count. */
- if (!initial) {
- unsigned long thresh;
-
- if (se_is_idle(se))
- thresh = sysctl_sched_min_granularity;
- else
- thresh = sysctl_sched_latency;
+ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
+ vslice /= 2;
- /*
- * Halve their sleep time's effect, to allow
- * for a gentler effect of sleepers:
- */
- if (sched_feat(GENTLE_FAIR_SLEEPERS))
- thresh >>= 1;
-
@ -1335,26 +1326,6 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
- se->vruntime = vruntime;
- else
- se->vruntime = max_vruntime(se->vruntime, vruntime);
+ load = cfs_rq->avg_load;
+ if (curr && curr->on_rq)
+ load += scale_load_down(curr->load.weight);
+
+ lag *= load + scale_load_down(se->load.weight);
+ if (WARN_ON_ONCE(!load))
+ load = 1;
+ lag = div_s64(lag, load);
+ }
+
+ se->vruntime = vruntime - lag;
+
+ /*
+ * When joining the competition; the exisiting tasks will be,
+ * on average, halfway through their slice, as such start tasks
+ * off with half a slice to ease into the competition.
+ */
+ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
+ vslice /= 2;
+
+ /*
+ * EEVDF: vd_i = ve_i + r_i/w_i
+ */
@ -1362,7 +1333,6 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -4809,60 +5059,20 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
static inline bool cfs_bandwidth_used(void);
@ -1425,7 +1395,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
/*
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
@@ -4874,18 +5084,28 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -4873,18 +5044,28 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
se_update_runnable(se);
@ -1457,7 +1427,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
@@ -4907,17 +5127,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -4896,17 +5077,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
}
}
@ -1475,7 +1445,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
static void __clear_buddies_next(struct sched_entity *se)
{
for_each_sched_entity(se) {
@@ -4929,27 +5138,10 @@ static void __clear_buddies_next(struct sched_entity *se)
@@ -4918,27 +5088,10 @@ static void __clear_buddies_next(struct sched_entity *se)
}
}
@ -1503,7 +1473,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -4983,20 +5175,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -4972,20 +5125,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
clear_buddies(cfs_rq, se);
@ -1525,7 +1495,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
@@ -5015,52 +5199,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -5004,52 +5149,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_idle_cfs_rq_clock_pelt(cfs_rq);
}
@ -1578,7 +1548,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -5099,9 +5237,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -5088,9 +5187,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
@ -1588,7 +1558,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
/*
* Pick the next process, keeping these things in mind, in this order:
* 1) keep things fair between processes/task groups
@@ -5112,50 +5247,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
@@ -5101,50 +5197,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
@ -1644,7 +1614,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -5172,8 +5271,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
@@ -5161,8 +5221,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
@ -1653,7 +1623,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
if (prev->on_rq) {
update_stats_wait_start_fair(cfs_rq, prev);
/* Put 'current' back into the tree. */
@@ -5214,9 +5311,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
@@ -5203,9 +5261,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
@ -1663,7 +1633,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
}
@@ -6259,13 +6353,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
@@ -6228,13 +6283,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
@ -1678,7 +1648,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
s64 delta = slice - ran;
if (delta < 0) {
@@ -6289,8 +6382,7 @@ static void hrtick_update(struct rq *rq)
@@ -6258,8 +6312,7 @@ static void hrtick_update(struct rq *rq)
if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
return;
@ -1688,7 +1658,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
@@ -6331,17 +6423,6 @@ static int sched_idle_rq(struct rq *rq)
@@ -6300,17 +6353,6 @@ static int sched_idle_rq(struct rq *rq)
rq->nr_running);
}
@ -1706,7 +1676,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
#ifdef CONFIG_SMP
static int sched_idle_cpu(int cpu)
{
@@ -7844,18 +7925,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
@@ -7816,18 +7858,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
struct sched_entity *se = &p->se;
@ -1725,7 +1695,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
if (!task_on_rq_migrating(p)) {
remove_entity_load_avg(se);
@@ -7893,66 +7962,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
@@ -7865,66 +7895,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
#endif /* CONFIG_SMP */
@ -1792,7 +1762,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
static void set_next_buddy(struct sched_entity *se)
{
for_each_sched_entity(se) {
@@ -7964,12 +7973,6 @@ static void set_next_buddy(struct sched_entity *se)
@@ -7936,12 +7906,6 @@ static void set_next_buddy(struct sched_entity *se)
}
}
@ -1805,7 +1775,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
/*
* Preempt the current task with a newly woken task if needed:
*/
@@ -7978,7 +7981,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
@@ -7950,7 +7914,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
@ -1813,7 +1783,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
int next_buddy_marked = 0;
int cse_is_idle, pse_is_idle;
@@ -7994,7 +7996,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
@@ -7966,7 +7929,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
return;
@ -1822,7 +1792,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
set_next_buddy(pse);
next_buddy_marked = 1;
}
@@ -8039,35 +8041,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
@@ -8011,35 +7974,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (cse_is_idle != pse_is_idle)
return;
@ -1865,7 +1835,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
}
#ifdef CONFIG_SMP
@@ -8268,8 +8254,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
@@ -8240,8 +8187,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
/*
* sched_yield() is very simple
@ -1874,7 +1844,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
*/
static void yield_task_fair(struct rq *rq)
{
@@ -8285,21 +8269,19 @@ static void yield_task_fair(struct rq *rq)
@@ -8257,21 +8202,19 @@ static void yield_task_fair(struct rq *rq)
clear_buddies(cfs_rq, se);
@ -1908,7 +1878,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
}
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
@@ -8547,8 +8529,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
@@ -8514,8 +8457,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
* Buddy candidates are cache hot:
*/
if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
@ -1918,7 +1888,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
return 1;
if (sysctl_sched_migration_cost == -1)
@@ -12174,8 +12155,8 @@ static void rq_offline_fair(struct rq *rq)
@@ -12025,8 +11967,8 @@ static void rq_offline_fair(struct rq *rq)
static inline bool
__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
{
@ -1928,7 +1898,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
return (rtime * min_nr_tasks > slice);
}
@@ -12331,8 +12312,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
@@ -12182,8 +12124,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
*/
static void task_fork_fair(struct task_struct *p)
{
@ -1938,7 +1908,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
struct rq *rq = this_rq();
struct rq_flags rf;
@@ -12341,22 +12322,9 @@ static void task_fork_fair(struct task_struct *p)
@@ -12192,22 +12134,9 @@ static void task_fork_fair(struct task_struct *p)
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
@ -1963,7 +1933,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
rq_unlock(rq, &rf);
}
@@ -12385,34 +12353,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
@@ -12236,34 +12165,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
check_preempt_curr(rq, p, 0);
}
@ -1998,7 +1968,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Propagate the changes of the sched_entity across the tg tree to make it
@@ -12483,16 +12423,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
@@ -12334,16 +12235,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
@ -2015,7 +1985,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
detach_entity_cfs_rq(se);
}
@@ -12500,12 +12430,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
@@ -12351,12 +12242,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
@ -2028,7 +1998,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
}
static void switched_from_fair(struct rq *rq, struct task_struct *p)
@@ -12616,6 +12542,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
@@ -12467,6 +12354,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
goto err;
tg->shares = NICE_0_LOAD;
@ -2036,7 +2006,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
@@ -12714,6 +12641,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
@@ -12565,6 +12453,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
}
se->my_q = cfs_rq;
@ -2046,7 +2016,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
/* guarantee group entities always have weight */
update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
@@ -12844,6 +12774,29 @@ int sched_group_set_idle(struct task_group *tg, long idle)
@@ -12695,6 +12586,29 @@ int sched_group_set_idle(struct task_group *tg, long idle)
return 0;
}
@ -2076,7 +2046,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
#else /* CONFIG_FAIR_GROUP_SCHED */
void free_fair_sched_group(struct task_group *tg) { }
@@ -12870,7 +12823,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
@@ -12721,7 +12635,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* idle runqueue:
*/
if (rq->cfs.load.weight)
@ -2086,10 +2056,10 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
return rr_interval;
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index ee7f23c76bd3..7d65b40299d9 100644
index ee7f23c76bd3..54334ca5c5c6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -1,16 +1,12 @@
@@ -1,16 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Only give sleepers 50% of their service deficit. This allows
@ -2106,12 +2076,11 @@ index ee7f23c76bd3..7d65b40299d9 100644
*/
-SCHED_FEAT(START_DEBIT, true)
+SCHED_FEAT(PLACE_LAG, true)
+SCHED_FEAT(PLACE_FUDGE, true)
+SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
/*
* Prefer to schedule the task we woke last (assuming it failed
@@ -19,13 +15,6 @@ SCHED_FEAT(START_DEBIT, true)
@@ -19,13 +14,6 @@ SCHED_FEAT(START_DEBIT, true)
*/
SCHED_FEAT(NEXT_BUDDY, false)
@ -2125,7 +2094,7 @@ index ee7f23c76bd3..7d65b40299d9 100644
/*
* Consider buddies to be cache hot, decreases the likeliness of a
* cache buddy being migrated away, increases cache locality.
@@ -98,6 +87,3 @@ SCHED_FEAT(UTIL_EST, true)
@@ -98,6 +86,3 @@ SCHED_FEAT(UTIL_EST, true)
SCHED_FEAT(UTIL_EST_FASTUP, true)
SCHED_FEAT(LATENCY_WARN, false)
@ -2133,7 +2102,7 @@ index ee7f23c76bd3..7d65b40299d9 100644
-SCHED_FEAT(ALT_PERIOD, true)
-SCHED_FEAT(BASE_SLICE, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9baeb1a2dfdd..4236c4c893aa 100644
index e93e006a942b..67cd7e1fd501 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -372,6 +372,8 @@ struct task_group {
@ -2154,18 +2123,17 @@ index 9baeb1a2dfdd..4236c4c893aa 100644
#ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next);
@@ -548,6 +552,10 @@ struct cfs_rq {
@@ -548,6 +552,9 @@ struct cfs_rq {
unsigned int idle_nr_running; /* SCHED_IDLE */
unsigned int idle_h_nr_running; /* SCHED_IDLE */
+ s64 avg_vruntime;
+ u64 avg_slice;
+ u64 avg_load;
+
u64 exec_clock;
u64 min_vruntime;
#ifdef CONFIG_SCHED_CORE
@@ -567,8 +575,6 @@ struct cfs_rq {
@@ -567,8 +574,6 @@ struct cfs_rq {
*/
struct sched_entity *curr;
struct sched_entity *next;
@ -2174,7 +2142,7 @@ index 9baeb1a2dfdd..4236c4c893aa 100644
#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
@@ -2198,6 +2204,7 @@ extern const u32 sched_prio_to_wmult[40];
@@ -2195,6 +2200,7 @@ extern const u32 sched_prio_to_wmult[40];
#else
#define ENQUEUE_MIGRATED 0x00
#endif
@ -2182,7 +2150,7 @@ index 9baeb1a2dfdd..4236c4c893aa 100644
#define RETRY_TASK ((void *)-1UL)
@@ -2502,11 +2509,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
@@ -2499,11 +2505,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;
@ -2196,7 +2164,7 @@ index 9baeb1a2dfdd..4236c4c893aa 100644
extern int sysctl_resched_latency_warn_ms;
extern int sysctl_resched_latency_warn_once;
@@ -2519,6 +2524,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
@@ -2516,6 +2520,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
extern unsigned int sysctl_numa_balancing_hot_threshold;
#endif
@ -2205,7 +2173,7 @@ index 9baeb1a2dfdd..4236c4c893aa 100644
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -3483,4 +3490,7 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
@@ -3480,4 +3486,7 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
static inline void init_sched_mm_cid(struct task_struct *t) { }
#endif

View File

@ -1,49 +1,76 @@
From e6e251fb3f3927c18ac4f2a22a43c6c198133d19 Mon Sep 17 00:00:00 2001
From: Piotr Gorski <lucjan.lucjanov@gmail.com>
Date: Sun, 23 Jul 2023 09:46:42 +0200
From 377657f92d256b364813e3f8b2a58edfc9833815 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Sun, 30 Jul 2023 09:43:51 +0200
Subject: [PATCH] bore-eevdf
Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
include/linux/sched.h | 10 ++
include/linux/sched.h | 30 ++++++
init/Kconfig | 20 ++++
kernel/sched/core.c | 117 +++++++++++++++++++++++
kernel/sched/core.c | 118 +++++++++++++++++++++
kernel/sched/debug.c | 4 +
kernel/sched/fair.c | 203 ++++++++++++++++++++++++++++++++++++++--
kernel/sched/fair.c | 228 ++++++++++++++++++++++++++++++++++++++--
kernel/sched/features.h | 4 +
kernel/sched/sched.h | 1 +
7 files changed, 351 insertions(+), 8 deletions(-)
7 files changed, 397 insertions(+), 8 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e99a9aa6a..14a1ce058 100644
index c940c4dc8304..8663c0813f81 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -559,6 +559,12 @@ struct sched_entity {
@@ -545,6 +545,26 @@ struct sched_statistics {
#endif /* CONFIG_SCHEDSTATS */
} ____cacheline_aligned;
+#ifdef CONFIG_SCHED_BORE
+union union16 {
+ u16 u16;
+ s16 s16;
+ u8 u8[2];
+ s8 s8[2];
+};
+typedef union union16 x16;
+
+union union32 {
+ u32 u32;
+ s32 s32;
+ u16 u16[2];
+ s16 s16[2];
+ u8 u8[4];
+ s8 s8[4];
+};
+typedef union union32 x32;
+#endif // CONFIG_SCHED_BORE
+
struct sched_entity {
/* For load-balancing: */
struct load_weight load;
@@ -559,6 +579,12 @@ struct sched_entity {
u64 sum_exec_runtime;
u64 prev_sum_exec_runtime;
u64 vruntime;
+#ifdef CONFIG_SCHED_BORE
+ u64 prev_burst_time;
+ u64 burst_time;
+ u64 max_burst_time;
+ u8 penalty_score;
+ u16 prev_burst_penalty;
+ u16 curr_burst_penalty;
+ u16 burst_penalty;
+#endif // CONFIG_SCHED_BORE
s64 vlag;
u64 slice;
@@ -990,6 +996,10 @@ struct task_struct {
@@ -990,6 +1016,10 @@ struct task_struct {
struct list_head children;
struct list_head sibling;
struct task_struct *group_leader;
+#ifdef CONFIG_SCHED_BORE
+ u64 child_burst_cache;
+ u16 child_burst_cache;
+ u64 child_burst_last_cached;
+#endif // CONFIG_SCHED_BORE
/*
* 'ptraced' is the list of tasks this task is using ptrace() on.
diff --git a/init/Kconfig b/init/Kconfig
index 71755cc8e..c697be79e 100644
index 71755cc8ed3e..c697be79e594 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1277,6 +1277,26 @@ config CHECKPOINT_RESTORE
@ -74,30 +101,31 @@ index 71755cc8e..c697be79e 100644
bool "Automatic process group scheduling"
select CGROUPS
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8a541fe2d..13969a3a3 100644
index aff81e12460e..839605620f63 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4491,6 +4491,112 @@ int wake_up_state(struct task_struct *p, unsigned int state)
@@ -4491,6 +4491,113 @@ int wake_up_state(struct task_struct *p, unsigned int state)
return try_to_wake_up(p, state, 0);
}
+#ifdef CONFIG_SCHED_BORE
+#define CHILD_BURST_CUTOFF_BITS 9
+extern unsigned int sched_burst_cache_lifetime;
+extern unsigned int sched_burst_fork_atavistic;
+
+void __init sched_init_bore(void) {
+ init_task.child_burst_cache = 0;
+ init_task.child_burst_last_cached = 0;
+ init_task.se.prev_burst_time = 0;
+ init_task.se.burst_time = 0;
+ init_task.se.max_burst_time = 0;
+ init_task.se.prev_burst_penalty = 0;
+ init_task.se.curr_burst_penalty = 0;
+ init_task.se.burst_penalty = 0;
+}
+
+void inline sched_fork_bore(struct task_struct *p) {
+ p->child_burst_cache = 0;
+ p->child_burst_last_cached = 0;
+ p->se.burst_time = 0;
+ p->se.curr_burst_penalty = 0;
+}
+
+static u32 count_child_tasks(struct task_struct *p) {
@ -112,31 +140,31 @@ index 8a541fe2d..13969a3a3 100644
+}
+
+static void __update_child_burst_cache(
+ struct task_struct *p, u32 cnt, u64 sum, u64 now) {
+ u64 avg = 0;
+ if (cnt) avg = div_u64(sum, cnt) << CHILD_BURST_CUTOFF_BITS;
+ p->child_burst_cache = max(avg, p->se.max_burst_time);
+ struct task_struct *p, u32 cnt, u32 sum, u64 now) {
+ u16 avg = 0;
+ if (cnt) avg = DIV_ROUND_CLOSEST(sum, cnt);
+ p->child_burst_cache = max(avg, p->se.burst_penalty);
+ p->child_burst_last_cached = now;
+}
+
+static void update_child_burst_cache(struct task_struct *p, u64 now) {
+ struct task_struct *child;
+ u32 cnt = 0;
+ u64 sum = 0;
+ u32 sum = 0;
+
+ list_for_each_entry(child, &p->children, sibling) {
+ cnt++;
+ sum += child->se.max_burst_time >> CHILD_BURST_CUTOFF_BITS;
+ sum += child->se.burst_penalty;
+ }
+
+ __update_child_burst_cache(p, cnt, sum, now);
+}
+
+static void update_child_burst_cache_atavistic(
+ struct task_struct *p, u64 now, u32 depth, u32 *acnt, u64 *asum) {
+ struct task_struct *p, u64 now, u32 depth, u32 *acnt, u32 *asum) {
+ struct task_struct *child, *dec;
+ u32 cnt = 0, dcnt = 0;
+ u64 sum = 0;
+ u32 sum = 0;
+
+ list_for_each_entry(child, &p->children, sibling) {
+ dec = child;
@ -145,13 +173,13 @@ index 8a541fe2d..13969a3a3 100644
+
+ if (!dcnt || !depth) {
+ cnt++;
+ sum += dec->se.max_burst_time >> CHILD_BURST_CUTOFF_BITS;
+ sum += dec->se.burst_penalty;
+ } else {
+ if (child_burst_cache_expired(dec, now))
+ update_child_burst_cache_atavistic(dec, now, depth - 1, &cnt, &sum);
+ else {
+ cnt += dcnt;
+ sum += (dec->child_burst_cache >> CHILD_BURST_CUTOFF_BITS) * dcnt;
+ sum += (dec->child_burst_cache) * dcnt;
+ }
+ }
+ }
@ -161,12 +189,12 @@ index 8a541fe2d..13969a3a3 100644
+ *asum += sum;
+}
+
+static void update_task_initial_burst_time(struct task_struct *p) {
+static void fork_burst_penalty(struct task_struct *p) {
+ struct sched_entity *se = &p->se;
+ struct task_struct *anc = p->real_parent;
+ u64 now = ktime_get_ns();
+ u32 cnt = 0;
+ u64 sum = 0;
+ u32 sum = 0;
+
+ read_lock(&tasklist_lock);
+
@ -182,15 +210,15 @@ index 8a541fe2d..13969a3a3 100644
+
+ read_unlock(&tasklist_lock);
+
+ se->max_burst_time = se->prev_burst_time =
+ max(se->prev_burst_time, anc->child_burst_cache);
+ se->burst_penalty = se->prev_burst_penalty =
+ max(se->prev_burst_penalty, anc->child_burst_cache);
+}
+#endif // CONFIG_SCHED_BORE
+
/*
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
@@ -4507,6 +4613,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
@@ -4507,6 +4614,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
@ -200,30 +228,30 @@ index 8a541fe2d..13969a3a3 100644
p->se.vlag = 0;
INIT_LIST_HEAD(&p->se.group_node);
@@ -4828,6 +4937,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
@@ -4828,6 +4938,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
void sched_post_fork(struct task_struct *p)
{
+#ifdef CONFIG_SCHED_BORE
+ update_task_initial_burst_time(p);
+ fork_burst_penalty(p);
+#endif // CONFIG_SCHED_BORE
uclamp_post_fork(p);
}
@@ -9967,6 +10079,11 @@ void __init sched_init(void)
@@ -9954,6 +10067,11 @@ void __init sched_init(void)
BUG_ON(&dl_sched_class != &stop_sched_class + 1);
#endif
+#ifdef CONFIG_SCHED_BORE
+ sched_init_bore();
+ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 2.5.3 by Masahito Suzuki");
+ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 3.0 Beta2 by Masahito Suzuki");
+#endif // CONFIG_SCHED_BORE
+
wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 5c743bcb3..755ef4c8d 100644
index e7e83181fbb6..ff41a524c1ee 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -348,6 +348,7 @@ static __init int sched_init_debug(void)
@ -234,18 +262,18 @@ index 5c743bcb3..755ef4c8d 100644
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
@@ -595,6 +596,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
@@ -594,6 +595,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
+#ifdef CONFIG_SCHED_BORE
+ SEQ_printf(m, " %2d", p->se.penalty_score);
+ SEQ_printf(m, " %2d", ((x16*)&p->se.burst_penalty)->u8[1]);
+#endif
#ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d6042543c..e52c14232 100644
index 461409c0eac7..90ce27fb0a3f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -19,6 +19,9 @@
@ -281,7 +309,7 @@ index d6042543c..e52c14232 100644
/*
* After fork, child runs first. If set to 0 (default) then
@@ -84,8 +87,76 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
@@ -84,8 +87,93 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
*/
unsigned int sysctl_sched_child_runs_first __read_mostly;
@ -292,61 +320,78 @@ index d6042543c..e52c14232 100644
+ * and reduces their over-scheduling. Synchronous workloads will still
+ * have immediate wakeup/sleep latencies.
+ *
+ * (default: 3.2 msec * 1, units: nanoseconds)
+ * (default: 1.6 msec * 1, units: nanoseconds)
+ */
+unsigned int sysctl_sched_wakeup_granularity = 3200000UL;
+static unsigned int normalized_sysctl_sched_wakeup_granularity = 3200000UL;
+unsigned int sysctl_sched_wakeup_granularity = 1600000UL;
+static unsigned int normalized_sysctl_sched_wakeup_granularity = 1600000UL;
+
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+#ifdef CONFIG_SCHED_BORE
+unsigned int __read_mostly sched_bore = 1;
+unsigned int __read_mostly sched_burst_cache_lifetime = 60000000;
+unsigned int __read_mostly sched_burst_penalty_offset = 12;
+unsigned int __read_mostly sched_burst_penalty_scale = 1292;
+unsigned int __read_mostly sched_burst_smoothness = 2;
+unsigned int __read_mostly sched_burst_fork_atavistic = 2;
+unsigned int __read_mostly sched_bore = 1;
+unsigned int __read_mostly sched_burst_cache_lifetime = 60000000;
+unsigned int __read_mostly sched_burst_penalty_offset = 18;
+unsigned int __read_mostly sched_burst_penalty_scale = 1292;
+unsigned int __read_mostly sched_burst_smoothness_up = 1;
+unsigned int __read_mostly sched_burst_smoothness_down = 0;
+unsigned int __read_mostly sched_burst_fork_atavistic = 2;
+static int three = 3;
+static int sixty_four = 64;
+static int maxval_12_bits = 4095;
+
+#define FIXED_SHIFT 10
+#define FIXED_ONE (1 << FIXED_SHIFT)
+typedef u32 fixed;
+#define MAX_BURST_PENALTY ((u32)(40UL << 8) - 1)
+
+static void update_burst_score(struct sched_entity *se) {
+ u64 burst_time = se->max_burst_time;
+static inline u32 log2plus1_u64_u32f8(u64 v) {
+ x32 result;
+ int msb = fls64(v);
+ result.u8[0] = v << (64 - msb) >> 55;
+ result.u8[1] = msb;
+ return result.u32;
+}
+
+ int msb = fls64(burst_time);
+ fixed integer_part = msb << FIXED_SHIFT;
+ fixed fractional_part = burst_time << (64 - msb) << 1 >> (64 - FIXED_SHIFT);
+ fixed greed = integer_part | fractional_part;
+static inline u32 u8h_u32(u8 v) {
+ x32 result;
+ result.u8[1] = v;
+ return result.u32;
+}
+
+ fixed tolerance = sched_burst_penalty_offset << FIXED_SHIFT;
+ fixed penalty = max(0, (s32)greed - (s32)tolerance);
+ fixed scaled_penalty = penalty * sched_burst_penalty_scale >> 10;
+static inline u32 calc_burst_penalty(struct sched_entity *se) {
+ u32 greed, tolerance, penalty, scaled_penalty;
+
+ greed = log2plus1_u64_u32f8(se->burst_time);
+ tolerance = u8h_u32(sched_burst_penalty_offset);
+ penalty = max(0, (s32)greed - (s32)tolerance);
+ scaled_penalty = penalty * sched_burst_penalty_scale >> 10;
+
+ u8 score = min(39U, scaled_penalty >> FIXED_SHIFT);
+ se->penalty_score = score;
+ return min(MAX_BURST_PENALTY, scaled_penalty);
+}
+
+static void update_burst_penalty(struct sched_entity *se) {
+ se->curr_burst_penalty = calc_burst_penalty(se);
+ se->burst_penalty = max(se->prev_burst_penalty, se->curr_burst_penalty);
+}
+
+static inline u64 penalty_scale(u64 delta, struct sched_entity *se) {
+ return mul_u64_u32_shr(delta, sched_prio_to_wmult[se->penalty_score], 22);
+ u8 score = ((x16*)&se->burst_penalty)->u8[1];
+ return mul_u64_u32_shr(delta, sched_prio_to_wmult[score], 22);
+}
+
+static inline u64 __binary_smooth(u64 new, u64 old, unsigned int smoothness) {
+ return (new <= old)? new: (new + old * ((1 << smoothness) - 1)) >> smoothness;
+static inline u32 binary_smooth(u32 new, u32 old) {
+ return (new >= old)?
+ old + ((new - old) >> sched_burst_smoothness_up):
+ old - ((old - new) >> sched_burst_smoothness_down);
+}
+
+void restart_burst(struct sched_entity *se) {
+ se->max_burst_time = se->prev_burst_time = __binary_smooth(
+ se->burst_time, se->prev_burst_time, sched_burst_smoothness);
+static void restart_burst(struct sched_entity *se) {
+ se->burst_penalty = se->prev_burst_penalty =
+ binary_smooth(se->curr_burst_penalty, se->prev_burst_penalty);
+ se->curr_burst_penalty = 0;
+ se->burst_time = 0;
+}
+
+#define calc_delta_fair(delta, se) __calc_delta_fair(delta, se, true)
+#define calc_delta_fair_unscaled(delta, se) __calc_delta_fair(delta, se, false)
+static inline u64
+static inline u64
+__calc_delta_fair(u64 delta, struct sched_entity *se, bool bscale);
+
+static s64 wakeup_preempt_backstep_delta(u64 rtime, struct sched_entity *se) {
@ -358,7 +403,7 @@ index d6042543c..e52c14232 100644
int sched_thermal_decay_shift;
static int __init setup_sched_thermal_decay_shift(char *str)
{
@@ -145,6 +216,60 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
@@ -145,6 +233,69 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
#ifdef CONFIG_SYSCTL
static struct ctl_table sched_fair_sysctls[] = {
@ -407,8 +452,17 @@ index d6042543c..e52c14232 100644
+ .extra2 = &maxval_12_bits,
+ },
+ {
+ .procname = "sched_burst_smoothness",
+ .data = &sched_burst_smoothness,
+ .procname = "sched_burst_smoothness_down",
+ .data = &sched_burst_smoothness_down,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &three,
+ },
+ {
+ .procname = "sched_burst_smoothness_up",
+ .data = &sched_burst_smoothness_up,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
@ -419,7 +473,7 @@ index d6042543c..e52c14232 100644
{
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
@@ -238,6 +363,7 @@ static void update_sysctl(void)
@@ -238,6 +389,7 @@ static void update_sysctl(void)
#define SET_SYSCTL(name) \
(sysctl_##name = (factor) * normalized_sysctl_##name)
SET_SYSCTL(sched_base_slice);
@ -427,12 +481,12 @@ index d6042543c..e52c14232 100644
#undef SET_SYSCTL
}
@@ -308,11 +434,19 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
@@ -308,11 +460,19 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
/*
* delta /= w
*/
+#ifdef CONFIG_SCHED_BORE
+static inline u64
+static inline u64
+__calc_delta_fair(u64 delta, struct sched_entity *se, bool bscale)
+#else // CONFIG_SCHED_BORE
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
@ -447,7 +501,7 @@ index d6042543c..e52c14232 100644
return delta;
}
@@ -708,7 +842,11 @@ void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -706,7 +866,11 @@ void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
SCHED_WARN_ON(!se->on_rq);
lag = avg_vruntime(cfs_rq) - se->vruntime;
@ -459,7 +513,7 @@ index d6042543c..e52c14232 100644
se->vlag = clamp(lag, -limit, limit);
}
@@ -946,6 +1084,7 @@ int sched_update_scaling(void)
@@ -944,6 +1108,7 @@ int sched_update_scaling(void)
#define WRT_SYSCTL(name) \
(normalized_sysctl_##name = sysctl_##name / (factor))
WRT_SYSCTL(sched_base_slice);
@ -467,19 +521,18 @@ index d6042543c..e52c14232 100644
#undef WRT_SYSCTL
return 0;
@@ -1123,6 +1262,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
@@ -1121,6 +1286,10 @@ static void update_curr(struct cfs_rq *cfs_rq)
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq->exec_clock, delta_exec);
+#ifdef CONFIG_SCHED_BORE
+ curr->burst_time += delta_exec;
+ curr->max_burst_time = max(curr->max_burst_time, curr->burst_time);
+ update_burst_score(curr);
+ update_burst_penalty(curr);
+#endif // CONFIG_SCHED_BORE
curr->vruntime += calc_delta_fair(delta_exec, curr);
update_deadline(cfs_rq, curr);
update_min_vruntime(cfs_rq);
@@ -5237,6 +5381,9 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -5187,6 +5356,9 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
@ -489,7 +542,7 @@ index d6042543c..e52c14232 100644
/*
* Pick the next process, keeping these things in mind, in this order:
* 1) keep things fair between processes/task groups
@@ -5247,14 +5394,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -5197,14 +5369,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
@ -499,7 +552,7 @@ index d6042543c..e52c14232 100644
*/
if (sched_feat(NEXT_BUDDY) &&
- cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next) &&
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next) &&
+ wakeup_preempt_entity(cfs_rq->next, candidate) < 1)
return cfs_rq->next;
@ -508,7 +561,7 @@ index d6042543c..e52c14232 100644
}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -6522,6 +6671,38 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
@@ -6452,6 +6626,38 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
hrtick_update(rq);
}
@ -547,7 +600,7 @@ index d6042543c..e52c14232 100644
static void set_next_buddy(struct sched_entity *se);
/*
@@ -6540,6 +6721,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
@@ -6470,6 +6676,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
util_est_dequeue(&rq->cfs, p);
for_each_sched_entity(se) {
@ -557,7 +610,7 @@ index d6042543c..e52c14232 100644
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
@@ -8047,7 +8231,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
@@ -7980,7 +8189,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
/*
* XXX pick_eevdf(cfs_rq) != se ?
*/
@ -566,7 +619,7 @@ index d6042543c..e52c14232 100644
goto preempt;
return;
@@ -8260,6 +8444,9 @@ static void yield_task_fair(struct rq *rq)
@@ -8193,6 +8402,9 @@ static void yield_task_fair(struct rq *rq)
struct task_struct *curr = rq->curr;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se;
@ -577,10 +630,10 @@ index d6042543c..e52c14232 100644
/*
* Are we the only task in the tree?
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 7d65b4029..bd274f7c7 100644
index 54334ca5c5c6..416ec4bcdb0f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -13,7 +13,11 @@ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
@@ -12,7 +12,11 @@ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
* wakeup-preemption), since its likely going to consume data we
* touched, increases cache locality.
*/
@ -593,10 +646,10 @@ index 7d65b4029..bd274f7c7 100644
/*
* Consider buddies to be cache hot, decreases the likeliness of a
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4236c4c89..714cc6ad9 100644
index 67cd7e1fd501..04d065015d6c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2510,6 +2510,7 @@ extern const_debug unsigned int sysctl_sched_nr_migrate;
@@ -2506,6 +2506,7 @@ extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_base_slice;
@ -605,4 +658,4 @@ index 4236c4c89..714cc6ad9 100644
#ifdef CONFIG_SCHED_DEBUG
extern int sysctl_resched_latency_warn_ms;
--
2.41.0.159.g0bfa463d37
2.41.0

573
patches/0006-AMD-cppc.patch Normal file
View File

@ -0,0 +1,573 @@
From ab6268d199fa749e274a48b00c443538ae492b16 Mon Sep 17 00:00:00 2001
From: Piotr Gorski <lucjan.lucjanov@gmail.com>
Date: Wed, 9 Aug 2023 14:07:31 +0200
Subject: [PATCH] amd-6.5: merge changes from dev tree
Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
---
.../admin-guide/kernel-parameters.txt | 5 +
Documentation/admin-guide/pm/amd-pstate.rst | 55 +++++
drivers/acpi/cppc_acpi.c | 13 ++
drivers/acpi/processor_driver.c | 6 +
drivers/cpufreq/amd-pstate.c | 191 ++++++++++++++++--
drivers/cpufreq/cpufreq.c | 13 ++
include/acpi/cppc_acpi.h | 5 +
include/linux/amd-pstate.h | 1 +
include/linux/cpufreq.h | 4 +
9 files changed, 272 insertions(+), 21 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a1457995f..1f53c395a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -363,6 +363,11 @@
selects a performance level in this range and appropriate
to the current workload.
+ amd_prefcore=
+ [X86]
+ enable
+ Enable AMD Pstate Preferred Core.
+
amijoy.map= [HW,JOY] Amiga joystick support
Map of devices attached to JOY0DAT and JOY1DAT
Format: <a>,<b>
diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
index 1cf40f692..4a30cf235 100644
--- a/Documentation/admin-guide/pm/amd-pstate.rst
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
@@ -353,6 +353,49 @@ is activated. In this mode, driver requests minimum and maximum performance
level and the platform autonomously selects a performance level in this range
and appropriate to the current workload.
+AMD Pstate Preferred Core
+=================================
+
+The core frequency is subjected to the process variation in semiconductors.
+Not all cores are able to reach the maximum frequency respecting the
+infrastructure limits. Consequently, AMD has redefined the concept of
+maximum frequency of a part. This means that a fraction of cores can reach
+maximum frequency. To find the best process scheduling policy for a given
+scenario, OS needs to know the core ordering informed by the platform through
+highest performance capability register of the CPPC interface.
+
+``AMD Pstate Preferred Core`` use ITMT arch provides functions and data structures
+for enabling the scheduler to favor scheduling on cores can be get a higher frequency
+with lower voltage under preferred core. And it has the ability to dynamically
+change the preferred core based on the workload and platform conditions and
+accounting for thermals and aging.
+
+The priority metric will be initialized by the AMD Pstate driver. The AMD Pstate
+driver will also determine whether or not ``AMD Pstate Preferred Core`` is
+supported by the platform.
+
+AMD Pstate driver will provide an initial core ordering when the system boots.
+The platform uses the CPPC interfaces to communicate the core ranking to the
+operating system and scheduler to make sure that OS is choosing the cores
+with highest performance firstly for scheduling the process. When AMD Pstate
+driver receives a message with the highest performance change, it will
+update the core ranking and set the cpu's priority.
+
+AMD Preferred Core Switch
+=================================
+Kernel Parameters
+-----------------
+
+``AMD Pstate Preferred Core`` has two states: enable and disable.
+Enable/disable states can be chosen by different kernel parameters.
+Default disable ``AMD Pstate Preferred Core``.
+
+``amd_prefcore=enable``
+
+If ``amd_prefcore=enable`` is passed to kernel command line option
+then enable ``AMD Pstate Preferred Core`` if the processor and power
+firmware can support preferred core feature.
+
User Space Interface in ``sysfs`` - General
===========================================
@@ -385,6 +428,18 @@ control its functionality at the system level. They are located in the
to the operation mode represented by that string - or to be
unregistered in the "disable" case.
+``prefcore_state``
+ Preferred Core state of the driver: "enabled" or "disabled".
+
+ "enabled"
+ Enable the AMD Preferred Core.
+
+ "disabled"
+ Disable the AMD Preferred Core
+
+
+ This attribute is read-only to check the state of Preferred Core.
+
``cpupower`` tool support for ``amd-pstate``
===============================================
diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 7ff269a78..ad388a0e8 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -1154,6 +1154,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
return cppc_get_perf(cpunum, NOMINAL_PERF, nominal_perf);
}
+/**
+ * cppc_get_highest_perf - Get the highest performance register value.
+ * @cpunum: CPU from which to get highest performance.
+ * @highest_perf: Return address.
+ *
+ * Return: 0 for success, -EIO otherwise.
+ */
+int cppc_get_highest_perf(int cpunum, u64 *highest_perf)
+{
+ return cppc_get_perf(cpunum, HIGHEST_PERF, highest_perf);
+}
+EXPORT_SYMBOL_GPL(cppc_get_highest_perf);
+
/**
* cppc_get_epp_perf - Get the epp register value.
* @cpunum: CPU from which to get epp preference value.
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index 4bd16b3f0..29b2fb68a 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -27,6 +27,7 @@
#define ACPI_PROCESSOR_NOTIFY_PERFORMANCE 0x80
#define ACPI_PROCESSOR_NOTIFY_POWER 0x81
#define ACPI_PROCESSOR_NOTIFY_THROTTLING 0x82
+#define ACPI_PROCESSOR_NOTIFY_HIGEST_PERF_CHANGED 0x85
MODULE_AUTHOR("Paul Diefenbaugh");
MODULE_DESCRIPTION("ACPI Processor Driver");
@@ -83,6 +84,11 @@ static void acpi_processor_notify(acpi_handle handle, u32 event, void *data)
acpi_bus_generate_netlink_event(device->pnp.device_class,
dev_name(&device->dev), event, 0);
break;
+ case ACPI_PROCESSOR_NOTIFY_HIGEST_PERF_CHANGED:
+ cpufreq_update_highest_perf(pr->id);
+ acpi_bus_generate_netlink_event(device->pnp.device_class,
+ dev_name(&device->dev), event, 0);
+ break;
default:
acpi_handle_debug(handle, "Unsupported event [0x%x]\n", event);
break;
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 81fba0dcb..ba10aa971 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -37,6 +37,7 @@
#include <linux/uaccess.h>
#include <linux/static_call.h>
#include <linux/amd-pstate.h>
+#include <linux/topology.h>
#include <acpi/processor.h>
#include <acpi/cppc_acpi.h>
@@ -49,6 +50,8 @@
#define AMD_PSTATE_TRANSITION_LATENCY 20000
#define AMD_PSTATE_TRANSITION_DELAY 1000
+#define AMD_PSTATE_PREFCORE_THRESHOLD 166
+#define AMD_PSTATE_MAX_CPPC_PERF 255
/*
* TODO: We need more time to fine tune processors with shared memory solution
@@ -65,6 +68,14 @@ static struct cpufreq_driver amd_pstate_epp_driver;
static int cppc_state = AMD_PSTATE_UNDEFINED;
static bool cppc_enabled;
+/*
+ * CPPC Preferred Core feature is supported by power firmware
+ */
+static bool prefcore_enabled = false;
+
+/* Disable AMD Pstate Preferred Core loading */
+static bool no_prefcore __read_mostly = true;
+
/*
* AMD Energy Preference Performance (EPP)
* The EPP is used in the CCLK DPM controller to drive
@@ -290,27 +301,26 @@ static inline int amd_pstate_enable(bool enable)
static int pstate_init_perf(struct amd_cpudata *cpudata)
{
u64 cap1;
- u32 highest_perf;
int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1,
&cap1);
if (ret)
return ret;
- /*
- * TODO: Introduce AMD specific power feature.
- *
- * CPPC entry doesn't indicate the highest performance in some ASICs.
+ /* For platforms that do not support the preferred core feature, the
+ * highest_pef may be configured with 166 or 255, to avoid max frequency
+ * calculated wrongly. we take the AMD_CPPC_HIGHEST_PERF(cap1) value as
+ * the default max perf.
*/
- highest_perf = amd_get_highest_perf();
- if (highest_perf > AMD_CPPC_HIGHEST_PERF(cap1))
- highest_perf = AMD_CPPC_HIGHEST_PERF(cap1);
-
- WRITE_ONCE(cpudata->highest_perf, highest_perf);
+ if (!prefcore_enabled)
+ WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
+ else
+ WRITE_ONCE(cpudata->highest_perf, AMD_PSTATE_PREFCORE_THRESHOLD);
WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1));
WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1));
WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1));
+ WRITE_ONCE(cpudata->prefcore_highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
return 0;
}
@@ -318,22 +328,21 @@ static int pstate_init_perf(struct amd_cpudata *cpudata)
static int cppc_init_perf(struct amd_cpudata *cpudata)
{
struct cppc_perf_caps cppc_perf;
- u32 highest_perf;
int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf);
if (ret)
return ret;
- highest_perf = amd_get_highest_perf();
- if (highest_perf > cppc_perf.highest_perf)
- highest_perf = cppc_perf.highest_perf;
-
- WRITE_ONCE(cpudata->highest_perf, highest_perf);
+ if (!prefcore_enabled)
+ WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf);
+ else
+ WRITE_ONCE(cpudata->highest_perf, AMD_PSTATE_PREFCORE_THRESHOLD);
WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf);
WRITE_ONCE(cpudata->lowest_nonlinear_perf,
cppc_perf.lowest_nonlinear_perf);
WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf);
+ WRITE_ONCE(cpudata->prefcore_highest_perf, cppc_perf.highest_perf);
if (cppc_state == AMD_PSTATE_ACTIVE)
return 0;
@@ -676,6 +685,118 @@ static void amd_perf_ctl_reset(unsigned int cpu)
wrmsrl_on_cpu(cpu, MSR_AMD_PERF_CTL, 0);
}
+/*
+ * Set AMD Pstate Preferred Core enable can't be done directly from cpufreq callbacks
+ * due to locking, so queue the work for later.
+ */
+static void amd_pstste_sched_prefcore_workfn(struct work_struct *work)
+{
+ sched_set_itmt_support();
+}
+static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn);
+
+/**
+ * Get the highest performance register value.
+ * @cpu: CPU from which to get highest performance.
+ * @highest_perf: Return address.
+ *
+ * Return: 0 for success, -EIO otherwise.
+ */
+static int amd_pstate_get_highest_perf(int cpu, u64 *highest_perf)
+{
+ int ret;
+
+ if (boot_cpu_has(X86_FEATURE_CPPC)) {
+ u64 cap1;
+
+ ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1);
+ if (ret)
+ return ret;
+ WRITE_ONCE(*highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
+ } else {
+ ret = cppc_get_highest_perf(cpu, highest_perf);
+ }
+
+ return (ret);
+}
+
+static void amd_pstate_init_prefcore(void)
+{
+ int cpu, ret;
+ u64 highest_perf;
+
+ if (no_prefcore)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ ret = amd_pstate_get_highest_perf(cpu, &highest_perf);
+ if (ret)
+ break;
+
+ sched_set_itmt_core_prio(highest_perf, cpu);
+ }
+
+ /*
+ * This code can be run during CPU online under the
+ * CPU hotplug locks, so sched_set_amd_prefcore_support()
+ * cannot be called from here. Queue up a work item
+ * to invoke it.
+ */
+ schedule_work(&sched_prefcore_work);
+}
+
+static void amd_pstate_update_highest_perf(unsigned int cpu)
+{
+ struct cpufreq_policy *policy;
+ struct amd_cpudata *cpudata;
+ u32 prev_high = 0, cur_high = 0;
+ u64 highest_perf;
+ int ret;
+
+ if (!prefcore_enabled)
+ return;
+
+ ret = amd_pstate_get_highest_perf(cpu, &highest_perf);
+ if (ret)
+ return;
+
+ policy = cpufreq_cpu_get(cpu);
+ cpudata = policy->driver_data;
+ cur_high = highest_perf;
+ prev_high = READ_ONCE(cpudata->prefcore_highest_perf);
+
+ if (prev_high != cur_high) {
+ WRITE_ONCE(cpudata->prefcore_highest_perf, cur_high);
+ sched_set_itmt_core_prio(cur_high, cpu);
+ }
+
+ cpufreq_cpu_put(policy);
+}
+
+/*
+ * Check if AMD Pstate Preferred core feature is supported and enabled
+ * 1) no_prefcore is used to enable or disable AMD Pstate Preferred Core
+ * loading when user would like to enable or disable it. Without that,
+ * AMD Pstate Preferred Core will be disabled by default if the processor
+ * and power firmware can support preferred core feature.
+ * 2) prefcore_enabled is used to indicate whether CPPC preferred core is enabled.
+ */
+static void check_prefcore_supported(int cpu)
+{
+ u64 highest_perf;
+ int ret;
+
+ if (no_prefcore)
+ return;
+
+ ret = amd_pstate_get_highest_perf(cpu, &highest_perf);
+ if (ret)
+ return;
+
+ if(highest_perf < AMD_PSTATE_MAX_CPPC_PERF)
+ prefcore_enabled = true;
+}
+
static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
{
int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret;
@@ -697,6 +818,9 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
cpudata->cpu = policy->cpu;
+ /* check if CPPC preferred core feature is enabled*/
+ check_prefcore_supported(policy->cpu);
+
ret = amd_pstate_init_perf(cpudata);
if (ret)
goto free_cpudata1;
@@ -1012,8 +1136,8 @@ static int amd_pstate_update_status(const char *buf, size_t size)
return 0;
}
-static ssize_t show_status(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t status_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
ssize_t ret;
@@ -1024,7 +1148,7 @@ static ssize_t show_status(struct kobject *kobj,
return ret;
}
-static ssize_t store_status(struct kobject *a, struct kobj_attribute *b,
+static ssize_t status_store(struct device *a, struct device_attribute *b,
const char *buf, size_t count)
{
char *p = memchr(buf, '\n', count);
@@ -1037,13 +1161,20 @@ static ssize_t store_status(struct kobject *a, struct kobj_attribute *b,
return ret < 0 ? ret : count;
}
+static ssize_t prefcore_state_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%s\n", prefcore_enabled ? "enabled" : "disabled");
+}
+
cpufreq_freq_attr_ro(amd_pstate_max_freq);
cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq);
cpufreq_freq_attr_ro(amd_pstate_highest_perf);
cpufreq_freq_attr_rw(energy_performance_preference);
cpufreq_freq_attr_ro(energy_performance_available_preferences);
-define_one_global_rw(status);
+static DEVICE_ATTR_RW(status);
+static DEVICE_ATTR_RO(prefcore_state);
static struct freq_attr *amd_pstate_attr[] = {
&amd_pstate_max_freq,
@@ -1062,7 +1193,8 @@ static struct freq_attr *amd_pstate_epp_attr[] = {
};
static struct attribute *pstate_global_attributes[] = {
- &status.attr,
+ &dev_attr_status.attr,
+ &dev_attr_prefcore_state.attr,
NULL
};
@@ -1114,6 +1246,9 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
cpudata->cpu = policy->cpu;
cpudata->epp_policy = 0;
+ /* check if CPPC preferred core feature is supported*/
+ check_prefcore_supported(policy->cpu);
+
ret = amd_pstate_init_perf(cpudata);
if (ret)
goto free_cpudata1;
@@ -1392,6 +1527,7 @@ static struct cpufreq_driver amd_pstate_driver = {
.suspend = amd_pstate_cpu_suspend,
.resume = amd_pstate_cpu_resume,
.set_boost = amd_pstate_set_boost,
+ .update_highest_perf = amd_pstate_update_highest_perf,
.name = "amd-pstate",
.attr = amd_pstate_attr,
};
@@ -1406,6 +1542,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = {
.online = amd_pstate_epp_cpu_online,
.suspend = amd_pstate_epp_suspend,
.resume = amd_pstate_epp_resume,
+ .update_highest_perf = amd_pstate_update_highest_perf,
.name = "amd-pstate-epp",
.attr = amd_pstate_epp_attr,
};
@@ -1506,6 +1643,8 @@ static int __init amd_pstate_init(void)
}
}
+ amd_pstate_init_prefcore();
+
return ret;
global_attr_free:
@@ -1527,7 +1666,17 @@ static int __init amd_pstate_param(char *str)
return amd_pstate_set_driver(mode_idx);
}
+
+static int __init amd_prefcore_param(char *str)
+{
+ if (!strcmp(str, "enable"))
+ no_prefcore = false;
+
+ return 0;
+}
+
early_param("amd_pstate", amd_pstate_param);
+early_param("amd_prefcore", amd_prefcore_param);
MODULE_AUTHOR("Huang Rui <ray.huang@amd.com>");
MODULE_DESCRIPTION("AMD Processor P-state Frequency Driver");
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 50bbc969f..842357abf 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2675,6 +2675,19 @@ void cpufreq_update_limits(unsigned int cpu)
}
EXPORT_SYMBOL_GPL(cpufreq_update_limits);
+/**
+ * cpufreq_update_highest_perf - Update highest performance for a given CPU.
+ * @cpu: CPU to update the highest performance for.
+ *
+ * Invoke the driver's ->update_highest_perf callback if present
+ */
+void cpufreq_update_highest_perf(unsigned int cpu)
+{
+ if (cpufreq_driver->update_highest_perf)
+ cpufreq_driver->update_highest_perf(cpu);
+}
+EXPORT_SYMBOL_GPL(cpufreq_update_highest_perf);
+
/*********************************************************************
* BOOST *
*********************************************************************/
diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
index 6126c977e..c0b69ffe7 100644
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -139,6 +139,7 @@ struct cppc_cpudata {
#ifdef CONFIG_ACPI_CPPC_LIB
extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf);
extern int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf);
+extern int cppc_get_highest_perf(int cpunum, u64 *highest_perf);
extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs);
extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls);
extern int cppc_set_enable(int cpu, bool enable);
@@ -165,6 +166,10 @@ static inline int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
{
return -ENOTSUPP;
}
+static inline int cppc_get_highest_perf(int cpunum, u64 *highest_perf)
+{
+ return -ENOTSUPP;
+}
static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs)
{
return -ENOTSUPP;
diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
index 446394f84..fa86bc953 100644
--- a/include/linux/amd-pstate.h
+++ b/include/linux/amd-pstate.h
@@ -70,6 +70,7 @@ struct amd_cpudata {
u32 nominal_perf;
u32 lowest_nonlinear_perf;
u32 lowest_perf;
+ u32 prefcore_highest_perf;
u32 max_freq;
u32 min_freq;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 172ff51c1..766c83a4f 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -231,6 +231,7 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
void refresh_frequency_limits(struct cpufreq_policy *policy);
void cpufreq_update_policy(unsigned int cpu);
void cpufreq_update_limits(unsigned int cpu);
+void cpufreq_update_highest_perf(unsigned int cpu);
bool have_governor_per_policy(void);
bool cpufreq_supports_freq_invariance(void);
struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
@@ -376,6 +377,9 @@ struct cpufreq_driver {
/* Called to update policy limits on firmware notifications. */
void (*update_limits)(unsigned int cpu);
+ /* Called to update highest performance on firmware notifications. */
+ void (*update_highest_perf)(unsigned int cpu);
+
/* optional */
int (*bios_limit)(int cpu, unsigned int *limit);
--
2.42.0.rc0.25.ga82fb66fed

View File

@ -15,4 +15,6 @@ patch -Np1 < "../patches/0002-eevdfbore.patch"
# Allow setting custom pollrates for usb devices
patch -Np1 < "../patches/0004-Allow-to-set-custom-USB-pollrate-for-specific-device.patch"
# Allow pre polaris cards to use the amdgpu kernel module
patch -Np1 < "../patches/0005-amdgpu-si-cik-default.patch"
patch -Np1 < "../patches/0005-amdgpu-si-cik-default.patch"
# AMD Patch for CPPC
patch -Np1 < "../patches/0006-AMD-cppc.patch"

View File

@ -2,7 +2,7 @@
echo "Pika Kernel - Getting source"
wget -nv https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/snapshot/linux-6.5-rc3.tar.gz
tar -xf ./linux-6.5-rc3.tar.gz
wget -nv https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/snapshot/linux-6.5-rc5.tar.gz
tar -xf ./linux-6.5-rc5.tar.gz
cd linux-6.5-rc3
cd linux-6.5-rc5