diff --git a/config b/config index 9449157..7aa2860 100644 --- a/config +++ b/config @@ -7067,7 +7067,7 @@ CONFIG_SND_SERIAL_U16550=m CONFIG_SND_MPU401=m CONFIG_SND_PORTMAN2X4=m CONFIG_SND_AC97_POWER_SAVE=y -CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 +CONFIG_SND_AC97_POWER_SAVE_DEFAULT=10 CONFIG_SND_SB_COMMON=m CONFIG_SND_PCI=y CONFIG_SND_AD1889=m @@ -7172,7 +7172,7 @@ CONFIG_SND_HDA_CODEC_CA0132_DSP=y CONFIG_SND_HDA_CODEC_CMEDIA=m CONFIG_SND_HDA_CODEC_SI3054=m CONFIG_SND_HDA_GENERIC=m -CONFIG_SND_HDA_POWER_SAVE_DEFAULT=1 +CONFIG_SND_HDA_POWER_SAVE_DEFAULT=10 CONFIG_SND_HDA_INTEL_HDMI_SILENT_STREAM=y # CONFIG_SND_HDA_CTL_DEV_ID is not set # end of HD-Audio diff --git a/patches/0001-cachy-all.patch b/patches/0001-cachy-all.patch index bc7f710..aa7ecc2 100644 --- a/patches/0001-cachy-all.patch +++ b/patches/0001-cachy-all.patch @@ -1,4 +1,4 @@ -From e1cfa351424a722e33443e5c9a6a937034eb18bd Mon Sep 17 00:00:00 2001 +From 8b27eca196447f74bfa5a346df10212b900ce82a Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Tue, 9 May 2023 18:38:36 +0200 Subject: [PATCH 1/8] bbr2 @@ -51,7 +51,7 @@ index c2b15f7e5516..d85858efa571 100644 }; diff --git a/include/net/tcp.h b/include/net/tcp.h -index 18a038d16434..070d0aad5204 100644 +index 5066e4586cf0..b34661204315 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -375,6 +375,7 @@ static inline void tcp_dec_quickack_mode(struct sock *sk, @@ -62,7 +62,7 @@ index 18a038d16434..070d0aad5204 100644 enum tcp_tw_status { TCP_TW_SUCCESS = 0, -@@ -823,6 +824,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) +@@ -824,6 +825,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) return max_t(s64, t1 - t0, 0); } @@ -74,7 +74,7 @@ index 18a038d16434..070d0aad5204 100644 static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) { return tcp_ns_to_ts(skb->skb_mstamp_ns); -@@ -898,9 +904,14 @@ struct tcp_skb_cb { +@@ -899,9 +905,14 @@ struct tcp_skb_cb { /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ @@ -91,7 +91,7 @@ index 18a038d16434..070d0aad5204 100644 } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; -@@ -1026,7 +1037,11 @@ enum tcp_ca_ack_event_flags { +@@ -1027,7 +1038,11 @@ enum tcp_ca_ack_event_flags { #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 @@ -104,7 +104,7 @@ index 18a038d16434..070d0aad5204 100644 union tcp_cc_info; -@@ -1046,8 +1061,11 @@ struct ack_sample { +@@ -1047,8 +1062,11 @@ struct ack_sample { */ struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ @@ -116,7 +116,7 @@ index 18a038d16434..070d0aad5204 100644 s32 delivered; /* number of packets delivered over interval */ s32 delivered_ce; /* number of packets delivered w/ CE marks*/ long interval_us; /* time for tp->delivered to incr "delivered" */ -@@ -1061,6 +1079,7 @@ struct rate_sample { +@@ -1062,6 +1080,7 @@ struct rate_sample { bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ bool is_ack_delayed; /* is this (likely) a delayed ACK? */ @@ -124,7 +124,7 @@ index 18a038d16434..070d0aad5204 100644 }; struct tcp_congestion_ops { -@@ -1084,8 +1103,11 @@ struct tcp_congestion_ops { +@@ -1085,8 +1104,11 @@ struct tcp_congestion_ops { /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); @@ -138,7 +138,7 @@ index 18a038d16434..070d0aad5204 100644 /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) -@@ -1151,6 +1173,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) +@@ -1152,6 +1174,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) } #endif @@ -153,7 +153,7 @@ index 18a038d16434..070d0aad5204 100644 static inline bool tcp_ca_needs_ecn(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); -@@ -1170,6 +1200,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) +@@ -1171,6 +1201,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) void tcp_set_ca_state(struct sock *sk, const u8 ca_state); /* From tcp_rate.c */ @@ -268,10 +268,10 @@ index b18ba8ef93ad..b4e3dcb27a20 100644 obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index a60f6f4e7cd9..158d0ed5a7c4 100644 +index 8d20d9221238..99c2e0357dec 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c -@@ -3186,6 +3186,7 @@ int tcp_disconnect(struct sock *sk, int flags) +@@ -3192,6 +3192,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; tp->rcv_ooopack = 0; @@ -3037,7 +3037,7 @@ index 1b34050a7538..66d40449b3f4 100644 icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c -index 61b6710f337a..2efb52fbeee3 100644 +index bf8b22218dd4..3ae56b0676a8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -349,7 +349,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) @@ -3269,10 +3269,10 @@ index a8f6d9d06f2e..a8b4c9504570 100644 rs->interval_us = max(snd_us, ack_us); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c -index b839c2f91292..ae272ae2b707 100644 +index 39eb947fe392..61ab4ee55b22 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c -@@ -605,6 +605,7 @@ void tcp_write_timer_handler(struct sock *sk) +@@ -615,6 +615,7 @@ void tcp_write_timer_handler(struct sock *sk) return; } @@ -3283,7 +3283,7 @@ index b839c2f91292..ae272ae2b707 100644 -- 2.41.0 -From 2414bafa00ccf9c4dca1327546ff9cfa2f87676f Mon Sep 17 00:00:00 2001 +From 4b326373e0295ad142c417ef510d28cc491e0e73 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 1 Jun 2023 16:35:02 +0200 Subject: [PATCH 2/8] cachy @@ -3411,7 +3411,7 @@ index 3c399f132e2d..a62ad01e6d11 100644 vmlinuz voffset.h diff --git a/Makefile b/Makefile -index 836643eaefee..161c4a3c9e3a 100644 +index 0d3a9d3e73c1..f6ec2f6c57ca 100644 --- a/Makefile +++ b/Makefile @@ -818,6 +818,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) @@ -9419,7 +9419,7 @@ index 38ef6d06888e..0f78364efd4f 100644 config SCHED_HRTICK diff --git a/kernel/fork.c b/kernel/fork.c -index ed4e01daccaa..ee9b6e4cf16a 100644 +index 41c964104b58..915ad6dae416 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -100,6 +100,10 @@ @@ -9444,7 +9444,7 @@ index ed4e01daccaa..ee9b6e4cf16a 100644 /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. -@@ -3422,6 +3430,12 @@ int ksys_unshare(unsigned long unshare_flags) +@@ -3419,6 +3427,12 @@ int ksys_unshare(unsigned long unshare_flags) if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; @@ -9758,7 +9758,7 @@ index ab0c5bd1a60f..f4989f706d7f 100644 -- 2.41.0 -From c6c9513db571d0b72d3a7c37aa010db70992b6a6 Mon Sep 17 00:00:00 2001 +From d66ae67f1a8580742fdd5cda2e6dcade3cc770e0 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Tue, 9 May 2023 18:39:03 +0200 Subject: [PATCH 3/8] ddcci @@ -12558,7 +12558,7 @@ index 000000000000..a219f031e584 -- 2.41.0 -From 7341d3f2d650ef7c81ace77bbaed7aeedf6d124b Mon Sep 17 00:00:00 2001 +From 72c060c5d2883853d8530a436380a788f74248b1 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 1 Jun 2023 16:35:21 +0200 Subject: [PATCH 4/8] fixes @@ -14250,7 +14250,7 @@ index b5210abb5141..4d8936e1f769 100644 -- 2.41.0 -From 2b82b34c90d5a0b7f64f438ae45a77777059a810 Mon Sep 17 00:00:00 2001 +From 5cce371c10c8c702dda5f1f0ca4428ff0d336662 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 1 Jun 2023 16:35:38 +0200 Subject: [PATCH 5/8] ksm @@ -14702,7 +14702,7 @@ index 860b2dcf3ac4..96fe36a6d0f5 100644 -- 2.41.0 -From d9705b7f78a157575856ee08474297f3abe38dfd Mon Sep 17 00:00:00 2001 +From 21d2dc84c885c791c47d30838a265cf6c532d567 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 1 Jun 2023 16:35:57 +0200 Subject: [PATCH 6/8] sched @@ -14806,7 +14806,7 @@ index 57bde66d95f7..fad77b5172e2 100644 /* * Prefer to place tasks in a sibling domain diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c -index 625d7483951c..b26ae200abef 100644 +index 245cf62ce85a..2d3d13e52333 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3877,6 +3877,14 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, @@ -15515,9 +15515,9 @@ index 6682535e37c8..ca4472281c28 100644 -- 2.41.0 -From b205c598effc456c3c47800294a7e3c4f110e0c2 Mon Sep 17 00:00:00 2001 +From de932d02ee465828c5f4902165e38d9fb74f7758 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Thu, 1 Jun 2023 16:36:10 +0200 +Date: Wed, 14 Jun 2023 19:42:38 +0200 Subject: [PATCH 7/8] vma Signed-off-by: Peter Jung @@ -15526,20 +15526,24 @@ Signed-off-by: Peter Jung arch/powerpc/mm/fault.c | 3 ++- arch/s390/mm/fault.c | 3 ++- arch/x86/mm/fault.c | 3 ++- - include/linux/mm_types.h | 6 +++++- - include/linux/pagemap.h | 14 ++++++++++---- - mm/filemap.c | 37 +++++++++++++++++++++++-------------- - mm/memory.c | 39 ++++++++++++++++++++++++++++----------- - 8 files changed, 74 insertions(+), 34 deletions(-) + fs/userfaultfd.c | 42 ++++++++++++++++++----------------- + include/linux/mm_types.h | 7 +++++- + include/linux/pagemap.h | 14 ++++++++---- + mm/filemap.c | 37 +++++++++++++++++++------------ + mm/madvise.c | 4 ++-- + mm/memory.c | 48 ++++++++++++++++++++++------------------ + mm/swap.h | 1 - + mm/swap_state.c | 12 +++++----- + 12 files changed, 103 insertions(+), 74 deletions(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c -index cb21ccd7940d..92ecac055e4d 100644 +index 6045a5117ac1..8f59badbffb5 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c -@@ -602,7 +602,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, +@@ -601,7 +601,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, + goto lock_mmap; } - fault = handle_mm_fault(vma, addr & PAGE_MASK, - mm_flags | FAULT_FLAG_VMA_LOCK, regs); + fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & VM_FAULT_VMA_UNLOCKED)) + vma_end_read(vma); @@ -15588,19 +15592,119 @@ index e4399983c50c..ef62ab2fd211 100644 if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); +diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c +index 0fd96d6e39ce..23c3a4ce45d9 100644 +--- a/fs/userfaultfd.c ++++ b/fs/userfaultfd.c +@@ -277,17 +277,17 @@ static inline struct uffd_msg userfault_msg(unsigned long address, + * hugepmd ranges. + */ + static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, +- struct vm_area_struct *vma, +- unsigned long address, +- unsigned long flags, +- unsigned long reason) ++ struct vm_fault *vmf, ++ unsigned long reason) + { ++ struct vm_area_struct *vma = vmf->vma; + pte_t *ptep, pte; + bool ret = true; + +- mmap_assert_locked(ctx->mm); ++ if (!(vmf->flags & FAULT_FLAG_VMA_LOCK)) ++ mmap_assert_locked(ctx->mm); + +- ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma)); ++ ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma)); + if (!ptep) + goto out; + +@@ -308,10 +308,8 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + } + #else + static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, +- struct vm_area_struct *vma, +- unsigned long address, +- unsigned long flags, +- unsigned long reason) ++ struct vm_fault *vmf, ++ unsigned long reason) + { + return false; /* should never get here */ + } +@@ -325,11 +323,11 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + * threads. + */ + static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, +- unsigned long address, +- unsigned long flags, ++ struct vm_fault *vmf, + unsigned long reason) + { + struct mm_struct *mm = ctx->mm; ++ unsigned long address = vmf->address; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; +@@ -337,7 +335,8 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, + pte_t *pte; + bool ret = true; + +- mmap_assert_locked(mm); ++ if (!(vmf->flags & FAULT_FLAG_VMA_LOCK)) ++ mmap_assert_locked(mm); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) +@@ -445,7 +444,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) + * Coredumping runs without mmap_lock so we can only check that + * the mmap_lock is held, if PF_DUMPCORE was not set. + */ +- mmap_assert_locked(mm); ++ if (!(vmf->flags & FAULT_FLAG_VMA_LOCK)) ++ mmap_assert_locked(mm); + + ctx = vma->vm_userfaultfd_ctx.ctx; + if (!ctx) +@@ -561,15 +561,17 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) + spin_unlock_irq(&ctx->fault_pending_wqh.lock); + + if (!is_vm_hugetlb_page(vma)) +- must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, +- reason); ++ must_wait = userfaultfd_must_wait(ctx, vmf, reason); + else +- must_wait = userfaultfd_huge_must_wait(ctx, vma, +- vmf->address, +- vmf->flags, reason); ++ must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason); + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_unlock_read(vma); +- mmap_read_unlock(mm); ++ if (vmf->flags & FAULT_FLAG_VMA_LOCK) { ++ vma_end_read(vma); ++ /* WARNING: VMA can't be used after this */ ++ ret |= VM_FAULT_VMA_UNLOCKED; ++ } else ++ mmap_read_unlock(mm); + + if (likely(must_wait && !READ_ONCE(ctx->released))) { + wake_up_poll(&ctx->fd_wqh, EPOLLIN); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h -index 306a3d1a0fa6..b3b57c6da0e1 100644 +index 306a3d1a0fa6..bd6b95c82f7a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h -@@ -1030,6 +1030,7 @@ typedef __bitwise unsigned int vm_fault_t; +@@ -1030,6 +1030,8 @@ typedef __bitwise unsigned int vm_fault_t; * fsync() to complete (for synchronous page faults * in DAX) * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released -+ * @VM_FAULT_VMA_UNLOCKED: VMA lock was released ++ * @VM_FAULT_VMA_UNLOCKED: VMA lock was released, vmf->vma should no longer ++ * be accessed * @VM_FAULT_HINDEX_MASK: mask HINDEX value * */ -@@ -1047,6 +1048,7 @@ enum vm_fault_reason { +@@ -1047,6 +1049,7 @@ enum vm_fault_reason { VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000, VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000, VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000, @@ -15608,7 +15712,7 @@ index 306a3d1a0fa6..b3b57c6da0e1 100644 VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000, }; -@@ -1070,7 +1072,9 @@ enum vm_fault_reason { +@@ -1070,7 +1073,9 @@ enum vm_fault_reason { { VM_FAULT_RETRY, "RETRY" }, \ { VM_FAULT_FALLBACK, "FALLBACK" }, \ { VM_FAULT_DONE_COW, "DONE_COW" }, \ @@ -15726,8 +15830,30 @@ index b4c9bd368b7e..838955635fbc 100644 return true; } +diff --git a/mm/madvise.c b/mm/madvise.c +index b5ffbaf616f5..b1e8adf1234e 100644 +--- a/mm/madvise.c ++++ b/mm/madvise.c +@@ -215,7 +215,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, + continue; + + page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, +- vma, index, false, &splug); ++ vma, index, &splug); + if (page) + put_page(page); + } +@@ -252,7 +252,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, + rcu_read_unlock(); + + page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, +- NULL, 0, false, &splug); ++ NULL, 0, &splug); + if (page) + put_page(page); + diff --git a/mm/memory.c b/mm/memory.c -index f69fbc251198..e1cd39f00756 100644 +index f69fbc251198..b2ea015dcb87 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3568,6 +3568,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) @@ -15750,12 +15876,11 @@ index f69fbc251198..e1cd39f00756 100644 return VM_FAULT_RETRY; } mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, -@@ -3704,27 +3707,40 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) +@@ -3704,27 +3707,39 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) bool exclusive = false; swp_entry_t entry; pte_t pte; - int locked; -+ bool locked; + bool lock_dropped; vm_fault_t ret = 0; void *shadow = NULL; @@ -15779,7 +15904,7 @@ index f69fbc251198..e1cd39f00756 100644 + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + /* No need to hold VMA lock for migration */ + vma_end_read(vma); -+ /* CAUTION! VMA can't be used after this */ ++ /* WARNING: VMA can't be used after this */ + ret |= VM_FAULT_VMA_UNLOCKED; + } + migration_entry_wait(mm, vmf->pmd, vmf->address); @@ -15799,23 +15924,108 @@ index f69fbc251198..e1cd39f00756 100644 vmf->page = pfn_swap_entry_to_page(entry); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); -@@ -3825,9 +3841,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) +@@ -3825,9 +3840,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_release; } - locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags); - -+ locked = folio_lock_or_retry(folio, vma, vmf->flags, &lock_dropped); - if (!locked) { +- if (!locked) { ++ if (!folio_lock_or_retry(folio, vma, vmf->flags, &lock_dropped)) { + if (lock_dropped && vmf->flags & FAULT_FLAG_VMA_LOCK) + ret |= VM_FAULT_VMA_UNLOCKED; ret |= VM_FAULT_RETRY; goto out_release; } +@@ -5291,15 +5306,6 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + if (!vma_start_read(vma)) + goto inval; + +- /* +- * Due to the possibility of userfault handler dropping mmap_lock, avoid +- * it for now and fall back to page fault handling under mmap_lock. +- */ +- if (userfaultfd_armed(vma)) { +- vma_end_read(vma); +- goto inval; +- } +- + /* Check since vm_start/vm_end might change before we lock the VMA */ + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { + vma_end_read(vma); +diff --git a/mm/swap.h b/mm/swap.h +index 7c033d793f15..8a3c7a0ace4f 100644 +--- a/mm/swap.h ++++ b/mm/swap.h +@@ -46,7 +46,6 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, + struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, + unsigned long addr, +- bool do_poll, + struct swap_iocb **plug); + struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, +diff --git a/mm/swap_state.c b/mm/swap_state.c +index b76a65ac28b3..a3839de71f3f 100644 +--- a/mm/swap_state.c ++++ b/mm/swap_state.c +@@ -517,15 +517,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + */ + struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, +- unsigned long addr, bool do_poll, +- struct swap_iocb **plug) ++ unsigned long addr, struct swap_iocb **plug) + { + bool page_was_allocated; + struct page *retpage = __read_swap_cache_async(entry, gfp_mask, + vma, addr, &page_was_allocated); + + if (page_was_allocated) +- swap_readpage(retpage, do_poll, plug); ++ swap_readpage(retpage, false, plug); + + return retpage; + } +@@ -620,7 +619,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, + struct swap_info_struct *si = swp_swap_info(entry); + struct blk_plug plug; + struct swap_iocb *splug = NULL; +- bool do_poll = true, page_allocated; ++ bool page_allocated; + struct vm_area_struct *vma = vmf->vma; + unsigned long addr = vmf->address; + +@@ -628,7 +627,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, + if (!mask) + goto skip; + +- do_poll = false; + /* Read a page_cluster sized and aligned cluster around offset. */ + start_offset = offset & ~mask; + end_offset = offset | mask; +@@ -660,7 +658,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, + lru_add_drain(); /* Push any new pages onto the LRU now */ + skip: + /* The page was likely read above, so no need for plugging here */ +- return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, NULL); ++ return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL); + } + + int init_swap_address_space(unsigned int type, unsigned long nr_pages) +@@ -825,7 +823,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, + skip: + /* The page was likely read above, so no need for plugging here */ + return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, +- ra_info.win == 1, NULL); ++ NULL); + } + + /** -- 2.41.0 -From 4c79598323457b04cefa717a2adbf82586477840 Mon Sep 17 00:00:00 2001 +From 8af5970aac59ebe84af695501c91e8c7f2a1b91d Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Tue, 9 May 2023 18:40:05 +0200 Subject: [PATCH 8/8] zstd 1.5.5 diff --git a/patches/0002-eevdf.patch b/patches/0002-eevdf.patch index ae65e0d..e9814d3 100644 --- a/patches/0002-eevdf.patch +++ b/patches/0002-eevdf.patch @@ -1,2230 +1,543 @@ -From 5e4ded34523fcaf5aea5c77d45239b6dd33f1c91 Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Thu, 1 Jun 2023 16:37:55 +0200 -Subject: [PATCH] EEVDF +From 74b3a8a51481e8c38adb4954398fc7325cf42634 Mon Sep 17 00:00:00 2001 +From: Piotr Gorski +Date: Thu, 8 Jun 2023 10:59:00 +0200 +Subject: [PATCH] bore-eevdf -Signed-off-by: Peter Jung +Signed-off-by: Piotr Gorski --- - Documentation/admin-guide/cgroup-v2.rst | 10 + - include/linux/rbtree_augmented.h | 26 + - include/linux/sched.h | 8 +- - include/uapi/linux/sched.h | 4 +- - include/uapi/linux/sched/types.h | 19 + - init/init_task.c | 3 +- - kernel/sched/core.c | 65 +- - kernel/sched/debug.c | 49 +- - kernel/sched/fair.c | 1152 +++++++++++------------ - kernel/sched/features.h | 24 +- - kernel/sched/sched.h | 22 +- - tools/include/uapi/linux/sched.h | 4 +- - 12 files changed, 726 insertions(+), 660 deletions(-) + include/linux/sched.h | 10 +++ + init/Kconfig | 20 +++++ + kernel/sched/core.c | 62 +++++++++++++ + kernel/sched/debug.c | 4 + + kernel/sched/fair.c | 193 ++++++++++++++++++++++++++++++++++++++-- + kernel/sched/features.h | 4 + + kernel/sched/sched.h | 1 + + 7 files changed, 286 insertions(+), 8 deletions(-) -diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst -index f67c0829350b..a39dfda3d032 100644 ---- a/Documentation/admin-guide/cgroup-v2.rst -+++ b/Documentation/admin-guide/cgroup-v2.rst -@@ -1121,6 +1121,16 @@ All time durations are in microseconds. - values similar to the sched_setattr(2). This maximum utilization - value is used to clamp the task specific maximum utilization clamp. - -+ cpu.latency.nice -+ A read-write single value file which exists on non-root -+ cgroups. The default is "0". -+ -+ The nice value is in the range [-20, 19]. -+ -+ This interface file allows reading and setting latency using the -+ same values used by sched_setattr(2). The latency_nice of a group is -+ used to limit the impact of the latency_nice of a task outside the -+ group. - - - Memory -diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h -index 7ee7ed5de722..6dbc5a1bf6a8 100644 ---- a/include/linux/rbtree_augmented.h -+++ b/include/linux/rbtree_augmented.h -@@ -60,6 +60,32 @@ rb_insert_augmented_cached(struct rb_node *node, - rb_insert_augmented(node, &root->rb_root, augment); - } - -+static __always_inline struct rb_node * -+rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree, -+ bool (*less)(struct rb_node *, const struct rb_node *), -+ const struct rb_augment_callbacks *augment) -+{ -+ struct rb_node **link = &tree->rb_root.rb_node; -+ struct rb_node *parent = NULL; -+ bool leftmost = true; -+ -+ while (*link) { -+ parent = *link; -+ if (less(node, parent)) { -+ link = &parent->rb_left; -+ } else { -+ link = &parent->rb_right; -+ leftmost = false; -+ } -+ } -+ -+ rb_link_node(node, parent, link); -+ augment->propagate(parent, NULL); /* suboptimal */ -+ rb_insert_augmented_cached(node, tree, leftmost, augment); -+ -+ return leftmost ? node : NULL; -+} -+ - /* - * Template for declaring augmented rbtree callbacks (generic case) - * diff --git a/include/linux/sched.h b/include/linux/sched.h -index eed5d65b8d1f..63ac38d66ec6 100644 +index 63ac38d66..63a2205a5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h -@@ -550,13 +550,18 @@ struct sched_entity { - /* For load-balancing: */ - struct load_weight load; - struct rb_node run_node; -+ u64 deadline; -+ u64 min_deadline; -+ - struct list_head group_node; - unsigned int on_rq; - - u64 exec_start; +@@ -560,6 +560,12 @@ struct sched_entity { u64 sum_exec_runtime; -- u64 vruntime; u64 prev_sum_exec_runtime; -+ u64 vruntime; -+ s64 vlag; -+ u64 slice; + u64 vruntime; ++#ifdef CONFIG_SCHED_BORE ++ u64 prev_burst_time; ++ u64 burst_time; ++ u64 max_burst_time; ++ u8 penalty_score; ++#endif // CONFIG_SCHED_BORE + s64 vlag; + u64 slice; - u64 nr_migrations; +@@ -991,6 +997,10 @@ struct task_struct { + struct list_head children; + struct list_head sibling; + struct task_struct *group_leader; ++#ifdef CONFIG_SCHED_BORE ++ u64 child_burst_cache; ++ u64 child_burst_last_cached; ++#endif // CONFIG_SCHED_BORE -@@ -786,6 +791,7 @@ struct task_struct { - int static_prio; - int normal_prio; - unsigned int rt_priority; -+ int latency_prio; + /* + * 'ptraced' is the list of tasks this task is using ptrace() on. +diff --git a/init/Kconfig b/init/Kconfig +index 0147b4a33..4ab7e154b 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1290,6 +1290,26 @@ config CHECKPOINT_RESTORE - struct sched_entity se; - struct sched_rt_entity rt; -diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h -index 3bac0a8ceab2..b2e932c25be6 100644 ---- a/include/uapi/linux/sched.h -+++ b/include/uapi/linux/sched.h -@@ -132,6 +132,7 @@ struct clone_args { - #define SCHED_FLAG_KEEP_PARAMS 0x10 - #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 - #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 -+#define SCHED_FLAG_LATENCY_NICE 0x80 + If unsure, say N here. - #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ - SCHED_FLAG_KEEP_PARAMS) -@@ -143,6 +144,7 @@ struct clone_args { - SCHED_FLAG_RECLAIM | \ - SCHED_FLAG_DL_OVERRUN | \ - SCHED_FLAG_KEEP_ALL | \ -- SCHED_FLAG_UTIL_CLAMP) -+ SCHED_FLAG_UTIL_CLAMP | \ -+ SCHED_FLAG_LATENCY_NICE) - - #endif /* _UAPI_LINUX_SCHED_H */ -diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h -index f2c4589d4dbf..db1e8199e8c8 100644 ---- a/include/uapi/linux/sched/types.h -+++ b/include/uapi/linux/sched/types.h -@@ -10,6 +10,7 @@ struct sched_param { - - #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ - #define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */ -+#define SCHED_ATTR_SIZE_VER2 60 /* add: latency_nice */ - - /* - * Extended scheduling parameters data structure. -@@ -98,6 +99,22 @@ struct sched_param { - * scheduled on a CPU with no more capacity than the specified value. - * - * A task utilization boundary can be reset by setting the attribute to -1. -+ * -+ * Latency Tolerance Attributes -+ * =========================== -+ * -+ * A subset of sched_attr attributes allows to specify the relative latency -+ * requirements of a task with respect to the other tasks running/queued in the -+ * system. -+ * -+ * @ sched_latency_nice task's latency_nice value -+ * -+ * The latency_nice of a task can have any value in a range of -+ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE]. -+ * -+ * A task with latency_nice with the value of LATENCY_NICE_MIN can be -+ * taken for a task requiring a lower latency as opposed to the task with -+ * higher latency_nice. - */ - struct sched_attr { - __u32 size; -@@ -120,6 +137,8 @@ struct sched_attr { - __u32 sched_util_min; - __u32 sched_util_max; - -+ /* latency requirement hints */ -+ __s32 sched_latency_nice; - }; - - #endif /* _UAPI_LINUX_SCHED_TYPES_H */ -diff --git a/init/init_task.c b/init/init_task.c -index ff6c4b9bfe6b..511cbcf3510d 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -78,6 +78,7 @@ struct task_struct init_task - .prio = MAX_PRIO - 20, - .static_prio = MAX_PRIO - 20, - .normal_prio = MAX_PRIO - 20, -+ .latency_prio = DEFAULT_PRIO, - .policy = SCHED_NORMAL, - .cpus_ptr = &init_task.cpus_mask, - .user_cpus_ptr = NULL, -@@ -89,7 +90,7 @@ struct task_struct init_task - .fn = do_no_restart_syscall, - }, - .se = { -- .group_node = LIST_HEAD_INIT(init_task.se.group_node), -+ .group_node = LIST_HEAD_INIT(init_task.se.group_node), - }, - .rt = { - .run_list = LIST_HEAD_INIT(init_task.rt.run_list), ++config SCHED_BORE ++ bool "Burst-Oriented Response Enhancer" ++ default y ++ help ++ In Desktop and Mobile computing, one might prefer interactive ++ tasks to keep responsive no matter what they run in the background. ++ ++ Enabling this kernel feature modifies the scheduler to discriminate ++ tasks by their burst time (runtime since it last went sleeping or ++ yielding state) and prioritize those that run less bursty. ++ Such tasks usually include window compositor, widgets backend, ++ terminal emulator, video playback, games and so on. ++ With a little impact to scheduling fairness, it may improve ++ responsiveness especially under heavy background workload. ++ ++ You can turn it off by setting the sysctl kernel.sched_bore = 0. ++ Enabling this feature implies NO_GENTLE_FAIR_SLEEPERS by default. ++ ++ If unsure say Y here. ++ + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" + select CGROUPS diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index bcb3a7e684ca..3bcb77b00e5b 100644 +index 3bcb77b00..c7ceffc14 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c -@@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_struct *p, bool update_load) - } +@@ -4490,6 +4490,57 @@ int wake_up_state(struct task_struct *p, unsigned int state) + return try_to_wake_up(p, state, 0); } -+static inline void set_latency_prio(struct task_struct *p, int prio) -+{ -+ p->latency_prio = prio; -+ set_latency_fair(&p->se, prio - MAX_RT_PRIO); ++#ifdef CONFIG_SCHED_BORE ++#define CHILD_BURST_CUTOFF_BITS 9 ++extern unsigned int sched_burst_cache_lifetime; ++ ++void __init sched_init_bore(void) { ++ init_task.child_burst_cache = 0; ++ init_task.child_burst_last_cached = 0; ++ init_task.se.prev_burst_time = 0; ++ init_task.se.burst_time = 0; ++ init_task.se.max_burst_time = 0; +} + - #ifdef CONFIG_UCLAMP_TASK ++void inline __sched_fork_bore(struct task_struct *p) { ++ p->child_burst_cache = 0; ++ p->child_burst_last_cached = 0; ++ p->se.burst_time = 0; ++} ++ ++static inline void update_task_child_burst_time_cache(struct task_struct *p) { ++ u64 sum = 0, avg_burst_time = 0; ++ u32 cnt = 0; ++ struct task_struct *child; ++ ++ read_lock(&tasklist_lock); ++ list_for_each_entry(child, &p->children, sibling) { ++ cnt++; ++ sum += child->se.max_burst_time >> CHILD_BURST_CUTOFF_BITS; ++ } ++ read_unlock(&tasklist_lock); ++ ++ if (cnt) avg_burst_time = div_u64(sum, cnt) << CHILD_BURST_CUTOFF_BITS; ++ p->child_burst_cache = max(avg_burst_time, p->se.max_burst_time); ++} ++ ++static void update_task_initial_burst_time(struct task_struct *task) { ++ struct sched_entity *se = &task->se; ++ struct task_struct *par = task->real_parent; ++ u64 ktime = ktime_to_ns(ktime_get()); ++ ++ if (likely(par)) { ++ if (par->child_burst_last_cached + sched_burst_cache_lifetime < ktime) { ++ par->child_burst_last_cached = ktime; ++ update_task_child_burst_time_cache(par); ++ } ++ se->prev_burst_time = max(se->prev_burst_time, par->child_burst_cache); ++ } ++ ++ se->max_burst_time = se->prev_burst_time; ++} ++#endif // CONFIG_SCHED_BORE ++ /* - * Serializes updates of utilization clamp values -@@ -4500,8 +4506,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. +@@ -4506,6 +4557,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; -+ p->se.vlag = 0; ++#ifdef CONFIG_SCHED_BORE ++ __sched_fork_bore(p); ++#endif // CONFIG_SCHED_BORE + p->se.vlag = 0; INIT_LIST_HEAD(&p->se.group_node); -+ set_latency_prio(p, p->latency_prio); +@@ -4735,6 +4789,9 @@ late_initcall(sched_core_sysctl_init); + int sched_fork(unsigned long clone_flags, struct task_struct *p) + { + __sched_fork(clone_flags, p); ++#ifdef CONFIG_SCHED_BORE ++ update_task_initial_burst_time(p); ++#endif // CONFIG_SCHED_BORE + /* + * We mark the process as NEW here. This guarantees that + * nobody will actually run it, and a signal or other external +@@ -9955,6 +10012,11 @@ void __init sched_init(void) + BUG_ON(&dl_sched_class != &stop_sched_class + 1); + #endif + ++#ifdef CONFIG_SCHED_BORE ++ sched_init_bore(); ++ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 2.4.0 by Masahito Suzuki"); ++#endif // CONFIG_SCHED_BORE + + wait_bit_init(); + #ifdef CONFIG_FAIR_GROUP_SCHED - p->se.cfs_rq = NULL; - #endif -@@ -4753,6 +4762,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) - - p->prio = p->normal_prio = p->static_prio; - set_load_weight(p, false); -+ set_latency_prio(p, NICE_TO_PRIO(0)); - - /* - * We don't need the reset flag anymore after the fork. It has -@@ -7512,7 +7522,7 @@ static struct task_struct *find_process_by_pid(pid_t pid) - #define SETPARAM_POLICY -1 - - static void __setscheduler_params(struct task_struct *p, -- const struct sched_attr *attr) -+ const struct sched_attr *attr) - { - int policy = attr->sched_policy; - -@@ -7536,6 +7546,13 @@ static void __setscheduler_params(struct task_struct *p, - set_load_weight(p, true); - } - -+static void __setscheduler_latency(struct task_struct *p, -+ const struct sched_attr *attr) -+{ -+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) -+ set_latency_prio(p, NICE_TO_PRIO(attr->sched_latency_nice)); -+} -+ - /* - * Check the target process has a UID that matches the current process's: - */ -@@ -7676,6 +7693,13 @@ static int __sched_setscheduler(struct task_struct *p, - return retval; - } - -+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) { -+ if (attr->sched_latency_nice > MAX_NICE) -+ return -EINVAL; -+ if (attr->sched_latency_nice < MIN_NICE) -+ return -EINVAL; -+ } -+ - if (pi) - cpuset_read_lock(); - -@@ -7710,6 +7734,9 @@ static int __sched_setscheduler(struct task_struct *p, - goto change; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) - goto change; -+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE && -+ attr->sched_latency_nice != PRIO_TO_NICE(p->latency_prio)) -+ goto change; - - p->sched_reset_on_fork = reset_on_fork; - retval = 0; -@@ -7798,6 +7825,7 @@ static int __sched_setscheduler(struct task_struct *p, - __setscheduler_params(p, attr); - __setscheduler_prio(p, newprio); - } -+ __setscheduler_latency(p, attr); - __setscheduler_uclamp(p, attr); - - if (queued) { -@@ -8008,6 +8036,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a - size < SCHED_ATTR_SIZE_VER1) - return -EINVAL; - -+ if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) && -+ size < SCHED_ATTR_SIZE_VER2) -+ return -EINVAL; - /* - * XXX: Do we want to be lenient like existing syscalls; or do we want - * to be strict and return an error on out-of-bounds values? -@@ -8245,6 +8276,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - get_params(p, &kattr); - kattr.sched_flags &= SCHED_FLAG_ALL; - -+ kattr.sched_latency_nice = PRIO_TO_NICE(p->latency_prio); -+ - #ifdef CONFIG_UCLAMP_TASK - /* - * This could race with another potential updater, but this is fine -@@ -11181,6 +11214,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, - { - return sched_group_set_idle(css_tg(css), idle); - } -+ -+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css, -+ struct cftype *cft) -+{ -+ return PRIO_TO_NICE(css_tg(css)->latency_prio); -+} -+ -+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css, -+ struct cftype *cft, s64 nice) -+{ -+ int prio; -+ -+ if (nice < MIN_NICE || nice > MAX_NICE) -+ return -ERANGE; -+ -+ prio = NICE_TO_PRIO(nice); -+ -+ return sched_group_set_latency(css_tg(css), prio); -+} - #endif - - static struct cftype cpu_legacy_files[] = { -@@ -11195,6 +11247,11 @@ static struct cftype cpu_legacy_files[] = { - .read_s64 = cpu_idle_read_s64, - .write_s64 = cpu_idle_write_s64, - }, -+ { -+ .name = "latency.nice", -+ .read_s64 = cpu_latency_nice_read_s64, -+ .write_s64 = cpu_latency_nice_write_s64, -+ }, - #endif - #ifdef CONFIG_CFS_BANDWIDTH - { -@@ -11412,6 +11469,12 @@ static struct cftype cpu_files[] = { - .read_s64 = cpu_idle_read_s64, - .write_s64 = cpu_idle_write_s64, - }, -+ { -+ .name = "latency.nice", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .read_s64 = cpu_latency_nice_read_s64, -+ .write_s64 = cpu_latency_nice_write_s64, -+ }, - #endif - #ifdef CONFIG_CFS_BANDWIDTH - { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 066ff1c8ae4e..e7e83181fbb6 100644 +index e7e83181f..c29500314 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c -@@ -347,10 +347,7 @@ static __init int sched_init_debug(void) - debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); +@@ -348,6 +348,7 @@ static __init int sched_init_debug(void) #endif -- debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency); -- debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity); -- debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity); -- debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity); -+ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); + debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); ++ debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity); debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); -@@ -581,9 +578,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) - else - SEQ_printf(m, " %c", task_state_to_char(p)); +@@ -594,6 +595,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); -- SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ", -+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", - p->comm, task_pid_nr(p), - SPLIT_NS(p->se.vruntime), -+ entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', -+ SPLIT_NS(p->se.deadline), -+ SPLIT_NS(p->se.slice), -+ SPLIT_NS(p->se.sum_exec_runtime), - (long long)(p->nvcsw + p->nivcsw), - p->prio); - -@@ -626,10 +627,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) - - void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) - { -- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, -- spread, rq0_min_vruntime, spread0; -+ s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread; -+ struct sched_entity *last, *first; - struct rq *rq = cpu_rq(cpu); -- struct sched_entity *last; - unsigned long flags; - - #ifdef CONFIG_FAIR_GROUP_SCHED -@@ -643,26 +643,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) - SPLIT_NS(cfs_rq->exec_clock)); - - raw_spin_rq_lock_irqsave(rq, flags); -- if (rb_first_cached(&cfs_rq->tasks_timeline)) -- MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; -+ first = __pick_first_entity(cfs_rq); -+ if (first) -+ left_vruntime = first->vruntime; - last = __pick_last_entity(cfs_rq); - if (last) -- max_vruntime = last->vruntime; -+ right_vruntime = last->vruntime; - min_vruntime = cfs_rq->min_vruntime; -- rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; - raw_spin_rq_unlock_irqrestore(rq, flags); -- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", -- SPLIT_NS(MIN_vruntime)); -+ -+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime", -+ SPLIT_NS(left_vruntime)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", - SPLIT_NS(min_vruntime)); -- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", -- SPLIT_NS(max_vruntime)); -- spread = max_vruntime - MIN_vruntime; -- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", -- SPLIT_NS(spread)); -- spread0 = min_vruntime - rq0_min_vruntime; -- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", -- SPLIT_NS(spread0)); -+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", -+ SPLIT_NS(avg_vruntime(cfs_rq))); -+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", -+ SPLIT_NS(right_vruntime)); -+ spread = right_vruntime - left_vruntime; -+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); - SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", - cfs_rq->nr_spread_over); - SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); -@@ -863,10 +862,7 @@ static void sched_debug_header(struct seq_file *m) - SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) - #define PN(x) \ - SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) -- PN(sysctl_sched_latency); -- PN(sysctl_sched_min_granularity); -- PN(sysctl_sched_idle_min_granularity); -- PN(sysctl_sched_wakeup_granularity); -+ PN(sysctl_sched_base_slice); - P(sysctl_sched_child_runs_first); - P(sysctl_sched_features); - #undef PN -@@ -1089,6 +1085,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++#ifdef CONFIG_SCHED_BORE ++ SEQ_printf(m, " %2d", p->se.penalty_score); ++#endif + #ifdef CONFIG_NUMA_BALANCING + SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif - P(policy); - P(prio); -+ P(latency_prio); - if (task_has_dl_policy(p)) { - P(dl.runtime); - P(dl.deadline); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 9fe8288b1b1f..97678b9b4023 100644 +index 97678b9b4..b0acc7126 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c -@@ -47,6 +47,7 @@ - #include - #include - #include -+#include - - #include - -@@ -56,26 +57,6 @@ - #include "stats.h" - #include "autogroup.h" - --/* -- * Targeted preemption latency for CPU-bound tasks: -- * -- * NOTE: this latency value is not the same as the concept of -- * 'timeslice length' - timeslices in CFS are of variable length -- * and have no persistent notion like in traditional, time-slice -- * based scheduling concepts. -- * -- * (to see the precise effective timeslice length of your workload, -- * run vmstat and monitor the context-switches (cs) field) -- * -- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) -- */ --#ifdef CONFIG_CACHY --unsigned int sysctl_sched_latency = 3000000ULL; --static unsigned int normalized_sysctl_sched_latency = 3000000ULL; --#else --unsigned int sysctl_sched_latency = 6000000ULL; --static unsigned int normalized_sysctl_sched_latency = 6000000ULL; --#endif - /* - * The initial- and re-scaling of tunables is configurable +@@ -19,6 +19,9 @@ * -@@ -94,26 +75,8 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; - * - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra ++ * ++ * Burst-Oriented Response Enhancer (BORE) CPU Scheduler ++ * Copyright (C) 2021-2023 Masahito Suzuki */ --#ifdef CONFIG_CACHY --unsigned int sysctl_sched_min_granularity = 400000ULL; --static unsigned int normalized_sysctl_sched_min_granularity = 400000ULL; --#else --unsigned int sysctl_sched_min_granularity = 750000ULL; --static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; --#endif -- --/* -- * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks. -- * Applies only when SCHED_IDLE tasks compete with normal tasks. -- * -- * (default: 0.75 msec) -- */ --unsigned int sysctl_sched_idle_min_granularity = 750000ULL; -- --/* -- * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity -- */ --static unsigned int sched_nr_latency = 8; -+unsigned int sysctl_sched_base_slice = 750000ULL; -+static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; + #include + #include +@@ -66,17 +69,17 @@ + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + * +- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) ++ * (default SCHED_TUNABLESCALING_NONE = *1) + */ +-unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; ++unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; + + /* + * Minimal preemption granularity for CPU-bound tasks: + * +- * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) ++ * (default: 3 msec * 1, units: nanoseconds) + */ +-unsigned int sysctl_sched_base_slice = 750000ULL; +-static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; ++unsigned int sysctl_sched_base_slice = 3000000ULL; ++static unsigned int normalized_sysctl_sched_base_slice = 3000000ULL; /* * After fork, child runs first. If set to 0 (default) then -@@ -121,23 +84,6 @@ static unsigned int sched_nr_latency = 8; +@@ -84,8 +87,75 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; */ unsigned int sysctl_sched_child_runs_first __read_mostly; --/* -- * SCHED_OTHER wake-up granularity. -- * -- * This option delays the preemption effects of decoupled workloads -- * and reduces their over-scheduling. Synchronous workloads will still -- * have immediate wakeup/sleep latencies. -- * -- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) -- */ --#ifdef CONFIG_CACHY --unsigned int sysctl_sched_wakeup_granularity = 500000UL; --static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; --#else --unsigned int sysctl_sched_wakeup_granularity = 1000000UL; --static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; --#endif -- ++/* ++ * SCHED_OTHER wake-up granularity. ++ * ++ * This option delays the preemption effects of decoupled workloads ++ * and reduces their over-scheduling. Synchronous workloads will still ++ * have immediate wakeup/sleep latencies. ++ * ++ * (default: 3.2 msec * 1, units: nanoseconds) ++ */ ++unsigned int sysctl_sched_wakeup_granularity = 3200000UL; ++static unsigned int normalized_sysctl_sched_wakeup_granularity = 3200000UL; ++ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; ++#ifdef CONFIG_SCHED_BORE ++unsigned int __read_mostly sched_bore = 1; ++unsigned int __read_mostly sched_burst_cache_lifetime = 15000000; ++unsigned int __read_mostly sched_burst_penalty_offset = 12; ++unsigned int __read_mostly sched_burst_penalty_scale = 1292; ++unsigned int __read_mostly sched_burst_smoothness = 1; ++static int three = 3; ++static int sixty_four = 64; ++static int maxval_12_bits = 4095; ++ ++#define FIXED_SHIFT 10 ++#define FIXED_ONE (1 << FIXED_SHIFT) ++typedef u32 fixed; ++ ++static void update_burst_score(struct sched_entity *se) { ++ u64 burst_time = se->max_burst_time; ++ ++ int msb = fls64(burst_time); ++ fixed integer_part = msb << FIXED_SHIFT; ++ fixed fractional_part = burst_time << (64 - msb) << 1 >> (64 - FIXED_SHIFT); ++ fixed greed = integer_part | fractional_part; ++ ++ fixed tolerance = sched_burst_penalty_offset << FIXED_SHIFT; ++ fixed penalty = max(0, (s32)greed - (s32)tolerance); ++ fixed scaled_penalty = penalty * sched_burst_penalty_scale >> 10; ++ ++ u8 score = min(39U, scaled_penalty >> FIXED_SHIFT); ++ se->penalty_score = score; ++} ++ ++static inline u64 penalty_scale(u64 delta, struct sched_entity *se) { ++ return mul_u64_u32_shr(delta, sched_prio_to_wmult[se->penalty_score], 22); ++} ++ ++static inline u64 __binary_smooth(u64 new, u64 old, unsigned int smoothness) { ++ return (new + old * ((1 << smoothness) - 1)) >> smoothness; ++} ++ ++void restart_burst(struct sched_entity *se) { ++ se->max_burst_time = se->prev_burst_time = __binary_smooth( ++ se->burst_time, se->prev_burst_time, sched_burst_smoothness); ++ se->burst_time = 0; ++} ++ ++#define calc_delta_fair(delta, se) __calc_delta_fair(delta, se, true) ++#define calc_delta_fair_unscaled(delta, se) __calc_delta_fair(delta, se, false) ++static inline u64 ++__calc_delta_fair(u64 delta, struct sched_entity *se, bool bscale); ++ ++static s64 wakeup_preempt_backstep_delta(u64 rtime, struct sched_entity *se) { ++ u64 delta = calc_delta_fair_unscaled(rtime, se); ++ return delta - penalty_scale(delta, se); ++} ++#endif // CONFIG_SCHED_BORE ++ int sched_thermal_decay_shift; -@@ -189,12 +135,8 @@ int __weak arch_asym_cpu_priority(int cpu) - * - * (default: 5 msec, units: microseconds) - */ --#ifdef CONFIG_CACHY --static unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; --#else - static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; - #endif --#endif - - #ifdef CONFIG_NUMA_BALANCING - /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ -@@ -295,9 +237,7 @@ static void update_sysctl(void) + static int __init setup_sched_thermal_decay_shift(char *str) + { +@@ -145,6 +215,51 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; + #ifdef CONFIG_SYSCTL + static struct ctl_table sched_fair_sysctls[] = { ++#ifdef CONFIG_SCHED_BORE ++ { ++ .procname = "sched_bore", ++ .data = &sched_bore, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++ { ++ .procname = "sched_burst_cache_lifetime", ++ .data = &sched_burst_cache_lifetime, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++ { ++ .procname = "sched_burst_penalty_offset", ++ .data = &sched_burst_penalty_offset, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &sixty_four, ++ }, ++ { ++ .procname = "sched_burst_penalty_scale", ++ .data = &sched_burst_penalty_scale, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &maxval_12_bits, ++ }, ++ { ++ .procname = "sched_burst_smoothness", ++ .data = &sched_burst_smoothness, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &three, ++ }, ++#endif // CONFIG_SCHED_BORE + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -238,6 +353,7 @@ static void update_sysctl(void) #define SET_SYSCTL(name) \ (sysctl_##name = (factor) * normalized_sysctl_##name) -- SET_SYSCTL(sched_min_granularity); -- SET_SYSCTL(sched_latency); -- SET_SYSCTL(sched_wakeup_granularity); -+ SET_SYSCTL(sched_base_slice); + SET_SYSCTL(sched_base_slice); ++ SET_SYSCTL(sched_wakeup_granularity); #undef SET_SYSCTL } -@@ -365,6 +305,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight - return mul_u64_u32_shr(delta_exec, fact, shift); - } - -+/* -+ * delta /= w -+ */ -+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) -+{ -+ if (unlikely(se->load.weight != NICE_0_LOAD)) -+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load); -+ -+ return delta; -+} - - const struct sched_class fair_sched_class; - -@@ -619,13 +569,200 @@ static inline bool entity_before(const struct sched_entity *a, - return (s64)(a->vruntime - b->vruntime) < 0; - } - -+static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) -+{ -+ return (s64)(se->vruntime - cfs_rq->min_vruntime); -+} -+ - #define __node_2_se(node) \ - rb_entry((node), struct sched_entity, run_node) - -+/* -+ * Compute virtual time from the per-task service numbers: -+ * -+ * Fair schedulers conserve lag: -+ * -+ * \Sum lag_i = 0 -+ * -+ * Where lag_i is given by: -+ * -+ * lag_i = S - s_i = w_i * (V - v_i) -+ * -+ * Where S is the ideal service time and V is it's virtual time counterpart. -+ * Therefore: -+ * -+ * \Sum lag_i = 0 -+ * \Sum w_i * (V - v_i) = 0 -+ * \Sum w_i * V - w_i * v_i = 0 -+ * -+ * From which we can solve an expression for V in v_i (which we have in -+ * se->vruntime): -+ * -+ * \Sum v_i * w_i \Sum v_i * w_i -+ * V = -------------- = -------------- -+ * \Sum w_i W -+ * -+ * Specifically, this is the weighted average of all entity virtual runtimes. -+ * -+ * [[ NOTE: this is only equal to the ideal scheduler under the condition -+ * that join/leave operations happen at lag_i = 0, otherwise the -+ * virtual time has non-continguous motion equivalent to: -+ * -+ * V +-= lag_i / W -+ * -+ * Also see the comment in place_entity() that deals with this. ]] -+ * -+ * However, since v_i is u64, and the multiplcation could easily overflow -+ * transform it into a relative form that uses smaller quantities: -+ * -+ * Substitute: v_i == (v_i - v0) + v0 -+ * -+ * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i -+ * V = ---------------------------- = --------------------- + v0 -+ * W W -+ * -+ * Which we track using: -+ * -+ * v0 := cfs_rq->min_vruntime -+ * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime -+ * \Sum w_i := cfs_rq->avg_load -+ * -+ * Since min_vruntime is a monotonic increasing variable that closely tracks -+ * the per-task service, these deltas: (v_i - v), will be in the order of the -+ * maximal (virtual) lag induced in the system due to quantisation. -+ * -+ * Also, we use scale_load_down() to reduce the size. -+ * -+ * As measured, the max (key * weight) value was ~44 bits for a kernel build. -+ */ -+static void -+avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) -+{ -+ unsigned long weight = scale_load_down(se->load.weight); -+ s64 key = entity_key(cfs_rq, se); -+ -+ cfs_rq->avg_vruntime += key * weight; -+ cfs_rq->avg_slice += se->slice * weight; -+ cfs_rq->avg_load += weight; -+} -+ -+static void -+avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) -+{ -+ unsigned long weight = scale_load_down(se->load.weight); -+ s64 key = entity_key(cfs_rq, se); -+ -+ cfs_rq->avg_vruntime -= key * weight; -+ cfs_rq->avg_slice -= se->slice * weight; -+ cfs_rq->avg_load -= weight; -+} -+ -+static inline -+void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) -+{ -+ /* -+ * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load -+ */ -+ cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta; -+} -+ -+u64 avg_vruntime(struct cfs_rq *cfs_rq) -+{ -+ struct sched_entity *curr = cfs_rq->curr; -+ s64 avg = cfs_rq->avg_vruntime; -+ long load = cfs_rq->avg_load; -+ -+ if (curr && curr->on_rq) { -+ unsigned long weight = scale_load_down(curr->load.weight); -+ -+ avg += entity_key(cfs_rq, curr) * weight; -+ load += weight; -+ } -+ -+ if (load) -+ avg = div_s64(avg, load); -+ -+ return cfs_rq->min_vruntime + avg; -+} -+ -+/* -+ * lag_i = S - s_i = w_i * (V - v_i) -+ * -+ * However, since V is approximated by the weighted average of all entities it -+ * is possible -- by addition/removal/reweight to the tree -- to move V around -+ * and end up with a larger lag than we started with. -+ * -+ * Limit this to either double the slice length with a minimum of TICK_NSEC -+ * since that is the timing granularity. -+ * -+ * EEVDF gives the following limit for a steady state system: -+ * -+ * -r_max < lag < max(r_max, q) -+ * -+ * XXX could add max_slice to the augmented data to track this. -+ */ -+void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) -+{ -+ s64 lag, limit; -+ -+ SCHED_WARN_ON(!se->on_rq); -+ lag = avg_vruntime(cfs_rq) - se->vruntime; -+ -+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); -+ se->vlag = clamp(lag, -limit, limit); -+} -+ -+/* -+ * Entity is eligible once it received less service than it ought to have, -+ * eg. lag >= 0. -+ * -+ * lag_i = S - s_i = w_i*(V - v_i) -+ * -+ * lag_i >= 0 -> V >= v_i -+ * -+ * \Sum (v_i - v)*w_i -+ * V = ------------------ + v -+ * \Sum w_i -+ * -+ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) -+ * -+ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due -+ * to the loss in precision caused by the division. -+ */ -+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) -+{ -+ struct sched_entity *curr = cfs_rq->curr; -+ s64 avg = cfs_rq->avg_vruntime; -+ long load = cfs_rq->avg_load; -+ -+ if (curr && curr->on_rq) { -+ unsigned long weight = scale_load_down(curr->load.weight); -+ -+ avg += entity_key(cfs_rq, curr) * weight; -+ load += weight; -+ } -+ -+ return avg >= entity_key(cfs_rq, se) * load; -+} -+ -+static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) -+{ -+ u64 min_vruntime = cfs_rq->min_vruntime; -+ /* -+ * open coded max_vruntime() to allow updating avg_vruntime -+ */ -+ s64 delta = (s64)(vruntime - min_vruntime); -+ if (delta > 0) { -+ avg_vruntime_update(cfs_rq, delta); -+ min_vruntime = vruntime; -+ } -+ return min_vruntime; -+} -+ - static void update_min_vruntime(struct cfs_rq *cfs_rq) - { -+ struct sched_entity *se = __pick_first_entity(cfs_rq); - struct sched_entity *curr = cfs_rq->curr; -- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); - - u64 vruntime = cfs_rq->min_vruntime; - -@@ -636,9 +773,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) - curr = NULL; - } - -- if (leftmost) { /* non-empty tree */ -- struct sched_entity *se = __node_2_se(leftmost); -- -+ if (se) { - if (!curr) - vruntime = se->vruntime; - else -@@ -647,7 +782,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) - - /* ensure we never gain time by being placed backwards. */ - u64_u32_store(cfs_rq->min_vruntime, -- max_vruntime(cfs_rq->min_vruntime, vruntime)); -+ __update_min_vruntime(cfs_rq, vruntime)); - } - - static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) -@@ -655,17 +790,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) - return entity_before(__node_2_se(a), __node_2_se(b)); - } - -+#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) -+ -+static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node) -+{ -+ if (node) { -+ struct sched_entity *rse = __node_2_se(node); -+ if (deadline_gt(min_deadline, se, rse)) -+ se->min_deadline = rse->min_deadline; -+ } -+} -+ -+/* -+ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline) -+ */ -+static inline bool min_deadline_update(struct sched_entity *se, bool exit) -+{ -+ u64 old_min_deadline = se->min_deadline; -+ struct rb_node *node = &se->run_node; -+ -+ se->min_deadline = se->deadline; -+ __update_min_deadline(se, node->rb_right); -+ __update_min_deadline(se, node->rb_left); -+ -+ return se->min_deadline == old_min_deadline; -+} -+ -+RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity, -+ run_node, min_deadline, min_deadline_update); -+ +@@ -308,11 +424,19 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight /* - * Enqueue an entity into the rb-tree: + * delta /= w */ - static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) ++#ifdef CONFIG_SCHED_BORE ++static inline u64 ++__calc_delta_fair(u64 delta, struct sched_entity *se, bool bscale) ++#else // CONFIG_SCHED_BORE + static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) ++#endif // CONFIG_SCHED_BORE { -- rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less); -+ avg_vruntime_add(cfs_rq, se); -+ se->min_deadline = se->deadline; -+ rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, -+ __entity_less, &min_deadline_cb); + if (unlikely(se->load.weight != NICE_0_LOAD)) + delta = __calc_delta(delta, NICE_0_LOAD, &se->load); + ++#ifdef CONFIG_SCHED_BORE ++ if (bscale && sched_bore) delta = penalty_scale(delta, se); ++#endif // CONFIG_SCHED_BORE + return delta; } - static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) - { -- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); -+ rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, -+ &min_deadline_cb); -+ avg_vruntime_sub(cfs_rq, se); +@@ -708,7 +832,11 @@ void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) + SCHED_WARN_ON(!se->on_rq); + lag = avg_vruntime(cfs_rq) - se->vruntime; + ++#ifdef CONFIG_SCHED_BORE ++ limit = calc_delta_fair_unscaled(max_t(u64, 2*se->slice, TICK_NSEC), se); ++#else // CONFIG_SCHED_BORE + limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); ++#endif // CONFIG_SCHED_BORE + se->vlag = clamp(lag, -limit, limit); } - struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) -@@ -678,14 +847,81 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) - return __node_2_se(left); - } - --static struct sched_entity *__pick_next_entity(struct sched_entity *se) -+/* -+ * Earliest Eligible Virtual Deadline First -+ * -+ * In order to provide latency guarantees for different request sizes -+ * EEVDF selects the best runnable task from two criteria: -+ * -+ * 1) the task must be eligible (must be owed service) -+ * -+ * 2) from those tasks that meet 1), we select the one -+ * with the earliest virtual deadline. -+ * -+ * We can do this in O(log n) time due to an augmented RB-tree. The -+ * tree keeps the entries sorted on service, but also functions as a -+ * heap based on the deadline by keeping: -+ * -+ * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline) -+ * -+ * Which allows an EDF like search on (sub)trees. -+ */ -+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) - { -- struct rb_node *next = rb_next(&se->run_node); -+ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; -+ struct sched_entity *curr = cfs_rq->curr; -+ struct sched_entity *best = NULL; - -- if (!next) -- return NULL; -+ if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) -+ curr = NULL; -+ -+ while (node) { -+ struct sched_entity *se = __node_2_se(node); - -- return __node_2_se(next); -+ /* -+ * If this entity is not eligible, try the left subtree. -+ */ -+ if (!entity_eligible(cfs_rq, se)) { -+ node = node->rb_left; -+ continue; -+ } -+ -+ /* -+ * If this entity has an earlier deadline than the previous -+ * best, take this one. If it also has the earliest deadline -+ * of its subtree, we're done. -+ */ -+ if (!best || deadline_gt(deadline, best, se)) { -+ best = se; -+ if (best->deadline == best->min_deadline) -+ break; -+ } -+ -+ /* -+ * If the earlest deadline in this subtree is in the fully -+ * eligible left half of our space, go there. -+ */ -+ if (node->rb_left && -+ __node_2_se(node->rb_left)->min_deadline == se->min_deadline) { -+ node = node->rb_left; -+ continue; -+ } -+ -+ node = node->rb_right; -+ } -+ -+ if (!best || (curr && deadline_gt(deadline, best, curr))) -+ best = curr; -+ -+ if (unlikely(!best)) { -+ struct sched_entity *left = __pick_first_entity(cfs_rq); -+ if (left) { -+ pr_err("EEVDF scheduling fail, picking leftmost\n"); -+ return left; -+ } -+ } -+ -+ return best; - } - - #ifdef CONFIG_SCHED_DEBUG -@@ -707,104 +943,53 @@ int sched_update_scaling(void) - { - unsigned int factor = get_update_sysctl_factor(); - -- sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, -- sysctl_sched_min_granularity); -- +@@ -946,6 +1074,7 @@ int sched_update_scaling(void) #define WRT_SYSCTL(name) \ (normalized_sysctl_##name = sysctl_##name / (factor)) -- WRT_SYSCTL(sched_min_granularity); -- WRT_SYSCTL(sched_latency); -- WRT_SYSCTL(sched_wakeup_granularity); -+ WRT_SYSCTL(sched_base_slice); + WRT_SYSCTL(sched_base_slice); ++ WRT_SYSCTL(sched_wakeup_granularity); #undef WRT_SYSCTL return 0; - } - #endif - --/* -- * delta /= w -- */ --static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) -+void set_latency_fair(struct sched_entity *se, int prio) - { -- if (unlikely(se->load.weight != NICE_0_LOAD)) -- delta = __calc_delta(delta, NICE_0_LOAD, &se->load); -+ u32 weight = sched_prio_to_weight[prio]; -+ u64 base = sysctl_sched_base_slice; - -- return delta; --} -- --/* -- * The idea is to set a period in which each task runs once. -- * -- * When there are too many tasks (sched_nr_latency) we have to stretch -- * this period because otherwise the slices get too small. -- * -- * p = (nr <= nl) ? l : l*nr/nl -- */ --static u64 __sched_period(unsigned long nr_running) --{ -- if (unlikely(nr_running > sched_nr_latency)) -- return nr_running * sysctl_sched_min_granularity; -- else -- return sysctl_sched_latency; -+ /* -+ * For EEVDF the virtual time slope is determined by w_i (iow. -+ * nice) while the request time r_i is determined by -+ * latency-nice. -+ * -+ * Smaller request gets better latency. -+ */ -+ se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight); - } - --static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq); -+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); - - /* -- * We calculate the wall-time slice from the period by taking a part -- * proportional to the weight. -- * -- * s = p*P[w/rw] -+ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i -+ * this is probably good enough. - */ --static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) -+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) - { -- unsigned int nr_running = cfs_rq->nr_running; -- struct sched_entity *init_se = se; -- unsigned int min_gran; -- u64 slice; -- -- if (sched_feat(ALT_PERIOD)) -- nr_running = rq_of(cfs_rq)->cfs.h_nr_running; -- -- slice = __sched_period(nr_running + !se->on_rq); -- -- for_each_sched_entity(se) { -- struct load_weight *load; -- struct load_weight lw; -- struct cfs_rq *qcfs_rq; -- -- qcfs_rq = cfs_rq_of(se); -- load = &qcfs_rq->load; -- -- if (unlikely(!se->on_rq)) { -- lw = qcfs_rq->load; -- -- update_load_add(&lw, se->load.weight); -- load = &lw; -- } -- slice = __calc_delta(slice, se->load.weight, load); -- } -+ if ((s64)(se->vruntime - se->deadline) < 0) -+ return; - -- if (sched_feat(BASE_SLICE)) { -- if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq)) -- min_gran = sysctl_sched_idle_min_granularity; -- else -- min_gran = sysctl_sched_min_granularity; -+ /* -+ * EEVDF: vd_i = ve_i + r_i / w_i -+ */ -+ se->deadline = se->vruntime + calc_delta_fair(se->slice, se); - -- slice = max_t(u64, slice, min_gran); -+ /* -+ * The task has consumed its request, reschedule. -+ */ -+ if (cfs_rq->nr_running > 1) { -+ resched_curr(rq_of(cfs_rq)); -+ clear_buddies(cfs_rq, se); - } -- -- return slice; --} -- --/* -- * We calculate the vruntime slice of a to-be-inserted task. -- * -- * vs = s/w -- */ --static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) --{ -- return calc_delta_fair(sched_slice(cfs_rq, se), se); - } - - #include "pelt.h" -@@ -939,6 +1124,7 @@ static void update_curr(struct cfs_rq *cfs_rq) +@@ -1123,6 +1252,11 @@ static void update_curr(struct cfs_rq *cfs_rq) + curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq->exec_clock, delta_exec); ++#ifdef CONFIG_SCHED_BORE ++ curr->burst_time += delta_exec; ++ curr->max_burst_time = max(curr->max_burst_time, curr->burst_time); ++ update_burst_score(curr); ++#endif // CONFIG_SCHED_BORE curr->vruntime += calc_delta_fair(delta_exec, curr); -+ update_deadline(cfs_rq, curr); + update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); - - if (entity_is_task(curr)) { -@@ -3393,16 +3579,36 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } - static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long weight) - { -+ unsigned long old_weight = se->load.weight; -+ - if (se->on_rq) { - /* commit outstanding execution time */ - if (cfs_rq->curr == se) - update_curr(cfs_rq); -+ else -+ avg_vruntime_sub(cfs_rq, se); - update_load_sub(&cfs_rq->load, se->load.weight); - } - dequeue_load_avg(cfs_rq, se); - - update_load_set(&se->load, weight); - -+ if (!se->on_rq) { -+ /* -+ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), -+ * we need to scale se->vlag when w_i changes. -+ */ -+ se->vlag = div_s64(se->vlag * old_weight, weight); -+ } else { -+ s64 deadline = se->deadline - se->vruntime; -+ /* -+ * When the weight changes, the virtual time slope changes and -+ * we should adjust the relative virtual deadline accordingly. -+ */ -+ deadline = div_s64(deadline * old_weight, weight); -+ se->deadline = se->vruntime + deadline; -+ } -+ - #ifdef CONFIG_SMP - do { - u32 divider = get_pelt_divider(&se->avg); -@@ -3412,9 +3618,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - #endif - - enqueue_load_avg(cfs_rq, se); -- if (se->on_rq) -+ if (se->on_rq) { - update_load_add(&cfs_rq->load, se->load.weight); -- -+ if (cfs_rq->curr != se) -+ avg_vruntime_add(cfs_rq, se); -+ } - } - - void reweight_task(struct task_struct *p, int prio) -@@ -4710,158 +4918,151 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} - - #endif /* CONFIG_SMP */ - --static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) --{ --#ifdef CONFIG_SCHED_DEBUG -- s64 d = se->vruntime - cfs_rq->min_vruntime; -- -- if (d < 0) -- d = -d; -- -- if (d > 3*sysctl_sched_latency) -- schedstat_inc(cfs_rq->nr_spread_over); --#endif --} -- --static inline bool entity_is_long_sleeper(struct sched_entity *se) -+static inline bool -+entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - { -- struct cfs_rq *cfs_rq; -- u64 sleep_time; -+ u64 now; - -- if (se->exec_start == 0) -+ if (!(flags & ENQUEUE_WAKEUP)) - return false; - -- cfs_rq = cfs_rq_of(se); -- -- sleep_time = rq_clock_task(rq_of(cfs_rq)); -- -- /* Happen while migrating because of clock task divergence */ -- if (sleep_time <= se->exec_start) -- return false; -- -- sleep_time -= se->exec_start; -- if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD))) -+ if (flags & ENQUEUE_MIGRATED) - return true; - -- return false; -+ now = rq_clock_task(rq_of(cfs_rq)); -+ return (s64)(se->exec_start - now) >= se->slice; - } - - static void --place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) -+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - { -- u64 vruntime = cfs_rq->min_vruntime; -+ u64 vslice = calc_delta_fair(se->slice, se); -+ u64 vruntime = avg_vruntime(cfs_rq); -+ s64 lag = 0; - - /* -- * The 'current' period is already promised to the current tasks, -- * however the extra weight of the new task will slow them down a -- * little, place the new task so that it fits in the slot that -- * stays open at the end. -+ * Due to how V is constructed as the weighted average of entities, -+ * adding tasks with positive lag, or removing tasks with negative lag -+ * will move 'time' backwards, this can screw around with the lag of -+ * other tasks. -+ * -+ * EEVDF: placement strategy #1 / #2 - */ -- if (initial && sched_feat(START_DEBIT)) -- vruntime += sched_vslice(cfs_rq, se); -+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { -+ struct sched_entity *curr = cfs_rq->curr; -+ unsigned long load; - -- /* sleeps up to a single latency don't count. */ -- if (!initial) { -- unsigned long thresh; -+ lag = se->vlag; - -- if (se_is_idle(se)) -- thresh = sysctl_sched_min_granularity; -- else -- thresh = sysctl_sched_latency; -+ /* -+ * For latency sensitive tasks; those that have a shorter than -+ * average slice and do not fully consume the slice, transition -+ * to EEVDF placement strategy #2. -+ */ -+ if (sched_feat(PLACE_FUDGE) && -+ (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) && -+ entity_has_slept(cfs_rq, se, flags)) { -+ lag += vslice; -+ if (lag > 0) -+ lag = 0; -+ } - - /* -- * Halve their sleep time's effect, to allow -- * for a gentler effect of sleepers: -+ * If we want to place a task and preserve lag, we have to -+ * consider the effect of the new entity on the weighted -+ * average and compensate for this, otherwise lag can quickly -+ * evaporate. -+ * -+ * Lag is defined as: -+ * -+ * lag_i = S - s_i = w_i * (V - v_i) -+ * -+ * To avoid the 'w_i' term all over the place, we only track -+ * the virtual lag: -+ * -+ * vl_i = V - v_i <=> v_i = V - vl_i -+ * -+ * And we take V to be the weighted average of all v: -+ * -+ * V = (\Sum w_j*v_j) / W -+ * -+ * Where W is: \Sum w_j -+ * -+ * Then, the weighted average after adding an entity with lag -+ * vl_i is given by: -+ * -+ * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i) -+ * = (W*V + w_i*(V - vl_i)) / (W + w_i) -+ * = (W*V + w_i*V - w_i*vl_i) / (W + w_i) -+ * = (V*(W + w_i) - w_i*l) / (W + w_i) -+ * = V - w_i*vl_i / (W + w_i) -+ * -+ * And the actual lag after adding an entity with vl_i is: -+ * -+ * vl'_i = V' - v_i -+ * = V - w_i*vl_i / (W + w_i) - (V - vl_i) -+ * = vl_i - w_i*vl_i / (W + w_i) -+ * -+ * Which is strictly less than vl_i. So in order to preserve lag -+ * we should inflate the lag before placement such that the -+ * effective lag after placement comes out right. -+ * -+ * As such, invert the above relation for vl'_i to get the vl_i -+ * we need to use such that the lag after placement is the lag -+ * we computed before dequeue. -+ * -+ * vl'_i = vl_i - w_i*vl_i / (W + w_i) -+ * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i) -+ * -+ * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i -+ * = W*vl_i -+ * -+ * vl_i = (W + w_i)*vl'_i / W - */ -- if (sched_feat(GENTLE_FAIR_SLEEPERS)) -- thresh >>= 1; -- -- vruntime -= thresh; -- } -- -- /* -- * Pull vruntime of the entity being placed to the base level of -- * cfs_rq, to prevent boosting it if placed backwards. -- * However, min_vruntime can advance much faster than real time, with -- * the extreme being when an entity with the minimal weight always runs -- * on the cfs_rq. If the waking entity slept for a long time, its -- * vruntime difference from min_vruntime may overflow s64 and their -- * comparison may get inversed, so ignore the entity's original -- * vruntime in that case. -- * The maximal vruntime speedup is given by the ratio of normal to -- * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES. -- * When placing a migrated waking entity, its exec_start has been set -- * from a different rq. In order to take into account a possible -- * divergence between new and prev rq's clocks task because of irq and -- * stolen time, we take an additional margin. -- * So, cutting off on the sleep time of -- * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days -- * should be safe. -- */ -- if (entity_is_long_sleeper(se)) -- se->vruntime = vruntime; -- else -- se->vruntime = max_vruntime(se->vruntime, vruntime); -+ load = cfs_rq->avg_load; -+ if (curr && curr->on_rq) -+ load += scale_load_down(curr->load.weight); -+ -+ lag *= load + scale_load_down(se->load.weight); -+ if (WARN_ON_ONCE(!load)) -+ load = 1; -+ lag = div_s64(lag, load); -+ } -+ -+ se->vruntime = vruntime - lag; -+ -+ /* -+ * When joining the competition; the exisiting tasks will be, -+ * on average, halfway through their slice, as such start tasks -+ * off with half a slice to ease into the competition. -+ */ -+ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) -+ vslice /= 2; -+ -+ /* -+ * EEVDF: vd_i = ve_i + r_i/w_i -+ */ -+ se->deadline = se->vruntime + vslice; - } - - static void check_enqueue_throttle(struct cfs_rq *cfs_rq); - - static inline bool cfs_bandwidth_used(void); - --/* -- * MIGRATION -- * -- * dequeue -- * update_curr() -- * update_min_vruntime() -- * vruntime -= min_vruntime -- * -- * enqueue -- * update_curr() -- * update_min_vruntime() -- * vruntime += min_vruntime -- * -- * this way the vruntime transition between RQs is done when both -- * min_vruntime are up-to-date. -- * -- * WAKEUP (remote) -- * -- * ->migrate_task_rq_fair() (p->state == TASK_WAKING) -- * vruntime -= min_vruntime -- * -- * enqueue -- * update_curr() -- * update_min_vruntime() -- * vruntime += min_vruntime -- * -- * this way we don't have the most up-to-date min_vruntime on the originating -- * CPU and an up-to-date min_vruntime on the destination CPU. -- */ -- - static void - enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - { -- bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); - bool curr = cfs_rq->curr == se; - - /* - * If we're the current task, we must renormalise before calling - * update_curr(). - */ -- if (renorm && curr) -- se->vruntime += cfs_rq->min_vruntime; -+ if (curr) -+ place_entity(cfs_rq, se, flags); - - update_curr(cfs_rq); - -- /* -- * Otherwise, renormalise after, such that we're placed at the current -- * moment in time, instead of some random moment in the past. Being -- * placed in the past could significantly boost this task to the -- * fairness detriment of existing tasks. -- */ -- if (renorm && !curr) -- se->vruntime += cfs_rq->min_vruntime; -- - /* - * When enqueuing a sched_entity, we must: - * - Update loads to have both entity and cfs_rq synced with now. -@@ -4873,18 +5074,28 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - */ - update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); - se_update_runnable(se); -+ /* -+ * XXX update_load_avg() above will have attached us to the pelt sum; -+ * but update_cfs_group() here will re-adjust the weight and have to -+ * undo/redo all that. Seems wasteful. -+ */ - update_cfs_group(se); -+ -+ /* -+ * XXX now that the entity has been re-weighted, and it's lag adjusted, -+ * we can place the entity. -+ */ -+ if (!curr) -+ place_entity(cfs_rq, se, flags); -+ - account_entity_enqueue(cfs_rq, se); - -- if (flags & ENQUEUE_WAKEUP) -- place_entity(cfs_rq, se, 0); - /* Entity has migrated, no longer consider this task hot */ - if (flags & ENQUEUE_MIGRATED) - se->exec_start = 0; - - check_schedstat_required(); - update_stats_enqueue_fair(cfs_rq, se, flags); -- check_spread(cfs_rq, se); - if (!curr) - __enqueue_entity(cfs_rq, se); - se->on_rq = 1; -@@ -4896,17 +5107,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - } - } - --static void __clear_buddies_last(struct sched_entity *se) --{ -- for_each_sched_entity(se) { -- struct cfs_rq *cfs_rq = cfs_rq_of(se); -- if (cfs_rq->last != se) -- break; -- -- cfs_rq->last = NULL; -- } --} -- - static void __clear_buddies_next(struct sched_entity *se) - { - for_each_sched_entity(se) { -@@ -4918,27 +5118,10 @@ static void __clear_buddies_next(struct sched_entity *se) - } - } - --static void __clear_buddies_skip(struct sched_entity *se) --{ -- for_each_sched_entity(se) { -- struct cfs_rq *cfs_rq = cfs_rq_of(se); -- if (cfs_rq->skip != se) -- break; -- -- cfs_rq->skip = NULL; -- } --} -- - static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) - { -- if (cfs_rq->last == se) -- __clear_buddies_last(se); -- - if (cfs_rq->next == se) - __clear_buddies_next(se); -- -- if (cfs_rq->skip == se) -- __clear_buddies_skip(se); - } - - static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); -@@ -4972,20 +5155,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - - clear_buddies(cfs_rq, se); - -+ update_entity_lag(cfs_rq, se); - if (se != cfs_rq->curr) - __dequeue_entity(cfs_rq, se); - se->on_rq = 0; - account_entity_dequeue(cfs_rq, se); - -- /* -- * Normalize after update_curr(); which will also have moved -- * min_vruntime if @se is the one holding it back. But before doing -- * update_min_vruntime() again, which will discount @se's position and -- * can move min_vruntime forward still more. -- */ -- if (!(flags & DEQUEUE_SLEEP)) -- se->vruntime -= cfs_rq->min_vruntime; -- - /* return excess runtime on last dequeue */ - return_cfs_rq_runtime(cfs_rq); - -@@ -5004,52 +5179,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - update_idle_cfs_rq_clock_pelt(cfs_rq); - } - --/* -- * Preempt the current task with a newly woken task if needed: -- */ --static void --check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) --{ -- unsigned long ideal_runtime, delta_exec; -- struct sched_entity *se; -- s64 delta; -- -- /* -- * When many tasks blow up the sched_period; it is possible that -- * sched_slice() reports unusually large results (when many tasks are -- * very light for example). Therefore impose a maximum. -- */ -- ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency); -- -- delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; -- if (delta_exec > ideal_runtime) { -- resched_curr(rq_of(cfs_rq)); -- /* -- * The current task ran long enough, ensure it doesn't get -- * re-elected due to buddy favours. -- */ -- clear_buddies(cfs_rq, curr); -- return; -- } -- -- /* -- * Ensure that a task that missed wakeup preemption by a -- * narrow margin doesn't have to wait for a full slice. -- * This also mitigates buddy induced latencies under load. -- */ -- if (delta_exec < sysctl_sched_min_granularity) -- return; -- -- se = __pick_first_entity(cfs_rq); -- delta = curr->vruntime - se->vruntime; -- -- if (delta < 0) -- return; -- -- if (delta > ideal_runtime) -- resched_curr(rq_of(cfs_rq)); --} -- - static void - set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) - { -@@ -5088,9 +5217,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +@@ -5217,6 +5351,9 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } --static int --wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); -- ++static int ++wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); ++ /* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups -@@ -5101,50 +5227,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); +@@ -5227,14 +5364,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static struct sched_entity * pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { -- struct sched_entity *left = __pick_first_entity(cfs_rq); -- struct sched_entity *se; -- ++ struct sched_entity *candidate = pick_eevdf(cfs_rq); /* -- * If curr is set we have to see if its left of the leftmost entity -- * still in the tree, provided there was anything in the tree at all. -+ * Enabling NEXT_BUDDY will affect latency but not fairness. + * Enabling NEXT_BUDDY will affect latency but not fairness. */ -- if (!left || (curr && entity_before(curr, left))) -- left = curr; -- -- se = left; /* ideally we run the leftmost entity */ -- -- /* -- * Avoid running the skip buddy, if running something else can -- * be done without getting too unfair. -- */ -- if (cfs_rq->skip && cfs_rq->skip == se) { -- struct sched_entity *second; -- -- if (se == curr) { -- second = __pick_first_entity(cfs_rq); -- } else { -- second = __pick_next_entity(se); -- if (!second || (curr && entity_before(curr, second))) -- second = curr; -- } -- -- if (second && wakeup_preempt_entity(second, left) < 1) -- se = second; -- } -- -- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) { -- /* -- * Someone really wants this to run. If it's not unfair, run it. -- */ -- se = cfs_rq->next; -- } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) { -- /* -- * Prefer last buddy, try to return the CPU to a preempted task. -- */ -- se = cfs_rq->last; -- } -+ if (sched_feat(NEXT_BUDDY) && -+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) -+ return cfs_rq->next; + if (sched_feat(NEXT_BUDDY) && +- cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) ++ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next) && ++ wakeup_preempt_entity(cfs_rq->next, candidate) < 1) + return cfs_rq->next; -- return se; -+ return pick_eevdf(cfs_rq); +- return pick_eevdf(cfs_rq); ++ return candidate; } static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); -@@ -5161,8 +5251,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) - /* throttle cfs_rqs exceeding runtime */ - check_cfs_rq_runtime(cfs_rq); - -- check_spread(cfs_rq, prev); -- - if (prev->on_rq) { - update_stats_wait_start_fair(cfs_rq, prev); - /* Put 'current' back into the tree. */ -@@ -5203,9 +5291,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) - hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) - return; - #endif -- -- if (cfs_rq->nr_running > 1) -- check_preempt_tick(cfs_rq, curr); +@@ -6464,6 +6603,38 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + hrtick_update(rq); } - -@@ -6210,13 +6295,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} - static void hrtick_start_fair(struct rq *rq, struct task_struct *p) - { - struct sched_entity *se = &p->se; -- struct cfs_rq *cfs_rq = cfs_rq_of(se); - - SCHED_WARN_ON(task_rq(p) != rq); - - if (rq->cfs.h_nr_running > 1) { -- u64 slice = sched_slice(cfs_rq, se); - u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; -+ u64 slice = se->slice; - s64 delta = slice - ran; - - if (delta < 0) { -@@ -6240,8 +6324,7 @@ static void hrtick_update(struct rq *rq) - if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) - return; - -- if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) -- hrtick_start_fair(rq, curr); -+ hrtick_start_fair(rq, curr); - } - #else /* !CONFIG_SCHED_HRTICK */ - static inline void -@@ -6282,17 +6365,6 @@ static int sched_idle_rq(struct rq *rq) - rq->nr_running); - } - --/* -- * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use -- * of idle_nr_running, which does not consider idle descendants of normal -- * entities. -- */ --static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq) --{ -- return cfs_rq->nr_running && -- cfs_rq->nr_running == cfs_rq->idle_nr_running; --} -- - #ifdef CONFIG_SMP - static int sched_idle_cpu(int cpu) - { -@@ -7778,18 +7850,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) - { - struct sched_entity *se = &p->se; - -- /* -- * As blocked tasks retain absolute vruntime the migration needs to -- * deal with this by subtracting the old and adding the new -- * min_vruntime -- the latter is done by enqueue_entity() when placing -- * the task on the new runqueue. -- */ -- if (READ_ONCE(p->__state) == TASK_WAKING) { -- struct cfs_rq *cfs_rq = cfs_rq_of(se); -- -- se->vruntime -= u64_u32_load(cfs_rq->min_vruntime); -- } -- - if (!task_on_rq_migrating(p)) { - remove_entity_load_avg(se); - -@@ -7827,66 +7887,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) - } - #endif /* CONFIG_SMP */ - --static unsigned long wakeup_gran(struct sched_entity *se) --{ -- unsigned long gran = sysctl_sched_wakeup_granularity; -- -- /* -- * Since its curr running now, convert the gran from real-time -- * to virtual-time in his units. -- * -- * By using 'se' instead of 'curr' we penalize light tasks, so -- * they get preempted easier. That is, if 'se' < 'curr' then -- * the resulting gran will be larger, therefore penalizing the -- * lighter, if otoh 'se' > 'curr' then the resulting gran will -- * be smaller, again penalizing the lighter task. -- * -- * This is especially important for buddies when the leftmost -- * task is higher priority than the buddy. -- */ -- return calc_delta_fair(gran, se); --} -- --/* -- * Should 'se' preempt 'curr'. -- * -- * |s1 -- * |s2 -- * |s3 -- * g -- * |<--->|c -- * -- * w(c, s1) = -1 -- * w(c, s2) = 0 -- * w(c, s3) = 1 -- * -- */ --static int --wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) --{ -- s64 gran, vdiff = curr->vruntime - se->vruntime; -- -- if (vdiff <= 0) -- return -1; -- -- gran = wakeup_gran(se); -- if (vdiff > gran) -- return 1; -- -- return 0; --} -- --static void set_last_buddy(struct sched_entity *se) --{ -- for_each_sched_entity(se) { -- if (SCHED_WARN_ON(!se->on_rq)) -- return; -- if (se_is_idle(se)) -- return; -- cfs_rq_of(se)->last = se; -- } --} -- - static void set_next_buddy(struct sched_entity *se) - { - for_each_sched_entity(se) { -@@ -7898,12 +7898,6 @@ static void set_next_buddy(struct sched_entity *se) - } - } - --static void set_skip_buddy(struct sched_entity *se) --{ -- for_each_sched_entity(se) -- cfs_rq_of(se)->skip = se; --} -- - /* - * Preempt the current task with a newly woken task if needed: - */ -@@ -7912,7 +7906,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ - struct task_struct *curr = rq->curr; - struct sched_entity *se = &curr->se, *pse = &p->se; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); -- int scale = cfs_rq->nr_running >= sched_nr_latency; - int next_buddy_marked = 0; - int cse_is_idle, pse_is_idle; - -@@ -7928,7 +7921,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ - if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) - return; - -- if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { -+ if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) { - set_next_buddy(pse); - next_buddy_marked = 1; - } -@@ -7973,35 +7966,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ - if (cse_is_idle != pse_is_idle) - return; - -- update_curr(cfs_rq_of(se)); -- if (wakeup_preempt_entity(se, pse) == 1) { -- /* -- * Bias pick_next to pick the sched entity that is -- * triggering this preemption. -- */ -- if (!next_buddy_marked) -- set_next_buddy(pse); -+ cfs_rq = cfs_rq_of(se); -+ update_curr(cfs_rq); -+ -+ /* -+ * XXX pick_eevdf(cfs_rq) != se ? -+ */ -+ if (pick_eevdf(cfs_rq) == pse) - goto preempt; -- } - - return; - - preempt: - resched_curr(rq); -- /* -- * Only set the backward buddy when the current task is still -- * on the rq. This can happen when a wakeup gets interleaved -- * with schedule on the ->pre_schedule() or idle_balance() -- * point, either of which can * drop the rq lock. -- * -- * Also, during early boot the idle thread is in the fair class, -- * for obvious reasons its a bad idea to schedule back to it. -- */ -- if (unlikely(!se->on_rq || curr == rq->idle)) -- return; -- -- if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) -- set_last_buddy(se); - } - - #ifdef CONFIG_SMP -@@ -8202,8 +8179,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) - - /* - * sched_yield() is very simple -- * -- * The magic of dealing with the ->skip buddy is in pick_next_entity. - */ - static void yield_task_fair(struct rq *rq) - { -@@ -8219,21 +8194,19 @@ static void yield_task_fair(struct rq *rq) - - clear_buddies(cfs_rq, se); - -- if (curr->policy != SCHED_BATCH) { -- update_rq_clock(rq); -- /* -- * Update run-time statistics of the 'current'. -- */ -- update_curr(cfs_rq); -- /* -- * Tell update_rq_clock() that we've just updated, -- * so we don't do microscopic update in schedule() -- * and double the fastpath cost. -- */ -- rq_clock_skip_update(rq); -- } -+ update_rq_clock(rq); -+ /* -+ * Update run-time statistics of the 'current'. -+ */ -+ update_curr(cfs_rq); -+ /* -+ * Tell update_rq_clock() that we've just updated, -+ * so we don't do microscopic update in schedule() -+ * and double the fastpath cost. -+ */ -+ rq_clock_skip_update(rq); - -- set_skip_buddy(se); -+ se->deadline += calc_delta_fair(se->slice, se); - } - - static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) -@@ -8476,8 +8449,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) - * Buddy candidates are cache hot: - */ - if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && -- (&p->se == cfs_rq_of(&p->se)->next || -- &p->se == cfs_rq_of(&p->se)->last)) -+ (&p->se == cfs_rq_of(&p->se)->next)) - return 1; - - if (sysctl_sched_migration_cost == -1) -@@ -11987,8 +11959,8 @@ static void rq_offline_fair(struct rq *rq) - static inline bool - __entity_slice_used(struct sched_entity *se, int min_nr_tasks) - { -- u64 slice = sched_slice(cfs_rq_of(se), se); - u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime; -+ u64 slice = se->slice; - - return (rtime * min_nr_tasks > slice); - } -@@ -12144,8 +12116,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) - */ - static void task_fork_fair(struct task_struct *p) - { -- struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se, *curr; -+ struct cfs_rq *cfs_rq; - struct rq *rq = this_rq(); - struct rq_flags rf; - -@@ -12154,22 +12126,9 @@ static void task_fork_fair(struct task_struct *p) - - cfs_rq = task_cfs_rq(current); - curr = cfs_rq->curr; -- if (curr) { -+ if (curr) - update_curr(cfs_rq); -- se->vruntime = curr->vruntime; -- } -- place_entity(cfs_rq, se, 1); -- -- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { -- /* -- * Upon rescheduling, sched_class::put_prev_task() will place -- * 'current' within the tree based on its new key value. -- */ -- swap(curr->vruntime, se->vruntime); -- resched_curr(rq); -- } -- -- se->vruntime -= cfs_rq->min_vruntime; -+ place_entity(cfs_rq, se, ENQUEUE_INITIAL); - rq_unlock(rq, &rf); - } - -@@ -12198,34 +12157,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) - check_preempt_curr(rq, p, 0); - } - --static inline bool vruntime_normalized(struct task_struct *p) --{ -- struct sched_entity *se = &p->se; -- -- /* -- * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, -- * the dequeue_entity(.flags=0) will already have normalized the -- * vruntime. -- */ -- if (p->on_rq) -- return true; -- -- /* -- * When !on_rq, vruntime of the task has usually NOT been normalized. -- * But there are some cases where it has already been normalized: -- * -- * - A forked child which is waiting for being woken up by -- * wake_up_new_task(). -- * - A task which has been woken up by try_to_wake_up() and -- * waiting for actually being woken up by sched_ttwu_pending(). -- */ -- if (!se->sum_exec_runtime || -- (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup)) -- return true; -- -- return false; --} -- - #ifdef CONFIG_FAIR_GROUP_SCHED - /* - * Propagate the changes of the sched_entity across the tg tree to make it -@@ -12296,16 +12227,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) - static void detach_task_cfs_rq(struct task_struct *p) - { - struct sched_entity *se = &p->se; -- struct cfs_rq *cfs_rq = cfs_rq_of(se); -- -- if (!vruntime_normalized(p)) { -- /* -- * Fix up our vruntime so that the current sleep doesn't -- * cause 'unlimited' sleep bonus. -- */ -- place_entity(cfs_rq, se, 0); -- se->vruntime -= cfs_rq->min_vruntime; -- } - - detach_entity_cfs_rq(se); - } -@@ -12313,12 +12234,8 @@ static void detach_task_cfs_rq(struct task_struct *p) - static void attach_task_cfs_rq(struct task_struct *p) - { - struct sched_entity *se = &p->se; -- struct cfs_rq *cfs_rq = cfs_rq_of(se); - - attach_entity_cfs_rq(se); -- -- if (!vruntime_normalized(p)) -- se->vruntime += cfs_rq->min_vruntime; - } - - static void switched_from_fair(struct rq *rq, struct task_struct *p) -@@ -12429,6 +12346,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) - goto err; - - tg->shares = NICE_0_LOAD; -+ tg->latency_prio = DEFAULT_PRIO; - - init_cfs_bandwidth(tg_cfs_bandwidth(tg)); - -@@ -12527,6 +12445,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, - } - - se->my_q = cfs_rq; -+ -+ set_latency_fair(se, tg->latency_prio - MAX_RT_PRIO); -+ - /* guarantee group entities always have weight */ - update_load_set(&se->load, NICE_0_LOAD); - se->parent = parent; -@@ -12657,6 +12578,29 @@ int sched_group_set_idle(struct task_group *tg, long idle) - return 0; - } - -+int sched_group_set_latency(struct task_group *tg, int prio) ++static unsigned long wakeup_gran(struct sched_entity *se) +{ -+ int i; ++ unsigned long gran = sysctl_sched_wakeup_granularity; ++#ifdef CONFIG_SCHED_BORE ++ return calc_delta_fair_unscaled(gran, se); ++#else // CONFIG_SCHED_BORE ++ return calc_delta_fair(gran, se); ++#endif // CONFIG_SCHED_BORE ++} + -+ if (tg == &root_task_group) -+ return -EINVAL; -+ -+ mutex_lock(&shares_mutex); -+ -+ if (tg->latency_prio == prio) { -+ mutex_unlock(&shares_mutex); -+ return 0; ++static int ++wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) ++{ ++ s64 gran, vdiff = curr->vruntime - se->vruntime; ++#ifdef CONFIG_SCHED_BORE ++ if (sched_bore) { ++ u64 rtime = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; ++ vdiff += wakeup_preempt_backstep_delta(rtime, curr) ++ - wakeup_preempt_backstep_delta(rtime, se); + } ++#endif // CONFIG_SCHED_BORE + -+ tg->latency_prio = prio; ++ if (vdiff <= 0) ++ return -1; + -+ for_each_possible_cpu(i) -+ set_latency_fair(tg->se[i], prio - MAX_RT_PRIO); ++ gran = wakeup_gran(se); ++ if (vdiff > gran) ++ return 1; + -+ mutex_unlock(&shares_mutex); + return 0; +} + - #else /* CONFIG_FAIR_GROUP_SCHED */ + static void set_next_buddy(struct sched_entity *se); - void free_fair_sched_group(struct task_group *tg) { } -@@ -12683,7 +12627,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task - * idle runqueue: + /* +@@ -6482,6 +6653,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + util_est_dequeue(&rq->cfs, p); + + for_each_sched_entity(se) { ++#ifdef CONFIG_SCHED_BORE ++ if (task_sleep) restart_burst(se); ++#endif // CONFIG_SCHED_BORE + cfs_rq = cfs_rq_of(se); + dequeue_entity(cfs_rq, se, flags); + +@@ -7972,7 +8146,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + /* + * XXX pick_eevdf(cfs_rq) != se ? */ - if (rq->cfs.load.weight) -- rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); -+ rr_interval = NS_TO_JIFFIES(se->slice); +- if (pick_eevdf(cfs_rq) == pse) ++ if ((pick_eevdf(cfs_rq) == pse) && (wakeup_preempt_entity(se, pse) == 1)) + goto preempt; - return rr_interval; - } + return; +@@ -8185,6 +8359,9 @@ static void yield_task_fair(struct rq *rq) + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct sched_entity *se = &curr->se; ++#ifdef CONFIG_SCHED_BORE ++ restart_burst(se); ++#endif // CONFIG_SCHED_BORE + + /* + * Are we the only task in the tree? diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index 9e390eb82e38..ca95044a7479 100644 +index ca95044a7..a7d34d1b2 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h -@@ -1,16 +1,12 @@ - /* SPDX-License-Identifier: GPL-2.0 */ --/* -- * Only give sleepers 50% of their service deficit. This allows -- * them to run sooner, but does not allow tons of sleepers to -- * rip the spread apart. -- */ --SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) - - /* -- * Place new tasks ahead so that they do not starve already running -- * tasks -+ * Using the avg_vruntime, do the right thing and preserve lag across -+ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. - */ --SCHED_FEAT(START_DEBIT, true) -+SCHED_FEAT(PLACE_LAG, true) -+SCHED_FEAT(PLACE_FUDGE, true) -+SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) - - /* - * Prefer to schedule the task we woke last (assuming it failed -@@ -19,13 +15,6 @@ SCHED_FEAT(START_DEBIT, true) +@@ -13,7 +13,11 @@ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) + * wakeup-preemption), since its likely going to consume data we + * touched, increases cache locality. */ ++#ifdef CONFIG_SCHED_BORE ++SCHED_FEAT(NEXT_BUDDY, true) ++#else // CONFIG_SCHED_BORE SCHED_FEAT(NEXT_BUDDY, false) ++#endif // CONFIG_SCHED_BORE --/* -- * Prefer to schedule the task that ran last (when we did -- * wake-preempt) as that likely will touch the same data, increases -- * cache locality. -- */ --SCHED_FEAT(LAST_BUDDY, true) -- /* * Consider buddies to be cache hot, decreases the likeliness of a - * cache buddy being migrated away, increases cache locality. -@@ -99,6 +88,3 @@ SCHED_FEAT(UTIL_EST, true) - SCHED_FEAT(UTIL_EST_FASTUP, true) - - SCHED_FEAT(LATENCY_WARN, false) -- --SCHED_FEAT(ALT_PERIOD, true) --SCHED_FEAT(BASE_SLICE, true) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index d8ba81c66579..0ea13cfac95b 100644 +index 0ea13cfac..34cb2fbbb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h -@@ -372,6 +372,8 @@ struct task_group { - - /* A positive value indicates that this is a SCHED_IDLE group. */ - int idle; -+ /* latency priority of the group. */ -+ int latency_prio; - - #ifdef CONFIG_SMP - /* -@@ -482,6 +484,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); - - extern int sched_group_set_idle(struct task_group *tg, long idle); - -+extern int sched_group_set_latency(struct task_group *tg, int prio); -+ - #ifdef CONFIG_SMP - extern void set_task_rq_fair(struct sched_entity *se, - struct cfs_rq *prev, struct cfs_rq *next); -@@ -548,6 +552,10 @@ struct cfs_rq { - unsigned int idle_nr_running; /* SCHED_IDLE */ - unsigned int idle_h_nr_running; /* SCHED_IDLE */ - -+ s64 avg_vruntime; -+ u64 avg_slice; -+ u64 avg_load; -+ - u64 exec_clock; - u64 min_vruntime; - #ifdef CONFIG_SCHED_CORE -@@ -567,8 +575,6 @@ struct cfs_rq { - */ - struct sched_entity *curr; - struct sched_entity *next; -- struct sched_entity *last; -- struct sched_entity *skip; - - #ifdef CONFIG_SCHED_DEBUG - unsigned int nr_spread_over; -@@ -2167,6 +2173,7 @@ extern const u32 sched_prio_to_wmult[40]; - #else - #define ENQUEUE_MIGRATED 0x00 - #endif -+#define ENQUEUE_INITIAL 0x80 - - #define RETRY_TASK ((void *)-1UL) - -@@ -2471,11 +2478,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); - extern const_debug unsigned int sysctl_sched_nr_migrate; +@@ -2479,6 +2479,7 @@ extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; -+extern unsigned int sysctl_sched_base_slice; -+ + extern unsigned int sysctl_sched_base_slice; ++extern unsigned int sysctl_sched_wakeup_granularity; + #ifdef CONFIG_SCHED_DEBUG --extern unsigned int sysctl_sched_latency; --extern unsigned int sysctl_sched_min_granularity; --extern unsigned int sysctl_sched_idle_min_granularity; --extern unsigned int sysctl_sched_wakeup_granularity; extern int sysctl_resched_latency_warn_ms; - extern int sysctl_resched_latency_warn_once; - -@@ -2488,6 +2493,8 @@ extern unsigned int sysctl_numa_balancing_scan_size; - extern unsigned int sysctl_numa_balancing_hot_threshold; - #endif - -+extern void set_latency_fair(struct sched_entity *se, int prio); -+ - #ifdef CONFIG_SCHED_HRTICK - - /* -@@ -3496,4 +3503,7 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } - static inline void init_sched_mm_cid(struct task_struct *t) { } - #endif - -+extern u64 avg_vruntime(struct cfs_rq *cfs_rq); -+extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); -+ - #endif /* _KERNEL_SCHED_SCHED_H */ -diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h -index 3bac0a8ceab2..b2e932c25be6 100644 ---- a/tools/include/uapi/linux/sched.h -+++ b/tools/include/uapi/linux/sched.h -@@ -132,6 +132,7 @@ struct clone_args { - #define SCHED_FLAG_KEEP_PARAMS 0x10 - #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 - #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 -+#define SCHED_FLAG_LATENCY_NICE 0x80 - - #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ - SCHED_FLAG_KEEP_PARAMS) -@@ -143,6 +144,7 @@ struct clone_args { - SCHED_FLAG_RECLAIM | \ - SCHED_FLAG_DL_OVERRUN | \ - SCHED_FLAG_KEEP_ALL | \ -- SCHED_FLAG_UTIL_CLAMP) -+ SCHED_FLAG_UTIL_CLAMP | \ -+ SCHED_FLAG_LATENCY_NICE) - - #endif /* _UAPI_LINUX_SCHED_H */ -- -2.41.0 +2.41.0.rc2 diff --git a/patches/0003-bore.patch b/patches/0003-bore.patch new file mode 100644 index 0000000..64bf802 --- /dev/null +++ b/patches/0003-bore.patch @@ -0,0 +1,421 @@ +From 32c617afc05751783be3eb0f5a1d15e31dfc7919 Mon Sep 17 00:00:00 2001 +From: Piotr Gorski +Date: Thu, 8 Jun 2023 11:14:23 +0200 +Subject: [PATCH] bore-cachy + +Signed-off-by: Piotr Gorski +--- + include/linux/sched.h | 10 +++ + init/Kconfig | 20 ++++++ + kernel/sched/core.c | 62 ++++++++++++++++++ + kernel/sched/debug.c | 3 + + kernel/sched/fair.c | 136 ++++++++++++++++++++++++++++++++++++++++ + kernel/sched/features.h | 8 +++ + 6 files changed, 239 insertions(+) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index eed5d65b8..38fbebe4d 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -557,6 +557,12 @@ struct sched_entity { + u64 sum_exec_runtime; + u64 vruntime; + u64 prev_sum_exec_runtime; ++#ifdef CONFIG_SCHED_BORE ++ u64 prev_burst_time; ++ u64 burst_time; ++ u64 max_burst_time; ++ u8 penalty_score; ++#endif // CONFIG_SCHED_BORE + + u64 nr_migrations; + +@@ -985,6 +991,10 @@ struct task_struct { + struct list_head children; + struct list_head sibling; + struct task_struct *group_leader; ++#ifdef CONFIG_SCHED_BORE ++ u64 child_burst_cache; ++ u64 child_burst_last_cached; ++#endif // CONFIG_SCHED_BORE + + /* + * 'ptraced' is the list of tasks this task is using ptrace() on. +diff --git a/init/Kconfig b/init/Kconfig +index 0147b4a33..4ab7e154b 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1290,6 +1290,26 @@ config CHECKPOINT_RESTORE + + If unsure, say N here. + ++config SCHED_BORE ++ bool "Burst-Oriented Response Enhancer" ++ default y ++ help ++ In Desktop and Mobile computing, one might prefer interactive ++ tasks to keep responsive no matter what they run in the background. ++ ++ Enabling this kernel feature modifies the scheduler to discriminate ++ tasks by their burst time (runtime since it last went sleeping or ++ yielding state) and prioritize those that run less bursty. ++ Such tasks usually include window compositor, widgets backend, ++ terminal emulator, video playback, games and so on. ++ With a little impact to scheduling fairness, it may improve ++ responsiveness especially under heavy background workload. ++ ++ You can turn it off by setting the sysctl kernel.sched_bore = 0. ++ Enabling this feature implies NO_GENTLE_FAIR_SLEEPERS by default. ++ ++ If unsure say Y here. ++ + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" + select CGROUPS +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index bcb3a7e68..a0f227344 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4484,6 +4484,57 @@ int wake_up_state(struct task_struct *p, unsigned int state) + return try_to_wake_up(p, state, 0); + } + ++#ifdef CONFIG_SCHED_BORE ++#define CHILD_BURST_CUTOFF_BITS 9 ++extern unsigned int sched_burst_cache_lifetime; ++ ++void __init sched_init_bore(void) { ++ init_task.child_burst_cache = 0; ++ init_task.child_burst_last_cached = 0; ++ init_task.se.prev_burst_time = 0; ++ init_task.se.burst_time = 0; ++ init_task.se.max_burst_time = 0; ++} ++ ++void inline __sched_fork_bore(struct task_struct *p) { ++ p->child_burst_cache = 0; ++ p->child_burst_last_cached = 0; ++ p->se.burst_time = 0; ++} ++ ++static inline void update_task_child_burst_time_cache(struct task_struct *p) { ++ u64 sum = 0, avg_burst_time = 0; ++ u32 cnt = 0; ++ struct task_struct *child; ++ ++ read_lock(&tasklist_lock); ++ list_for_each_entry(child, &p->children, sibling) { ++ cnt++; ++ sum += child->se.max_burst_time >> CHILD_BURST_CUTOFF_BITS; ++ } ++ read_unlock(&tasklist_lock); ++ ++ if (cnt) avg_burst_time = div_u64(sum, cnt) << CHILD_BURST_CUTOFF_BITS; ++ p->child_burst_cache = max(avg_burst_time, p->se.max_burst_time); ++} ++ ++static void update_task_initial_burst_time(struct task_struct *task) { ++ struct sched_entity *se = &task->se; ++ struct task_struct *par = task->real_parent; ++ u64 ktime = ktime_to_ns(ktime_get()); ++ ++ if (likely(par)) { ++ if (par->child_burst_last_cached + sched_burst_cache_lifetime < ktime) { ++ par->child_burst_last_cached = ktime; ++ update_task_child_burst_time_cache(par); ++ } ++ se->prev_burst_time = max(se->prev_burst_time, par->child_burst_cache); ++ } ++ ++ se->max_burst_time = se->prev_burst_time; ++} ++#endif // CONFIG_SCHED_BORE ++ + /* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. +@@ -4500,6 +4551,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; + p->se.vruntime = 0; ++#ifdef CONFIG_SCHED_BORE ++ __sched_fork_bore(p); ++#endif // CONFIG_SCHED_BORE + INIT_LIST_HEAD(&p->se.group_node); + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -4726,6 +4780,9 @@ late_initcall(sched_core_sysctl_init); + int sched_fork(unsigned long clone_flags, struct task_struct *p) + { + __sched_fork(clone_flags, p); ++#ifdef CONFIG_SCHED_BORE ++ update_task_initial_burst_time(p); ++#endif // CONFIG_SCHED_BORE + /* + * We mark the process as NEW here. This guarantees that + * nobody will actually run it, and a signal or other external +@@ -9922,6 +9979,11 @@ void __init sched_init(void) + BUG_ON(&dl_sched_class != &stop_sched_class + 1); + #endif + ++#ifdef CONFIG_SCHED_BORE ++ sched_init_bore(); ++ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 2.4.0 by Masahito Suzuki"); ++#endif // CONFIG_SCHED_BORE ++ + wait_bit_init(); + + #ifdef CONFIG_FAIR_GROUP_SCHED +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 066ff1c8a..4bc07d405 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -593,6 +593,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); + ++#ifdef CONFIG_SCHED_BORE ++ SEQ_printf(m, " %2d", p->se.penalty_score); ++#endif + #ifdef CONFIG_NUMA_BALANCING + SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); + #endif +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 9fe8288b1..ac29ac350 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -19,6 +19,9 @@ + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra ++ * ++ * Burst-Oriented Response Enhancer (BORE) CPU Scheduler ++ * Copyright (C) 2021-2023 Masahito Suzuki + */ + #include + #include +@@ -140,6 +143,61 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; + + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + ++#ifdef CONFIG_SCHED_BORE ++unsigned int __read_mostly sched_bore = 1; ++unsigned int __read_mostly sched_burst_cache_lifetime = 15000000; ++unsigned int __read_mostly sched_burst_penalty_offset = 12; ++unsigned int __read_mostly sched_burst_penalty_scale = 1292; ++unsigned int __read_mostly sched_burst_smoothness = 1; ++static int three = 3; ++static int sixty_four = 64; ++static int maxval_12_bits = 4095; ++ ++#define FIXED_SHIFT 10 ++#define FIXED_ONE (1 << FIXED_SHIFT) ++typedef u32 fixed; ++ ++static void update_burst_score(struct sched_entity *se) { ++ u64 burst_time = se->max_burst_time; ++ ++ int msb = fls64(burst_time); ++ fixed integer_part = msb << FIXED_SHIFT; ++ fixed fractional_part = burst_time << (64 - msb) << 1 >> (64 - FIXED_SHIFT); ++ fixed greed = integer_part | fractional_part; ++ ++ fixed tolerance = sched_burst_penalty_offset << FIXED_SHIFT; ++ fixed penalty = max(0, (s32)greed - (s32)tolerance); ++ fixed scaled_penalty = penalty * sched_burst_penalty_scale >> 10; ++ ++ u8 score = min(39U, scaled_penalty >> FIXED_SHIFT); ++ se->penalty_score = score; ++} ++ ++static inline u64 penalty_scale(u64 delta, struct sched_entity *se) { ++ return mul_u64_u32_shr(delta, sched_prio_to_wmult[se->penalty_score], 22); ++} ++ ++static inline u64 __binary_smooth(u64 new, u64 old, unsigned int smoothness) { ++ return (new + old * ((1 << smoothness) - 1)) >> smoothness; ++} ++ ++void restart_burst(struct sched_entity *se) { ++ se->max_burst_time = se->prev_burst_time = __binary_smooth( ++ se->burst_time, se->prev_burst_time, sched_burst_smoothness); ++ se->burst_time = 0; ++} ++ ++#define calc_delta_fair(delta, se) __calc_delta_fair(delta, se, true) ++#define calc_delta_fair_unscaled(delta, se) __calc_delta_fair(delta, se, false) ++static inline u64 ++__calc_delta_fair(u64 delta, struct sched_entity *se, bool bscale); ++ ++static s64 wakeup_preempt_backstep_delta(u64 rtime, struct sched_entity *se) { ++ u64 delta = calc_delta_fair_unscaled(rtime, se); ++ return delta - penalty_scale(delta, se); ++} ++#endif // CONFIG_SCHED_BORE ++ + int sched_thermal_decay_shift; + static int __init setup_sched_thermal_decay_shift(char *str) + { +@@ -203,6 +261,51 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; + + #ifdef CONFIG_SYSCTL + static struct ctl_table sched_fair_sysctls[] = { ++#ifdef CONFIG_SCHED_BORE ++ { ++ .procname = "sched_bore", ++ .data = &sched_bore, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++ { ++ .procname = "sched_burst_cache_lifetime", ++ .data = &sched_burst_cache_lifetime, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++ { ++ .procname = "sched_burst_penalty_offset", ++ .data = &sched_burst_penalty_offset, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &sixty_four, ++ }, ++ { ++ .procname = "sched_burst_penalty_scale", ++ .data = &sched_burst_penalty_scale, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &maxval_12_bits, ++ }, ++ { ++ .procname = "sched_burst_smoothness", ++ .data = &sched_burst_smoothness, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &three, ++ }, ++#endif // CONFIG_SCHED_BORE + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -724,11 +827,19 @@ int sched_update_scaling(void) + /* + * delta /= w + */ ++#ifdef CONFIG_SCHED_BORE ++static inline u64 ++__calc_delta_fair(u64 delta, struct sched_entity *se, bool bscale) ++#else // CONFIG_SCHED_BORE + static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) ++#endif // CONFIG_SCHED_BORE + { + if (unlikely(se->load.weight != NICE_0_LOAD)) + delta = __calc_delta(delta, NICE_0_LOAD, &se->load); + ++#ifdef CONFIG_SCHED_BORE ++ if (bscale && sched_bore) delta = penalty_scale(delta, se); ++#endif // CONFIG_SCHED_BORE + return delta; + } + +@@ -938,6 +1049,14 @@ static void update_curr(struct cfs_rq *cfs_rq) + curr->sum_exec_runtime += delta_exec; + schedstat_add(cfs_rq->exec_clock, delta_exec); + ++#ifdef CONFIG_SCHED_BORE ++ curr->burst_time += delta_exec; ++ curr->max_burst_time = max(curr->max_burst_time, curr->burst_time); ++ update_burst_score(curr); ++ if (sched_bore) ++ curr->vruntime += penalty_scale(calc_delta_fair(delta_exec, curr), curr); ++ else ++#endif // CONFIG_SCHED_BORE + curr->vruntime += calc_delta_fair(delta_exec, curr); + update_min_vruntime(cfs_rq); + +@@ -6410,6 +6529,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + util_est_dequeue(&rq->cfs, p); + + for_each_sched_entity(se) { ++#ifdef CONFIG_SCHED_BORE ++ if (task_sleep) restart_burst(se); ++#endif // CONFIG_SCHED_BORE + cfs_rq = cfs_rq_of(se); + dequeue_entity(cfs_rq, se, flags); + +@@ -7844,7 +7966,11 @@ static unsigned long wakeup_gran(struct sched_entity *se) + * This is especially important for buddies when the leftmost + * task is higher priority than the buddy. + */ ++#ifdef CONFIG_SCHED_BORE ++ return calc_delta_fair_unscaled(gran, se); ++#else // CONFIG_SCHED_BORE + return calc_delta_fair(gran, se); ++#endif // CONFIG_SCHED_BORE + } + + /* +@@ -7865,6 +7991,13 @@ static int + wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) + { + s64 gran, vdiff = curr->vruntime - se->vruntime; ++#ifdef CONFIG_SCHED_BORE ++ if (sched_bore) { ++ u64 rtime = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; ++ vdiff += wakeup_preempt_backstep_delta(rtime, curr) ++ - wakeup_preempt_backstep_delta(rtime, se); ++ } ++#endif // CONFIG_SCHED_BORE + + if (vdiff <= 0) + return -1; +@@ -8210,6 +8343,9 @@ static void yield_task_fair(struct rq *rq) + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct sched_entity *se = &curr->se; ++#ifdef CONFIG_SCHED_BORE ++ restart_burst(se); ++#endif // CONFIG_SCHED_BORE + + /* + * Are we the only task in the tree? +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 9e390eb82..696ea7081 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -4,7 +4,11 @@ + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ ++#ifdef CONFIG_SCHED_BORE ++SCHED_FEAT(GENTLE_FAIR_SLEEPERS, false) ++#else // CONFIG_SCHED_BORE + SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) ++#endif // CONFIG_SCHED_BORE + + /* + * Place new tasks ahead so that they do not starve already running +@@ -17,7 +21,11 @@ SCHED_FEAT(START_DEBIT, true) + * wakeup-preemption), since its likely going to consume data we + * touched, increases cache locality. + */ ++#ifdef CONFIG_SCHED_BORE ++SCHED_FEAT(NEXT_BUDDY, true) ++#else // CONFIG_SCHED_BORE + SCHED_FEAT(NEXT_BUDDY, false) ++#endif // CONFIG_SCHED_BORE + + /* + * Prefer to schedule the task that ran last (when we did +-- +2.41.0.rc2 diff --git a/patches/0003-Allow-to-set-custom-USB-pollrate-for-specific-device.patch b/patches/0004-Allow-to-set-custom-USB-pollrate-for-specific-device.patch similarity index 100% rename from patches/0003-Allow-to-set-custom-USB-pollrate-for-specific-device.patch rename to patches/0004-Allow-to-set-custom-USB-pollrate-for-specific-device.patch diff --git a/patches/0004-amdgpu-si-cik-default.patch b/patches/0005-amdgpu-si-cik-default.patch similarity index 100% rename from patches/0004-amdgpu-si-cik-default.patch rename to patches/0005-amdgpu-si-cik-default.patch diff --git a/scripts/build.sh b/scripts/build.sh index 7b8b207..c4c2314 100755 --- a/scripts/build.sh +++ b/scripts/build.sh @@ -2,4 +2,4 @@ echo "Pika Kernel - Building" -make -j`nproc` bindeb-pkg LOCALVERSION=-pikaos KDEB_PKGVERSION=$(make kernelversion)-5 +make -j`nproc` bindeb-pkg LOCALVERSION=-pikaos KDEB_PKGVERSION=$(make kernelversion)-1 diff --git a/scripts/patch.sh b/scripts/patch.sh index 9a9e251..cd2c0b3 100755 --- a/scripts/patch.sh +++ b/scripts/patch.sh @@ -7,8 +7,10 @@ echo "Pika Kernel - Applying patches" patch -Np1 < "../patches/0001-cachy-all.patch" # orig patch from cachy patch -Np1 < "../patches/0002-eevdf.patch" +# orig patch from cachy +patch -Np1 < "../patches/0003-bore.patch" # Nobara patches are here: https://github.com/sammilucia/nobara-kernel-fork # Allow setting custom pollrates for usb devices -patch -Np1 < "../patches/0003-Allow-to-set-custom-USB-pollrate-for-specific-device.patch" +patch -Np1 < "../patches/0004-Allow-to-set-custom-USB-pollrate-for-specific-device.patch" # Allow pre polaris cards to use the amdgpu kernel module -patch -Np1 < "../patches/0004-amdgpu-si-cik-default.patch" +patch -Np1 < "../patches/0005-amdgpu-si-cik-default.patch" diff --git a/scripts/source.sh b/scripts/source.sh index 6156c9f..43f38ab 100755 --- a/scripts/source.sh +++ b/scripts/source.sh @@ -2,7 +2,7 @@ echo "Pika Kernel - Getting source" -wget -nv https://git.kernel.org/torvalds/t/linux-6.4-rc5.tar.gz -tar -xf ./linux-6.4-rc5.tar.gz +wget -nv https://git.kernel.org/torvalds/t/linux-6.4-rc6.tar.gz +tar -xf ./linux-6.4-rc6.tar.gz -cd linux-6.4-rc5 +cd linux-6.4-rc6